diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md deleted file mode 100644 index 8b9677709..000000000 --- a/.github/ISSUE_TEMPLATE.md +++ /dev/null @@ -1,18 +0,0 @@ - - -## Your Environment - - - -- Operating System: -- Python Version Used: -- spaCy Version Used: -- Environment Information: diff --git a/.github/ISSUE_TEMPLATE/01_bugs.md b/.github/ISSUE_TEMPLATE/01_bugs.md index 9e1b35fbf..768832c24 100644 --- a/.github/ISSUE_TEMPLATE/01_bugs.md +++ b/.github/ISSUE_TEMPLATE/01_bugs.md @@ -1,6 +1,6 @@ --- -name: "\U0001F6A8 Bug Report" -about: Did you come across a bug or unexpected behaviour differing from the docs? +name: "\U0001F6A8 Submit a Bug Report" +about: Use this template if you came across a bug or unexpected behaviour differing from the docs. --- diff --git a/.github/ISSUE_TEMPLATE/03_docs.md b/.github/ISSUE_TEMPLATE/02_docs.md similarity index 86% rename from .github/ISSUE_TEMPLATE/03_docs.md rename to .github/ISSUE_TEMPLATE/02_docs.md index 4cf791330..0df41abc1 100644 --- a/.github/ISSUE_TEMPLATE/03_docs.md +++ b/.github/ISSUE_TEMPLATE/02_docs.md @@ -1,5 +1,5 @@ --- -name: "\U0001F4DA Documentation" +name: "\U0001F4DA Submit a Documentation Report" about: Did you spot a mistake in the docs, is anything unclear or do you have a suggestion? diff --git a/.github/ISSUE_TEMPLATE/02_install.md b/.github/ISSUE_TEMPLATE/02_install.md deleted file mode 100644 index d0790bbdb..000000000 --- a/.github/ISSUE_TEMPLATE/02_install.md +++ /dev/null @@ -1,21 +0,0 @@ ---- -name: "\U000023F3 Installation Problem" -about: Do you have problems installing spaCy, and none of the suggestions in the docs - and other issues helped? - ---- - - -## How to reproduce the problem - - -```bash -# copy-paste the error message here -``` - -## Your Environment - -* Operating System: -* Python Version Used: -* spaCy Version Used: -* Environment Information: diff --git a/.github/ISSUE_TEMPLATE/04_other.md b/.github/ISSUE_TEMPLATE/04_other.md deleted file mode 100644 index 4c6ada4cc..000000000 --- a/.github/ISSUE_TEMPLATE/04_other.md +++ /dev/null @@ -1,19 +0,0 @@ ---- -name: "\U0001F4AC Anything else?" -about: For feature and project ideas, general usage questions or help with your code, please post on the GitHub Discussions board instead. ---- - - - -## Your Environment - - - -- Operating System: -- Python Version Used: -- spaCy Version Used: -- Environment Information: diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 000000000..fce1a1064 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,17 @@ +blank_issues_enabled: false +contact_links: + - name: ⚠️ Python 3.10 Support + url: https://github.com/explosion/spaCy/discussions/9418 + about: Python 3.10 wheels haven't been released yet, see the link for details. + - name: 🗯 Discussions Forum + url: https://github.com/explosion/spaCy/discussions + about: Install issues, usage questions, general discussion and anything else that isn't a bug report. + - name: 📖 spaCy FAQ & Troubleshooting + url: https://github.com/explosion/spaCy/discussions/8226 + about: Before you post, check out the FAQ for answers to common community questions! + - name: 💫 spaCy Usage Guides & API reference + url: https://spacy.io/usage + about: Everything you need to know about spaCy and how to use it. + - name: 🛠 Submit a Pull Request + url: https://github.com/explosion/spaCy/pulls + about: Did you spot a mistake and know how to fix it? Feel free to submit a PR straight away! diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index ec11b78bd..b48b2c51b 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -14,6 +14,6 @@ or new feature, or a change to the documentation? --> ## Checklist -- [ ] I have submitted the spaCy Contributor Agreement. +- [ ] I confirm that I have the right to submit this contribution under the project's MIT license. - [ ] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information. diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml new file mode 100644 index 000000000..80c88b0b8 --- /dev/null +++ b/.github/azure-steps.yml @@ -0,0 +1,117 @@ +parameters: + python_version: '' + architecture: '' + prefix: '' + gpu: false + num_build_jobs: 1 + +steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: ${{ parameters.python_version }} + architecture: ${{ parameters.architecture }} + + - bash: | + echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}" + displayName: 'Set variables' + + - script: | + ${{ parameters.prefix }} python -m pip install -U pip setuptools + ${{ parameters.prefix }} python -m pip install -U -r requirements.txt + displayName: "Install dependencies" + + - script: | + ${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }} + ${{ parameters.prefix }} python setup.py sdist --formats=gztar + displayName: "Compile and build sdist" + + - script: python -m mypy spacy + displayName: 'Run mypy' + condition: ne(variables['python_version'], '3.10') + + - task: DeleteFiles@1 + inputs: + contents: "spacy" + displayName: "Delete source directory" + + - script: | + ${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt + ${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt + displayName: "Uninstall all packages" + + - bash: | + ${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) + ${{ parameters.prefix }} python -m pip install dist/$SDIST + displayName: "Install from sdist" + + - script: | + ${{ parameters.prefix }} python -m pip install -U -r requirements.txt + displayName: "Install test requirements" + + - script: | + ${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0 + ${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html + displayName: "Install GPU requirements" + condition: eq(${{ parameters.gpu }}, true) + + - script: | + ${{ parameters.prefix }} python -m pytest --pyargs spacy + displayName: "Run CPU tests" + condition: eq(${{ parameters.gpu }}, false) + + - script: | + ${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu + displayName: "Run GPU tests" + condition: eq(${{ parameters.gpu }}, true) + + - script: | + python -m spacy download ca_core_news_sm + python -m spacy download ca_core_news_md + python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" + displayName: 'Test download CLI' + condition: eq(variables['python_version'], '3.8') + + - script: | + python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . + displayName: 'Test convert CLI' + condition: eq(variables['python_version'], '3.8') + + - script: | + python -m spacy init config -p ner -l ca ner.cfg + python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy + displayName: 'Test debug config CLI' + condition: eq(variables['python_version'], '3.8') + + - script: | + # will have errors due to sparse data, check for summary in output + python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary + displayName: 'Test debug data CLI' + condition: eq(variables['python_version'], '3.8') + + - script: | + python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1 + displayName: 'Test train CLI' + condition: eq(variables['python_version'], '3.8') + + - script: | + python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" + PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir + displayName: 'Test assemble CLI' + condition: eq(variables['python_version'], '3.8') + + - script: | + python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" + python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 + displayName: 'Test assemble CLI vectors warning' + condition: eq(variables['python_version'], '3.8') + + - script: | + python .github/validate_universe_json.py website/meta/universe.json + displayName: 'Test website/meta/universe.json' + condition: eq(variables['python_version'], '3.8') + + - script: | + ${{ parameters.prefix }} python -m pip install thinc-apple-ops + ${{ parameters.prefix }} python -m pytest --pyargs spacy + displayName: "Run CPU tests with thinc-apple-ops" + condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.9')) diff --git a/.github/contributors/0x2b3bfa0.md b/.github/contributors/0x2b3bfa0.md new file mode 100644 index 000000000..017aae52d --- /dev/null +++ b/.github/contributors/0x2b3bfa0.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [X] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Helio Machado | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2021-02-03 | +| GitHub username | 0x2b3bfa0 | +| Website (optional) | | diff --git a/.github/contributors/AyushExel.md b/.github/contributors/AyushExel.md new file mode 100644 index 000000000..281fd0cd0 --- /dev/null +++ b/.github/contributors/AyushExel.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [X] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Ayush Chaurasia | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2021-03-12 | +| GitHub username | AyushExel | +| Website (optional) | | diff --git a/.github/contributors/Jette16.md b/.github/contributors/Jette16.md new file mode 100644 index 000000000..c064f1d4f --- /dev/null +++ b/.github/contributors/Jette16.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Henriette Behr | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 23.09.2021 | +| GitHub username | Jette16 | +| Website (optional) | | diff --git a/.github/contributors/KennethEnevoldsen.md b/.github/contributors/KennethEnevoldsen.md new file mode 100644 index 000000000..0bbb28d61 --- /dev/null +++ b/.github/contributors/KennethEnevoldsen.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [X] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------------- | +| Name | Kenneth Enevoldsen | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2021-07-13 | +| GitHub username | KennethEnevoldsen | +| Website (optional) | www.kennethenevoldsen.com | diff --git a/.github/contributors/SamEdwardes.md b/.github/contributors/SamEdwardes.md new file mode 100644 index 000000000..4e6453ac7 --- /dev/null +++ b/.github/contributors/SamEdwardes.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Sam Edwardes | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2021-04-02 | +| GitHub username | SamEdwardes | +| Website (optional) | samedwardes.com | diff --git a/.github/contributors/ZeeD.md b/.github/contributors/ZeeD.md new file mode 100644 index 000000000..460f91e19 --- /dev/null +++ b/.github/contributors/ZeeD.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Vito De Tullio | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2021-06-01 | +| GitHub username | ZeeD | +| Website (optional) | | diff --git a/.github/contributors/armsp.md b/.github/contributors/armsp.md index 63d1367e4..45607d69c 100644 --- a/.github/contributors/armsp.md +++ b/.github/contributors/armsp.md @@ -98,9 +98,9 @@ mark both statements: | Field | Entry | |------------------------------- | -------------------- | -| Name | Shantam | +| Name | Shantam Raj | | Company name (if applicable) | | | Title or role (if applicable) | | -| Date | 21/5/2018 | +| Date | 10/4/2021 | | GitHub username | armsp | -| Website (optional) | | +| Website (optional) |https://shantamraj.com| diff --git a/.github/contributors/bbieniek.md b/.github/contributors/bbieniek.md new file mode 100644 index 000000000..4050946aa --- /dev/null +++ b/.github/contributors/bbieniek.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [X] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Baltazar Bieniek | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2021.08.19 | +| GitHub username | bbieniek | +| Website (optional) | https://baltazar.bieniek.org.pl/ | \ No newline at end of file diff --git a/.github/contributors/bodak.md b/.github/contributors/bodak.md new file mode 100644 index 000000000..f87224f81 --- /dev/null +++ b/.github/contributors/bodak.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Kristian Boda | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 18.05.2021 | +| GitHub username | bodak | +| Website (optional) | | diff --git a/.github/contributors/broaddeep.md b/.github/contributors/broaddeep.md new file mode 100644 index 000000000..d6c4b3cf3 --- /dev/null +++ b/.github/contributors/broaddeep.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Dongjun Park | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2021-03-06 | +| GitHub username | broaddeep | +| Website (optional) | | diff --git a/.github/contributors/bsweileh.md b/.github/contributors/bsweileh.md new file mode 100644 index 000000000..13f78a4b7 --- /dev/null +++ b/.github/contributors/bsweileh.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Belal | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | March 13, 2021 | +| GitHub username | bsweileh | +| Website (optional) | | diff --git a/.github/contributors/connorbrinton.md b/.github/contributors/connorbrinton.md new file mode 100644 index 000000000..25d03b494 --- /dev/null +++ b/.github/contributors/connorbrinton.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Connor Brinton | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | July 20th, 2021 | +| GitHub username | connorbrinton | +| Website (optional) | | diff --git a/.github/contributors/dardoria.md b/.github/contributors/dardoria.md new file mode 100644 index 000000000..0c7202fca --- /dev/null +++ b/.github/contributors/dardoria.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Boian Tzonev | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 18.02.2021 | +| GitHub username | dardoria | +| Website (optional) | | diff --git a/.github/contributors/dhruvrnaik.md b/.github/contributors/dhruvrnaik.md new file mode 100644 index 000000000..9639b6cba --- /dev/null +++ b/.github/contributors/dhruvrnaik.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Dhruv Naik | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 26-01-2021 | +| GitHub username | dhruvrnaik | +| Website (optional) | | diff --git a/.github/contributors/ezorita.md b/.github/contributors/ezorita.md new file mode 100644 index 000000000..e5f3f5283 --- /dev/null +++ b/.github/contributors/ezorita.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Eduard Zorita | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 06/17/2021 | +| GitHub username | ezorita | +| Website (optional) | | diff --git a/.github/contributors/gtoffoli.md b/.github/contributors/gtoffoli.md new file mode 100644 index 000000000..5d5d712a2 --- /dev/null +++ b/.github/contributors/gtoffoli.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------ | +| Name | Giovanni Toffoli | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2021-05-12 | +| GitHub username | gtoffoli | +| Website (optional) | | diff --git a/.github/contributors/hlasse.md b/.github/contributors/hlasse.md new file mode 100644 index 000000000..b64b3c6a6 --- /dev/null +++ b/.github/contributors/hlasse.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [X] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------------- | +| Name | Lasse Hansen | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2021-08-11 | +| GitHub username | HLasse | +| Website (optional) | www.lassehansen.me | diff --git a/.github/contributors/jankrepl.md b/.github/contributors/jankrepl.md new file mode 100644 index 000000000..eda5a29b8 --- /dev/null +++ b/.github/contributors/jankrepl.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Jan Krepl | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2021-03-09 | +| GitHub username | jankrepl | +| Website (optional) | | diff --git a/.github/contributors/jganseman.md b/.github/contributors/jganseman.md new file mode 100644 index 000000000..dc25bee1c --- /dev/null +++ b/.github/contributors/jganseman.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Joachim Ganseman | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 26/01/2021 | +| GitHub username | jganseman | +| Website (optional) | www.ganseman.be | diff --git a/.github/contributors/jklaise.md b/.github/contributors/jklaise.md new file mode 100644 index 000000000..66d77ee48 --- /dev/null +++ b/.github/contributors/jklaise.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name |Janis Klaise | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date |26/04/2021 | +| GitHub username |jklaise | +| Website (optional) |janisklaise.com | diff --git a/.github/contributors/jmyerston.md b/.github/contributors/jmyerston.md new file mode 100644 index 000000000..be5db5453 --- /dev/null +++ b/.github/contributors/jmyerston.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +| ----------------------------- | ----------------------------------- | +| Name | Jacobo Myerston | +| Company name (if applicable) | University of California, San Diego | +| Title or role (if applicable) | Academic | +| Date | 07/05/2021 | +| GitHub username | jmyerston | +| Website (optional) | diogenet.ucsd.edu | diff --git a/.github/contributors/julien-talkair.md b/.github/contributors/julien-talkair.md new file mode 100644 index 000000000..f8a1933b2 --- /dev/null +++ b/.github/contributors/julien-talkair.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [x] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Julien Rossi | +| Company name (if applicable) | TalkAir BV | +| Title or role (if applicable) | CTO, Partner | +| Date | June 28 2021 | +| GitHub username | julien-talkair | +| Website (optional) | | diff --git a/.github/contributors/juliensalinas.md b/.github/contributors/juliensalinas.md new file mode 100644 index 000000000..0062426ba --- /dev/null +++ b/.github/contributors/juliensalinas.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [X] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +| ----------------------------- | ------------------- | +| Name | Julien Salinas | +| Company name (if applicable) | NLP Cloud | +| Title or role (if applicable) | Founder and CTO | +| Date | Mayb 14th 2021 | +| GitHub username | juliensalinas | +| Website (optional) | https://nlpcloud.io | diff --git a/.github/contributors/jumasheff.md b/.github/contributors/jumasheff.md new file mode 100644 index 000000000..1ce6d2341 --- /dev/null +++ b/.github/contributors/jumasheff.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Murat Jumashev | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 25.01.2021 | +| GitHub username | jumasheff | +| Website (optional) | | diff --git a/.github/contributors/mariosasko.md b/.github/contributors/mariosasko.md new file mode 100644 index 000000000..1f5acc934 --- /dev/null +++ b/.github/contributors/mariosasko.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [x] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Mario Šaško | +| Company name (if applicable) | TakeLab FER | +| Title or role (if applicable) | R&D Intern | +| Date | 2021-07-12 | +| GitHub username | mariosasko | +| Website (optional) | | diff --git a/.github/contributors/meghanabhange.md b/.github/contributors/meghanabhange.md new file mode 100644 index 000000000..2aaa57d10 --- /dev/null +++ b/.github/contributors/meghanabhange.md @@ -0,0 +1,107 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------ | +| Name | Meghana Bhange | +| Company name (if applicable) | Verloop.io | +| Title or role (if applicable) | ML Engineer | +| Date | 2020-04-21 | +| GitHub username | meghanbhange | +| Website (optional) | https://meghana.blog | + diff --git a/.github/contributors/narayanacharya6.md b/.github/contributors/narayanacharya6.md new file mode 100644 index 000000000..e4bf7703f --- /dev/null +++ b/.github/contributors/narayanacharya6.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Narayan Acharya | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 29 APR 2021 | +| GitHub username | narayanacharya6 | +| Website (optional) | narayanacharya.com | \ No newline at end of file diff --git a/.github/contributors/nsorros.md b/.github/contributors/nsorros.md new file mode 100644 index 000000000..a449c52e1 --- /dev/null +++ b/.github/contributors/nsorros.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Nick Sorros | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2/8/2021 | +| GitHub username | nsorros | +| Website (optional) | | diff --git a/.github/contributors/peter-exos.md b/.github/contributors/peter-exos.md new file mode 100644 index 000000000..e0ef1346e --- /dev/null +++ b/.github/contributors/peter-exos.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [x] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Peter Baumann | +| Company name (if applicable) | Exos Financial | +| Title or role (if applicable) | data scientist | +| Date | Feb 1st, 2021 | +| GitHub username | peter-exos | +| Website (optional) | | diff --git a/.github/contributors/philipvollet.md b/.github/contributors/philipvollet.md new file mode 100644 index 000000000..0bf58a701 --- /dev/null +++ b/.github/contributors/philipvollet.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Philip Vollet | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 22.09.2021 | +| GitHub username | philipvollet | +| Website (optional) | | diff --git a/.github/contributors/plison.md b/.github/contributors/plison.md new file mode 100644 index 000000000..e98b096b4 --- /dev/null +++ b/.github/contributors/plison.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Pierre Lison | +| Company name (if applicable) | Norsk Regnesentral | +| Title or role (if applicable) | Senior Researcher | +| Date | 22.04.2021 | +| GitHub username | plison | +| Website (optional) | www.nr.no/~plison | diff --git a/.github/contributors/reneoctavio.md b/.github/contributors/reneoctavio.md new file mode 100644 index 000000000..c0a4d1a76 --- /dev/null +++ b/.github/contributors/reneoctavio.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, + object code, patch, tool, sample, graphic, specification, manual, + documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and + registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment + to any third party, you hereby grant to us a perpetual, irrevocable, + non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your + contribution. The rights that you grant to us under these terms are effective + on the date you first submitted a contribution to us, even if your submission + took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + - Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + - to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + - each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable + U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT + mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +| ----------------------------- | -------------------- | +| Name | Rene Octavio Q. Dias | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-02-03 | +| GitHub username | reneoctavio | +| Website (optional) | | diff --git a/.github/contributors/sevdimali.md b/.github/contributors/sevdimali.md new file mode 100644 index 000000000..6b96abdf8 --- /dev/null +++ b/.github/contributors/sevdimali.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Sevdimali | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 10/4/2021 | +| GitHub username | sevdimali | +| Website (optional) | https://sevdimali.me | diff --git a/.github/contributors/shigapov.md b/.github/contributors/shigapov.md new file mode 100644 index 000000000..3c24c7982 --- /dev/null +++ b/.github/contributors/shigapov.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------ | +| Name | Renat Shigapov | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2021-09-09 | +| GitHub username | shigapov | +| Website (optional) | | diff --git a/.github/contributors/swfarnsworth.md b/.github/contributors/swfarnsworth.md new file mode 100644 index 000000000..c289e6658 --- /dev/null +++ b/.github/contributors/swfarnsworth.md @@ -0,0 +1,88 @@ +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Steele Farnsworth | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 13 August, 2021 | +| GitHub username | swfarnsworth | +| Website (optional) | | + diff --git a/.github/contributors/thomashacker.md b/.github/contributors/thomashacker.md new file mode 100644 index 000000000..d88727dc8 --- /dev/null +++ b/.github/contributors/thomashacker.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Edward Schmuhl | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 09.07.2021 | +| GitHub username | thomashacker | +| Website (optional) | | diff --git a/.github/contributors/tupui.md b/.github/contributors/tupui.md new file mode 100644 index 000000000..5f53a72f8 --- /dev/null +++ b/.github/contributors/tupui.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Pamphile Roy | +| Company name (if applicable) | N/A | +| Title or role (if applicable) | N/A | +| Date | January 29th, 2021 | +| GitHub username | tupui | +| Website (optional) | N/A | diff --git a/.github/contributors/xadrianzetx.md b/.github/contributors/xadrianzetx.md new file mode 100644 index 000000000..65603e9bc --- /dev/null +++ b/.github/contributors/xadrianzetx.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name |Adrian Zuber | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date |20-06-2021 | +| GitHub username |xadrianzetx | +| Website (optional) | | \ No newline at end of file diff --git a/.github/contributors/yohasebe.md b/.github/contributors/yohasebe.md new file mode 100644 index 000000000..c6f6167a3 --- /dev/null +++ b/.github/contributors/yohasebe.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Yoichiro Hasebe | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | July 4th, 2021 | +| GitHub username | yohasebe | +| Website (optional) | https://yohasebe.com | diff --git a/.github/lock.yml b/.github/lock.yml deleted file mode 100644 index 593e88397..000000000 --- a/.github/lock.yml +++ /dev/null @@ -1,19 +0,0 @@ -# Configuration for lock-threads - https://github.com/dessant/lock-threads - -# Number of days of inactivity before a closed issue or pull request is locked -daysUntilLock: 30 - -# Issues and pull requests with these labels will not be locked. Set to `[]` to disable -exemptLabels: [] - -# Label to add before locking, such as `outdated`. Set to `false` to disable -lockLabel: false - -# Comment to post before locking. Set to `false` to disable -lockComment: > - This thread has been automatically locked since there has not been - any recent activity after it was closed. Please open a new issue for - related bugs. - -# Limit to only `issues` or `pulls` -only: issues diff --git a/.github/validate_universe_json.py b/.github/validate_universe_json.py new file mode 100644 index 000000000..b96b7b347 --- /dev/null +++ b/.github/validate_universe_json.py @@ -0,0 +1,19 @@ +import json +import re +import sys +from pathlib import Path + + +def validate_json(document): + universe_file = Path(document) + with universe_file.open() as f: + universe_data = json.load(f) + for entry in universe_data["resources"]: + if "github" in entry: + assert not re.match( + r"^(http:)|^(https:)", entry["github"] + ), "Github field should be user/repo, not a url" + + +if __name__ == "__main__": + validate_json(str(sys.argv[1])) diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml new file mode 100644 index 000000000..8d0282650 --- /dev/null +++ b/.github/workflows/autoblack.yml @@ -0,0 +1,44 @@ +# GitHub Action that uses Black to reformat all Python code and submits a PR +# in regular intervals. Inspired by: https://github.com/cclauss/autoblack + +name: autoblack +on: + workflow_dispatch: # allow manual trigger + schedule: + - cron: '0 8 * * 5' # every Friday at 8am UTC + +jobs: + autoblack: + if: github.repository_owner == 'explosion' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + ref: ${{ github.head_ref }} + - uses: actions/setup-python@v2 + - run: pip install black + - name: Auto-format code if needed + run: black spacy + # We can't run black --check here because that returns a non-zero excit + # code and makes GitHub think the action failed + - name: Check for modified files + id: git-check + run: echo ::set-output name=modified::$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) + - name: Create Pull Request + if: steps.git-check.outputs.modified == 'true' + uses: peter-evans/create-pull-request@v3 + with: + title: Auto-format code with black + labels: meta + commit-message: Auto-format code with black + committer: GitHub + author: explosion-bot + body: _This PR is auto-generated._ + branch: autoblack + delete-branch: true + draft: false + - name: Check outputs + if: steps.git-check.outputs.modified == 'true' + run: | + echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}" + echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}" diff --git a/.github/workflows/explosionbot.yml b/.github/workflows/explosionbot.yml new file mode 100644 index 000000000..e29ce8fe8 --- /dev/null +++ b/.github/workflows/explosionbot.yml @@ -0,0 +1,27 @@ +name: Explosion Bot + +on: + issue_comment: + types: + - created + - edited + +jobs: + explosion-bot: + runs-on: ubuntu-18.04 + steps: + - name: Dump GitHub context + env: + GITHUB_CONTEXT: ${{ toJson(github) }} + run: echo "$GITHUB_CONTEXT" + - uses: actions/checkout@v1 + - uses: actions/setup-python@v1 + - name: Install and run explosion-bot + run: | + pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot + python -m explosionbot + env: + INPUT_TOKEN: ${{ secrets.EXPLOSIONBOT_TOKEN }} + INPUT_BK_TOKEN: ${{ secrets.BUILDKITE_SECRET }} + ENABLED_COMMANDS: "test_gpu,test_slow" + ALLOWED_TEAMS: "spaCy" diff --git a/.github/workflows/lock.yml b/.github/workflows/lock.yml new file mode 100644 index 000000000..c9833cdba --- /dev/null +++ b/.github/workflows/lock.yml @@ -0,0 +1,25 @@ +name: 'Lock Threads' + +on: + schedule: + - cron: '0 0 * * *' # check every day + workflow_dispatch: + +permissions: + issues: write + +concurrency: + group: lock + +jobs: + action: + runs-on: ubuntu-latest + steps: + - uses: dessant/lock-threads@v3 + with: + process-only: 'issues' + issue-inactive-days: '30' + issue-comment: > + This thread has been automatically locked since there + has not been any recent activity after it was closed. + Please open a new issue for related bugs. diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..a7a12fd24 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,12 @@ +repos: +- repo: https://github.com/ambv/black + rev: 21.6b0 + hooks: + - id: black + language_version: python3.7 +- repo: https://gitlab.com/pycqa/flake8 + rev: 3.9.2 + hooks: + - id: flake8 + args: + - "--config=setup.cfg" diff --git a/CITATION b/CITATION deleted file mode 100644 index bdaa90677..000000000 --- a/CITATION +++ /dev/null @@ -1,8 +0,0 @@ -@software{spacy, - author = {Honnibal, Matthew and Montani, Ines and Van Landeghem, Sofie and Boyd, Adriane}, - title = {{spaCy: Industrial-strength Natural Language Processing in Python}}, - year = 2020, - publisher = {Zenodo}, - doi = {10.5281/zenodo.1212303}, - url = {https://doi.org/10.5281/zenodo.1212303} -} diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 000000000..88c05b2a3 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,16 @@ +cff-version: 1.2.0 +preferred-citation: + type: article + message: "If you use spaCy, please cite it as below." + authors: + - family-names: "Honnibal" + given-names: "Matthew" + - family-names: "Montani" + given-names: "Ines" + - family-names: "Van Landeghem" + given-names: "Sofie" + - family-names: "Boyd" + given-names: "Adriane" + title: "spaCy: Industrial-strength Natural Language Processing in Python" + doi: "10.5281/zenodo.1212303" + year: 2020 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 45ce9af11..a4d321aa3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,9 +2,7 @@ # Contribute to spaCy -Thanks for your interest in contributing to spaCy 🎉 The project is maintained -by [@honnibal](https://github.com/honnibal) and [@ines](https://github.com/ines), -and we'll do our best to help you get started. This page will give you a quick +Thanks for your interest in contributing to spaCy 🎉 This page will give you a quick overview of how things are organized and most importantly, how to get involved. ## Table of contents @@ -50,8 +48,7 @@ issue body. A few more tips: parts and don't just dump your entire script. This will make it easier for us to reproduce the error. -- **Getting info about your spaCy installation and environment:** If you're - using spaCy v1.7+, you can use the command line interface to print details and +- **Getting info about your spaCy installation and environment:** You can use the command line interface to print details and even format them as Markdown to copy-paste into GitHub issues: `python -m spacy info --markdown`. @@ -60,7 +57,7 @@ issue body. A few more tips: model is incompatible with your spaCy installation. In spaCy v2.0+, you can check this on the command line by running `python -m spacy validate`. -- **Sharing a model's output, like dependencies and entities:** spaCy v2.0+ +- **Sharing a model's output, like dependencies and entities:** spaCy comes with [built-in visualizers](https://spacy.io/usage/visualizers) that you can run from within your script or a Jupyter notebook. For some issues, it's helpful to **include a screenshot** of the visualization. You can simply drag and @@ -99,7 +96,7 @@ questions: changes to spaCy's built-in methods. In contrast, a library of word alignment functions could easily live as a separate package that depended on spaCy — there's little difference between writing `import word_aligner` and - `import spacy.word_aligner`. spaCy v2.0+ makes it easy to implement + `import spacy.word_aligner`. spaCy makes it easy to implement [custom pipeline components](https://spacy.io/usage/processing-pipelines#custom-components), and add your own attributes, properties and methods to the `Doc`, `Token` and `Span`. If you're looking to implement a new spaCy feature, starting with a @@ -109,8 +106,8 @@ questions: library later. - **Would the feature be easier to implement if it relied on "heavy" dependencies spaCy doesn't currently require?** - Python has a very rich ecosystem. Libraries like scikit-learn, SciPy, Gensim or - TensorFlow/Keras do lots of useful things — but we don't want to have them as + Python has a very rich ecosystem. Libraries like PyTorch, TensorFlow, scikit-learn, SciPy or Gensim + do lots of useful things — but we don't want to have them as default dependencies. If the feature requires functionality in one of these libraries, it's probably better to break it out into a different package. @@ -137,35 +134,12 @@ files, a compiler, [pip](https://pip.pypa.io/en/latest/installing/), [virtualenv](https://virtualenv.pypa.io/en/stable/) and [git](https://git-scm.com) installed. The compiler is usually the trickiest part. -``` -python -m pip install -U pip -git clone https://github.com/explosion/spaCy -cd spaCy - -python -m venv .env -source .env/bin/activate -export PYTHONPATH=`pwd` -pip install -r requirements.txt -python setup.py build_ext --inplace -``` - -If you've made changes to `.pyx` files, you need to recompile spaCy before you +If you've made changes to `.pyx` files, you need to **recompile spaCy** before you can test your changes by re-running `python setup.py build_ext --inplace`. Changes to `.py` files will be effective immediately. 📖 **For more details and instructions, see the documentation on [compiling spaCy from source](https://spacy.io/usage/#source) and the [quickstart widget](https://spacy.io/usage/#section-quickstart) to get the right commands for your platform and Python version.** -### Contributor agreement - -If you've made a contribution to spaCy, you should fill in the -[spaCy contributor agreement](.github/CONTRIBUTOR_AGREEMENT.md) to ensure that -your contribution can be used across the project. If you agree to be bound by -the terms of the agreement, fill in the [template](.github/CONTRIBUTOR_AGREEMENT.md) -and include it with your pull request, or submit it separately to -[`.github/contributors/`](/.github/contributors). The name of the file should be -your GitHub username, with the extension `.md`. For example, the user -example_user would create the file `.github/contributors/example_user.md`. - ### Fixing bugs When fixing a bug, first create an @@ -184,7 +158,7 @@ sure your test passes and reference the issue in your commit message. ## Code conventions Code should loosely follow [pep8](https://www.python.org/dev/peps/pep-0008/). -As of `v2.1.0`, spaCy uses [`black`](https://github.com/ambv/black) for code +spaCy uses [`black`](https://github.com/ambv/black) for code formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its Python modules. If you've built spaCy from source, you'll already have both tools installed. @@ -192,6 +166,14 @@ tools installed. **⚠️ Note that formatting and linting is currently only possible for Python modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.** +### Pre-Commit Hooks + +After cloning the repo, after installing the packages from `requirements.txt`, enter the repo folder and run `pre-commit install`. +Each time a `git commit` is initiated, `black` and `flake8` will run automatically on the modified files only. + +In case of error, or when `black` modified a file, the modified file needs to be `git add` once again and a new +`git commit` has to be issued. + ### Code formatting [`black`](https://github.com/ambv/black) is an opinionated Python code @@ -216,8 +198,7 @@ list of available editor integrations. #### Disabling formatting There are a few cases where auto-formatting doesn't improve readability – for -example, in some of the language data files like the `tag_map.py`, or in -the tests that construct `Doc` objects from lists of words and other labels. +example, in some of the language data files or in the tests that construct `Doc` objects from lists of words and other labels. Wrapping a block in `# fmt: off` and `# fmt: on` lets you disable formatting for that particular code. Here's an example: @@ -281,6 +262,9 @@ except: # noqa: E722 ### Python conventions All Python code must be written **compatible with Python 3.6+**. + +#### I/O and handling paths + Code that interacts with the file-system should accept objects that follow the `pathlib.Path` API, without assuming that the object inherits from `pathlib.Path`. If the function is user-facing and takes a path as an argument, it should check @@ -290,14 +274,18 @@ accept **file-like objects**, as it makes the library IO-agnostic. Working on buffers makes the code more general, easier to test, and compatible with Python 3's asynchronous IO. +#### Composition vs. inheritance + Although spaCy uses a lot of classes, **inheritance is viewed with some suspicion** — it's seen as a mechanism of last resort. You should discuss plans to extend the class hierarchy before implementing. +#### Naming conventions + We have a number of conventions around variable naming that are still being documented, and aren't 100% strict. A general policy is that instances of the -class `Doc` should by default be called `doc`, `Token` `token`, `Lexeme` `lex`, -`Vocab` `vocab` and `Language` `nlp`. You should avoid naming variables that are +class `Doc` should by default be called `doc`, `Token` → `token`, `Lexeme` → `lex`, +`Vocab` → `vocab` and `Language` → `nlp`. You should avoid naming variables that are of other types these names. For instance, don't name a text string `doc` — you should usually call this `text`. Two general code style preferences further help with naming. First, **lean away from introducing temporary variables**, as these @@ -414,14 +402,7 @@ all test files and test functions need to be prefixed with `test_`. When adding tests, make sure to use descriptive names, keep the code short and concise and only test for one behavior at a time. Try to `parametrize` test cases wherever possible, use our pre-defined fixtures for spaCy components and -avoid unnecessary imports. - -Extensive tests that take a long time should be marked with `@pytest.mark.slow`. -Tests that require the model to be loaded should be marked with -`@pytest.mark.models`. Loading the models is expensive and not necessary if -you're not actually testing the model performance. If all you need is a `Doc` -object with annotations like heads, POS tags or the dependency parse, you can -use the `Doc` constructor to construct it manually. +avoid unnecessary imports. Extensive tests that take a long time should be marked with `@pytest.mark.slow`. 📖 **For more guidelines and information on how to add tests, check out the [tests README](spacy/tests/README.md).** @@ -438,7 +419,7 @@ simply click on the "Suggest edits" button at the bottom of a page. ## Publishing spaCy extensions and plugins We're very excited about all the new possibilities for **community extensions** -and plugins in spaCy v2.0, and we can't wait to see what you build with it! +and plugins in spaCy v3.0, and we can't wait to see what you build with it! - An extension or plugin should add substantial functionality, be **well-documented** and **open-source**. It should be available for users to download diff --git a/LICENSE b/LICENSE index 87b814ce4..86f501b92 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (C) 2016-2020 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal +Copyright (C) 2016-2021 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/MANIFEST.in b/MANIFEST.in index b4887cdb8..c1524d460 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,9 +1,11 @@ recursive-include include *.h -recursive-include spacy *.pyx *.pxd *.txt *.cfg *.jinja +recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml include LICENSE include README.md include pyproject.toml +include spacy/py.typed recursive-exclude spacy/lang *.json recursive-include spacy/lang *.json.gz recursive-include spacy/cli *.json *.yml recursive-include licenses * +recursive-exclude spacy *.cpp diff --git a/Makefile b/Makefile index ba2bdf786..4de628663 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ SHELL := /bin/bash ifndef SPACY_EXTRAS -override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba spacy-pkuseg==0.0.28 sudachipy sudachidict_core pymorphy2 +override SPACY_EXTRAS = spacy-lookups-data==1.0.2 jieba spacy-pkuseg==0.0.28 sudachipy sudachidict_core pymorphy2 endif ifndef PYVER diff --git a/README.md b/README.md index e84d799ae..61d5449a4 100644 --- a/README.md +++ b/README.md @@ -2,18 +2,21 @@ # spaCy: Industrial-strength NLP -spaCy is a library for advanced Natural Language Processing in Python and +spaCy is a library for **advanced Natural Language Processing** in Python and Cython. It's built on the very latest research, and was designed from day one to be used in real products. spaCy comes with -[pretrained pipelines](https://spacy.io/models) and vectors, and -currently supports tokenization for **60+ languages**. It features -state-of-the-art speed, convolutional **neural network models** for tagging, -parsing, **named entity recognition**, **text classification** and more, multi-task learning with pretrained **transformers** like BERT, as well as a production-ready training system and easy model packaging, deployment and workflow management. -spaCy is commercial open-source software, released under the MIT license. +[pretrained pipelines](https://spacy.io/models) and +currently supports tokenization and training for **60+ languages**. It features +state-of-the-art speed and **neural network models** for tagging, +parsing, **named entity recognition**, **text classification** and more, +multi-task learning with pretrained **transformers** like BERT, as well as a +production-ready [**training system**](https://spacy.io/usage/training) and easy +model packaging, deployment and workflow management. spaCy is commercial +open-source software, released under the MIT license. -💫 **Version 3.0 (nightly) out now!** +💫 **Version 3.0 out now!** [Check out the release notes here.](https://github.com/explosion/spaCy/releases) [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8) @@ -21,25 +24,27 @@ spaCy is commercial open-source software, released under the MIT license. [![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/spacy/) [![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square&logo=conda-forge&logoColor=white)](https://anaconda.org/conda-forge/spacy) [![Python wheels](https://img.shields.io/badge/wheels-%E2%9C%93-4c1.svg?longCache=true&style=flat-square&logo=python&logoColor=white)](https://github.com/explosion/wheelwright/releases) -[![PyPi downloads](https://img.shields.io/pypi/dm/spacy?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/spacy/) -[![Conda downloads](https://img.shields.io/conda/dn/conda-forge/spacy?style=flat-square&logo=conda-forge&logoColor=white)](https://anaconda.org/conda-forge/spacy) -[![Model downloads](https://img.shields.io/github/downloads/explosion/spacy-models/total?style=flat-square&label=model+downloads)](https://github.com/explosion/spacy-models/releases) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg?style=flat-square)](https://github.com/ambv/black) +
+[![PyPi downloads](https://static.pepy.tech/personalized-badge/spacy?period=total&units=international_system&left_color=grey&right_color=orange&left_text=pip%20downloads)](https://pypi.org/project/spacy/) +[![Conda downloads](https://img.shields.io/conda/dn/conda-forge/spacy?label=conda%20downloads)](https://anaconda.org/conda-forge/spacy) [![spaCy on Twitter](https://img.shields.io/twitter/follow/spacy_io.svg?style=social&label=Follow)](https://twitter.com/spacy_io) ## 📖 Documentation -| Documentation | | -| ------------------- | -------------------------------------------------------------- | -| [spaCy 101] | New to spaCy? Here's everything you need to know! | -| [Usage Guides] | How to use spaCy and its features. | -| [New in v3.0] | New features, backwards incompatibilities and migration guide. | -| [Project Templates] | End-to-end workflows you can clone, modify and run. | -| [API Reference] | The detailed reference for spaCy's API. | -| [Models] | Download statistical language models for spaCy. | -| [Universe] | Libraries, extensions, demos, books and courses. | -| [Changelog] | Changes and version history. | -| [Contribute] | How to contribute to the spaCy project and code base. | +| Documentation | | +| -------------------------- | -------------------------------------------------------------- | +| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! | +| 📚 **[Usage Guides]** | How to use spaCy and its features. | +| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. | +| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. | +| 🎛 **[API Reference]** | The detailed reference for spaCy's API. | +| 📦 **[Models]** | Download trained pipelines for spaCy. | +| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. | +| 👩‍🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. | +| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. | +| 🛠 **[Changelog]** | Changes and version history. | +| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. | [spacy 101]: https://spacy.io/usage/spacy-101 [new in v3.0]: https://spacy.io/usage/v3 @@ -47,18 +52,20 @@ spaCy is commercial open-source software, released under the MIT license. [api reference]: https://spacy.io/api/ [models]: https://spacy.io/models [universe]: https://spacy.io/universe +[videos]: https://www.youtube.com/c/ExplosionAI +[online course]: https://course.spacy.io [project templates]: https://github.com/explosion/projects [changelog]: https://spacy.io/usage#changelog [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md ## 💬 Where to ask questions -The spaCy project is maintained by [@honnibal](https://github.com/honnibal), -[@ines](https://github.com/ines), [@svlandeg](https://github.com/svlandeg) and -[@adrianeboyd](https://github.com/adrianeboyd). Please understand that we won't -be able to provide individual support via email. We also believe that help is -much more valuable if it's shared publicly, so that more people can benefit from -it. +The spaCy project is maintained by **[@honnibal](https://github.com/honnibal)**, +**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)**, +**[@adrianeboyd](https://github.com/adrianeboyd)** and **[@polm](https://github.com/polm)**. +Please understand that we won't be able to provide individual support via email. +We also believe that help is much more valuable if it's shared publicly, so that +more people can benefit from it. | Type | Platforms | | ------------------------------- | --------------------------------------- | @@ -74,9 +81,9 @@ it. ## Features - Support for **60+ languages** -- **Trained pipelines** +- **Trained pipelines** for different languages and tasks - Multi-task learning with pretrained **transformers** like BERT -- Pretrained **word vectors** +- Support for pretrained **word vectors** and embeddings - State-of-the-art speed - Production-ready **training system** - Linguistically-motivated **tokenization** @@ -90,7 +97,7 @@ it. 📖 **For more details, see the [facts, figures and benchmarks](https://spacy.io/usage/facts-figures).** -## Install spaCy +## ⏳ Install spaCy For detailed installation instructions, see the [documentation](https://spacy.io/usage). @@ -105,8 +112,8 @@ For detailed installation instructions, see the ### pip -Using pip, spaCy releases are available as source packages and binary wheels (as -of `v2.0.13`). Before you install spaCy and its dependencies, make sure that +Using pip, spaCy releases are available as source packages and binary wheels. +Before you install spaCy and its dependencies, make sure that your `pip`, `setuptools` and `wheel` are up to date. ```bash @@ -114,13 +121,12 @@ pip install -U pip setuptools wheel pip install spacy ``` -To install additional data tables for lemmatization and normalization in -**spaCy v2.2+** you can run `pip install spacy[lookups]` or install +To install additional data tables for lemmatization and normalization you can +run `pip install spacy[lookups]` or install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) separately. The lookups package is needed to create blank models with -lemmatization data for v2.2+ plus normalization data for v2.3+, and to -lemmatize in languages that don't yet come with pretrained models and aren't -powered by third-party libraries. +lemmatization data, and to lemmatize in languages that don't yet come with +pretrained models and aren't powered by third-party libraries. When using pip it is generally recommended to install packages in a virtual environment to avoid modifying system state: @@ -134,17 +140,14 @@ pip install spacy ### conda -Thanks to our great community, we've finally re-added conda support. You can now -install spaCy via `conda-forge`: +You can also install spaCy from `conda` via the `conda-forge` channel. For the +feedstock including the build recipe and configuration, check out +[this repository](https://github.com/conda-forge/spacy-feedstock). ```bash conda install -c conda-forge spacy ``` -For the feedstock including the build recipe and configuration, check out -[this repository](https://github.com/conda-forge/spacy-feedstock). Improvements -and pull requests to the recipe and setup are always appreciated. - ### Updating spaCy Some updates to spaCy may require downloading new statistical models. If you're @@ -164,34 +167,37 @@ with the new version. 📖 **For details on upgrading from spaCy 2.x to spaCy 3.x, see the [migration guide](https://spacy.io/usage/v3#migrating).** -## Download models +## 📦 Download model packages Trained pipelines for spaCy can be installed as **Python packages**. This means that they're a component of your application, just like any other module. -Models can be installed using spaCy's `download` command, or manually by -pointing pip to a path or URL. +Models can be installed using spaCy's [`download`](https://spacy.io/api/cli#download) +command, or manually by pointing pip to a path or URL. -| Documentation | | -| ---------------------- | ---------------------------------------------------------------- | -| [Available Pipelines] | Detailed pipeline descriptions, accuracy figures and benchmarks. | -| [Models Documentation] | Detailed usage instructions. | +| Documentation | | +| -------------------------- | ---------------------------------------------------------------- | +| **[Available Pipelines]** | Detailed pipeline descriptions, accuracy figures and benchmarks. | +| **[Models Documentation]** | Detailed usage and installation instructions. | +| **[Training]** | How to train your own pipelines on your data. | [available pipelines]: https://spacy.io/models -[models documentation]: https://spacy.io/docs/usage/models +[models documentation]: https://spacy.io/usage/models +[training]: https://spacy.io/usage/training ```bash # Download best-matching version of specific model for your spaCy installation python -m spacy download en_core_web_sm -# pip install .tar.gz archive from path or URL -pip install /Users/you/en_core_web_sm-2.2.0.tar.gz -pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz +# pip install .tar.gz archive or .whl from path or URL +pip install /Users/you/en_core_web_sm-3.0.0.tar.gz +pip install /Users/you/en_core_web_sm-3.0.0-py3-none-any.whl +pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz ``` ### Loading and using models -To load a model, use `spacy.load()` with the model name or a -path to the model data directory. +To load a model, use [`spacy.load()`](https://spacy.io/api/top-level#spacy.load) +with the model name or a path to the model data directory. ```python import spacy @@ -213,7 +219,7 @@ doc = nlp("This is a sentence.") 📖 **For more info and examples, check out the [models documentation](https://spacy.io/docs/usage/models).** -## Compile from source +## ⚒ Compile from source The other way to install spaCy is to clone its [GitHub repository](https://github.com/explosion/spaCy) and build it from @@ -223,8 +229,19 @@ Python distribution including header files, a compiler, [pip](https://pip.pypa.io/en/latest/installing/), [virtualenv](https://virtualenv.pypa.io/en/latest/) and [git](https://git-scm.com) installed. The compiler part is the trickiest. How to -do that depends on your system. See notes on Ubuntu, OS X and Windows for -details. +do that depends on your system. + +| Platform | | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **Ubuntu** | Install system-level dependencies via `apt-get`: `sudo apt-get install build-essential python-dev git` . | +| **Mac** | Install a recent version of [XCode](https://developer.apple.com/xcode/), including the so-called "Command Line Tools". macOS and OS X ship with Python and git preinstalled. | +| **Windows** | Install a version of the [Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/) or [Visual Studio Express](https://visualstudio.microsoft.com/vs/express/) that matches the version that was used to compile your Python interpreter. | + +For more details +and instructions, see the documentation on +[compiling spaCy from source](https://spacy.io/usage#source) and the +[quickstart widget](https://spacy.io/usage#section-quickstart) to get the right +commands for your platform and Python version. ```bash git clone https://github.com/explosion/spaCy @@ -236,64 +253,28 @@ source .env/bin/activate # make sure you are using the latest pip python -m pip install -U pip setuptools wheel -pip install . +pip install -r requirements.txt +pip install --no-build-isolation --editable . ``` To install with extras: ```bash -pip install .[lookups,cuda102] +pip install --no-build-isolation --editable .[lookups,cuda102] ``` -To install all dependencies required for development: - -```bash -pip install -r requirements.txt -``` - -Compared to regular install via pip, [requirements.txt](requirements.txt) -additionally installs developer dependencies such as Cython. For more details -and instructions, see the documentation on -[compiling spaCy from source](https://spacy.io/usage#source) and the -[quickstart widget](https://spacy.io/usage#section-quickstart) to get the right -commands for your platform and Python version. - -### Ubuntu - -Install system-level dependencies via `apt-get`: - -```bash -sudo apt-get install build-essential python-dev git -``` - -### macOS / OS X - -Install a recent version of [XCode](https://developer.apple.com/xcode/), -including the so-called "Command Line Tools". macOS and OS X ship with Python -and git preinstalled. - -### Windows - -Install a version of the -[Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/) -or [Visual Studio Express](https://visualstudio.microsoft.com/vs/express/) that -matches the version that was used to compile your Python interpreter. - -## Run tests +## 🚦 Run tests spaCy comes with an [extensive test suite](spacy/tests). In order to run the tests, you'll usually want to clone the repository and build spaCy from source. This will also install the required development dependencies and test utilities -defined in the `requirements.txt`. +defined in the [`requirements.txt`](requirements.txt). Alternatively, you can run `pytest` on the tests from within the installed `spacy` package. Don't forget to also install the test utilities via spaCy's -`requirements.txt`: +[`requirements.txt`](requirements.txt): ```bash pip install -r requirements.txt python -m pytest --pyargs spacy ``` - -See [the documentation](https://spacy.io/usage#tests) for more details and -examples. diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 5455da360..4291b6e0a 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -5,28 +5,31 @@ trigger: - "*" exclude: - "spacy.io" + - "nightly.spacy.io" + - "v2.spacy.io" paths: exclude: - "website/*" - "*.md" pr: - paths: + paths: exclude: - - "website/*" - "*.md" + - "website/docs/*" + - "website/src/*" jobs: # Perform basic checks for most important errors (syntax etc.) Uses the config # defined in .flake8 and overwrites the selected codes. - job: "Validate" pool: - vmImage: "ubuntu-16.04" + vmImage: "ubuntu-18.04" steps: - task: UsePythonVersion@0 inputs: versionSpec: "3.7" - script: | - pip install flake8==3.5.0 + pip install flake8==3.9.2 python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics displayName: "flake8" @@ -36,77 +39,71 @@ jobs: matrix: # We're only running one platform per Python version to speed up builds Python36Linux: - imageName: "ubuntu-16.04" + imageName: "ubuntu-18.04" python.version: "3.6" -# Python36Windows: -# imageName: "vs2017-win2016" -# python.version: "3.6" -# Python36Mac: -# imageName: "macos-10.14" -# python.version: "3.6" -# Python37Linux: -# imageName: "ubuntu-16.04" -# python.version: "3.7" + # Python36Windows: + # imageName: "windows-2019" + # python.version: "3.6" + # Python36Mac: + # imageName: "macos-10.14" + # python.version: "3.6" + # Python37Linux: + # imageName: "ubuntu-18.04" + # python.version: "3.7" Python37Windows: - imageName: "vs2017-win2016" + imageName: "windows-2019" python.version: "3.7" -# Python37Mac: -# imageName: "macos-10.14" -# python.version: "3.7" -# Python38Linux: -# imageName: "ubuntu-16.04" -# python.version: "3.8" -# Python38Windows: -# imageName: "vs2017-win2016" -# python.version: "3.8" + # Python37Mac: + # imageName: "macos-10.14" + # python.version: "3.7" + # Python38Linux: + # imageName: "ubuntu-18.04" + # python.version: "3.8" + # Python38Windows: + # imageName: "windows-2019" + # python.version: "3.8" Python38Mac: imageName: "macos-10.14" python.version: "3.8" Python39Linux: - imageName: "ubuntu-16.04" - python.version: "3.9" - Python39Windows: - imageName: "vs2017-win2016" - python.version: "3.9" - Python39Mac: - imageName: "macos-10.14" + imageName: "ubuntu-18.04" python.version: "3.9" + # Python39Windows: + # imageName: "windows-2019" + # python.version: "3.9" + # Python39Mac: + # imageName: "macos-10.14" + # python.version: "3.9" + Python310Linux: + imageName: "ubuntu-20.04" + python.version: "3.10" + Python310Windows: + imageName: "windows-2019" + python.version: "3.10" + Python310Mac: + imageName: "macos-10.15" + python.version: "3.10" maxParallel: 4 pool: vmImage: $(imageName) - steps: - - task: UsePythonVersion@0 - inputs: - versionSpec: "$(python.version)" - architecture: "x64" + - template: .github/azure-steps.yml + parameters: + python_version: '$(python.version)' + architecture: 'x64' - - script: | - python -m pip install -U setuptools - pip install -r requirements.txt - displayName: "Install dependencies" - - - script: | - python setup.py build_ext --inplace - python setup.py sdist --formats=gztar - displayName: "Compile and build sdist" - - - task: DeleteFiles@1 - inputs: - contents: "spacy" - displayName: "Delete source directory" - - - script: | - pip freeze > installed.txt - pip uninstall -y -r installed.txt - displayName: "Uninstall all packages" - - - bash: | - SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) - pip install dist/$SDIST - displayName: "Install from sdist" - - - script: | - pip install -r requirements.txt - python -m pytest --pyargs spacy - displayName: "Run tests" +# - job: "TestGPU" +# dependsOn: "Validate" +# strategy: +# matrix: +# Python38LinuxX64_GPU: +# python.version: '3.8' +# pool: +# name: "LinuxX64_GPU" +# steps: +# - template: .github/azure-steps.yml +# parameters: +# python_version: '$(python.version)' +# architecture: 'x64' +# gpu: true +# num_build_jobs: 24 diff --git a/build-constraints.txt b/build-constraints.txt index 23e660096..cf5fe3284 100644 --- a/build-constraints.txt +++ b/build-constraints.txt @@ -2,4 +2,5 @@ numpy==1.15.0; python_version<='3.7' numpy==1.17.3; python_version=='3.8' numpy==1.19.3; python_version=='3.9' -numpy; python_version>='3.10' +numpy==1.21.3; python_version=='3.10' +numpy; python_version>='3.11' diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 000000000..23ff59acd --- /dev/null +++ b/examples/README.md @@ -0,0 +1,130 @@ + + +# spaCy examples + +For spaCy v3 we've converted many of the [v2 example +scripts](https://github.com/explosion/spaCy/tree/v2.3.x/examples/) into +end-to-end [spacy projects](https://spacy.io/usage/projects) workflows. The +workflows include all the steps to go from data to packaged spaCy models. + +## 🪐 Pipeline component demos + +The simplest demos for training a single pipeline component are in the +[`pipelines`](https://github.com/explosion/projects/blob/v3/pipelines) category +including: + +- [`pipelines/ner_demo`](https://github.com/explosion/projects/blob/v3/pipelines/ner_demo): + Train a named entity recognizer +- [`pipelines/textcat_demo`](https://github.com/explosion/projects/blob/v3/pipelines/textcat_demo): + Train a text classifier +- [`pipelines/parser_intent_demo`](https://github.com/explosion/projects/blob/v3/pipelines/parser_intent_demo): + Train a dependency parser for custom semantics + +## 🪐 Tutorials + +The [`tutorials`](https://github.com/explosion/projects/blob/v3/tutorials) +category includes examples that work through specific NLP use cases end-to-end: + +- [`tutorials/textcat_goemotions`](https://github.com/explosion/projects/blob/v3/tutorials/textcat_goemotions): + Train a text classifier to categorize emotions in Reddit posts +- [`tutorials/nel_emerson`](https://github.com/explosion/projects/blob/v3/tutorials/nel_emerson): + Use an entity linker to disambiguate mentions of the same name + +Check out the [projects documentation](https://spacy.io/usage/projects) and +browse through the [available +projects](https://github.com/explosion/projects/)! + +## 🚀 Get started with a demo project + +The +[`pipelines/ner_demo`](https://github.com/explosion/projects/blob/v3/pipelines/ner_demo) +project converts the spaCy v2 +[`train_ner.py`](https://github.com/explosion/spaCy/blob/v2.3.x/examples/training/train_ner.py) +demo script into a spaCy v3 project. + +1. Clone the project: + + ```bash + python -m spacy project clone pipelines/ner_demo + ``` + +2. Install requirements and download any data assets: + + ```bash + cd ner_demo + python -m pip install -r requirements.txt + python -m spacy project assets + ``` + +3. Run the default workflow to convert, train and evaluate: + + ```bash + python -m spacy project run all + ``` + + Sample output: + + ```none + ℹ Running workflow 'all' + + ================================== convert ================================== + Running command: /home/user/venv/bin/python scripts/convert.py en assets/train.json corpus/train.spacy + Running command: /home/user/venv/bin/python scripts/convert.py en assets/dev.json corpus/dev.spacy + + =============================== create-config =============================== + Running command: /home/user/venv/bin/python -m spacy init config --lang en --pipeline ner configs/config.cfg --force + ℹ Generated config template specific for your use case + - Language: en + - Pipeline: ner + - Optimize for: efficiency + - Hardware: CPU + - Transformer: None + ✔ Auto-filled config with all values + ✔ Saved config + configs/config.cfg + You can now add your data and train your pipeline: + python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy + + =================================== train =================================== + Running command: /home/user/venv/bin/python -m spacy train configs/config.cfg --output training/ --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy --training.eval_frequency 10 --training.max_steps 100 --gpu-id -1 + ℹ Using CPU + + =========================== Initializing pipeline =========================== + [2021-03-11 19:34:59,101] [INFO] Set up nlp object from config + [2021-03-11 19:34:59,109] [INFO] Pipeline: ['tok2vec', 'ner'] + [2021-03-11 19:34:59,113] [INFO] Created vocabulary + [2021-03-11 19:34:59,113] [INFO] Finished initializing nlp object + [2021-03-11 19:34:59,265] [INFO] Initialized pipeline components: ['tok2vec', 'ner'] + ✔ Initialized pipeline + + ============================= Training pipeline ============================= + ℹ Pipeline: ['tok2vec', 'ner'] + ℹ Initial learn rate: 0.001 + E # LOSS TOK2VEC LOSS NER ENTS_F ENTS_P ENTS_R SCORE + --- ------ ------------ -------- ------ ------ ------ ------ + 0 0 0.00 7.90 0.00 0.00 0.00 0.00 + 10 10 0.11 71.07 0.00 0.00 0.00 0.00 + 20 20 0.65 22.44 50.00 50.00 50.00 0.50 + 30 30 0.22 6.38 80.00 66.67 100.00 0.80 + 40 40 0.00 0.00 80.00 66.67 100.00 0.80 + 50 50 0.00 0.00 80.00 66.67 100.00 0.80 + 60 60 0.00 0.00 100.00 100.00 100.00 1.00 + 70 70 0.00 0.00 100.00 100.00 100.00 1.00 + 80 80 0.00 0.00 100.00 100.00 100.00 1.00 + 90 90 0.00 0.00 100.00 100.00 100.00 1.00 + 100 100 0.00 0.00 100.00 100.00 100.00 1.00 + ✔ Saved pipeline to output directory + training/model-last + ``` + +4. Package the model: + + ```bash + python -m spacy project run package + ``` + +5. Visualize the model's output with [Streamlit](https://streamlit.io): + + ```bash + python -m spacy project run visualize-model + ``` diff --git a/examples/training/README.md b/examples/training/README.md new file mode 100644 index 000000000..34689ceb6 --- /dev/null +++ b/examples/training/README.md @@ -0,0 +1,5 @@ + + +# spaCy examples + +See [examples/README.md](../README.md) diff --git a/extra/DEVELOPER_DOCS/Code Conventions.md b/extra/DEVELOPER_DOCS/Code Conventions.md new file mode 100644 index 000000000..7a3f6996f --- /dev/null +++ b/extra/DEVELOPER_DOCS/Code Conventions.md @@ -0,0 +1,546 @@ +# Code Conventions + +For a general overview of code conventions for contributors, see the [section in the contributing guide](https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md#code-conventions). + +1. [Code compatibility](#code-compatibility) +2. [Auto-formatting](#auto-formatting) +3. [Linting](#linting) +4. [Documenting code](#documenting-code) +5. [Type hints](#type-hints) +6. [Structuring logic](#structuring-logic) +7. [Naming](#naming) +8. [Error handling](#error-handling) +9. [Writing tests](#writing-tests) + +## Code compatibility + +spaCy supports **Python 3.6** and above, so all code should be written compatible with 3.6. This means that there are certain new syntax features that we won't be able to use until we drop support for older Python versions. Some newer features provide backports that we can conditionally install for older versions, although we only want to do this if it's absolutely necessary. If we need to use conditional imports based on the Python version or other custom compatibility-specific helpers, those should live in `compat.py`. + +## Auto-formatting + +spaCy uses `black` for auto-formatting (which is also available as a pre-commit hook). It's recommended to configure your editor to perform this automatically, either triggered manually or whenever you save a file. We also have a GitHub action that regularly formats the code base and submits a PR if changes are available. Note that auto-formatting is currently only available for `.py` (Python) files, not for `.pyx` (Cython). + +As a rule of thumb, if the auto-formatting produces output that looks messy, it can often indicate that there's a better way to structure the code to make it more concise. + +```diff +- range_suggester = registry.misc.get("spacy.ngram_range_suggester.v1")( +- min_size=1, max_size=3 +- ) ++ suggester_factory = registry.misc.get("spacy.ngram_range_suggester.v1") ++ range_suggester = suggester_factory(min_size=1, max_size=3) +``` + +In some specific cases, e.g. in the tests, it can make sense to disable auto-formatting for a specific block. You can do this by wrapping the code in `# fmt: off` and `# fmt: on`: + +```diff ++ # fmt: off +text = "I look forward to using Thingamajig. I've been told it will make my life easier..." +deps = ["nsubj", "ROOT", "advmod", "prep", "pcomp", "dobj", "punct", "", + "nsubjpass", "aux", "auxpass", "ROOT", "nsubj", "aux", "ccomp", + "poss", "nsubj", "ccomp", "punct"] ++ # fmt: on +``` + +## Linting + +[`flake8`](http://flake8.pycqa.org/en/latest/) is a tool for enforcing code style. It scans one or more files and outputs errors and warnings. This feedback can help you stick to general standards and conventions, and can be very useful for spotting potential mistakes and inconsistencies in your code. Code you write should be compatible with our flake8 rules and not cause any warnings. + +```bash +flake8 spacy +``` + +The most common problems surfaced by linting are: + +- **Trailing or missing whitespace.** This is related to formatting and should be fixed automatically by running `black`. +- **Unused imports.** Those should be removed if the imports aren't actually used. If they're required, e.g. to expose them so they can be imported from the given module, you can add a comment and `# noqa: F401` exception (see details below). +- **Unused variables.** This can often indicate bugs, e.g. a variable that's declared and not correctly passed on or returned. To prevent ambiguity here, your code shouldn't contain unused variables. If you're unpacking a list of tuples and end up with variables you don't need, you can call them `_` to indicate that they're unused. +- **Redefinition of function.** This can also indicate bugs, e.g. a copy-pasted function that you forgot to rename and that now replaces the original function. +- **Repeated dictionary keys.** This either indicates a bug or unnecessary duplication. +- **Comparison with `True`, `False`, `None`**. This is mostly a stylistic thing: when checking whether a value is `True`, `False` or `None`, you should be using `is` instead of `==`. For example, `if value is None`. + +### Ignoring linter rules for special cases + +To ignore a given line, you can add a comment like `# noqa: F401`, specifying the code of the error or warning we want to ignore. It's also possible to ignore several comma-separated codes at once, e.g. `# noqa: E731,E123`. In general, you should always **specify the code(s)** you want to ignore – otherwise, you may end up missing actual problems. + +```python +# The imported class isn't used in this file, but imported here, so it can be +# imported *from* here by another module. +from .submodule import SomeClass # noqa: F401 + +try: + do_something() +except: # noqa: E722 + # This bare except is justified, for some specific reason + do_something_else() +``` + +## Documenting code + +All functions and methods you write should be documented with a docstring inline. The docstring can contain a simple summary, and an overview of the arguments and their (simplified) types. Modern editors will show this information to users when they call the function or method in their code. + +If it's part of the public API and there's a documentation section available, we usually add the link as `DOCS:` at the end. This allows us to keep the docstrings simple and concise, while also providing additional information and examples if necessary. + +```python +def has_pipe(self, name: str) -> bool: + """Check if a component name is present in the pipeline. Equivalent to + `name in nlp.pipe_names`. + + name (str): Name of the component. + RETURNS (bool): Whether a component of the name exists in the pipeline. + + DOCS: https://spacy.io/api/language#has_pipe + """ + ... +``` + +We specifically chose this approach of maintaining the docstrings and API reference separately, instead of auto-generating the API docs from the docstrings like other packages do. We want to be able to provide extensive explanations and examples in the documentation and use our own custom markup for it that would otherwise clog up the docstrings. We also want to be able to update the documentation independently of the code base. It's slightly more work, but it's absolutely worth it in terms of user and developer experience. + +### Inline code comments + +We don't expect you to add inline comments for everything you're doing – this should be obvious from reading the code. If it's not, the first thing to check is whether your code can be improved to make it more explicit. That said, if your code includes complex logic or aspects that may be unintuitive at first glance (or even included a subtle bug that you ended up fixing), you should leave a quick comment that provides more context. + +```diff +token_index = indices[value] ++ # Index describes Token.i of last token but Span indices are inclusive +span = doc[prev_token_index:token_index + 1] +``` + +```diff ++ # To create the components we need to use the final interpolated config ++ # so all values are available (if component configs use variables). ++ # Later we replace the component config with the raw config again. +interpolated = filled.interpolate() if not filled.is_interpolated else filled +``` + +Don't be shy about including comments for tricky parts that _you_ found hard to implement or get right – those may come in handy for the next person working on this code, or even future you! + +If your change implements a fix to a specific issue, it can often be helpful to include the issue number in the comment, especially if it's a relatively straightforward adjustment: + +```diff ++ # Ensure object is a Span, not a Doc (#1234) +if isinstance(obj, Doc): + obj = obj[obj.start:obj.end] +``` + +### Including TODOs + +It's fine to include code comments that indicate future TODOs, using the `TODO:` prefix. Modern editors typically format this in a different color, so it's easy to spot. TODOs don't necessarily have to be things that are absolutely critical to fix fight now – those should already be addressed in your pull request once it's ready for review. But they can include notes about potential future improvements. + +```diff ++ # TODO: this is currently pretty slow +dir_checksum = hashlib.md5() +for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()): + dir_checksum.update(sub_file.read_bytes()) +``` + +If any of the TODOs you've added are important and should be fixed soon, you should add a task for this on Explosion's internal Ora board or an issue on the public issue tracker to make sure we don't forget to address it. + +## Type hints + +We use Python type hints across the `.py` files wherever possible. This makes it easy to understand what a function expects and returns, and modern editors will be able to show this information to you when you call an annotated function. Type hints are not currently used in the `.pyx` (Cython) code, except for definitions of registered functions and component factories, where they're used for config validation. + +If possible, you should always use the more descriptive type hints like `List[str]` or even `List[Any]` instead of only `list`. We also annotate arguments and return types of `Callable` – although, you can simplify this if the type otherwise gets too verbose (e.g. functions that return factories to create callbacks). Remember that `Callable` takes two values: a **list** of the argument type(s) in order, and the return values. + +```diff +- def func(some_arg: dict) -> None: ++ def func(some_arg: Dict[str, Any]) -> None: + ... +``` + +```python +def create_callback(some_arg: bool) -> Callable[[str, int], List[str]]: + def callback(arg1: str, arg2: int) -> List[str]: + ... + + return callback +``` + +For model architectures, Thinc also provides a collection of [custom types](https://thinc.ai/docs/api-types), including more specific types for arrays and model inputs/outputs. Even outside of static type checking, using these types will make the code a lot easier to read and follow, since it's always clear what array types are expected (and what might go wrong if the output is different from the expected type). + +```python +def build_tagger_model( + tok2vec: Model[List[Doc], List[Floats2d]], nO: Optional[int] = None +) -> Model[List[Doc], List[Floats2d]]: + ... +``` + +If you need to use a type hint that refers to something later declared in the same module, or the class that a method belongs to, you can use a string value instead: + +```python +class SomeClass: + def from_bytes(self, data: bytes) -> "SomeClass": + ... +``` + +In some cases, you won't be able to import a class from a different module to use it as a type hint because it'd cause circular imports. For instance, `spacy/util.py` includes various helper functions that return an instance of `Language`, but we couldn't import it, because `spacy/language.py` imports `util` itself. In this case, we can provide `"Language"` as a string and make the import conditional on `typing.TYPE_CHECKING` so it only runs when the code is evaluated by a type checker: + +```python +from typing TYPE_CHECKING + +if TYPE_CHECKING: + from .language import Language + +def load_model(name: str) -> "Language": + ... +``` + +## Structuring logic + +### Positional and keyword arguments + +We generally try to avoid writing functions and methods with too many arguments, and use keyword-only arguments wherever possible. Python lets you define arguments as keyword-only by separating them with a `, *`. If you're writing functions with additional arguments that customize the behavior, you typically want to make those arguments keyword-only, so their names have to be provided explicitly. + +```diff +- def do_something(name: str, validate: bool = False): ++ def do_something(name: str, *, validate: bool = False): + ... + +- do_something("some_name", True) ++ do_something("some_name", validate=True) +``` + +This makes the function calls easier to read, because it's immediately clear what the additional values mean. It also makes it easier to extend arguments or change their order later on, because you don't end up with any function calls that depend on a specific positional order. + +### Avoid mutable default arguments + +A common Python gotcha are [mutable default arguments](https://docs.python-guide.org/writing/gotchas/#mutable-default-arguments): if your argument defines a mutable default value like `[]` or `{}` and then goes and mutates it, the default value is created _once_ when the function is created and the same object is then mutated every time the function is called. This can be pretty unintuitive when you first encounter it. We therefore avoid writing logic that does this. + +If your arguments need to default to an empty list or dict, you can use the `SimpleFrozenList` and `SimpleFrozenDict` helpers provided by spaCy. They are simple frozen implementations that raise an error if they're being mutated to prevent bugs and logic that accidentally mutates default arguments. + +```diff +- def to_bytes(self, *, exclude: List[str] = []): ++ def to_bytes(self, *, exclude: List[str] = SimpleFrozenList()): + ... +``` + +```diff +def do_something(values: List[str] = SimpleFrozenList()): + if some_condition: +- values.append("foo") # raises an error ++ values = [*values, "foo"] + return values +``` + +### Don't use `try`/`except` for control flow + +We strongly discourage using `try`/`except` blocks for anything that's not third-party error handling or error handling that we otherwise have little control over. There's typically always a way to anticipate the _actual_ problem and **check for it explicitly**, which makes the code easier to follow and understand, and prevents bugs: + +```diff +- try: +- token = doc[i] +- except IndexError: +- token = doc[-1] + ++ if i < len(doc): ++ token = doc[i] ++ else: ++ token = doc[-1] +``` + +Even if you end up having to check for multiple conditions explicitly, this is still preferred over a catch-all `try`/`except`. It can be very helpful to think about the exact scenarios you need to cover, and what could go wrong at each step, which often leads to better code and fewer bugs. `try/except` blocks can also easily mask _other_ bugs and problems that raise the same errors you're catching, which is obviously bad. + +If you have to use `try`/`except`, make sure to only include what's **absolutely necessary** in the `try` block and define the exception(s) explicitly. Otherwise, you may end up masking very different exceptions caused by other bugs. + +```diff +- try: +- value1 = get_some_value() +- value2 = get_some_other_value() +- score = external_library.compute_some_score(value1, value2) +- except: +- score = 0.0 + ++ value1 = get_some_value() ++ value2 = get_some_other_value() ++ try: ++ score = external_library.compute_some_score(value1, value2) ++ except ValueError: ++ score = 0.0 +``` + +### Avoid lambda functions + +`lambda` functions can be useful for defining simple anonymous functions in a single line, but they also introduce problems: for instance, they require [additional logic](https://stackoverflow.com/questions/25348532/can-python-pickle-lambda-functions) in order to be pickled and are pretty ugly to type-annotate. So we typically avoid them in the code base and only use them in the serialization handlers and within tests for simplicity. Instead of `lambda`s, check if your code can be refactored to not need them, or use helper functions instead. + +```diff +- split_string: Callable[[str], List[str]] = lambda value: [v.strip() for v in value.split(",")] + ++ def split_string(value: str) -> List[str]: ++ return [v.strip() for v in value.split(",")] +``` + +### Iteration and comprehensions + +We generally avoid using built-in functions like `filter` or `map` in favor of list or generator comprehensions. + +```diff +- filtered = filter(lambda x: x in ["foo", "bar"], values) ++ filtered = (x for x in values if x in ["foo", "bar"]) +- filtered = list(filter(lambda x: x in ["foo", "bar"], values)) ++ filtered = [x for x in values if x in ["foo", "bar"]] + +- result = map(lambda x: { x: x in ["foo", "bar"]}, values) ++ result = ({x: x in ["foo", "bar"]} for x in values) +- result = list(map(lambda x: { x: x in ["foo", "bar"]}, values)) ++ result = [{x: x in ["foo", "bar"]} for x in values] +``` + +If your logic is more complex, it's often better to write a loop instead, even if it adds more lines of code in total. The result will be much easier to follow and understand. + +```diff +- result = [{"key": key, "scores": {f"{i}": score for i, score in enumerate(scores)}} for key, scores in values] + ++ result = [] ++ for key, scores in values: ++ scores_dict = {f"{i}": score for i, score in enumerate(scores)} ++ result.append({"key": key, "scores": scores_dict}) +``` + +### Composition vs. inheritance + +Although spaCy uses a lot of classes, **inheritance is viewed with some suspicion** — it's seen as a mechanism of last resort. You should discuss plans to extend the class hierarchy before implementing. Unless you're implementing a new data structure or pipeline component, you typically shouldn't have to use classes at all. + +### Don't use `print` + +The core library never `print`s anything. While we encourage using `print` statements for simple debugging (it's the most straightforward way of looking at what's happening), make sure to clean them up once you're ready to submit your pull request. If you want to output warnings or debugging information for users, use the respective dedicated mechanisms for this instead (see sections on warnings and logging for details). + +The only exceptions are the CLI functions, which pretty-print messages for the user, and methods that are explicitly intended for printing things, e.g. `Language.analyze_pipes` with `pretty=True` enabled. For this, we use our lightweight helper library [`wasabi`](https://github.com/ines/wasabi). + +## Naming + +Naming is hard and often a topic of long internal discussions. We don't expect you to come up with the perfect names for everything you write – finding the right names is often an iterative and collaborative process. That said, we do try to follow some basic conventions. + +Consistent with general Python conventions, we use `CamelCase` for class names including dataclasses, `snake_case` for methods, functions and variables, and `UPPER_SNAKE_CASE` for constants, typically defined at the top of a module. We also avoid using variable names that shadow the names of built-in functions, e.g. `input`, `help` or `list`. + +### Naming variables + +Variable names should always make it clear _what exactly_ the variable is and what it's used for. Instances of common classes should use the same consistent names. For example, you should avoid naming a text string (or anything else that's not a `Doc` object) `doc`. The most common class-to-variable mappings are: + +| Class | Variable | Example | +| ---------- | --------------------- | ------------------------------------------- | +| `Language` | `nlp` | `nlp = spacy.blank("en")` | +| `Doc` | `doc` | `doc = nlp("Some text")` | +| `Span` | `span`, `ent`, `sent` | `span = doc[1:4]`, `ent = doc.ents[0]` | +| `Token` | `token` | `token = doc[0]` | +| `Lexeme` | `lexeme`, `lex` | `lex = nlp.vocab["foo"]` | +| `Vocab` | `vocab` | `vocab = Vocab()` | +| `Example` | `example`, `eg` | `example = Example.from_dict(doc, gold)` | +| `Config` | `config`, `cfg` | `config = Config().from_disk("config.cfg")` | + +We try to avoid introducing too many temporary variables, as these clutter your namespace. It's okay to re-assign to an existing variable, but only if the value has the same type. + +```diff +ents = get_a_list_of_entities() +ents = [ent for ent in doc.ents if ent.label_ == "PERSON"] +- ents = {(ent.start, ent.end): ent.label_ for ent in ents} ++ ent_mappings = {(ent.start, ent.end): ent.label_ for ent in ents} +``` + +### Naming methods and functions + +Try choosing short and descriptive names wherever possible and imperative verbs for methods that do something, e.g. `disable_pipes`, `add_patterns` or `get_vector`. Private methods and functions that are not intended to be part of the user-facing API should be prefixed with an underscore `_`. It's often helpful to look at the existing classes for inspiration. + +Objects that can be serialized, e.g. data structures and pipeline components, should implement the same consistent methods for serialization. Those usually include at least `to_disk`, `from_disk`, `to_bytes` and `from_bytes`. Some objects can also implement more specific methods like `{to/from}_dict` or `{to/from}_str`. + +## Error handling + +We always encourage writing helpful and detailed custom error messages for everything we can anticipate going wrong, and including as much detail as possible. spaCy provides a directory of error messages in `errors.py` with unique codes for each message. This allows us to keep the code base more concise and avoids long and nested blocks of texts throughout the code that disrupt the reading flow. The codes make it easy to find references to the same error in different places, and also helps identify problems reported by users (since we can just search for the error code). + +Errors can be referenced via their code, e.g. `Errors.E123`. Messages can also include placeholders for values, that can be populated by formatting the string with `.format()`. + +```python +class Errors: + E123 = "Something went wrong" + E456 = "Unexpected value: {value}" +``` + +```diff +if something_went_wrong: +- raise ValueError("Something went wrong!") ++ raise ValueError(Errors.E123) + +if not isinstance(value, int): +- raise ValueError(f"Unexpected value: {value}") ++ raise ValueError(Errors.E456.format(value=value)) +``` + +As a general rule of thumb, all error messages raised within the **core library** should be added to `Errors`. The only place where we write errors and messages as strings is `spacy.cli`, since these functions typically pretty-print and generate a lot of output that'd otherwise be very difficult to separate from the actual logic. + +### Re-raising exceptions + +If we anticipate possible errors in third-party code that we don't control, or our own code in a very different context, we typically try to provide custom and more specific error messages if possible. If we need to re-raise an exception within a `try`/`except` block, we can re-raise a custom exception. + +[Re-raising `from`](https://docs.python.org/3/tutorial/errors.html#exception-chaining) the original caught exception lets us chain the exceptions, so the user sees both the original error, as well as the custom message with a note "The above exception was the direct cause of the following exception". + +```diff +try: + run_third_party_code_that_might_fail() +except ValueError as e: ++ raise ValueError(Errors.E123) from e +``` + +In some cases, it makes sense to suppress the original exception, e.g. if we know what it is and know that it's not particularly helpful. In that case, we can raise `from None`. This prevents clogging up the user's terminal with multiple and irrelevant chained exceptions. + +```diff +try: + run_our_own_code_that_might_fail_confusingly() +except ValueError: ++ raise ValueError(Errors.E123) from None +``` + +### Avoid using naked `assert` + +During development, it can sometimes be helpful to add `assert` statements throughout your code to make sure that the values you're working with are what you expect. However, as you clean up your code, those should either be removed or replaced by more explicit error handling: + +```diff +- assert score >= 0.0 ++ if score < 0.0: ++ raise ValueError(Errors.789.format(score=score)) +``` + +Otherwise, the user will get to see a naked `AssertionError` with no further explanation, which is very unhelpful. Instead of adding an error message to `assert`, it's always better to `raise` more explicit errors for specific conditions. If you're checking for something that _has to be right_ and would otherwise be a bug in spaCy, you can express this in the error message: + +```python +E161 = ("Found an internal inconsistency when predicting entity links. " + "This is likely a bug in spaCy, so feel free to open an issue: " + "https://github.com/explosion/spaCy/issues") +``` + +### Warnings + +Instead of raising an error, some parts of the code base can raise warnings to notify the user of a potential problem. This is done using Python's `warnings.warn` and the messages defined in `Warnings` in the `errors.py`. Whether or not warnings are shown can be controlled by the user, including custom filters for disabling specific warnings using a regular expression matching our internal codes, e.g. `W123`. + +```diff +- print("Warning: No examples provided for validation") ++ warnings.warn(Warnings.W123) +``` + +When adding warnings, make sure you're not calling `warnings.warn` repeatedly, e.g. in a loop, which will clog up the terminal output. Instead, you can collect the potential problems first and then raise a single warning. If the problem is critical, consider raising an error instead. + +```diff ++ n_empty = 0 +for spans in lots_of_annotations: + if len(spans) == 0: +- warnings.warn(Warnings.456) ++ n_empty += 1 ++ warnings.warn(Warnings.456.format(count=n_empty)) +``` + +### Logging + +Log statements can be added via spaCy's `logger`, which uses Python's native `logging` module under the hood. We generally only use logging for debugging information that **the user may choose to see** in debugging mode or that's **relevant during training** but not at runtime. + +```diff ++ logger.info("Set up nlp object from config") +config = nlp.config.interpolate() +``` + +`spacy train` and similar CLI commands will enable all log statements of level `INFO` by default (which is not the case at runtime). This allows outputting specific information within certain parts of the core library during training, without having it shown at runtime. `DEBUG`-level logs are only shown if the user enables `--verbose` logging during training. They can be used to provide more specific and potentially more verbose details, especially in areas that can indicate bugs or problems, or to surface more details about what spaCy does under the hood. You should only use logging statements if absolutely necessary and important. + +## Writing tests + +spaCy uses the [`pytest`](http://doc.pytest.org/) framework for testing. Tests for spaCy modules and classes live in their own directories of the same name and all test files should be prefixed with `test_`. Tests included in the core library only cover the code and do not depend on any trained pipelines. When implementing a new feature or fixing a bug, it's usually good to start by writing some tests that describe what _should_ happen. As you write your code, you can then keep running the relevant tests until all of them pass. + +### Test suite structure + +When adding tests, make sure to use descriptive names and only test for one behavior at a time. Tests should be grouped into modules dedicated to the same type of functionality and some test modules are organized as directories of test files related to the same larger area of the library, e.g. `matcher` or `tokenizer`. + +Regression tests are tests that refer to bugs reported in specific issues. They should live in the `regression` module and are named according to the issue number (e.g. `test_issue1234.py`). This system allows us to relate tests for specific bugs back to the original reported issue, which is especially useful if we introduce a regression and a previously passing regression tests suddenly fails again. When fixing a bug, it's often useful to create a regression test for it first. Every once in a while, we go through the `regression` module and group tests together into larger files by issue number, in groups of 500 to 1000 numbers. This prevents us from ending up with too many individual files over time. + +The test suite also provides [fixtures](https://github.com/explosion/spaCy/blob/master/spacy/tests/conftest.py) for different language tokenizers that can be used as function arguments of the same name and will be passed in automatically. Those should only be used for tests related to those specific languages. We also have [test utility functions](https://github.com/explosion/spaCy/blob/master/spacy/tests/util.py) for common operations, like creating a temporary file. + +### Constructing objects and state + +Test functions usually follow the same simple structure: they set up some state, perform the operation you want to test and `assert` conditions that you expect to be true, usually before and after the operation. + +Tests should focus on exactly what they're testing and avoid dependencies on other unrelated library functionality wherever possible. If all your test needs is a `Doc` object with certain annotations set, you should always construct it manually: + +```python +def test_doc_creation_with_pos(): + doc = Doc(Vocab(), words=["hello", "world"], pos=["NOUN", "VERB"]) + assert doc[0].pos_ == "NOUN" + assert doc[1].pos_ == "VERB" +``` + +### Parametrizing tests + +If you need to run the same test function over different input examples, you usually want to parametrize the test cases instead of using a loop within your test. This lets you keep a better separation between test cases and test logic, and it'll result in more useful output because `pytest` will be able to tell you which exact test case failed. + +The `@pytest.mark.parametrize` decorator takes two arguments: a string defining one or more comma-separated arguments that should be passed to the test function and a list of corresponding test cases (or a list of tuples to provide multiple arguments). + +```python +@pytest.mark.parametrize("words", [["hello", "world"], ["this", "is", "a", "test"]]) +def test_doc_length(words): + doc = Doc(Vocab(), words=words) + assert len(doc) == len(words) +``` + +```python +@pytest.mark.parametrize("text,expected_len", [("hello world", 2), ("I can't!", 4)]) +def test_token_length(en_tokenizer, text, expected_len): # en_tokenizer is a fixture + doc = en_tokenizer(text) + assert len(doc) == expected_len +``` + +You can also stack `@pytest.mark.parametrize` decorators, although this is not recommended unless it's absolutely needed or required for the test. When stacking decorators, keep in mind that this will run the test with all possible combinations of the respective parametrized values, which is often not what you want and can slow down the test suite. + +### Handling failing tests + +`xfail` means that a test **should pass but currently fails**, i.e. is expected to fail. You can mark a test as currently xfailing by adding the `@pytest.mark.xfail` decorator. This should only be used for tests that don't yet work, not for logic that cause errors we raise on purpose (see the section on testing errors for this). It's often very helpful to implement tests for edge cases that we don't yet cover and mark them as `xfail`. You can also provide a `reason` keyword argument to the decorator with an explanation of why the test currently fails. + +```diff ++ @pytest.mark.xfail(reason="Issue #225 - not yet implemented") +def test_en_tokenizer_splits_em_dash_infix(en_tokenizer): + doc = en_tokenizer("Will this road take me to Puddleton?\u2014No.") + assert doc[8].text == "\u2014" +``` + +When you run the test suite, you may come across tests that are reported as `xpass`. This means that they're marked as `xfail` but didn't actually fail. This is worth looking into: sometimes, it can mean that we have since fixed a bug that caused the test to previously fail, so we can remove the decorator. In other cases, especially when it comes to machine learning model implementations, it can also indicate that the **test is flaky**: it sometimes passes and sometimes fails. This can be caused by a bug, or by constraints being too narrowly defined. If a test shows different behavior depending on whether its run in isolation or not, this can indicate that it reacts to global state set in a previous test, which is unideal and should be avoided. + +### Writing slow tests + +If a test is useful but potentially quite slow, you can mark it with the `@pytest.mark.slow` decorator. This is a special marker we introduced and tests decorated with it only run if you run the test suite with `--slow`, but not as part of the main CI process. Before introducing a slow test, double-check that there isn't another and more efficient way to test for the behavior. You should also consider adding a simpler test with maybe only a subset of the test cases that can always run, so we at least have some coverage. + +### Skipping tests + +The `@pytest.mark.skip` decorator lets you skip tests entirely. You only want to do this for failing tests that may be slow to run or cause memory errors or segfaults, which would otherwise terminate the entire process and wouldn't be caught by `xfail`. We also sometimes use the `skip` decorator for old and outdated regression tests that we want to keep around but that don't apply anymore. When using the `skip` decorator, make sure to provide the `reason` keyword argument with a quick explanation of why you chose to skip this test. + +### Testing errors and warnings + +`pytest` lets you check whether a given error is raised by using the `pytest.raises` contextmanager. This is very useful when implementing custom error handling, so make sure you're not only testing for the correct behavior but also for errors resulting from incorrect inputs. If you're testing errors, you should always check for `pytest.raises` explicitly and not use `xfail`. + +```python +words = ["a", "b", "c", "d", "e"] +ents = ["Q-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"] +with pytest.raises(ValueError): + Doc(Vocab(), words=words, ents=ents) +``` + +You can also use the `pytest.warns` contextmanager to check that a given warning type is raised. The first argument is the warning type or `None` (which will capture a list of warnings that you can `assert` is empty). + +```python +def test_phrase_matcher_validation(en_vocab): + doc1 = Doc(en_vocab, words=["Test"], deps=["ROOT"]) + doc2 = Doc(en_vocab, words=["Test"]) + matcher = PhraseMatcher(en_vocab, validate=True) + with pytest.warns(UserWarning): + # Warn about unnecessarily parsed document + matcher.add("TEST1", [doc1]) + with pytest.warns(None) as record: + matcher.add("TEST2", [docs]) + assert not record.list +``` + +Keep in mind that your tests will fail if you're using the `pytest.warns` contextmanager with a given warning and the warning is _not_ shown. So you should only use it to check that spaCy handles and outputs warnings correctly. If your test outputs a warning that's expected but not relevant to what you're testing, you can use the `@pytest.mark.filterwarnings` decorator and ignore specific warnings starting with a given code: + +```python +@pytest.mark.filterwarnings("ignore:\\[W036") +def test_matcher_empty(en_vocab): + matcher = Matcher(en_vocab) + matcher(Doc(en_vocab, words=["test"])) +``` + +### Testing trained pipelines + +Our regular test suite does not depend on any of the trained pipelines, since their outputs can vary and aren't generally required to test the library functionality. We test pipelines separately using the tests included in the [`spacy-models`](https://github.com/explosion/spacy-models) repository, which run whenever we train a new suite of models. The tests here mostly focus on making sure that the packages can be loaded and that the predictions seam reasonable, and they include checks for common bugs we encountered previously. If your test does not primarily focus on verifying a model's predictions, it should be part of the core library tests and construct the required objects manually, instead of being added to the models tests. + +Keep in mind that specific predictions may change, and we can't test for all incorrect predictions reported by users. Different models make different mistakes, so even a model that's significantly more accurate overall may end up making wrong predictions that it previously didn't. However, some surprising incorrect predictions may indicate deeper bugs that we definitely want to investigate. diff --git a/extra/DEVELOPER_DOCS/Language.md b/extra/DEVELOPER_DOCS/Language.md new file mode 100644 index 000000000..f4fc85095 --- /dev/null +++ b/extra/DEVELOPER_DOCS/Language.md @@ -0,0 +1,150 @@ +# Language + +> Reference: `spacy/language.py` + +1. [Constructing the `nlp` object from a config](#1-constructing-the-nlp-object-from-a-config) + - [A. Overview of `Language.from_config`](#1a-overview) + - [B. Component factories](#1b-how-pipeline-component-factories-work-in-the-config) + - [C. Sourcing a component](#1c-sourcing-a-pipeline-component) + - [D. Tracking components as they're modified](#1d-tracking-components-as-theyre-modified) + - [E. spaCy's config utility function](#1e-spacys-config-utility-functions) +2. [Initialization](#initialization) + - [A. Initialization for training](#2a-initialization-for-training): `init_nlp` + - [B. Initializing the `nlp` object](#2b-initializing-the-nlp-object): `Language.initialize` + - [C. Initializing the vocab](#2c-initializing-the-vocab): `init_vocab` + +## 1. Constructing the `nlp` object from a config + +### 1A. Overview + +Most of the functions referenced in the config are regular functions with arbitrary arguments registered via the function registry. However, the pipeline components are a bit special: they don't only receive arguments passed in via the config file, but also the current `nlp` object and the string `name` of the individual component instance (so a user can have multiple components created with the same factory, e.g. `ner_one` and `ner_two`). This name can then be used by the components to add to the losses and scores. This special requirement means that pipeline components can't just be resolved via the config the "normal" way: we need to retrieve the component functions manually and pass them their arguments, plus the `nlp` and `name`. + +The `Language.from_config` classmethod takes care of constructing the `nlp` object from a config. It's the single place where this happens and what `spacy.load` delegates to under the hood. Its main responsibilities are: + +- **Load and validate the config**, and optionally **auto-fill** all missing values that we either have defaults for in the config template or that registered function arguments define defaults for. This helps ensure backwards-compatibility, because we're able to add a new argument `foo: str = "bar"` to an existing function, without breaking configs that don't specity it. +- **Execute relevant callbacks** for pipeline creation, e.g. optional functions called before and after creation of the `nlp` object and pipeline. +- **Initialize language subclass and create tokenizer**. The `from_config` classmethod will always be called on a language subclass, e.g. `English`, not on `Language` directly. Initializing the subclass takes a callback to create the tokenizer. +- **Set up the pipeline components**. Components can either refer to a component factory or a `source`, i.e. an existing pipeline that's loaded and that the component is then copied from. We also need to ensure that we update the information about which components are disabled. +- **Manage listeners.** If sourced components "listen" to other components (`tok2vec`, `transformer`), we need to ensure that the references are valid. If the config specifies that listeners should be replaced by copies (e.g. to give the `ner` component its own `tok2vec` model instead of listening to the shared `tok2vec` component in the pipeline), we also need to take care of that. + +Note that we only resolve and load **selected sections** in `Language.from_config`, i.e. only the parts that are relevant at runtime, which is `[nlp]` and `[components]`. We don't want to be resolving anything related to training or initialization, since this would mean loading and constructing unnecessary functions, including functions that require information that isn't necessarily available at runtime, like `paths.train`. + +### 1B. How pipeline component factories work in the config + +As opposed to regular registered functions that refer to a registry and function name (e.g. `"@misc": "foo.v1"`), pipeline components follow a different format and refer to their component `factory` name. This corresponds to the name defined via the `@Language.component` or `@Language.factory` decorator. We need this decorator to define additional meta information for the components, like their default config and score weights. + +```ini +[components.my_component] +factory = "foo" +some_arg = "bar" +other_arg = ${paths.some_path} +``` + +This means that we need to create and resolve the `config["components"]` separately from the rest of the config. There are some important considerations and things we need to manage explicitly to avoid unexpected behavior: + +#### Variable interpolation + +When a config is resolved, references to variables are replaced, so that the functions receive the correct value instead of just the variable name. To interpolate a config, we need it in its entirety: we couldn't just interpolate a subsection that refers to variables defined in a different subsection. So we first interpolate the entire config. + +However, the `nlp.config` should include the original config with variables intact – otherwise, loading a pipeline and saving it to disk will destroy all logic implemented via variables and hard-code the values all over the place. This means that when we create the components, we need to keep two versions of the config: the interpolated config with the "real" values and the `raw_config` including the variable references. + +#### Factory registry + +Component factories are special and use the `@Language.factory` or `@Language.component` decorator to register themselves and their meta. When the decorator runs, it performs some basic validation, stores the meta information for the factory on the `Language` class (default config, scores etc.) and then adds the factory function to `registry.factories`. The `component` decorator can be used for registering simple functions that just take a `Doc` object and return it so in that case, we create the factory for the user automatically. + +There's one important detail to note about how factories are registered via entry points: A package that wants to expose spaCy components still needs to register them via the `@Language` decorators so we have the component meta information and can perform required checks. All we care about here is that the decorated function is **loaded and imported**. When it is, the `@Language` decorator takes care of everything, including actually registering the component factory. + +Normally, adding to the registry via an entry point will just add the function to the registry under the given name. But for `spacy_factories`, we don't actually want that: all we care about is that the function decorated with `@Language` is imported so the decorator runs. So we only exploit Python's entry point system to automatically import the function, and the `spacy_factories` entry point group actually adds to a **separate registry**, `registry._factories`, under the hood. Its only purpose is that the functions are imported. The decorator then runs, creates the factory if needed and adds it to the `registry.factories` registry. + +#### Language-specific factories + +spaCy supports registering factories on the `Language` base class, as well as language-specific subclasses like `English` or `German`. This allows providing different factories depending on the language, e.g. a different default lemmatizer. The `Language.get_factory_name` classmethod constructs the factory name as `{lang}.{name}` if a language is available (i.e. if it's a subclass) and falls back to `{name}` otherwise. So `@German.factory("foo")` will add a factory `de.foo` under the hood. If you add `nlp.add_pipe("foo")`, we first check if there's a factory for `{nlp.lang}.foo` and if not, we fall back to checking for a factory `foo`. + +#### Creating a pipeline component from a factory + +`Language.add_pipe` takes care of adding a pipeline component, given its factory name, its config. If no source pipeline to copy the component from is provided, it delegates to `Language.create_pipe`, which sets up the actual component function. + +- Validate the config and make sure that the factory was registered via the decorator and that we have meta for it. +- Update the component config with any defaults specified by the component's `default_config`, if available. This is done by merging the values we receive into the defaults. It ensures that you can still add a component without having to specify its _entire_ config including more complex settings like `model`. If no `model` is defined, we use the default. +- Check if we have a language-specific factory for the given `nlp.lang` and if not, fall back to the global factory. +- Construct the component config, consisting of whatever arguments were provided, plus the current `nlp` object and `name`, which are default expected arguments of all factories. We also add a reference to the `@factories` registry, so we can resolve the config via the registry, like any other config. With the added `nlp` and `name`, it should now include all expected arguments of the given function. +- Fill the config to make sure all unspecified defaults from the function arguments are added and update the `raw_config` (uninterpolated with variables intact) with that information, so the component config we store in `nlp.config` is up to date. We do this by adding the `raw_config` _into_ the filled config – otherwise, the references to variables would be overwritten. +- Resolve the config and create all functions it refers to (e.g. `model`). This gives us the actual component function that we can insert into the pipeline. + +### 1C. Sourcing a pipeline component + +```ini +[components.ner] +source = "en_core_web_sm" +``` + +spaCy also allows ["sourcing" a component](https://spacy.io/usage/processing-pipelines#sourced-components), which will copy it over from an existing pipeline. In this case, `Language.add_pipe` will delegate to `Language.create_pipe_from_source`. In order to copy a component effectively and validate it, the source pipeline first needs to be loaded. This is done in `Language.from_config`, so a source pipeline only has to be loaded once if multiple components source from it. Sourcing a component will perform the following checks and modifications: + +- For each sourced pipeline component loaded in `Language.from_config`, a hash of the vectors data from the source pipeline is stored in the pipeline meta so we're able to check whether the vectors match and warn if not (since different vectors that are used as features in components can lead to degraded performance). Because the vectors are not loaded at the point when components are sourced, the check is postponed to `init_vocab` as part of `Language.initialize`. +- If the sourced pipeline component is loaded through `Language.add_pipe(source=)`, the vectors are already loaded and can be compared directly. The check compares the shape and keys first and finally falls back to comparing the actual byte representation of the vectors (which is slower). +- Ensure that the component is available in the pipeline. +- Interpolate the entire config of the source pipeline so all variables are replaced and the component's config that's copied over doesn't include references to variables that are not available in the destination config. +- Add the source `vocab.strings` to the destination's `vocab.strings` so we don't end up with unavailable strings in the final pipeline (which would also include labels used by the sourced component). + +Note that there may be other incompatibilities that we're currently not checking for and that could cause a sourced component to not work in the destination pipeline. We're interested in adding more checks here but there'll always be a small number of edge cases we'll never be able to catch, including a sourced component depending on other pipeline state that's not available in the destination pipeline. + +### 1D. Tracking components as they're modified + +The `Language` class implements methods for removing, replacing or renaming pipeline components. Whenever we make these changes, we need to update the information stored on the `Language` object to ensure that it matches the current state of the pipeline. If a user just writes to `nlp.config` manually, we obviously can't ensure that the config matches the reality – but since we offer modification via the pipe methods, it's expected that spaCy keeps the config in sync under the hood. Otherwise, saving a modified pipeline to disk and loading it back wouldn't work. The internal attributes we need to keep in sync here are: + +| Attribute | Type | Description | +| ------------------------ | ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `Language._components` | `List[Tuple[str, Callable]]` | All pipeline components as `(name, func)` tuples. This is used as the source of truth for `Language.pipeline`, `Language.pipe_names` and `Language.components`. | +| `Language._pipe_meta` | `Dict[str, FactoryMeta]` | The meta information of a component's factory, keyed by component name. This can include multiple components referring to the same factory meta. | +| `Language._pipe_configs` | `Dict[str, Config]` | The component's config, keyed by component name. | +| `Language._disabled` | `Set[str]` | Names of components that are currently disabled. | +| `Language._config` | `Config` | The underlying config. This is only internals and will be used as the basis for constructing the config in the `Language.config` property. | + +In addition to the actual component settings in `[components]`, the config also allows specifying component-specific arguments via the `[initialize.components]` block, which are passed to the component's `initialize` method during initialization if it's available. So we also need to keep this in sync in the underlying config. + +### 1E. spaCy's config utility functions + +When working with configs in spaCy, make sure to use the utility functions provided by spaCy if available, instead of calling the respective `Config` methods. The utilities take care of providing spaCy-specific error messages and ensure a consistent order of config sections by setting the `section_order` argument. This ensures that exported configs always have the same consistent format. + +- `util.load_config`: load a config from a file +- `util.load_config_from_str`: load a confirm from a string representation +- `util.copy_config`: deepcopy a config + +## 2. Initialization + +Initialization is a separate step of the [config lifecycle](https://spacy.io/usage/training#config-lifecycle) that's not performed at runtime. It's implemented via the `training.initialize.init_nlp` helper and calls into `Language.initialize` method, which sets up the pipeline and component models before training. The `initialize` method takes a callback that returns a sample of examples, which is used to initialize the component models, add all required labels and perform shape inference if applicable. + +Components can also define custom initialization setting via the `[initialize.components]` block, e.g. if they require external data like lookup tables to be loaded in. All config settings defined here will be passed to the component's `initialize` method, if it implements one. Components are expected to handle their own serialization after they're initialized so that any data or settings they require are saved with the pipeline and will be available from disk when the pipeline is loaded back at runtime. + +### 2A. Initialization for training + +The `init_nlp` function is called before training and returns an initialized `nlp` object that can be updated with the examples. It only needs the config and does the following: + +- Load and validate the config. In order to validate certain settings like the `seed`, we also interpolate the config to get the final value (because in theory, a user could provide this via a variable). +- Set up the GPU allocation, if required. +- Create the `nlp` object from the raw, uninterpolated config, which delegates to `Language.from_config`. Since this method may modify and auto-fill the config and pipeline component settings, we then use the interpolated version of `nlp.config` going forward, to ensure that what we're training with is up to date. +- Resolve the `[training]` block of the config and perform validation, e.g. to check that the corpora are available. +- Determine the components that should be frozen (not updated during training) or resumed (sourced components from a different pipeline that should be updated from the examples and not reset and re-initialized). To resume training, we can call the `nlp.resume_training` method. +- Initialize the `nlp` object via `nlp.initialize` and pass it a `get_examples` callback that returns the training corpus (used for shape inference, setting up labels etc.). If the training corpus is streamed, we only provide a small sample of the data, which can potentially be infinite. `nlp.initialize` will delegate to the components as well and pass the data sample forward. +- Check the listeners and warn about components dependencies, e.g. if a frozen component listens to a component that is retrained, or vice versa (which can degrade results). + +### 2B. Initializing the `nlp` object + +The `Language.initialize` method does the following: + +- **Resolve the config** defined in the `[initialize]` block separately (since everything else is already available in the loaded `nlp` object), based on the fully interpolated config. +- **Execute callbacks**, i.e. `before_init` and `after_init`, if they're defined. +- **Initialize the vocab**, including vocab data, lookup tables and vectors. +- **Initialize the tokenizer** if it implements an `initialize` method. This is not the case for the default tokenizers, but it allows custom tokenizers to depend on external data resources that are loaded in on initialization. +- **Initialize all pipeline components** if they implement an `initialize` method and pass them the `get_examples` callback, the current `nlp` object as well as well additional initialization config settings provided in the component-specific block. +- **Initialize pretraining** if a `[pretraining]` block is available in the config. This allows loading pretrained tok2vec weights in `spacy pretrain`. +- **Register listeners** if token-to-vector embedding layers of a component model "listen" to a previous component (`tok2vec`, `transformer`) in the pipeline. +- **Create an optimizer** on the `Language` class, either by adding the optimizer passed as `sgd` to `initialize`, or by creating the optimizer defined in the config's training settings. + +### 2C. Initializing the vocab + +Vocab initialization is handled in the `training.initialize.init_vocab` helper. It takes the relevant loaded functions and values from the config and takes care of the following: + +- Add lookup tables defined in the config initialization, e.g. custom lemmatization tables. Those will be added to `nlp.vocab.lookups` from where they can be accessed by components. +- Add JSONL-formatted [vocabulary data](https://spacy.io/api/data-formats#vocab-jsonl) to pre-populate the lexical attributes. +- Load vectors into the pipeline. Vectors are defined as a name or path to a saved `nlp` object containing the vectors, e.g. `en_vectors_web_lg`. It's loaded and the vectors are ported over, while ensuring that all source strings are available in the destination strings. We also warn if there's a mismatch between sourced vectors, since this can lead to problems. diff --git a/extra/DEVELOPER_DOCS/Listeners.md b/extra/DEVELOPER_DOCS/Listeners.md new file mode 100644 index 000000000..3a71082e0 --- /dev/null +++ b/extra/DEVELOPER_DOCS/Listeners.md @@ -0,0 +1,220 @@ +# Listeners + +1. [Overview](#1-overview) +2. [Initialization](#2-initialization) + - [A. Linking listeners to the embedding component](#2a-linking-listeners-to-the-embedding-component) + - [B. Shape inference](#2b-shape-inference) +3. [Internal communication](#3-internal-communication) + - [A. During prediction](#3a-during-prediction) + - [B. During training](#3b-during-training) + - [C. Frozen components](#3c-frozen-components) +4. [Replacing listener with standalone](#4-replacing-listener-with-standalone) + +## 1. Overview + +Trainable spaCy components typically use some sort of `tok2vec` layer as part of the `model` definition. +This `tok2vec` layer produces embeddings and is either a standard `Tok2Vec` layer, or a Transformer-based one. +Both versions can be used either inline/standalone, which means that they are defined and used +by only one specific component (e.g. NER), or +[shared](https://spacy.io/usage/embeddings-transformers#embedding-layers), +in which case the embedding functionality becomes a separate component that can +feed embeddings to multiple components downstream, using a listener-pattern. + +| Type | Usage | Model Architecture | +| ------------- | ---------- | -------------------------------------------------------------------------------------------------- | +| `Tok2Vec` | standalone | [`spacy.Tok2Vec`](https://spacy.io/api/architectures#Tok2Vec) | +| `Tok2Vec` | listener | [`spacy.Tok2VecListener`](https://spacy.io/api/architectures#Tok2VecListener) | +| `Transformer` | standalone | [`spacy-transformers.Tok2VecTransformer`](https://spacy.io/api/architectures#Tok2VecTransformer) | +| `Transformer` | listener | [`spacy-transformers.TransformerListener`](https://spacy.io/api/architectures#TransformerListener) | + +Here we discuss the listener pattern and its implementation in code in more detail. + +## 2. Initialization + +### 2A. Linking listeners to the embedding component + +To allow sharing a `tok2vec` layer, a separate `tok2vec` component needs to be defined in the config: + +``` +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.Tok2Vec.v2" +``` + +A listener can then be set up by making sure the correct `upstream` name is defined, referring to the +name of the `tok2vec` component (which equals the factory name by default), or `*` as a wildcard: + +``` +[components.ner.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +upstream = "tok2vec" +``` + +When an [`nlp`](https://github.com/explosion/spaCy/blob/master/extra/DEVELOPER_DOCS/Language.md) object is +initialized or deserialized, it will make sure to link each `tok2vec` component to its listeners. This is +implemented in the method `nlp._link_components()` which loops over each +component in the pipeline and calls `find_listeners()` on a component if it's defined. +The [`tok2vec` component](https://github.com/explosion/spaCy/blob/master/spacy/pipeline/tok2vec.py)'s implementation +of this `find_listener()` method will specifically identify sublayers of a model definition that are of type +`Tok2VecListener` with a matching upstream name and will then add that listener to the internal `self.listener_map`. + +If it's a Transformer-based pipeline, a +[`transformer` component](https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py) +has a similar implementation but its `find_listener()` function will specifically look for `TransformerListener` +sublayers of downstream components. + +### 2B. Shape inference + +Typically, the output dimension `nO` of a listener's model equals the `nO` (or `width`) of the upstream embedding layer. +For a standard `Tok2Vec`-based component, this is typically known up-front and defined as such in the config: + +``` +[components.ner.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} +``` + +A `transformer` component however only knows its `nO` dimension after the HuggingFace transformer +is set with the function `model.attrs["set_transformer"]`, +[implemented](https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/layers/transformer_model.py) +by `set_pytorch_transformer`. +This is why, upon linking of the transformer listeners, the `transformer` component also makes sure to set +the listener's output dimension correctly. + +This shape inference mechanism also needs to happen with resumed/frozen components, which means that for some CLI +commands (`assemble` and `train`), we need to call `nlp._link_components` even before initializing the `nlp` +object. To cover all use-cases and avoid negative side effects, the code base ensures that performing the +linking twice is not harmful. + +## 3. Internal communication + +The internal communication between a listener and its downstream components is organized by sending and +receiving information across the components - either directly or implicitly. +The details are different depending on whether the pipeline is currently training, or predicting. +Either way, the `tok2vec` or `transformer` component always needs to run before the listener. + +### 3A. During prediction + +When the `Tok2Vec` pipeline component is called, its `predict()` method is executed to produce the results, +which are then stored by `set_annotations()` in the `doc.tensor` field of the document(s). +Similarly, the `Transformer` component stores the produced embeddings +in `doc._.trf_data`. Next, the `forward` pass of a +[`Tok2VecListener`](https://github.com/explosion/spaCy/blob/master/spacy/pipeline/tok2vec.py) +or a +[`TransformerListener`](https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/layers/listener.py) +accesses these fields on the `Doc` directly. Both listener implementations have a fallback mechanism for when these +properties were not set on the `Doc`: in that case an all-zero tensor is produced and returned. +We need this fallback mechanism to enable shape inference methods in Thinc, but the code +is slightly risky and at times might hide another bug - so it's a good spot to be aware of. + +### 3B. During training + +During training, the `update()` methods of the `Tok2Vec` & `Transformer` components don't necessarily set the +annotations on the `Doc` (though since 3.1 they can if they are part of the `annotating_components` list in the config). +Instead, we rely on a caching mechanism between the original embedding component and its listener. +Specifically, the produced embeddings are sent to the listeners by calling `listener.receive()` and uniquely +identifying the batch of documents with a `batch_id`. This `receive()` call also sends the appropriate `backprop` +call to ensure that gradients from the downstream component flow back to the trainable `Tok2Vec` or `Transformer` +network. + +We rely on the `nlp` object properly batching the data and sending each batch through the pipeline in sequence, +which means that only one such batch needs to be kept in memory for each listener. +When the downstream component runs and the listener should produce embeddings, it accesses the batch in memory, +runs the backpropagation, and returns the results and the gradients. + +There are two ways in which this mechanism can fail, both are detected by `verify_inputs()`: + +- `E953` if a different batch is in memory than the requested one - signaling some kind of out-of-sync state of the + training pipeline. +- `E954` if no batch is in memory at all - signaling that the pipeline is probably not set up correctly. + +#### Training with multiple listeners + +One `Tok2Vec` or `Transformer` component may be listened to by several downstream components, e.g. +a tagger and a parser could be sharing the same embeddings. In this case, we need to be careful about how we do +the backpropagation. When the `Tok2Vec` or `Transformer` sends out data to the listener with `receive()`, they will +send an `accumulate_gradient` function call to all listeners, except the last one. This function will keep track +of the gradients received so far. Only the final listener in the pipeline will get an actual `backprop` call that +will initiate the backpropagation of the `tok2vec` or `transformer` model with the accumulated gradients. + +### 3C. Frozen components + +The listener pattern can get particularly tricky in combination with frozen components. To detect components +with listeners that are not frozen consistently, `init_nlp()` (which is called by `spacy train`) goes through +the listeners and their upstream components and warns in two scenarios. + +#### The Tok2Vec or Transformer is frozen + +If the `Tok2Vec` or `Transformer` was already trained, +e.g. by [pretraining](https://spacy.io/usage/embeddings-transformers#pretraining), +it could be a valid use-case to freeze the embedding architecture and only train downstream components such +as a tagger or a parser. This used to be impossible before 3.1, but has become supported since then by putting the +embedding component in the [`annotating_components`](https://spacy.io/usage/training#annotating-components) +list of the config. This works like any other "annotating component" because it relies on the `Doc` attributes. + +However, if the `Tok2Vec` or `Transformer` is frozen, and not present in `annotating_components`, and a related +listener isn't frozen, then a `W086` warning is shown and further training of the pipeline will likely end with `E954`. + +#### The upstream component is frozen + +If an upstream component is frozen but the underlying `Tok2Vec` or `Transformer` isn't, the performance of +the upstream component will be degraded after training. In this case, a `W087` warning is shown, explaining +how to use the `replace_listeners` functionality to prevent this problem. + +## 4. Replacing listener with standalone + +The [`replace_listeners`](https://spacy.io/api/language#replace_listeners) functionality changes the architecture +of a downstream component from using a listener pattern to a standalone `tok2vec` or `transformer` layer, +effectively making the downstream component independent of any other components in the pipeline. +It is implemented by `nlp.replace_listeners()` and typically executed by `nlp.from_config()`. +First, it fetches the original `Model` of the original component that creates the embeddings: + +``` +tok2vec = self.get_pipe(tok2vec_name) +tok2vec_model = tok2vec.model +``` + +Which is either a [`Tok2Vec` model](https://github.com/explosion/spaCy/blob/master/spacy/ml/models/tok2vec.py) or a +[`TransformerModel`](https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/layers/transformer_model.py). + +In the case of the `tok2vec`, this model can be copied as-is into the configuration and architecture of the +downstream component. However, for the `transformer`, this doesn't work. +The reason is that the `TransformerListener` architecture chains the listener with +[`trfs2arrays`](https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/layers/trfs2arrays.py): + +``` +model = chain( + TransformerListener(upstream_name=upstream) + trfs2arrays(pooling, grad_factor), +) +``` + +but the standalone `Tok2VecTransformer` has an additional `split_trf_batch` chained inbetween the model +and `trfs2arrays`: + +``` +model = chain( + TransformerModel(name, get_spans, tokenizer_config), + split_trf_batch(), + trfs2arrays(pooling, grad_factor), +) +``` + +So you can't just take the model from the listener, and drop that into the component internally. You need to +adjust the model and the config. To facilitate this, `nlp.replace_listeners()` will check whether additional +[functions](https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/layers/_util.py) are +[defined](https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/layers/transformer_model.py) +in `model.attrs`, and if so, it will essentially call these to make the appropriate changes: + +``` +replace_func = tok2vec_model.attrs["replace_listener_cfg"] +new_config = replace_func(tok2vec_cfg["model"], pipe_cfg["model"]["tok2vec"]) +... +new_model = tok2vec_model.attrs["replace_listener"](new_model) +``` + +The new config and model are then properly stored on the `nlp` object. +Note that this functionality (running the replacement for a transformer listener) was broken prior to +`spacy-transformers` 1.0.5. diff --git a/extra/DEVELOPER_DOCS/README.md b/extra/DEVELOPER_DOCS/README.md new file mode 100644 index 000000000..8ff505dc6 --- /dev/null +++ b/extra/DEVELOPER_DOCS/README.md @@ -0,0 +1,7 @@ + + +# Developer Documentation + +This directory includes additional documentation and explanations of spaCy's internals. It's mostly intended for the spaCy core development team and contributors interested in the more complex parts of the library. The documents generally focus on more abstract implementation details and how specific methods and algorithms work, and they assume knowledge of what's already available in the [usage documentation](https://spacy.io/usage) and [API reference](https://spacy.io/api). + +If you're looking to contribute to spaCy, make sure to check out the documentation and [contributing guide](https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md) first. diff --git a/extra/DEVELOPER_DOCS/StringStore-Vocab.md b/extra/DEVELOPER_DOCS/StringStore-Vocab.md new file mode 100644 index 000000000..866ba2aae --- /dev/null +++ b/extra/DEVELOPER_DOCS/StringStore-Vocab.md @@ -0,0 +1,216 @@ +# StringStore & Vocab + +> Reference: `spacy/strings.pyx` +> Reference: `spacy/vocab.pyx` + +## Overview + +spaCy represents mosts strings internally using a `uint64` in Cython which +corresponds to a hash. The magic required to make this largely transparent is +handled by the `StringStore`, and is integrated into the pipelines using the +`Vocab`, which also connects it to some other information. + +These are mostly internal details that average library users should never have +to think about. On the other hand, when developing a component it's normal to +interact with the Vocab for lexeme data or word vectors, and it's not unusual +to add labels to the `StringStore`. + +## StringStore + +### Overview + +The `StringStore` is a `cdef class` that looks a bit like a two-way dictionary, +though it is not a subclass of anything in particular. + +The main functionality of the `StringStore` is that `__getitem__` converts +hashes into strings or strings into hashes. + +The full details of the conversion are complicated. Normally you shouldn't have +to worry about them, but the first applicable case here is used to get the +return value: + +1. 0 and the empty string are special cased to each other +2. internal symbols use a lookup table (`SYMBOLS_BY_STR`) +3. normal strings or bytes are hashed +4. internal symbol IDs in `SYMBOLS_BY_INT` are handled +5. anything not yet handled is used as a hash to lookup a string + +For the symbol enums, see [`symbols.pxd`](https://github.com/explosion/spaCy/blob/master/spacy/symbols.pxd). + +Almost all strings in spaCy are stored in the `StringStore`. This naturally +includes tokens, but also includes things like labels (not just NER/POS/dep, +but also categories etc.), lemmas, lowercase forms, word shapes, and so on. One +of the main results of this is that tokens can be represented by a compact C +struct ([`LexemeC`](https://spacy.io/api/cython-structs#lexemec)/[`TokenC`](https://github.com/explosion/spaCy/issues/4854)) that mostly consists of string hashes. This also means that converting +input for the models is straightforward, and there's not a token mapping step +like in many machine learning frameworks. Additionally, because the token IDs +in spaCy are based on hashes, they are consistent across environments or +models. + +One pattern you'll see a lot in spaCy APIs is that `something.value` returns an +`int` and `something.value_` returns a string. That's implemented using the +`StringStore`. Typically the `int` is stored in a C struct and the string is +generated via a property that calls into the `StringStore` with the `int`. + +Besides `__getitem__`, the `StringStore` has functions to return specifically a +string or specifically a hash, regardless of whether the input was a string or +hash to begin with, though these are only used occasionally. + +### Implementation Details: Hashes and Allocations + +Hashes are 64-bit and are computed using [murmurhash][] on UTF-8 bytes. There is no +mechanism for detecting and avoiding collisions. To date there has never been a +reproducible collision or user report about any related issues. + +[murmurhash]: https://github.com/explosion/murmurhash + +The empty string is not hashed, it's just converted to/from 0. + +A small number of strings use indices into a lookup table (so low integers) +rather than hashes. This is mostly Universal Dependencies labels or other +strings considered "core" in spaCy. This was critical in v1, which hadn't +introduced hashing yet. Since v2 it's important for items in `spacy.attrs`, +especially lexeme flags, but is otherwise only maintained for backwards +compatibility. + +You can call `strings["mystring"]` with a string the `StringStore` has never seen +before and it will return a hash. But in order to do the reverse operation, you +need to call `strings.add("mystring")` first. Without a call to `add` the +string will not be interned. + +Example: + +``` +from spacy.strings import StringStore + +ss = StringStore() +hashval = ss["spacy"] # 10639093010105930009 +try: + # this won't work + ss[hashval] +except KeyError: + print(f"key {hashval} unknown in the StringStore.") + +ss.add("spacy") +assert ss[hashval] == "spacy" # it works now + +# There is no `.keys` property, but you can iterate over keys +# The empty string will never be in the list of keys +for key in ss: + print(key) +``` + +In normal use nothing is ever removed from the `StringStore`. In theory this +means that if you do something like iterate through all hex values of a certain +length you can have explosive memory usage. In practice this has never been an +issue. (Note that this is also different from using `sys.intern` to intern +Python strings, which does not guarantee they won't be garbage collected later.) + +Strings are stored in the `StringStore` in a peculiar way: each string uses a +union that is either an eight-byte `char[]` or a `char*`. Short strings are +stored directly in the `char[]`, while longer strings are stored in allocated +memory and prefixed with their length. This is a strategy to reduce indirection +and memory fragmentation. See `decode_Utf8Str` and `_allocate` in +`strings.pyx` for the implementation. + +### When to Use the StringStore? + +While you can ignore the `StringStore` in many cases, there are situations where +you should make use of it to avoid errors. + +Any time you introduce a string that may be set on a `Doc` field that has a hash, +you should add the string to the `StringStore`. This mainly happens when adding +labels in components, but there are some other cases: + +- syntax iterators, mainly `get_noun_chunks` +- external data used in components, like the `KnowledgeBase` in the `entity_linker` +- labels used in tests + +## Vocab + +The `Vocab` is a core component of a `Language` pipeline. Its main function is +to manage `Lexeme`s, which are structs that contain information about a token +that depends only on its surface form, without context. `Lexeme`s store much of +the data associated with `Token`s. As a side effect of this the `Vocab` also +manages the `StringStore` for a pipeline and a grab-bag of other data. + +These are things stored in the vocab: + +- `Lexeme`s +- `StringStore` +- `Morphology`: manages info used in `MorphAnalysis` objects +- `vectors`: basically a dict for word vectors +- `lookups`: language specific data like lemmas +- `writing_system`: language specific metadata +- `get_noun_chunks`: a syntax iterator +- lex attribute getters: functions like `is_punct`, set in language defaults +- `cfg`: **not** the pipeline config, this is mostly unused +- `_unused_object`: Formerly an unused object, kept around until v4 for compatability + +Some of these, like the Morphology and Vectors, are complex enough that they +need their own explanations. Here we'll just look at Vocab-specific items. + +### Lexemes + +A `Lexeme` is a type that mainly wraps a `LexemeC`, a struct consisting of ints +that identify various context-free token attributes. Lexemes are the core data +of the `Vocab`, and can be accessed using `__getitem__` on the `Vocab`. The memory +for storing `LexemeC` objects is managed by a pool that belongs to the `Vocab`. + +Note that `__getitem__` on the `Vocab` works much like the `StringStore`, in +that it accepts a hash or id, with one important difference: if you do a lookup +using a string, that value is added to the `StringStore` automatically. + +The attributes stored in a `LexemeC` are: + +- orth (the raw text) +- lower +- norm +- shape +- prefix +- suffix + +Most of these are straightforward. All of them can be customized, and (except +`orth`) probably should be since the defaults are based on English, but in +practice this is rarely done at present. + +### Lookups + +This is basically a dict of dicts, implemented using a `Table` for each +sub-dict, that stores lemmas and other language-specific lookup data. + +A `Table` is a subclass of `OrderedDict` used for string-to-string data. It uses +Bloom filters to speed up misses and has some extra serialization features. +Tables are not used outside of the lookups. + +### Lex Attribute Getters + +Lexical Attribute Getters like `is_punct` are defined on a per-language basis, +much like lookups, but take the form of functions rather than string-to-string +dicts, so they're stored separately. + +### Writing System + +This is a dict with three attributes: + +- `direction`: ltr or rtl (default ltr) +- `has_case`: bool (default `True`) +- `has_letters`: bool (default `True`, `False` only for CJK for now) + +Currently these are not used much - the main use is that `direction` is used in +visualizers, though `rtl` doesn't quite work (see +[#4854](https://github.com/explosion/spaCy/issues/4854)). In the future they +could be used when choosing hyperparameters for subwords, controlling word +shape generation, and similar tasks. + +### Other Vocab Members + +The Vocab is kind of the default place to store things from `Language.defaults` +that don't belong to the Tokenizer. The following properties are in the Vocab +just because they don't have anywhere else to go. + +- `get_noun_chunks` +- `cfg`: This is a dict that just stores `oov_prob` (hardcoded to `-20`) +- `_unused_object`: Leftover C member, should be removed in next major version + + diff --git a/extra/example_data/ner_example_data/README.md b/extra/example_data/ner_example_data/README.md index af70694f5..3c6a4a86b 100644 --- a/extra/example_data/ner_example_data/README.md +++ b/extra/example_data/ner_example_data/README.md @@ -1,7 +1,25 @@ ## Examples of NER/IOB data that can be converted with `spacy convert` -spacy JSON training files were generated with: +To convert an IOB file to `.spacy` ([`DocBin`](https://spacy.io/api/docbin)) +for spaCy v3: +```bash +python -m spacy convert -c iob -s -n 10 -b en_core_web_sm file.iob . ``` + +See all the `spacy convert` options: https://spacy.io/api/cli#convert + +--- + +The spaCy v2 JSON training files were generated using **spaCy v2** with: + +```bash python -m spacy convert -c iob -s -n 10 -b en file.iob ``` + +To convert an existing JSON training file to `.spacy` for spaCy v3, convert +with **spaCy v3**: + +```bash +python -m spacy convert file.json . +``` diff --git a/licenses/3rd_party_licenses.txt b/licenses/3rd_party_licenses.txt index 3702ad131..d58da9c4a 100644 --- a/licenses/3rd_party_licenses.txt +++ b/licenses/3rd_party_licenses.txt @@ -43,8 +43,8 @@ scikit-learn * Files: scorer.py -The following implementation of roc_auc_score() is adapted from -scikit-learn, which is distributed under the following license: +The implementation of roc_auc_score() is adapted from scikit-learn, which is +distributed under the following license: New BSD License @@ -77,3 +77,53 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +pyvi +---- + +* Files: lang/vi/__init__.py + +The MIT License (MIT) +Copyright (c) 2016 Viet-Trung Tran + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + + +importlib_metadata +------------------ + +* Files: util.py + +The implementation of packages_distributions() is adapted from +importlib_metadata, which is distributed under the following license: + +Copyright 2017-2019 Jason R. Coombs, Barry Warsaw + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/netlify.toml b/netlify.toml index deebe4283..ddcd0ca6c 100644 --- a/netlify.toml +++ b/netlify.toml @@ -2,9 +2,8 @@ redirects = [ # Netlify {from = "https://spacy.netlify.com/*", to="https://spacy.io/:splat", force = true }, # Subdomain for branches - {from = "https://nightly.spacy.io/*", to="https://nightly-spacy-io.spacy.io/:splat", force = true, status = 200}, - # TODO: update this with the v2 branch build once v3 is live (status = 200) - {from = "https://v2.spacy.io/*", to="https://spacy.io/:splat", force = true}, + {from = "https://nightly.spacy.io/*", to="https://spacy.io/:splat", force = true}, + {from = "https://v2.spacy.io/*", to="https://v2-spacy-io.spacy.io/:splat", force = true, status = 200}, # Old subdomains {from = "https://survey.spacy.io/*", to = "https://spacy.io", force = true}, {from = "http://survey.spacy.io/*", to = "https://spacy.io", force = true}, diff --git a/pyproject.toml b/pyproject.toml index 3113cf6c5..f81484d43 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,11 +1,11 @@ [build-system] requires = [ "setuptools", - "cython>=0.25", + "cython>=0.25,<3.0", "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.0.0,<8.1.0", + "thinc>=8.0.12,<8.1.0", "blis>=0.4.0,<0.8.0", "pathy", "numpy>=1.15.0", diff --git a/requirements.txt b/requirements.txt index 72f6b001f..36cf5c58e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,31 +1,35 @@ # Our libraries -spacy-legacy>=3.0.0.dev0,<3.1.0 +spacy-legacy>=3.0.8,<3.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.0.0,<8.1.0 +thinc>=8.0.12,<8.1.0 blis>=0.4.0,<0.8.0 -ml_datasets==0.2.0a0 +ml_datasets>=0.2.0,<0.3.0 murmurhash>=0.28.0,<1.1.0 -wasabi>=0.8.0,<1.1.0 -srsly>=2.3.0,<3.0.0 -catalogue>=2.0.1,<2.1.0 -typer>=0.3.0,<0.4.0 -pathy +wasabi>=0.8.1,<1.1.0 +srsly>=2.4.1,<3.0.0 +catalogue>=2.0.6,<2.1.0 +typer>=0.3.0,<0.5.0 +pathy>=0.3.5 # Third party dependencies numpy>=1.15.0 requests>=2.13.0,<3.0.0 tqdm>=4.38.0,<5.0.0 -pydantic>=1.7.1,<1.8.0 +pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0 jinja2 # Official Python utilities setuptools packaging>=20.0 -importlib_metadata>=0.20; python_version < "3.8" -typing_extensions>=3.7.4; python_version < "3.8" +typing_extensions>=3.7.4.1,<4.0.0.0; python_version < "3.8" # Development dependencies -cython>=0.25 -pytest>=4.6.5 +pre-commit>=2.13.0 +cython>=0.25,<3.0 +pytest>=5.2.0 pytest-timeout>=1.3.0,<2.0.0 mock>=2.0.0,<3.0.0 -flake8>=3.5.0,<3.6.0 -hypothesis +flake8>=3.8.0,<3.10.0 +hypothesis>=3.27.0,<7.0.0 +mypy>=0.910 +types-dataclasses>=0.1.3; python_version < "3.7" +types-mock>=0.1.1 +types-requests diff --git a/setup.cfg b/setup.cfg index b753763f4..dc31228e5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -21,76 +21,87 @@ classifiers = Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 + Programming Language :: Python :: 3.10 Topic :: Scientific/Engineering +project_urls = + Release notes = https://github.com/explosion/spaCy/releases + Source = https://github.com/explosion/spaCy [options] zip_safe = false include_package_data = true python_requires = >=3.6 setup_requires = - cython>=0.25 + cython>=0.25,<3.0 numpy>=1.15.0 # We also need our Cython packages here to compile against cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.0.0,<8.1.0 + thinc>=8.0.12,<8.1.0 install_requires = # Our libraries - spacy-legacy>=3.0.0.dev0,<3.1.0 + spacy-legacy>=3.0.8,<3.1.0 murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.0.0,<8.1.0 + thinc>=8.0.12,<8.1.0 blis>=0.4.0,<0.8.0 - wasabi>=0.8.0,<1.1.0 - srsly>=2.3.0,<3.0.0 - catalogue>=2.0.1,<2.1.0 - typer>=0.3.0,<0.4.0 - pathy + wasabi>=0.8.1,<1.1.0 + srsly>=2.4.1,<3.0.0 + catalogue>=2.0.6,<2.1.0 + typer>=0.3.0,<0.5.0 + pathy>=0.3.5 # Third-party dependencies tqdm>=4.38.0,<5.0.0 numpy>=1.15.0 requests>=2.13.0,<3.0.0 - pydantic>=1.7.1,<1.8.0 + pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0 jinja2 # Official Python utilities setuptools packaging>=20.0 - importlib_metadata>=0.20; python_version < "3.8" - typing_extensions>=3.7.4; python_version < "3.8" + typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8" [options.entry_points] console_scripts = - spacy = spacy.cli:app + spacy = spacy.cli:setup_cli [options.extras_require] lookups = - spacy_lookups_data>=1.0.0rc0,<1.1.0 + spacy_lookups_data>=1.0.2,<1.1.0 transformers = - spacy_transformers>=1.0.0rc0,<1.1.0 + spacy_transformers>=1.0.1,<1.2.0 ray = spacy_ray>=0.1.0,<1.0.0 cuda = - cupy>=5.0.0b4,<9.0.0 + cupy>=5.0.0b4,<10.0.0 cuda80 = - cupy-cuda80>=5.0.0b4,<9.0.0 + cupy-cuda80>=5.0.0b4,<10.0.0 cuda90 = - cupy-cuda90>=5.0.0b4,<9.0.0 + cupy-cuda90>=5.0.0b4,<10.0.0 cuda91 = - cupy-cuda91>=5.0.0b4,<9.0.0 + cupy-cuda91>=5.0.0b4,<10.0.0 cuda92 = - cupy-cuda92>=5.0.0b4,<9.0.0 + cupy-cuda92>=5.0.0b4,<10.0.0 cuda100 = - cupy-cuda100>=5.0.0b4,<9.0.0 + cupy-cuda100>=5.0.0b4,<10.0.0 cuda101 = - cupy-cuda101>=5.0.0b4,<9.0.0 + cupy-cuda101>=5.0.0b4,<10.0.0 cuda102 = - cupy-cuda102>=5.0.0b4,<9.0.0 + cupy-cuda102>=5.0.0b4,<10.0.0 cuda110 = - cupy-cuda110>=5.0.0b4,<9.0.0 + cupy-cuda110>=5.0.0b4,<10.0.0 cuda111 = - cupy-cuda111>=5.0.0b4,<9.0.0 + cupy-cuda111>=5.0.0b4,<10.0.0 +cuda112 = + cupy-cuda112>=5.0.0b4,<10.0.0 +cuda113 = + cupy-cuda113>=5.0.0b4,<10.0.0 +cuda114 = + cupy-cuda114>=5.0.0b4,<10.0.0 +apple = + thinc-apple-ops>=0.0.4,<1.0.0 # Language tokenizers with external dependencies ja = sudachipy>=0.4.9 @@ -107,7 +118,7 @@ universal = false formats = gztar [flake8] -ignore = E203, E266, E501, E731, W503, E741 +ignore = E203, E266, E501, E731, W503, E741, F541 max-line-length = 80 select = B,C,E,F,W,T4,B9 exclude = @@ -118,9 +129,11 @@ exclude = [tool:pytest] markers = - slow + slow: mark a test as slow + issue: reference specific issue [mypy] ignore_missing_imports = True no_implicit_optional = True plugins = pydantic.mypy, thinc.mypy +allow_redefinition = True diff --git a/setup.py b/setup.py index 3904593dc..dcfa98cfa 100755 --- a/setup.py +++ b/setup.py @@ -208,12 +208,11 @@ def setup_package(): ext_modules = cythonize(ext_modules, compiler_directives=COMPILER_DIRECTIVES) setup( - name="spacy-nightly", + name="spacy", packages=PACKAGES, version=about["__version__"], ext_modules=ext_modules, cmdclass={"build_ext": build_ext_subclass}, - include_dirs=include_dirs, package_data={"": ["*.pyx", "*.pxd", "*.pxi"]}, ) diff --git a/spacy/__init__.py b/spacy/__init__.py index aac1db289..ca47edc94 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,10 +1,11 @@ from typing import Union, Iterable, Dict, Any from pathlib import Path -import warnings import sys -warnings.filterwarnings("ignore", message="numpy.dtype size changed") # noqa -warnings.filterwarnings("ignore", message="numpy.ufunc size changed") # noqa +# set library-specific custom warning handling before doing anything else +from .errors import setup_default_warnings + +setup_default_warnings() # noqa: E402 # These are imported as part of the API from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401 @@ -28,6 +29,8 @@ if sys.maxunicode == 65535: def load( name: Union[str, Path], + *, + vocab: Union[Vocab, bool] = True, disable: Iterable[str] = util.SimpleFrozenList(), exclude: Iterable[str] = util.SimpleFrozenList(), config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(), @@ -35,6 +38,7 @@ def load( """Load a spaCy model from an installed package or a local path. name (str): Package name or model path. + vocab (Vocab): A Vocab object. If True, a vocab is created. disable (Iterable[str]): Names of pipeline components to disable. Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling nlp.enable_pipe. @@ -44,7 +48,9 @@ def load( keyed by section values in dot notation. RETURNS (Language): The loaded nlp object. """ - return util.load_model(name, disable=disable, exclude=exclude, config=config) + return util.load_model( + name, vocab=vocab, disable=disable, exclude=exclude, config=config + ) def blank( @@ -52,7 +58,7 @@ def blank( *, vocab: Union[Vocab, bool] = True, config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(), - meta: Dict[str, Any] = util.SimpleFrozenDict() + meta: Dict[str, Any] = util.SimpleFrozenDict(), ) -> Language: """Create a blank nlp object for a given language code. @@ -65,4 +71,4 @@ def blank( LangClass = util.get_lang_class(name) # We should accept both dot notation and nested dict here for consistency config = util.dot_to_dict(config) - return LangClass.from_config(config, meta=meta) + return LangClass.from_config(config, vocab=vocab, meta=meta) diff --git a/spacy/about.py b/spacy/about.py index 5eaf3c224..e6846f8d4 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off -__title__ = "spacy-nightly" -__version__ = "3.0.0rc4.dev21" +__title__ = "spacy" +__version__ = "3.1.4" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index b15db7599..9122de17b 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -74,7 +74,6 @@ IDS = { "SUFFIX": SUFFIX, "LENGTH": LENGTH, - "CLUSTER": CLUSTER, "LEMMA": LEMMA, "POS": POS, "TAG": TAG, @@ -85,9 +84,7 @@ IDS = { "ENT_KB_ID": ENT_KB_ID, "HEAD": HEAD, "SENT_START": SENT_START, - "SENT_END": SENT_END, "SPACY": SPACY, - "PROB": PROB, "LANG": LANG, "MORPH": MORPH, "IDX": IDX diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 7368bcef3..fd8da262e 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -9,6 +9,7 @@ from .info import info # noqa: F401 from .package import package # noqa: F401 from .profile import profile # noqa: F401 from .train import train_cli # noqa: F401 +from .assemble import assemble_cli # noqa: F401 from .pretrain import pretrain # noqa: F401 from .debug_data import debug_data # noqa: F401 from .debug_config import debug_config # noqa: F401 @@ -29,9 +30,9 @@ from .project.document import project_document # noqa: F401 @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) def link(*args, **kwargs): - """As of spaCy v3.0, symlinks like "en" are deprecated. You can load trained + """As of spaCy v3.0, symlinks like "en" are not supported anymore. You can load trained pipeline packages using their full names or from a directory path.""" msg.warn( - "As of spaCy v3.0, model symlinks are deprecated. You can load trained " + "As of spaCy v3.0, model symlinks are not supported anymore. You can load trained " "pipeline packages using their full names or from a directory path." ) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 4012737cf..fb680d888 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -1,8 +1,9 @@ -from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, TYPE_CHECKING +from typing import Dict, Any, Union, List, Optional, Tuple, Iterable +from typing import TYPE_CHECKING, overload import sys import shutil from pathlib import Path -from wasabi import msg +from wasabi import msg, Printer import srsly import hashlib import typer @@ -11,25 +12,30 @@ from click.parser import split_arg_string from typer.main import get_command from contextlib import contextmanager from thinc.api import Config, ConfigValidationError, require_gpu +from thinc.util import has_cupy, gpu_is_available from configparser import InterpolationError import os +from ..compat import Literal from ..schemas import ProjectConfigSchema, validate from ..util import import_file, run_command, make_tempdir, registry, logger -from ..util import is_compatible_version, ENV_VARS +from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS from .. import about if TYPE_CHECKING: from pathy import Pathy # noqa: F401 +SDIST_SUFFIX = ".tar.gz" +WHEEL_SUFFIX = "-py3-none-any.whl" + PROJECT_FILE = "project.yml" PROJECT_LOCK = "project.lock" COMMAND = "python -m spacy" NAME = "spacy" HELP = """spaCy Command-line Interface -DOCS: https://nightly.spacy.io/api/cli +DOCS: https://spacy.io/api/cli """ PROJECT_HELP = f"""Command-line interface for spaCy projects and templates. You'd typically start by cloning a project template to a local directory and @@ -108,26 +114,33 @@ def _parse_overrides(args: List[str], is_cli: bool = False) -> Dict[str, Any]: value = "true" else: value = args.pop(0) - # Just like we do in the config, we're calling json.loads on the - # values. But since they come from the CLI, it'd be unintuitive to - # explicitly mark strings with escaped quotes. So we're working - # around that here by falling back to a string if parsing fails. - # TODO: improve logic to handle simple types like list of strings? - try: - result[opt] = srsly.json_loads(value) - except ValueError: - result[opt] = str(value) + result[opt] = _parse_override(value) else: msg.fail(f"{err}: name should start with --", exits=1) return result -def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]: +def _parse_override(value: Any) -> Any: + # Just like we do in the config, we're calling json.loads on the + # values. But since they come from the CLI, it'd be unintuitive to + # explicitly mark strings with escaped quotes. So we're working + # around that here by falling back to a string if parsing fails. + # TODO: improve logic to handle simple types like list of strings? + try: + return srsly.json_loads(value) + except ValueError: + return str(value) + + +def load_project_config( + path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict() +) -> Dict[str, Any]: """Load the project.yml file from a directory and validate it. Also make sure that all directories defined in the config exist. path (Path): The path to the project directory. interpolate (bool): Whether to substitute project variables. + overrides (Dict[str, Any]): Optional config overrides. RETURNS (Dict[str, Any]): The loaded project.yml. """ config_path = path / PROJECT_FILE @@ -151,20 +164,36 @@ def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]: if not dir_path.exists(): dir_path.mkdir(parents=True) if interpolate: - err = "project.yml validation error" + err = f"{PROJECT_FILE} validation error" with show_validation_error(title=err, hint_fill=False): - config = substitute_project_variables(config) + config = substitute_project_variables(config, overrides) return config -def substitute_project_variables(config: Dict[str, Any], overrides: Dict = {}): - key = "vars" +def substitute_project_variables( + config: Dict[str, Any], + overrides: Dict[str, Any] = SimpleFrozenDict(), + key: str = "vars", + env_key: str = "env", +) -> Dict[str, Any]: + """Interpolate variables in the project file using the config system. + + config (Dict[str, Any]): The project config. + overrides (Dict[str, Any]): Optional config overrides. + key (str): Key containing variables in project config. + env_key (str): Key containing environment variable mapping in project config. + RETURNS (Dict[str, Any]): The interpolated project config. + """ config.setdefault(key, {}) - config[key].update(overrides) + config.setdefault(env_key, {}) + # Substitute references to env vars with their values + for config_var, env_var in config[env_key].items(): + config[env_key][config_var] = _parse_override(os.environ.get(env_var, "")) # Need to put variables in the top scope again so we can have a top-level # section "project" (otherwise, a list of commands in the top scope wouldn't) # be allowed by Thinc's config system - cfg = Config({"project": config, key: config[key]}) + cfg = Config({"project": config, key: config[key], env_key: config[env_key]}) + cfg = Config().from_str(cfg.to_str(), overrides=overrides) interpolated = cfg.interpolate() return dict(interpolated["project"]) @@ -233,15 +262,16 @@ def get_checksum(path: Union[Path, str]) -> str: RETURNS (str): The checksum. """ path = Path(path) + if not (path.is_file() or path.is_dir()): + msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1) if path.is_file(): return hashlib.md5(Path(path).read_bytes()).hexdigest() - if path.is_dir(): + else: # TODO: this is currently pretty slow dir_checksum = hashlib.md5() for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()): dir_checksum.update(sub_file.read_bytes()) return dir_checksum.hexdigest() - msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1) @contextmanager @@ -369,7 +399,15 @@ def git_checkout( cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}" run_command(cmd, capture=True) # We need Path(name) to make sure we also support subdirectories - shutil.copytree(str(tmp_dir / Path(subpath)), str(dest)) + try: + source_path = tmp_dir / Path(subpath) + if not is_subpath_of(tmp_dir, source_path): + err = f"'{subpath}' is a path outside of the cloned repository." + msg.fail(err, repo, exits=1) + shutil.copytree(str(source_path), str(dest)) + except FileNotFoundError: + err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')" + msg.fail(err, repo, exits=1) def git_sparse_checkout(repo, subpath, dest, branch): @@ -414,8 +452,14 @@ def git_sparse_checkout(repo, subpath, dest, branch): # And finally, we can checkout our subpath cmd = f"git -C {tmp_dir} checkout {branch} {subpath}" run_command(cmd, capture=True) - # We need Path(name) to make sure we also support subdirectories - shutil.move(str(tmp_dir / Path(subpath)), str(dest)) + + # Get a subdirectory of the cloned path, if appropriate + source_path = tmp_dir / Path(subpath) + if not is_subpath_of(tmp_dir, source_path): + err = f"'{subpath}' is a path outside of the cloned repository." + msg.fail(err, repo, exits=1) + + shutil.move(str(source_path), str(dest)) def get_git_version( @@ -427,12 +471,15 @@ def get_git_version( RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns (0, 0) if the version couldn't be determined. """ - ret = run_command("git --version", capture=True) + try: + ret = run_command("git --version", capture=True) + except: + raise RuntimeError(error) stdout = ret.stdout.strip() if not stdout or not stdout.startswith("git version"): - return (0, 0) + return 0, 0 version = stdout[11:].strip().split(".") - return (int(version[0]), int(version[1])) + return int(version[0]), int(version[1]) def _http_to_git(repo: str) -> str: @@ -446,6 +493,29 @@ def _http_to_git(repo: str) -> str: return repo +def is_subpath_of(parent, child): + """ + Check whether `child` is a path contained within `parent`. + """ + # Based on https://stackoverflow.com/a/37095733 . + + # In Python 3.9, the `Path.is_relative_to()` method will supplant this, so + # we can stop using crusty old os.path functions. + parent_realpath = os.path.realpath(parent) + child_realpath = os.path.realpath(child) + return os.path.commonpath([parent_realpath, child_realpath]) == parent_realpath + + +@overload +def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]: + ... + + +@overload +def string_to_list(value: str, intify: Literal[True]) -> List[int]: + ... + + def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[int]]: """Parse a comma-separated string to a list and account for various formatting options. Mostly used to handle CLI arguments that take a list of @@ -456,7 +526,7 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in RETURNS (Union[List[str], List[int]]): A list of strings or ints. """ if not value: - return [] + return [] # type: ignore[return-value] if value.startswith("[") and value.endswith("]"): value = value[1:-1] result = [] @@ -468,15 +538,21 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in p = p[1:-1] p = p.strip() if intify: - p = int(p) + p = int(p) # type: ignore[assignment] result.append(p) return result -def setup_gpu(use_gpu: int) -> None: +def setup_gpu(use_gpu: int, silent=None) -> None: """Configure the GPU and log info.""" + if silent is None: + local_msg = Printer() + else: + local_msg = Printer(no_print=silent, pretty=not silent) if use_gpu >= 0: - msg.info(f"Using GPU: {use_gpu}") + local_msg.info(f"Using GPU: {use_gpu}") require_gpu(use_gpu) else: - msg.info("Using CPU") + local_msg.info("Using CPU") + if has_cupy and gpu_is_available(): + local_msg.info("To switch to GPU 0, use the option: --gpu-id 0") diff --git a/spacy/cli/assemble.py b/spacy/cli/assemble.py new file mode 100644 index 000000000..1cfa290a3 --- /dev/null +++ b/spacy/cli/assemble.py @@ -0,0 +1,57 @@ +from typing import Optional +from pathlib import Path +from wasabi import msg +import typer +import logging + +from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error +from ._util import import_code +from .. import util +from ..util import get_sourced_components, load_model_from_config + + +@app.command( + "assemble", + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) +def assemble_cli( + # fmt: off + ctx: typer.Context, # This is only used to read additional arguments + config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), + output_path: Path = Arg(..., help="Output directory to store assembled pipeline in"), + code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), + # fmt: on +): + """ + Assemble a spaCy pipeline from a config file. The config file includes + all settings for initializing the pipeline. To override settings in the + config, e.g. settings that point to local paths or that you want to + experiment with, you can override them as command line options. The + --code argument lets you pass in a Python file that can be used to + register custom functions that are referenced in the config. + + DOCS: https://spacy.io/api/cli#assemble + """ + util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) + # Make sure all files and paths exists if they are needed + if not config_path or (str(config_path) != "-" and not config_path.exists()): + msg.fail("Config file not found", config_path, exits=1) + overrides = parse_config_overrides(ctx.args) + import_code(code_path) + with show_validation_error(config_path): + config = util.load_config(config_path, overrides=overrides, interpolate=False) + msg.divider("Initializing pipeline") + nlp = load_model_from_config(config, auto_fill=True) + config = config.interpolate() + sourced = get_sourced_components(config) + # Make sure that listeners are defined before initializing further + nlp._link_components() + with nlp.select_pipes(disable=[*sourced]): + nlp.initialize() + msg.good("Initialized pipeline") + msg.divider("Serializing to disk") + if output_path is not None and not output_path.exists(): + output_path.mkdir(parents=True) + msg.good(f"Created output directory: {output_path}") + nlp.to_disk(output_path) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 2e52bfa41..04eb7078f 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -1,4 +1,4 @@ -from typing import Optional, Any, List, Union +from typing import Callable, Iterable, Mapping, Optional, Any, List, Union from enum import Enum from pathlib import Path from wasabi import Printer @@ -9,7 +9,7 @@ import itertools from ._util import app, Arg, Opt from ..training import docs_to_json -from ..tokens import DocBin +from ..tokens import Doc, DocBin from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs from ..training.converters import conllu_to_docs @@ -19,10 +19,10 @@ from ..training.converters import conllu_to_docs # entry to this dict with the file extension mapped to the converter function # imported from /converters. -CONVERTERS = { +CONVERTERS: Mapping[str, Callable[..., Iterable[Doc]]] = { "conllubio": conllu_to_docs, "conllu": conllu_to_docs, - "conll": conllu_to_docs, + "conll": conll_ner_to_docs, "ner": conll_ner_to_docs, "iob": iob_to_docs, "json": json_to_docs, @@ -64,21 +64,18 @@ def convert_cli( is written to stdout, so you can pipe them forward to a JSON file: $ spacy convert some_file.conllu --file-type json > some_file.json - DOCS: https://nightly.spacy.io/api/cli#convert + DOCS: https://spacy.io/api/cli#convert """ - if isinstance(file_type, FileTypes): - # We get an instance of the FileTypes from the CLI so we need its string value - file_type = file_type.value input_path = Path(input_path) - output_dir = "-" if output_dir == Path("-") else output_dir + output_dir: Union[str, Path] = "-" if output_dir == Path("-") else output_dir silent = output_dir == "-" msg = Printer(no_print=silent) - verify_cli_args(msg, input_path, output_dir, file_type, converter, ner_map) + verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map) converter = _get_converter(msg, converter, input_path) convert( input_path, output_dir, - file_type=file_type, + file_type=file_type.value, n_sents=n_sents, seg_sents=seg_sents, model=model, @@ -94,7 +91,7 @@ def convert_cli( def convert( - input_path: Union[str, Path], + input_path: Path, output_dir: Union[str, Path], *, file_type: str = "json", @@ -108,14 +105,16 @@ def convert( lang: Optional[str] = None, concatenate: bool = False, silent: bool = True, - msg: Optional[Printer], + msg: Optional[Printer] = None, ) -> None: + input_path = Path(input_path) if not msg: msg = Printer(no_print=silent) ner_map = srsly.read_json(ner_map) if ner_map is not None else None doc_files = [] - for input_loc in walk_directory(Path(input_path), converter): - input_data = input_loc.open("r", encoding="utf-8").read() + for input_loc in walk_directory(input_path, converter): + with input_loc.open("r", encoding="utf-8") as infile: + input_data = infile.read() # Use converter function to convert data func = CONVERTERS[converter] docs = func( @@ -140,7 +139,7 @@ def convert( else: db = DocBin(docs=docs, store_user_data=True) len_docs = len(db) - data = db.to_bytes() + data = db.to_bytes() # type: ignore[assignment] if output_dir == "-": _print_docs_to_stdout(data, file_type) else: @@ -219,13 +218,12 @@ def walk_directory(path: Path, converter: str) -> List[Path]: def verify_cli_args( msg: Printer, - input_path: Union[str, Path], + input_path: Path, output_dir: Union[str, Path], - file_type: FileTypes, + file_type: str, converter: str, ner_map: Optional[Path], ): - input_path = Path(input_path) if file_type not in FILE_TYPES_STDOUT and output_dir == "-": msg.fail( f"Can't write .{file_type} data to stdout. Please specify an output directory.", @@ -243,13 +241,13 @@ def verify_cli_args( msg.fail("No input files in directory", input_path, exits=1) file_types = list(set([loc.suffix[1:] for loc in input_locs])) if converter == "auto" and len(file_types) >= 2: - file_types = ",".join(file_types) - msg.fail("All input files must be same type", file_types, exits=1) + file_types_str = ",".join(file_types) + msg.fail("All input files must be same type", file_types_str, exits=1) if converter != "auto" and converter not in CONVERTERS: msg.fail(f"Can't find converter for {converter}", exits=1) -def _get_converter(msg, converter, input_path): +def _get_converter(msg, converter, input_path: Path): if input_path.is_dir(): input_path = walk_directory(input_path, converter)[0] if converter == "auto": @@ -268,6 +266,6 @@ def _get_converter(msg, converter, input_path): msg.warn( "Can't automatically detect NER format. " "Conversion may not succeed. " - "See https://nightly.spacy.io/api/cli#convert" + "See https://spacy.io/api/cli#convert" ) return converter diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py index 5f9759c8c..56ee12336 100644 --- a/spacy/cli/debug_config.py +++ b/spacy/cli/debug_config.py @@ -20,7 +20,7 @@ def debug_config_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), - code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"), show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.") # fmt: on @@ -34,7 +34,7 @@ def debug_config_cli( as command line options. For instance, --training.batch_size 128 overrides the value of "batch_size" in the block "[training]". - DOCS: https://nightly.spacy.io/api/cli#debug-config + DOCS: https://spacy.io/api/cli#debug-config """ overrides = parse_config_overrides(ctx.args) import_code(code_path) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index c04647fde..3143e2c62 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -1,4 +1,5 @@ -from typing import List, Sequence, Dict, Any, Tuple, Optional +from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union +from typing import cast, overload from pathlib import Path from collections import Counter import sys @@ -13,8 +14,11 @@ from ..training.initialize import get_sourced_components from ..schemas import ConfigSchemaTraining from ..pipeline._parser_internals import nonproj from ..pipeline._parser_internals.nonproj import DELIMITER +from ..pipeline import Morphologizer +from ..morphology import Morphology from ..language import Language from ..util import registry, resolve_dot_names +from ..compat import Literal from .. import util @@ -39,7 +43,7 @@ def debug_data_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), - code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"), verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"), no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"), @@ -50,7 +54,7 @@ def debug_data_cli( useful stats, and can help you find problems like invalid entity annotations, cyclic dependencies, low data labels and more. - DOCS: https://nightly.spacy.io/api/cli#debug-data + DOCS: https://spacy.io/api/cli#debug-data """ if ctx.command.name == "debug-data": msg.warn( @@ -99,13 +103,14 @@ def debug_data( # Create the gold corpus to be able to better analyze data dot_names = [T["train_corpus"], T["dev_corpus"]] train_corpus, dev_corpus = resolve_dot_names(config, dot_names) + + nlp.initialize(lambda: train_corpus(nlp)) + msg.good("Pipeline can be initialized with data") + train_dataset = list(train_corpus(nlp)) dev_dataset = list(dev_corpus(nlp)) msg.good("Corpus is loadable") - nlp.initialize(lambda: train_dataset) - msg.good("Pipeline can be initialized with data") - # Create all gold data here to avoid iterating over the train_dataset constantly gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True) gold_train_unpreprocessed_data = _compile_gold( @@ -171,8 +176,9 @@ def debug_data( ) n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values()) msg.warn( - "{} words in training data without vectors ({:0.2f}%)".format( - n_missing_vectors, n_missing_vectors / gold_train_data["n_words"] + "{} words in training data without vectors ({:.0f}%)".format( + n_missing_vectors, + 100 * (n_missing_vectors / gold_train_data["n_words"]), ), ) msg.text( @@ -194,44 +200,39 @@ def debug_data( ) label_counts = gold_train_data["ner"] model_labels = _get_labels_from_model(nlp, "ner") - new_labels = [l for l in labels if l not in model_labels] - existing_labels = [l for l in labels if l in model_labels] has_low_data_warning = False has_no_neg_warning = False has_ws_ents_error = False - has_punct_ents_warning = False msg.divider("Named Entity Recognition") - msg.info( - f"{len(new_labels)} new label(s), {len(existing_labels)} existing label(s)" - ) + msg.info(f"{len(model_labels)} label(s)") missing_values = label_counts["-"] msg.text(f"{missing_values} missing value(s) (tokens with '-' label)") - for label in new_labels: + for label in labels: if len(label) == 0: - msg.fail("Empty label found in new labels") - if new_labels: - labels_with_counts = [ - (label, count) - for label, count in label_counts.most_common() - if label != "-" - ] - labels_with_counts = _format_labels(labels_with_counts, counts=True) - msg.text(f"New: {labels_with_counts}", show=verbose) - if existing_labels: - msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose) + msg.fail("Empty label found in train data") + labels_with_counts = [ + (label, count) + for label, count in label_counts.most_common() + if label != "-" + ] + labels_with_counts = _format_labels(labels_with_counts, counts=True) + msg.text(f"Labels in train data: {_format_labels(labels)}", show=verbose) + missing_labels = model_labels - labels + if missing_labels: + msg.warn( + "Some model labels are not present in the train data. The " + "model performance may be degraded for these labels after " + f"training: {_format_labels(missing_labels)}." + ) if gold_train_data["ws_ents"]: msg.fail(f"{gold_train_data['ws_ents']} invalid whitespace entity spans") has_ws_ents_error = True - if gold_train_data["punct_ents"]: - msg.warn(f"{gold_train_data['punct_ents']} entity span(s) with punctuation") - has_punct_ents_warning = True - - for label in new_labels: + for label in labels: if label_counts[label] <= NEW_LABEL_THRESHOLD: msg.warn( - f"Low number of examples for new label '{label}' ({label_counts[label]})" + f"Low number of examples for label '{label}' ({label_counts[label]})" ) has_low_data_warning = True @@ -247,8 +248,6 @@ def debug_data( msg.good("Examples without occurrences available for all labels") if not has_ws_ents_error: msg.good("No entities consisting of or starting/ending with whitespace") - if not has_punct_ents_warning: - msg.good("No entities consisting of or starting/ending with punctuation") if has_low_data_warning: msg.text( @@ -265,55 +264,99 @@ def debug_data( ) if has_ws_ents_error: msg.text( - "As of spaCy v2.1.0, entity spans consisting of or starting/ending " + "Entity spans consisting of or starting/ending " "with whitespace characters are considered invalid." ) - if has_punct_ents_warning: - msg.text( - "Entity spans consisting of or starting/ending " - "with punctuation can not be trained with a noise level > 0." - ) - if "textcat" in factory_names: - msg.divider("Text Classification") - labels = [label for label in gold_train_data["cats"]] - model_labels = _get_labels_from_model(nlp, "textcat") - new_labels = [l for l in labels if l not in model_labels] - existing_labels = [l for l in labels if l in model_labels] - msg.info( - f"Text Classification: {len(new_labels)} new label(s), " - f"{len(existing_labels)} existing label(s)" - ) - if new_labels: - labels_with_counts = _format_labels( - gold_train_data["cats"].most_common(), counts=True + msg.divider("Text Classification (Exclusive Classes)") + labels = _get_labels_from_model(nlp, "textcat") + msg.info(f"Text Classification: {len(labels)} label(s)") + msg.text(f"Labels: {_format_labels(labels)}", show=verbose) + missing_labels = labels - set(gold_train_data["cats"]) + if missing_labels: + msg.warn( + "Some model labels are not present in the train data. The " + "model performance may be degraded for these labels after " + f"training: {_format_labels(missing_labels)}." ) - msg.text(f"New: {labels_with_counts}", show=verbose) - if existing_labels: - msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose) if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]): - msg.fail( - f"The train and dev labels are not the same. " + msg.warn( + "Potential train/dev mismatch: the train and dev labels are " + "not the same. " f"Train labels: {_format_labels(gold_train_data['cats'])}. " f"Dev labels: {_format_labels(gold_dev_data['cats'])}." ) - if gold_train_data["n_cats_multilabel"] > 0: - msg.info( - "The train data contains instances without " - "mutually-exclusive classes. Use '--textcat-multilabel' " - "when training." + if len(labels) < 2: + msg.fail( + "The model does not have enough labels. 'textcat' requires at " + "least two labels due to mutually-exclusive classes, e.g. " + "LABEL/NOT_LABEL or POSITIVE/NEGATIVE for a binary " + "classification task." ) + if ( + gold_train_data["n_cats_bad_values"] > 0 + or gold_dev_data["n_cats_bad_values"] > 0 + ): + msg.fail( + "Unsupported values for cats: the supported values are " + "1.0/True and 0.0/False." + ) + if gold_train_data["n_cats_multilabel"] > 0: + # Note: you should never get here because you run into E895 on + # initialization first. + msg.fail( + "The train data contains instances without mutually-exclusive " + "classes. Use the component 'textcat_multilabel' instead of " + "'textcat'." + ) + if gold_dev_data["n_cats_multilabel"] > 0: + msg.fail( + "The dev data contains instances without mutually-exclusive " + "classes. Use the component 'textcat_multilabel' instead of " + "'textcat'." + ) + + if "textcat_multilabel" in factory_names: + msg.divider("Text Classification (Multilabel)") + labels = _get_labels_from_model(nlp, "textcat_multilabel") + msg.info(f"Text Classification: {len(labels)} label(s)") + msg.text(f"Labels: {_format_labels(labels)}", show=verbose) + missing_labels = labels - set(gold_train_data["cats"]) + if missing_labels: + msg.warn( + "Some model labels are not present in the train data. The " + "model performance may be degraded for these labels after " + f"training: {_format_labels(missing_labels)}." + ) + if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]): + msg.warn( + "Potential train/dev mismatch: the train and dev labels are " + "not the same. " + f"Train labels: {_format_labels(gold_train_data['cats'])}. " + f"Dev labels: {_format_labels(gold_dev_data['cats'])}." + ) + if ( + gold_train_data["n_cats_bad_values"] > 0 + or gold_dev_data["n_cats_bad_values"] > 0 + ): + msg.fail( + "Unsupported values for cats: the supported values are " + "1.0/True and 0.0/False." + ) + if gold_train_data["n_cats_multilabel"] > 0: if gold_dev_data["n_cats_multilabel"] == 0: msg.warn( "Potential train/dev mismatch: the train data contains " "instances without mutually-exclusive classes while the " - "dev data does not." + "dev data contains only instances with mutually-exclusive " + "classes." ) else: - msg.info( + msg.warn( "The train data contains only instances with " - "mutually-exclusive classes." + "mutually-exclusive classes. You can potentially use the " + "component 'textcat' instead of 'textcat_multilabel'." ) if gold_dev_data["n_cats_multilabel"] > 0: msg.fail( @@ -324,14 +367,40 @@ def debug_data( if "tagger" in factory_names: msg.divider("Part-of-speech Tagging") - labels = [label for label in gold_train_data["tags"]] - # TODO: does this need to be updated? - msg.info(f"{len(labels)} label(s) in data") + label_list = [label for label in gold_train_data["tags"]] + model_labels = _get_labels_from_model(nlp, "tagger") + msg.info(f"{len(label_list)} label(s) in train data") + labels = set(label_list) + missing_labels = model_labels - labels + if missing_labels: + msg.warn( + "Some model labels are not present in the train data. The " + "model performance may be degraded for these labels after " + f"training: {_format_labels(missing_labels)}." + ) labels_with_counts = _format_labels( gold_train_data["tags"].most_common(), counts=True ) msg.text(labels_with_counts, show=verbose) + if "morphologizer" in factory_names: + msg.divider("Morphologizer (POS+Morph)") + label_list = [label for label in gold_train_data["morphs"]] + model_labels = _get_labels_from_model(nlp, "morphologizer") + msg.info(f"{len(label_list)} label(s) in train data") + labels = set(label_list) + missing_labels = model_labels - labels + if missing_labels: + msg.warn( + "Some model labels are not present in the train data. The " + "model performance may be degraded for these labels after " + f"training: {_format_labels(missing_labels)}." + ) + labels_with_counts = _format_labels( + gold_train_data["morphs"].most_common(), counts=True + ) + msg.text(labels_with_counts, show=verbose) + if "parser" in factory_names: has_low_data_warning = False msg.divider("Dependency Parsing") @@ -487,15 +556,15 @@ def _compile_gold( nlp: Language, make_proj: bool, ) -> Dict[str, Any]: - data = { + data: Dict[str, Any] = { "ner": Counter(), "cats": Counter(), "tags": Counter(), + "morphs": Counter(), "deps": Counter(), "words": Counter(), "roots": Counter(), "ws_ents": 0, - "punct_ents": 0, "n_words": 0, "n_misaligned_words": 0, "words_missing_vectors": Counter(), @@ -503,6 +572,7 @@ def _compile_gold( "n_nonproj": 0, "n_cycles": 0, "n_cats_multilabel": 0, + "n_cats_bad_values": 0, "texts": set(), } for eg in examples: @@ -529,28 +599,43 @@ def _compile_gold( if label.startswith(("B-", "U-", "L-")) and doc[i].is_space: # "Illegal" whitespace entity data["ws_ents"] += 1 - if label.startswith(("B-", "U-", "L-")) and doc[i].text in [ - ".", - "'", - "!", - "?", - ",", - ]: - # punctuation entity: could be replaced by whitespace when training with noise, - # so add a warning to alert the user to this unexpected side effect. - data["punct_ents"] += 1 if label.startswith(("B-", "U-")): combined_label = label.split("-")[1] data["ner"][combined_label] += 1 elif label == "-": data["ner"]["-"] += 1 - if "textcat" in factory_names: + if "textcat" in factory_names or "textcat_multilabel" in factory_names: data["cats"].update(gold.cats) - if list(gold.cats.values()).count(1.0) != 1: + if any(val not in (0, 1) for val in gold.cats.values()): + data["n_cats_bad_values"] += 1 + if list(gold.cats.values()).count(1) != 1: data["n_cats_multilabel"] += 1 if "tagger" in factory_names: tags = eg.get_aligned("TAG", as_string=True) data["tags"].update([x for x in tags if x is not None]) + if "morphologizer" in factory_names: + pos_tags = eg.get_aligned("POS", as_string=True) + morphs = eg.get_aligned("MORPH", as_string=True) + for pos, morph in zip(pos_tags, morphs): + # POS may align (same value for multiple tokens) when morph + # doesn't, so if either is misaligned (None), treat the + # annotation as missing so that truths doesn't end up with an + # unknown morph+POS combination + if pos is None or morph is None: + pass + # If both are unset, the annotation is missing (empty morph + # converted from int is "_" rather than "") + elif pos == "" and morph == "": + pass + # Otherwise, generate the combined label + else: + label_dict = Morphology.feats_to_dict(morph) + if pos: + label_dict[Morphologizer.POS_FEAT] = pos + label = eg.reference.vocab.strings[ + eg.reference.vocab.morphology.add(label_dict) + ] + data["morphs"].update([label]) if "parser" in factory_names: aligned_heads, aligned_deps = eg.get_aligned_parse(projectivize=make_proj) data["deps"].update([x for x in aligned_deps if x is not None]) @@ -565,10 +650,28 @@ def _compile_gold( return data -def _format_labels(labels: List[Tuple[str, int]], counts: bool = False) -> str: +@overload +def _format_labels(labels: Iterable[str], counts: Literal[False] = False) -> str: + ... + + +@overload +def _format_labels( + labels: Iterable[Tuple[str, int]], + counts: Literal[True], +) -> str: + ... + + +def _format_labels( + labels: Union[Iterable[str], Iterable[Tuple[str, int]]], + counts: bool = False, +) -> str: if counts: - return ", ".join([f"'{l}' ({c})" for l, c in labels]) - return ", ".join([f"'{l}'" for l in labels]) + return ", ".join( + [f"'{l}' ({c})" for l, c in cast(Iterable[Tuple[str, int]], labels)] + ) + return ", ".join([f"'{l}'" for l in cast(Iterable[str], labels)]) def _get_examples_without_label(data: Sequence[Example], label: str) -> int: @@ -584,8 +687,8 @@ def _get_examples_without_label(data: Sequence[Example], label: str) -> int: return count -def _get_labels_from_model(nlp: Language, pipe_name: str) -> Sequence[str]: +def _get_labels_from_model(nlp: Language, pipe_name: str) -> Set[str]: if pipe_name not in nlp.pipe_names: return set() pipe = nlp.get_pipe(pipe_name) - return pipe.labels + return set(pipe.labels) diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index 2e4d9456f..190094d81 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -1,10 +1,11 @@ -from typing import Dict, Any, Optional, Iterable +from typing import Dict, Any, Optional from pathlib import Path +import itertools from spacy.training import Example from spacy.util import resolve_dot_names from wasabi import msg -from thinc.api import fix_random_seed, set_dropout_rate, Adam +from thinc.api import fix_random_seed, set_dropout_rate from thinc.api import Model, data_validation, set_gpu_allocator import typer @@ -40,7 +41,7 @@ def debug_model_cli( Analyze a Thinc model implementation. Includes checks for internal structure and activations during training. - DOCS: https://nightly.spacy.io/api/cli#debug-model + DOCS: https://spacy.io/api/cli#debug-model """ setup_gpu(use_gpu) layers = string_to_list(layers, intify=True) @@ -73,23 +74,24 @@ def debug_model_cli( msg.info(f"Fixing random seed: {seed}") fix_random_seed(seed) pipe = nlp.get_pipe(component) - if not hasattr(pipe, "model"): - msg.fail( - f"The component '{component}' does not specify an object that holds a Model.", - exits=1, - ) - model = pipe.model - debug_model(config, T, nlp, model, print_settings=print_settings) + + debug_model(config, T, nlp, pipe, print_settings=print_settings) def debug_model( config, resolved_train_config, nlp, - model: Model, + pipe, *, print_settings: Optional[Dict[str, Any]] = None, ): + if not hasattr(pipe, "model"): + msg.fail( + f"The component '{pipe}' does not specify an object that holds a Model.", + exits=1, + ) + model = pipe.model if not isinstance(model, Model): msg.fail( f"Requires a Thinc Model to be analysed, but found {type(model)} instead.", @@ -105,8 +107,6 @@ def debug_model( _print_model(model, print_settings) # STEP 1: Initializing the model and printing again - X = _get_docs() - # The output vector might differ from the official type of the output layer with data_validation(False): try: dot_names = [resolved_train_config["train_corpus"]] @@ -114,15 +114,17 @@ def debug_model( (train_corpus,) = resolve_dot_names(config, dot_names) nlp.initialize(lambda: train_corpus(nlp)) msg.info("Initialized the model with the training corpus.") + examples = list(itertools.islice(train_corpus(nlp), 5)) except ValueError: try: _set_output_dim(nO=7, model=model) with show_validation_error(): - nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X]) + examples = [Example.from_dict(x, {}) for x in _get_docs()] + nlp.initialize(lambda: examples) msg.info("Initialized the model with dummy data.") except Exception: msg.fail( - "Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.", + "Could not initialize the model: you'll have to provide a valid 'train_corpus' argument in the config file.", exits=1, ) @@ -131,28 +133,26 @@ def debug_model( _print_model(model, print_settings) # STEP 2: Updating the model and printing again - optimizer = Adam(0.001) set_dropout_rate(model, 0.2) - # ugly hack to deal with Tok2Vec listeners - tok2vec = None - if model.has_ref("tok2vec") and model.get_ref("tok2vec").name == "tok2vec-listener": - tok2vec = nlp.get_pipe("tok2vec") - goldY = None + # ugly hack to deal with Tok2Vec/Transformer listeners + upstream_component = None + if model.has_ref("tok2vec") and "tok2vec-listener" in model.get_ref("tok2vec").name: + upstream_component = nlp.get_pipe("tok2vec") + if ( + model.has_ref("tok2vec") + and "transformer-listener" in model.get_ref("tok2vec").name + ): + upstream_component = nlp.get_pipe("transformer") for e in range(3): - if tok2vec: - tok2vec.update([Example.from_dict(x, {}) for x in X]) - Y, get_dX = model.begin_update(X) - if goldY is None: - goldY = _simulate_gold(Y) - dY = get_gradient(goldY, Y, model.ops) - get_dX(dY) - model.finish_update(optimizer) + if upstream_component: + upstream_component.update(examples) + pipe.update(examples) if print_settings.get("print_after_training"): msg.divider(f"STEP 2 - after training") _print_model(model, print_settings) # STEP 3: the final prediction - prediction = model.predict(X) + prediction = model.predict([ex.predicted for ex in examples]) if print_settings.get("print_prediction"): msg.divider(f"STEP 3 - prediction") msg.info(str(prediction)) @@ -160,19 +160,6 @@ def debug_model( msg.good(f"Succesfully ended analysis - model looks good.") -def get_gradient(goldY, Y, ops): - return ops.asarray(Y) - ops.asarray(goldY) - - -def _simulate_gold(element, counter=1): - if isinstance(element, Iterable): - for i in range(len(element)): - element[i] = _simulate_gold(element[i], counter + i) - return element - else: - return 1 / counter - - def _sentences(): return [ "Apple is looking at buying U.K. startup for $1 billion", @@ -209,11 +196,7 @@ def _print_model(model, print_settings): if dimensions: for name in node.dim_names: - if node.has_dim(name): - msg.info(f" - dim {name}: {node.get_dim(name)}") - else: - msg.info(f" - dim {name}: {node.has_dim(name)}") - + msg.info(f" - dim {name}: {node.maybe_get_dim(name)}") if parameters: for name in node.param_names: if node.has_param(name): diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 5fcac63c0..4ea9a8f0e 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -4,9 +4,9 @@ import sys from wasabi import msg import typer -from ._util import app, Arg, Opt +from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX from .. import about -from ..util import is_package, get_base_version, run_command +from ..util import is_package, get_minor_version, run_command from ..errors import OLD_MODEL_SHORTCUTS @@ -19,6 +19,7 @@ def download_cli( ctx: typer.Context, model: str = Arg(..., help="Name of pipeline package to download"), direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"), + sdist: bool = Opt(False, "--sdist", "-S", help="Download sdist (.tar.gz) archive instead of pre-built binary wheel") # fmt: on ): """ @@ -28,13 +29,13 @@ def download_cli( additional arguments provided to this command will be passed to `pip install` on package installation. - DOCS: https://nightly.spacy.io/api/cli#download + DOCS: https://spacy.io/api/cli#download AVAILABLE PACKAGES: https://spacy.io/models """ - download(model, direct, *ctx.args) + download(model, direct, sdist, *ctx.args) -def download(model: str, direct: bool = False, *pip_args) -> None: +def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -> None: if ( not (is_package("spacy") or is_package("spacy-nightly")) and "--no-deps" not in pip_args @@ -48,23 +49,24 @@ def download(model: str, direct: bool = False, *pip_args) -> None: "dependencies, you'll have to install them manually." ) pip_args = pip_args + ("--no-deps",) - dl_tpl = "{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}" + suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX + dl_tpl = "{m}-{v}/{m}-{v}{s}#egg={m}=={v}" if direct: components = model.split("-") model_name = "".join(components[:-1]) version = components[-1] - download_model(dl_tpl.format(m=model_name, v=version), pip_args) + download_model(dl_tpl.format(m=model_name, v=version, s=suffix), pip_args) else: model_name = model if model in OLD_MODEL_SHORTCUTS: msg.warn( - f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please" + f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please " f"use the full pipeline package name '{OLD_MODEL_SHORTCUTS[model]}' instead." ) model_name = OLD_MODEL_SHORTCUTS[model] compatibility = get_compatibility() version = get_version(model_name, compatibility) - download_model(dl_tpl.format(m=model_name, v=version), pip_args) + download_model(dl_tpl.format(m=model_name, v=version, s=suffix), pip_args) msg.good( "Download and installation successful", f"You can now load the package via spacy.load('{model_name}')", @@ -72,7 +74,7 @@ def download(model: str, direct: bool = False, *pip_args) -> None: def get_compatibility() -> dict: - version = get_base_version(about.__version__) + version = get_minor_version(about.__version__) r = requests.get(about.__compatibility__) if r.status_code != 200: msg.fail( @@ -80,7 +82,7 @@ def get_compatibility() -> dict: f"Couldn't fetch compatibility table. Please find a package for your spaCy " f"installation (v{about.__version__}), and download it manually. " f"For more details, see the documentation: " - f"https://nightly.spacy.io/usage/models", + f"https://spacy.io/usage/models", exits=1, ) comp_table = r.json() @@ -103,8 +105,6 @@ def download_model( filename: str, user_pip_args: Optional[Sequence[str]] = None ) -> None: download_url = about.__download_url__ + "/" + filename - pip_args = ["--no-cache-dir"] - if user_pip_args: - pip_args.extend(user_pip_args) + pip_args = list(user_pip_args) if user_pip_args is not None else [] cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url] run_command(cmd) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index b14cf2876..0d08d2c5e 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -1,4 +1,4 @@ -from typing import Optional, List, Dict +from typing import Optional, List, Dict, Any, Union from wasabi import Printer from pathlib import Path import re @@ -36,7 +36,7 @@ def evaluate_cli( dependency parses in a HTML file, set as output directory as the displacy_path argument. - DOCS: https://nightly.spacy.io/api/cli#evaluate + DOCS: https://spacy.io/api/cli#evaluate """ import_code(code_path) evaluate( @@ -60,10 +60,11 @@ def evaluate( displacy_path: Optional[Path] = None, displacy_limit: int = 25, silent: bool = True, -) -> Scorer: + spans_key: str = "sc", +) -> Dict[str, Any]: msg = Printer(no_print=silent, pretty=not silent) fix_random_seed() - setup_gpu(use_gpu) + setup_gpu(use_gpu, silent=silent) data_path = util.ensure_path(data_path) output_path = util.ensure_path(output) displacy_path = util.ensure_path(displacy_path) @@ -90,6 +91,9 @@ def evaluate( "SENT P": "sents_p", "SENT R": "sents_r", "SENT F": "sents_f", + "SPAN P": f"spans_{spans_key}_p", + "SPAN R": f"spans_{spans_key}_r", + "SPAN F": f"spans_{spans_key}_f", "SPEED": "speed", } results = {} @@ -108,31 +112,11 @@ def evaluate( data[re.sub(r"[\s/]", "_", key.lower())] = scores[key] msg.table(results, title="Results") - - if "morph_per_feat" in scores: - if scores["morph_per_feat"]: - print_prf_per_type(msg, scores["morph_per_feat"], "MORPH", "feat") - data["morph_per_feat"] = scores["morph_per_feat"] - if "dep_las_per_type" in scores: - if scores["dep_las_per_type"]: - print_prf_per_type(msg, scores["dep_las_per_type"], "LAS", "type") - data["dep_las_per_type"] = scores["dep_las_per_type"] - if "ents_per_type" in scores: - if scores["ents_per_type"]: - print_prf_per_type(msg, scores["ents_per_type"], "NER", "type") - data["ents_per_type"] = scores["ents_per_type"] - if "cats_f_per_type" in scores: - if scores["cats_f_per_type"]: - print_prf_per_type(msg, scores["cats_f_per_type"], "Textcat F", "label") - data["cats_f_per_type"] = scores["cats_f_per_type"] - if "cats_auc_per_type" in scores: - if scores["cats_auc_per_type"]: - print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"]) - data["cats_auc_per_type"] = scores["cats_auc_per_type"] + data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent) if displacy_path: factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names] - docs = [ex.predicted for ex in dev_dataset] + docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit])) render_deps = "parser" in factory_names render_ents = "ner" in factory_names render_parses( @@ -151,6 +135,43 @@ def evaluate( return data +def handle_scores_per_type( + scores: Dict[str, Any], + data: Dict[str, Any] = {}, + *, + spans_key: str = "sc", + silent: bool = False, +) -> Dict[str, Any]: + msg = Printer(no_print=silent, pretty=not silent) + if "morph_per_feat" in scores: + if scores["morph_per_feat"]: + print_prf_per_type(msg, scores["morph_per_feat"], "MORPH", "feat") + data["morph_per_feat"] = scores["morph_per_feat"] + if "dep_las_per_type" in scores: + if scores["dep_las_per_type"]: + print_prf_per_type(msg, scores["dep_las_per_type"], "LAS", "type") + data["dep_las_per_type"] = scores["dep_las_per_type"] + if "ents_per_type" in scores: + if scores["ents_per_type"]: + print_prf_per_type(msg, scores["ents_per_type"], "NER", "type") + data["ents_per_type"] = scores["ents_per_type"] + if f"spans_{spans_key}_per_type" in scores: + if scores[f"spans_{spans_key}_per_type"]: + print_prf_per_type( + msg, scores[f"spans_{spans_key}_per_type"], "SPANS", "type" + ) + data[f"spans_{spans_key}_per_type"] = scores[f"spans_{spans_key}_per_type"] + if "cats_f_per_type" in scores: + if scores["cats_f_per_type"]: + print_prf_per_type(msg, scores["cats_f_per_type"], "Textcat F", "label") + data["cats_f_per_type"] = scores["cats_f_per_type"] + if "cats_auc_per_type" in scores: + if scores["cats_auc_per_type"]: + print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"]) + data["cats_auc_per_type"] = scores["cats_auc_per_type"] + return scores + + def render_parses( docs: List[Doc], output_path: Path, @@ -175,10 +196,13 @@ def render_parses( def print_prf_per_type( msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str ) -> None: - data = [ - (k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}") - for k, v in scores.items() - ] + data = [] + for key, value in scores.items(): + row = [key] + for k in ("p", "r", "f"): + v = value[k] + row.append(f"{v * 100:.2f}" if isinstance(v, (int, float)) else v) + data.append(row) msg.table( data, header=("", "P", "R", "F"), @@ -191,7 +215,10 @@ def print_textcats_auc_per_cat( msg: Printer, scores: Dict[str, Dict[str, float]] ) -> None: msg.table( - [(k, f"{v:.2f}") for k, v in scores.items()], + [ + (k, f"{v:.2f}" if isinstance(v, (float, int)) else v) + for k, v in scores.items() + ], header=("", "ROC AUC"), aligns=("l", "r"), title="Textcat ROC AUC (per label)", diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 12b5d6c17..e6a1cb616 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -15,7 +15,7 @@ def info_cli( model: Optional[str] = Arg(None, help="Optional loadable spaCy pipeline"), markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"), silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"), - exclude: Optional[str] = Opt("labels", "--exclude", "-e", help="Comma-separated keys to exclude from the print-out"), + exclude: str = Opt("labels", "--exclude", "-e", help="Comma-separated keys to exclude from the print-out"), # fmt: on ): """ @@ -23,7 +23,7 @@ def info_cli( print its meta information. Flag --markdown prints details in Markdown for easy copy-pasting to GitHub issues. - DOCS: https://nightly.spacy.io/api/cli#info + DOCS: https://spacy.io/api/cli#info """ exclude = string_to_list(exclude) info(model, markdown=markdown, silent=silent, exclude=exclude) @@ -34,7 +34,7 @@ def info( *, markdown: bool = False, silent: bool = True, - exclude: List[str] = None, + exclude: Optional[List[str]] = None, ) -> Union[str, dict]: msg = Printer(no_print=silent, pretty=not silent) if not exclude: @@ -61,7 +61,7 @@ def info( return raw_data -def info_spacy() -> Dict[str, any]: +def info_spacy() -> Dict[str, Any]: """Generate info about the current spaCy intallation. RETURNS (dict): The spaCy info. @@ -105,12 +105,15 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]: def get_markdown( - data: Dict[str, Any], title: Optional[str] = None, exclude: List[str] = None + data: Dict[str, Any], + title: Optional[str] = None, + exclude: Optional[List[str]] = None, ) -> str: """Get data in GitHub-flavoured Markdown format for issues etc. - data (dict or list of tuples): Label/value pairs. - title (str / None): Title, will be rendered as headline 2. + data (Dict[str, Any]): Label/value pairs. + title (str): Optional title, will be rendered as headline 2. + exclude (List[str]): Names of keys to exclude. RETURNS (str): The Markdown string. """ md = MarkdownRenderer() diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index e862454f7..530b38eb3 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -10,7 +10,8 @@ from jinja2 import Template from .. import util from ..language import DEFAULT_CONFIG_PRETRAIN_PATH from ..schemas import RecommendationSchema -from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND, string_to_list +from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND +from ._util import string_to_list, import_code ROOT = Path(__file__).parent / "templates" @@ -27,8 +28,8 @@ class Optimizations(str, Enum): def init_config_cli( # fmt: off output_file: Path = Arg(..., help="File to save config.cfg to or - for stdout (will only output config and no additional logging info)", allow_dash=True), - lang: Optional[str] = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"), - pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"), + lang: str = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"), + pipeline: str = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"), optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."), gpu: bool = Opt(False, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."), pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"), @@ -41,10 +42,8 @@ def init_config_cli( optimal settings for your use case. This includes the choice of architecture, pretrained weights and related hyperparameters. - DOCS: https://nightly.spacy.io/api/cli#init-config + DOCS: https://spacy.io/api/cli#init-config """ - if isinstance(optimize, Optimizations): # instance of enum from the CLI - optimize = optimize.value pipeline = string_to_list(pipeline) is_stdout = str(output_file) == "-" if not is_stdout and output_file.exists() and not force_overwrite: @@ -56,7 +55,7 @@ def init_config_cli( config = init_config( lang=lang, pipeline=pipeline, - optimize=optimize, + optimize=optimize.value, gpu=gpu, pretraining=pretraining, silent=is_stdout, @@ -70,7 +69,8 @@ def init_fill_config_cli( base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False), output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True), pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"), - diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes") + diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes"), + code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), # fmt: on ): """ @@ -78,10 +78,11 @@ def init_fill_config_cli( from the default config and will create all objects, check the registered functions for their default values and update the base config. This command can be used with a config generated via the training quickstart widget: - https://nightly.spacy.io/usage/training#quickstart + https://spacy.io/usage/training#quickstart - DOCS: https://nightly.spacy.io/api/cli#init-fill-config + DOCS: https://spacy.io/api/cli#init-fill-config """ + import_code(code_path) fill_config(output_file, base_path, pretraining=pretraining, diff=diff) @@ -103,6 +104,10 @@ def fill_config( # config result is a valid config nlp = util.load_model_from_config(nlp.config) filled = nlp.config + # If we have sourced components in the base config, those will have been + # replaced with their actual config after loading, so we have to re-add them + sourced = util.get_sourced_components(config) + filled["components"].update(sourced) if pretraining: validate_config_for_pretrain(filled, msg) pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH) @@ -140,7 +145,8 @@ def init_config( template = Template(f.read()) # Filter out duplicates since tok2vec and transformer are added by template pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")] - reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, {})).dict() + defaults = RECOMMENDATIONS["__default__"] + reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, defaults)).dict() variables = { "lang": lang, "components": pipeline, @@ -167,7 +173,9 @@ def init_config( "Pipeline": ", ".join(pipeline), "Optimize for": optimize, "Hardware": variables["hardware"].upper(), - "Transformer": template_vars.transformer.get("name", False), + "Transformer": template_vars.transformer.get("name") # type: ignore[attr-defined] + if template_vars.use_transformer # type: ignore[attr-defined] + else None, } msg.info("Generated config template specific for your use case") for label, value in use_case.items(): diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index 2cdf4aa00..2a920cdda 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -108,10 +108,14 @@ def init_labels_cli( config = util.load_config(config_path, overrides=overrides) with show_validation_error(hint_fill=False): nlp = init_nlp(config, use_gpu=use_gpu) + _init_labels(nlp, output_path) + + +def _init_labels(nlp, output_path): for name, component in nlp.pipeline: if getattr(component, "label_data", None) is not None: output_file = output_path / f"{name}.json" srsly.write_json(output_file, component.label_data) - msg.good(f"Saving {name} labels to {output_file}") + msg.good(f"Saving label data for component '{name}' to {output_file}") else: - msg.info(f"No labels found for {name}") + msg.info(f"No label data found for component '{name}'") diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 04b2f1c9e..e76343dc3 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -1,11 +1,13 @@ -from typing import Optional, Union, Any, Dict, List +from typing import Optional, Union, Any, Dict, List, Tuple, cast import shutil from pathlib import Path -from wasabi import Printer, get_raw_input +from wasabi import Printer, MarkdownRenderer, get_raw_input +from thinc.api import Config +from collections import defaultdict import srsly import sys -from ._util import app, Arg, Opt +from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX from ..schemas import validate, ModelMetaSchema from .. import util from .. import about @@ -16,12 +18,12 @@ def package_cli( # fmt: off input_dir: Path = Arg(..., help="Directory with pipeline data", exists=True, file_okay=False), output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False), - code_paths: Optional[str] = Opt(None, "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"), + code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"), meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False), - create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"), + create_meta: bool = Opt(False, "--create-meta", "-C", help="Create meta.json, even if one exists"), name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"), version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"), - no_sdist: bool = Opt(False, "--no-sdist", "-NS", help="Don't build .tar.gz sdist, can be set if you want to run this step manually"), + build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."), force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing data in output directory"), # fmt: on ): @@ -38,13 +40,10 @@ def package_cli( registered functions like pipeline components), they are copied into the package and imported in the __init__.py. - DOCS: https://nightly.spacy.io/api/cli#package + DOCS: https://spacy.io/api/cli#package """ - code_paths = ( - [Path(p.strip()) for p in code_paths.split(",")] - if code_paths is not None - else [] - ) + create_sdist, create_wheel = get_build_formats(string_to_list(build)) + code_paths = [Path(p.strip()) for p in string_to_list(code_paths)] package( input_dir, output_dir, @@ -53,7 +52,8 @@ def package_cli( name=name, version=version, create_meta=create_meta, - create_sdist=not no_sdist, + create_sdist=create_sdist, + create_wheel=create_wheel, force=force, silent=False, ) @@ -68,6 +68,7 @@ def package( version: Optional[str] = None, create_meta: bool = False, create_sdist: bool = True, + create_wheel: bool = False, force: bool = False, silent: bool = True, ) -> None: @@ -75,10 +76,16 @@ def package( input_path = util.ensure_path(input_dir) output_path = util.ensure_path(output_dir) meta_path = util.ensure_path(meta_path) + if create_wheel and not has_wheel(): + err = "Generating a binary .whl file requires wheel to be installed" + msg.fail(err, "pip install wheel", exits=1) if not input_path or not input_path.exists(): msg.fail("Can't locate pipeline data", input_path, exits=1) if not output_path or not output_path.exists(): msg.fail("Output directory not found", output_path, exits=1) + if create_sdist or create_wheel: + opts = ["sdist" if create_sdist else "", "wheel" if create_wheel else ""] + msg.info(f"Building package artifacts: {', '.join(opt for opt in opts if opt)}") for code_path in code_paths: if not code_path.exists(): msg.fail("Can't find code file", code_path, exits=1) @@ -94,6 +101,12 @@ def package( msg.fail("Can't load pipeline meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) meta = get_meta(input_dir, meta) + if meta["requirements"]: + msg.good( + f"Including {len(meta['requirements'])} package requirement(s) from " + f"meta and config", + ", ".join(meta["requirements"]), + ) if name is not None: meta["name"] = name if version is not None: @@ -107,7 +120,9 @@ def package( msg.fail("Invalid pipeline meta.json") print("\n".join(errors)) sys.exit(1) - model_name = meta["lang"] + "_" + meta["name"] + model_name = meta["name"] + if not model_name.startswith(meta["lang"] + "_"): + model_name = f"{meta['lang']}_{model_name}" model_name_v = model_name + "-" + meta["version"] main_path = output_dir / model_name_v package_path = main_path / model_name @@ -123,9 +138,18 @@ def package( ) Path.mkdir(package_path, parents=True) shutil.copytree(str(input_dir), str(package_path / model_name_v)) - license_path = package_path / model_name_v / "LICENSE" - if license_path.exists(): - shutil.move(str(license_path), str(main_path)) + for file_name in FILENAMES_DOCS: + file_path = package_path / model_name_v / file_name + if file_path.exists(): + shutil.copy(str(file_path), str(main_path)) + readme_path = main_path / "README.md" + if not readme_path.exists(): + readme = generate_readme(meta) + create_file(readme_path, readme) + create_file(package_path / model_name_v / "README.md", readme) + msg.good("Generated README.md from meta.json") + else: + msg.info("Using existing README.md from pipeline directory") imports = [] for code_path in code_paths: imports.append(code_path.stem) @@ -141,8 +165,83 @@ def package( if create_sdist: with util.working_dir(main_path): util.run_command([sys.executable, "setup.py", "sdist"], capture=False) - zip_file = main_path / "dist" / f"{model_name_v}.tar.gz" + zip_file = main_path / "dist" / f"{model_name_v}{SDIST_SUFFIX}" msg.good(f"Successfully created zipped Python package", zip_file) + if create_wheel: + with util.working_dir(main_path): + util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False) + wheel = main_path / "dist" / f"{model_name_v}{WHEEL_SUFFIX}" + msg.good(f"Successfully created binary wheel", wheel) + + +def has_wheel() -> bool: + try: + import wheel # noqa: F401 + + return True + except ImportError: + return False + + +def get_third_party_dependencies( + config: Config, exclude: List[str] = util.SimpleFrozenList() +) -> List[str]: + """If the config includes references to registered functions that are + provided by third-party packages (spacy-transformers, other libraries), we + want to include them in meta["requirements"] so that the package specifies + them as dependencies and the user won't have to do it manually. + + We do this by: + - traversing the config to check for registered function (@ keys) + - looking up the functions and getting their module + - looking up the module version and generating an appropriate version range + + config (Config): The pipeline config. + exclude (list): List of packages to exclude (e.g. that already exist in meta). + RETURNS (list): The versioned requirements. + """ + own_packages = ("spacy", "spacy-legacy", "spacy-nightly", "thinc", "srsly") + distributions = util.packages_distributions() + funcs = defaultdict(set) + # We only want to look at runtime-relevant sections, not [training] or [initialize] + for section in ("nlp", "components"): + for path, value in util.walk_dict(config[section]): + if path[-1].startswith("@"): # collect all function references by registry + funcs[path[-1][1:]].add(value) + for component in config.get("components", {}).values(): + if "factory" in component: + funcs["factories"].add(component["factory"]) + modules = set() + for reg_name, func_names in funcs.items(): + for func_name in func_names: + func_info = util.registry.find(reg_name, func_name) + module_name = func_info.get("module") # type: ignore[attr-defined] + if module_name: # the code is part of a module, not a --code file + modules.add(func_info["module"].split(".")[0]) # type: ignore[index] + dependencies = [] + for module_name in modules: + if module_name in distributions: + dist = distributions.get(module_name) + if dist: + pkg = dist[0] + if pkg in own_packages or pkg in exclude: + continue + version = util.get_package_version(pkg) + version_range = util.get_minor_version_range(version) # type: ignore[arg-type] + dependencies.append(f"{pkg}{version_range}") + return dependencies + + +def get_build_formats(formats: List[str]) -> Tuple[bool, bool]: + supported = ["sdist", "wheel", "none"] + for form in formats: + if form not in supported: + msg = Printer() + err = f"Unknown build format: {form}. Supported: {', '.join(supported)}" + msg.fail(err, exits=1) + if not formats or "none" in formats: + return (False, False) + return ("sdist" in formats, "wheel" in formats) def create_file(file_path: Path, contents: str) -> None: @@ -153,7 +252,7 @@ def create_file(file_path: Path, contents: str) -> None: def get_meta( model_path: Union[str, Path], existing_meta: Dict[str, Any] ) -> Dict[str, Any]: - meta = { + meta: Dict[str, Any] = { "lang": "en", "name": "pipeline", "version": "0.0.0", @@ -163,9 +262,10 @@ def get_meta( "url": "", "license": "MIT", } - meta.update(existing_meta) nlp = util.load_model_from_path(Path(model_path)) - meta["spacy_version"] = util.get_model_version_range(about.__version__) + meta.update(nlp.meta) + meta.update(existing_meta) + meta["spacy_version"] = util.get_minor_version_range(about.__version__) meta["vectors"] = { "width": nlp.vocab.vectors_length, "vectors": len(nlp.vocab.vectors), @@ -174,6 +274,11 @@ def get_meta( } if about.__title__ != "spacy": meta["parent_package"] = about.__title__ + meta.setdefault("requirements", []) + # Update the requirements with all third-party packages in the config + existing_reqs = [util.split_requirement(req)[0] for req in meta["requirements"]] + reqs = get_third_party_dependencies(nlp.config, exclude=existing_reqs) + meta["requirements"].extend(reqs) return meta @@ -200,6 +305,113 @@ def generate_meta(existing_meta: Dict[str, Any], msg: Printer) -> Dict[str, Any] return meta +def generate_readme(meta: Dict[str, Any]) -> str: + """ + Generate a Markdown-formatted README text from a model meta.json. Used + within the GitHub release notes and as content for README.md file added + to model packages. + """ + md = MarkdownRenderer() + lang = meta["lang"] + name = f"{lang}_{meta['name']}" + version = meta["version"] + pipeline = ", ".join([md.code(p) for p in meta.get("pipeline", [])]) + components = ", ".join([md.code(p) for p in meta.get("components", [])]) + vecs = meta.get("vectors", {}) + vectors = f"{vecs.get('keys', 0)} keys, {vecs.get('vectors', 0)} unique vectors ({ vecs.get('width', 0)} dimensions)" + author = meta.get("author") or "n/a" + notes = meta.get("notes", "") + license_name = meta.get("license") + sources = _format_sources(meta.get("sources")) + description = meta.get("description") + label_scheme = _format_label_scheme(cast(Dict[str, Any], meta.get("labels"))) + accuracy = _format_accuracy(cast(Dict[str, Any], meta.get("performance"))) + table_data = [ + (md.bold("Name"), md.code(name)), + (md.bold("Version"), md.code(version)), + (md.bold("spaCy"), md.code(meta["spacy_version"])), + (md.bold("Default Pipeline"), pipeline), + (md.bold("Components"), components), + (md.bold("Vectors"), vectors), + (md.bold("Sources"), sources or "n/a"), + (md.bold("License"), md.code(license_name) if license_name else "n/a"), + (md.bold("Author"), md.link(author, meta["url"]) if "url" in meta else author), + ] + # Put together Markdown body + if description: + md.add(description) + md.add(md.table(table_data, ["Feature", "Description"])) + if label_scheme: + md.add(md.title(3, "Label Scheme")) + md.add(label_scheme) + if accuracy: + md.add(md.title(3, "Accuracy")) + md.add(accuracy) + if notes: + md.add(notes) + return md.text + + +def _format_sources(data: Any) -> str: + if not data or not isinstance(data, list): + return "n/a" + sources = [] + for source in data: + if not isinstance(source, dict): + source = {"name": source} + name = source.get("name") + if not name: + continue + url = source.get("url") + author = source.get("author") + result = name if not url else "[{}]({})".format(name, url) + if author: + result += " ({})".format(author) + sources.append(result) + return "
".join(sources) + + +def _format_accuracy(data: Dict[str, Any], exclude: List[str] = ["speed"]) -> str: + if not data: + return "" + md = MarkdownRenderer() + scalars = [(k, v) for k, v in data.items() if isinstance(v, (int, float))] + scores = [ + (md.code(acc.upper()), f"{score*100:.2f}") + for acc, score in scalars + if acc not in exclude + ] + md.add(md.table(scores, ["Type", "Score"])) + return md.text + + +def _format_label_scheme(data: Dict[str, Any]) -> str: + if not data: + return "" + md = MarkdownRenderer() + n_labels = 0 + n_pipes = 0 + label_data = [] + for pipe, labels in data.items(): + if not labels: + continue + col1 = md.bold(md.code(pipe)) + col2 = ", ".join( + [md.code(label.replace("|", "\\|")) for label in labels] + ) # noqa: W605 + label_data.append((col1, col2)) + n_labels += len(labels) + n_pipes += 1 + if not label_data: + return "" + label_info = f"View label scheme ({n_labels} labels for {n_pipes} components)" + md.add("
") + md.add(f"{label_info}") + md.add(md.table(label_data, ["Component", "Labels"])) + md.add("
") + return md.text + + TEMPLATE_SETUP = """ #!/usr/bin/env python import io @@ -214,6 +426,13 @@ def load_meta(fp): return json.load(f) +def load_readme(fp): + if path.exists(fp): + with io.open(fp, encoding='utf8') as f: + return f.read() + return "" + + def list_files(data_dir): output = [] for root, _, filenames in walk(data_dir): @@ -239,6 +458,8 @@ def setup_package(): root = path.abspath(path.dirname(__file__)) meta_path = path.join(root, 'meta.json') meta = load_meta(meta_path) + readme_path = path.join(root, 'README.md') + readme = load_readme(readme_path) model_name = str(meta['lang'] + '_' + meta['name']) model_dir = path.join(model_name, model_name + '-' + meta['version']) @@ -248,6 +469,7 @@ def setup_package(): setup( name=model_name, description=meta.get('description'), + long_description=readme, author=meta.get('author'), author_email=meta.get('email'), url=meta.get('url'), @@ -263,12 +485,14 @@ def setup_package(): if __name__ == '__main__': setup_package() -""".strip() +""".lstrip() TEMPLATE_MANIFEST = """ include meta.json include LICENSE +include LICENSES_SOURCES +include README.md """.strip() @@ -283,4 +507,7 @@ __version__ = get_model_meta(Path(__file__).parent)['version'] def load(**overrides): return load_model_from_init_py(__file__, **overrides) -""".strip() +""".lstrip() + + +FILENAMES_DOCS = ["LICENSE", "LICENSES_SOURCES", "README.md"] diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 15a1e92a5..fe3ce0dad 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -44,7 +44,7 @@ def pretrain_cli( all settings are the same between pretraining and training. Ideally, this is done by using the same config file for both commands. - DOCS: https://nightly.spacy.io/api/cli#pretrain + DOCS: https://spacy.io/api/cli#pretrain """ config_overrides = parse_config_overrides(ctx.args) import_code(code_path) @@ -95,6 +95,13 @@ def verify_cli_args(config_path, output_dir, resume_path, epoch_resume): "then the new directory will be created for you.", ) if resume_path is not None: + if resume_path.is_dir(): + # This is necessary because Windows gives a Permission Denied when we + # try to open the directory later, which is confusing. See #7878 + msg.fail( + "--resume-path should be a weights file, but {resume_path} is a directory.", + exits=True, + ) model_name = re.search(r"model\d+\.bin", str(resume_path)) if not model_name and not epoch_resume: msg.fail( diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py index 43226730d..3c282c73d 100644 --- a/spacy/cli/profile.py +++ b/spacy/cli/profile.py @@ -30,9 +30,9 @@ def profile_cli( It can either be provided as a JSONL file, or be read from sys.sytdin. If no input file is specified, the IMDB dataset is loaded via Thinc. - DOCS: https://nightly.spacy.io/api/cli#debug-profile + DOCS: https://spacy.io/api/cli#debug-profile """ - if ctx.parent.command.name == NAME: # called as top-level command + if ctx.parent.command.name == NAME: # type: ignore[union-attr] # called as top-level command msg.warn( "The profile command is now available via the 'debug profile' " "subcommand. You can run python -m spacy debug --help for an " @@ -42,9 +42,9 @@ def profile_cli( def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> None: - if inputs is not None: - inputs = _read_inputs(inputs, msg) + texts = _read_inputs(inputs, msg) + texts = list(itertools.islice(texts, n_texts)) if inputs is None: try: import ml_datasets @@ -56,16 +56,13 @@ def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> exits=1, ) - n_inputs = 25000 - with msg.loading("Loading IMDB dataset via Thinc..."): - imdb_train, _ = ml_datasets.imdb() - inputs, _ = zip(*imdb_train) - msg.info(f"Loaded IMDB dataset and using {n_inputs} examples") - inputs = inputs[:n_inputs] + with msg.loading("Loading IMDB dataset via ml_datasets..."): + imdb_train, _ = ml_datasets.imdb(train_limit=n_texts, dev_limit=0) + texts, _ = zip(*imdb_train) + msg.info(f"Loaded IMDB dataset and using {n_texts} examples") with msg.loading(f"Loading pipeline '{model}'..."): nlp = load_model(model) msg.good(f"Loaded pipeline '{model}'") - texts = list(itertools.islice(inputs, n_texts)) cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof") s = pstats.Stats("Profile.prof") msg.divider("Profile stats") @@ -87,7 +84,7 @@ def _read_inputs(loc: Union[Path, str], msg: Printer) -> Iterator[str]: if not input_path.exists() or not input_path.is_file(): msg.fail("Not a valid input data file", loc, exits=1) msg.info(f"Using data from {input_path.parts[-1]}") - file_ = input_path.open() + file_ = input_path.open() # type: ignore[assignment] for line in file_: data = srsly.json_loads(line) text = data["text"] diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py index 58f59a3f9..b5057e401 100644 --- a/spacy/cli/project/assets.py +++ b/spacy/cli/project/assets.py @@ -1,18 +1,24 @@ -from typing import Optional +from typing import Any, Dict, Optional from pathlib import Path from wasabi import msg import re import shutil import requests +import typer from ...util import ensure_path, working_dir from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config from .._util import get_checksum, download_file, git_checkout, get_git_version +from .._util import SimpleFrozenDict, parse_config_overrides -@project_cli.command("assets") +@project_cli.command( + "assets", + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) def project_assets_cli( # fmt: off + ctx: typer.Context, # This is only used to read additional arguments project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False), sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+.") # fmt: on @@ -22,18 +28,24 @@ def project_assets_cli( provided in the project.yml, the file is only downloaded if no local file with the same checksum exists. - DOCS: https://nightly.spacy.io/api/cli#project-assets + DOCS: https://spacy.io/api/cli#project-assets """ - project_assets(project_dir, sparse_checkout=sparse_checkout) + overrides = parse_config_overrides(ctx.args) + project_assets(project_dir, overrides=overrides, sparse_checkout=sparse_checkout) -def project_assets(project_dir: Path, *, sparse_checkout: bool = False) -> None: +def project_assets( + project_dir: Path, + *, + overrides: Dict[str, Any] = SimpleFrozenDict(), + sparse_checkout: bool = False, +) -> None: """Fetch assets for a project using DVC if possible. project_dir (Path): Path to project directory. """ project_path = ensure_path(project_dir) - config = load_project_config(project_path) + config = load_project_config(project_path, overrides=overrides) assets = config.get("assets", {}) if not assets: msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0) @@ -59,6 +71,15 @@ def project_assets(project_dir: Path, *, sparse_checkout: bool = False) -> None: shutil.rmtree(dest) else: dest.unlink() + if "repo" not in asset["git"] or asset["git"]["repo"] is None: + msg.fail( + "A git asset must include 'repo', the repository address.", exits=1 + ) + if "path" not in asset["git"] or asset["git"]["path"] is None: + msg.fail( + "A git asset must include 'path' - use \"\" to get the entire repository.", + exits=1, + ) git_checkout( asset["git"]["repo"], asset["git"]["path"], @@ -112,7 +133,6 @@ def fetch_asset( # If there's already a file, check for checksum if checksum == get_checksum(dest_path): msg.good(f"Skipping download with matching checksum: {dest}") - return dest_path # We might as well support the user here and create parent directories in # case the asset dir isn't listed as a dir to create in the project.yml if not dest_path.parent.exists(): @@ -129,7 +149,6 @@ def fetch_asset( msg.good(f"Copied local asset {dest}") else: msg.fail(f"Download failed: {dest}", e) - return if checksum and checksum != get_checksum(dest_path): msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}") diff --git a/spacy/cli/project/clone.py b/spacy/cli/project/clone.py index 851fc444a..360ee3428 100644 --- a/spacy/cli/project/clone.py +++ b/spacy/cli/project/clone.py @@ -9,14 +9,18 @@ from ...util import ensure_path from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE from .._util import git_checkout, get_git_version +DEFAULT_REPO = about.__projects__ +DEFAULT_PROJECTS_BRANCH = about.__projects_branch__ +DEFAULT_BRANCH = "master" + @project_cli.command("clone") def project_clone_cli( # fmt: off name: str = Arg(..., help="The name of the template to clone"), dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False), - repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to clone from"), - branch: str = Opt(about.__projects_branch__, "--branch", "-b", help="The branch to clone from"), + repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"), + branch: Optional[str] = Opt(None, "--branch", "-b", help="The branch to clone from"), sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.") # fmt: on ): @@ -25,10 +29,13 @@ def project_clone_cli( defaults to the official spaCy template repo, but can be customized (including using a private repo). - DOCS: https://nightly.spacy.io/api/cli#project-clone + DOCS: https://spacy.io/api/cli#project-clone """ if dest is None: dest = Path.cwd() / Path(name).parts[-1] + if branch is None: + # If it's a user repo, we want to default to other branch + branch = DEFAULT_PROJECTS_BRANCH if repo == DEFAULT_REPO else DEFAULT_BRANCH project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout) @@ -73,9 +80,9 @@ def check_clone(name: str, dest: Path, repo: str) -> None: repo (str): URL of the repo to clone from. """ git_err = ( - f"Cloning spaCy project templates requires Git and the 'git' command. ", + f"Cloning spaCy project templates requires Git and the 'git' command. " f"To clone a project without Git, copy the files from the '{name}' " - f"directory in the {repo} to {dest} manually.", + f"directory in the {repo} to {dest} manually." ) get_git_version(error=git_err) if not dest: diff --git a/spacy/cli/project/document.py b/spacy/cli/project/document.py index 811b7c746..1ba43a958 100644 --- a/spacy/cli/project/document.py +++ b/spacy/cli/project/document.py @@ -5,10 +5,7 @@ from ...util import working_dir from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config -DOCS_URL = "https://nightly.spacy.io" -INTRO = f"""> ⚠️ This project template uses the new [**spaCy v3.0**]({DOCS_URL}), which -> is currently available as a nightly pre-release. You can install it from pip as `spacy-nightly`: -> `pip install spacy-nightly`. Make sure to use a fresh virtual environment.""" +DOCS_URL = "https://spacy.io" INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the project, as well as the available commands and workflows. For details, see the [spaCy projects documentation]({DOCS_URL}/usage/projects).""" @@ -44,7 +41,7 @@ def project_document_cli( auto-generated section and only the auto-generated docs will be replaced when you re-run the command. - DOCS: https://nightly.spacy.io/api/cli#project-document + DOCS: https://spacy.io/api/cli#project-document """ project_document(project_dir, output_file, no_emoji=no_emoji) @@ -59,7 +56,6 @@ def project_document( title = config.get("title") description = config.get("description") md.add(md.title(1, f"spaCy Project{f': {title}' if title else ''}", "🪐")) - md.add(INTRO) if description: md.add(description) md.add(md.title(2, PROJECT_FILE, "📋")) diff --git a/spacy/cli/project/dvc.py b/spacy/cli/project/dvc.py index 6eedc9c20..83dc5efbf 100644 --- a/spacy/cli/project/dvc.py +++ b/spacy/cli/project/dvc.py @@ -34,7 +34,7 @@ def project_update_dvc_cli( workflow is used. The DVC config will only be updated if the project.yml changed. - DOCS: https://nightly.spacy.io/api/cli#project-dvc + DOCS: https://spacy.io/api/cli#project-dvc """ project_update_dvc(project_dir, workflow, verbose=verbose, force=force) @@ -143,8 +143,8 @@ def run_dvc_commands( easier to pass flags like --quiet that depend on a variable or command-line setting while avoiding lots of nested conditionals. """ - for command in commands: - command = split_command(command) + for c in commands: + command = split_command(c) dvc_command = ["dvc", *command] # Add the flags if they are set to True for flag, is_active in flags.items(): diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py index 26676d5b3..6e3cde88c 100644 --- a/spacy/cli/project/pull.py +++ b/spacy/cli/project/pull.py @@ -2,7 +2,7 @@ from pathlib import Path from wasabi import msg from .remote_storage import RemoteStorage from .remote_storage import get_command_hash -from .._util import project_cli, Arg +from .._util import project_cli, Arg, logger from .._util import load_project_config from .run import update_lockfile @@ -19,7 +19,7 @@ def project_pull_cli( A storage can be anything that the smart-open library can upload to, e.g. AWS, Google Cloud Storage, SSH, local directories etc. - DOCS: https://nightly.spacy.io/api/cli#project-pull + DOCS: https://spacy.io/api/cli#project-pull """ for url, output_path in project_pull(project_dir, remote): if url is not None: @@ -39,11 +39,15 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): # in the list. while commands: for i, cmd in enumerate(list(commands)): + logger.debug(f"CMD: {cmd['name']}.") deps = [project_dir / dep for dep in cmd.get("deps", [])] if all(dep.exists() for dep in deps): cmd_hash = get_command_hash("", "", deps, cmd["script"]) for output_path in cmd.get("outputs", []): url = storage.pull(output_path, command_hash=cmd_hash) + logger.debug( + f"URL: {url} for {output_path} with command hash {cmd_hash}" + ) yield url, output_path out_locs = [project_dir / out for out in cmd.get("outputs", [])] @@ -53,6 +57,8 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): # we iterate over the loop again. commands.pop(i) break + else: + logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.") else: # If we didn't break the for loop, break the while loop. break diff --git a/spacy/cli/project/push.py b/spacy/cli/project/push.py index 26495412d..bc779e9cd 100644 --- a/spacy/cli/project/push.py +++ b/spacy/cli/project/push.py @@ -3,7 +3,7 @@ from wasabi import msg from .remote_storage import RemoteStorage from .remote_storage import get_content_hash, get_command_hash from .._util import load_project_config -from .._util import project_cli, Arg +from .._util import project_cli, Arg, logger @project_cli.command("push") @@ -18,7 +18,7 @@ def project_push_cli( the smart-open library can upload to, e.g. AWS, Google Cloud Storage, SSH, local directories etc. - DOCS: https://nightly.spacy.io/api/cli#project-push + DOCS: https://spacy.io/api/cli#project-push """ for output_path, url in project_push(project_dir, remote): if url is None: @@ -37,12 +37,15 @@ def project_push(project_dir: Path, remote: str): remote = config["remotes"][remote] storage = RemoteStorage(project_dir, remote) for cmd in config.get("commands", []): + logger.debug(f"CMD: cmd['name']") deps = [project_dir / dep for dep in cmd.get("deps", [])] if any(not dep.exists() for dep in deps): + logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs") continue cmd_hash = get_command_hash( "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"] ) + logger.debug(f"CMD_HASH: {cmd_hash}") for output_path in cmd.get("outputs", []): output_loc = project_dir / output_path if output_loc.exists() and _is_not_empty_dir(output_loc): @@ -51,6 +54,9 @@ def project_push(project_dir: Path, remote: str): command_hash=cmd_hash, content_hash=get_content_hash(output_loc), ) + logger.debug( + f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}" + ) yield output_path, url diff --git a/spacy/cli/project/remote_storage.py b/spacy/cli/project/remote_storage.py index 6056458e2..336a4bcb3 100644 --- a/spacy/cli/project/remote_storage.py +++ b/spacy/cli/project/remote_storage.py @@ -41,7 +41,7 @@ class RemoteStorage: raise IOError(f"Cannot push {loc}: does not exist.") url = self.make_url(path, command_hash, content_hash) if url.exists(): - return None + return url tmp: Path with make_tempdir() as tmp: tar_loc = tmp / self.encode_name(str(path)) @@ -131,8 +131,10 @@ def get_command_hash( currently installed packages, whatever environment variables have been marked as relevant, and the command. """ - check_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION) - spacy_v = GIT_VERSION if check_commit else get_minor_version(about.__version__) + if check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION): + spacy_v = GIT_VERSION + else: + spacy_v = str(get_minor_version(about.__version__) or "") dep_checksums = [get_checksum(dep) for dep in sorted(deps)] hashes = [spacy_v, site_hash, env_hash] + dep_checksums hashes.extend(cmd) diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index 1a9b447ea..734803bc4 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -1,21 +1,26 @@ from typing import Optional, List, Dict, Sequence, Any, Iterable from pathlib import Path from wasabi import msg +from wasabi.util import locale_escape import sys import srsly +import typer from ... import about from ...git_info import GIT_VERSION from ...util import working_dir, run_command, split_command, is_cwd, join_command from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS -from ...util import check_bool_env_var +from ...util import check_bool_env_var, SimpleFrozenDict from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash -from .._util import get_checksum, project_cli, Arg, Opt, COMMAND +from .._util import get_checksum, project_cli, Arg, Opt, COMMAND, parse_config_overrides -@project_cli.command("run") +@project_cli.command( + "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True} +) def project_run_cli( # fmt: off + ctx: typer.Context, # This is only used to read additional arguments subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"), project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"), @@ -28,16 +33,23 @@ def project_run_cli( commands define dependencies and/or outputs, they will only be re-run if state has changed. - DOCS: https://nightly.spacy.io/api/cli#project-run + DOCS: https://spacy.io/api/cli#project-run """ if show_help or not subcommand: print_run_help(project_dir, subcommand) else: - project_run(project_dir, subcommand, force=force, dry=dry) + overrides = parse_config_overrides(ctx.args) + project_run(project_dir, subcommand, overrides=overrides, force=force, dry=dry) def project_run( - project_dir: Path, subcommand: str, *, force: bool = False, dry: bool = False + project_dir: Path, + subcommand: str, + *, + overrides: Dict[str, Any] = SimpleFrozenDict(), + force: bool = False, + dry: bool = False, + capture: bool = False, ) -> None: """Run a named script defined in the project.yml. If the script is part of the default pipeline (defined in the "run" section), DVC is used to @@ -46,17 +58,30 @@ def project_run( project_dir (Path): Path to project directory. subcommand (str): Name of command to run. + overrides (Dict[str, Any]): Optional config overrides. force (bool): Force re-running, even if nothing changed. dry (bool): Perform a dry run and don't execute commands. + capture (bool): Whether to capture the output and errors of individual commands. + If False, the stdout and stderr will not be redirected, and if there's an error, + sys.exit will be called with the return code. You should use capture=False + when you want to turn over execution to the command, and capture=True + when you want to run the command more like a function. """ - config = load_project_config(project_dir) + config = load_project_config(project_dir, overrides=overrides) commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} workflows = config.get("workflows", {}) - validate_subcommand(commands.keys(), workflows.keys(), subcommand) + validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand) if subcommand in workflows: msg.info(f"Running workflow '{subcommand}'") for cmd in workflows[subcommand]: - project_run(project_dir, cmd, force=force, dry=dry) + project_run( + project_dir, + cmd, + overrides=overrides, + force=force, + dry=dry, + capture=capture, + ) else: cmd = commands[subcommand] for dep in cmd.get("deps", []): @@ -72,7 +97,7 @@ def project_run( if not rerun and not force: msg.info(f"Skipping '{cmd['name']}': nothing changed") else: - run_commands(cmd["script"], dry=dry) + run_commands(cmd["script"], dry=dry, capture=capture) if not dry: update_lockfile(current_dir, cmd) @@ -91,7 +116,7 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: workflows = config.get("workflows", {}) project_loc = "" if is_cwd(project_dir) else project_dir if subcommand: - validate_subcommand(commands.keys(), workflows.keys(), subcommand) + validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand) print(f"Usage: {COMMAND} project run {subcommand} {project_loc}") if subcommand in commands: help_text = commands[subcommand].get("help") @@ -111,7 +136,7 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: print("") title = config.get("title") if title: - print(f"{title}\n") + print(f"{locale_escape(title)}\n") if config_commands: print(f"Available commands in {PROJECT_FILE}") print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}") @@ -126,15 +151,21 @@ def run_commands( commands: Iterable[str] = SimpleFrozenList(), silent: bool = False, dry: bool = False, + capture: bool = False, ) -> None: """Run a sequence of commands in a subprocess, in order. commands (List[str]): The string commands. silent (bool): Don't print the commands. dry (bool): Perform a dry run and don't execut anything. + capture (bool): Whether to capture the output and errors of individual commands. + If False, the stdout and stderr will not be redirected, and if there's an error, + sys.exit will be called with the return code. You should use capture=False + when you want to turn over execution to the command, and capture=True + when you want to run the command more like a function. """ - for command in commands: - command = split_command(command) + for c in commands: + command = split_command(c) # Not sure if this is needed or a good idea. Motivation: users may often # use commands in their config that reference "python" and we want to # make sure that it's always executing the same Python that spaCy is @@ -149,7 +180,7 @@ def run_commands( if not silent: print(f"Running command: {join_command(command)}") if not dry: - run_command(command, capture=False) + run_command(command, capture=capture) def validate_subcommand( @@ -190,6 +221,9 @@ def check_rerun( strict_version (bool): RETURNS (bool): Whether to re-run the command. """ + # Always rerun if no-skip is set + if command.get("no_skip", False): + return True lock_path = project_dir / PROJECT_LOCK if not lock_path.exists(): # We don't have a lockfile, run command return True @@ -260,7 +294,7 @@ def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any] } -def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, str]]: +def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, Optional[str]]]: """Generate the file information for a list of paths (dependencies, outputs). Includes the file path and the file's checksum. diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index ff190804c..8eaef86b3 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -1,7 +1,7 @@ {# This is a template for training configs used for the quickstart widget in the docs and the init config command. It encodes various best practices and can help generate the best possible configuration, given a user's requirements. #} -{%- set use_transformer = (transformer_data and hardware != "cpu") -%} +{%- set use_transformer = hardware != "cpu" -%} {%- set transformer = transformer_data[optimize] if use_transformer else {} -%} [paths] train = null @@ -16,9 +16,13 @@ gpu_allocator = null [nlp] lang = "{{ lang }}" +{%- set no_tok2vec = components|length == 1 and (("textcat" in components or "textcat_multilabel" in components) and optimize == "efficiency")-%} +{%- if not no_tok2vec and ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "entity_linker" in components or "textcat" in components or "textcat_multilabel" in components) -%} {%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %} +{%- else -%} +{%- set full_pipeline = components %} +{%- endif %} pipeline = {{ full_pipeline|pprint()|replace("'", '"')|safe }} -tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"} batch_size = {{ 128 if hardware == "gpu" else 1000 }} [components] @@ -29,7 +33,7 @@ batch_size = {{ 128 if hardware == "gpu" else 1000 }} factory = "transformer" [components.transformer.model] -@architectures = "spacy-transformers.TransformerModel.v1" +@architectures = "spacy-transformers.TransformerModel.v3" name = "{{ transformer["name"] }}" tokenizer_config = {"use_fast": true} @@ -146,14 +150,14 @@ grad_factor = 1.0 @layers = "reduce_mean.v1" [components.textcat.model.linear_model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = true ngram_size = 1 no_output_layer = false {% else -%} [components.textcat.model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = true ngram_size = 1 no_output_layer = false @@ -177,14 +181,14 @@ grad_factor = 1.0 @layers = "reduce_mean.v1" [components.textcat_multilabel.model.linear_model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = false ngram_size = 1 no_output_layer = false {% else -%} [components.textcat_multilabel.model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = false ngram_size = 1 no_output_layer = false @@ -193,12 +197,7 @@ no_output_layer = false {# NON-TRANSFORMER PIPELINE #} {% else -%} - -{%- if hardware == "gpu" -%} -# There are no recommended transformer weights available for language '{{ lang }}' -# yet, so the pipeline described here is not transformer-based. -{%- endif %} - +{% if not no_tok2vec-%} [components.tok2vec] factory = "tok2vec" @@ -206,7 +205,7 @@ factory = "tok2vec" @architectures = "spacy.Tok2Vec.v2" [components.tok2vec.model.embed] -@architectures = "spacy.MultiHashEmbed.v1" +@architectures = "spacy.MultiHashEmbed.v2" width = ${components.tok2vec.model.encode.width} {% if has_letters -%} attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] @@ -223,6 +222,7 @@ width = {{ 96 if optimize == "efficiency" else 256 }} depth = {{ 4 if optimize == "efficiency" else 8 }} window_size = 1 maxout_pieces = 3 +{% endif -%} {% if "morphologizer" in components %} [components.morphologizer] @@ -314,14 +314,14 @@ nO = null width = ${components.tok2vec.model.encode.width} [components.textcat.model.linear_model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = true ngram_size = 1 no_output_layer = false {% else -%} [components.textcat.model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = true ngram_size = 1 no_output_layer = false @@ -342,14 +342,14 @@ nO = null width = ${components.tok2vec.model.encode.width} [components.textcat_multilabel.model.linear_model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = false ngram_size = 1 no_output_layer = false {% else -%} [components.textcat_multilabel.model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = false ngram_size = 1 no_output_layer = false @@ -370,7 +370,7 @@ factory = "{{ pipe }}" [corpora.train] @readers = "spacy.Corpus.v1" path = ${paths.train} -max_length = {{ 500 if hardware == "gpu" else 2000 }} +max_length = 0 [corpora.dev] @readers = "spacy.Corpus.v1" @@ -416,7 +416,7 @@ compound = 1.001 [initialize] {% if use_transformer or optimize == "efficiency" or not word_vectors -%} -vectors = null +vectors = ${paths.vectors} {% else -%} vectors = "{{ word_vectors }}" {% endif -%} diff --git a/spacy/cli/templates/quickstart_training_recommendations.yml b/spacy/cli/templates/quickstart_training_recommendations.yml index 47b3abbf6..a7bf9b74a 100644 --- a/spacy/cli/templates/quickstart_training_recommendations.yml +++ b/spacy/cli/templates/quickstart_training_recommendations.yml @@ -1,6 +1,15 @@ # Recommended settings and available resources for each language, if available. # Not all languages have recommended word vectors or transformers and for some, # the recommended transformer for efficiency and accuracy may be the same. +__default__: + word_vectors: null + transformer: + efficiency: + name: bert-base-multilingual-uncased + size_factor: 3 + accuracy: + name: bert-base-multilingual-uncased + size_factor: 3 ar: word_vectors: null transformer: @@ -10,9 +19,33 @@ ar: accuracy: name: asafaya/bert-base-arabic size_factor: 3 +bg: + word_vectors: null + transformer: + efficiency: + name: iarfmoose/roberta-base-bulgarian + size_factor: 3 + accuracy: + name: iarfmoose/roberta-base-bulgarian + size_factor: 3 +bn: + word_vectors: null + transformer: + efficiency: + name: sagorsarker/bangla-bert-base + size_factor: 3 + accuracy: + name: sagorsarker/bangla-bert-base + size_factor: 3 da: word_vectors: da_core_news_lg - transformer: null + transformer: + efficiency: + name: Maltehb/danish-bert-botxo + size_factor: 3 + accuracy: + name: Maltehb/danish-bert-botxo + size_factor: 3 de: word_vectors: de_core_news_lg transformer: @@ -49,6 +82,15 @@ es: accuracy: name: dccuchile/bert-base-spanish-wwm-cased size_factor: 3 +eu: + word_vectors: null + transformer: + efficiency: + name: mrm8488/RoBasquERTa + size_factor: 3 + accuracy: + name: mrm8488/RoBasquERTa + size_factor: 3 fi: word_vectors: null transformer: @@ -67,26 +109,59 @@ fr: accuracy: name: camembert-base size_factor: 3 +hi: + word_vectors: null + transformer: + efficiency: + name: ai4bharat/indic-bert + size_factor: 3 + accuracy: + name: ai4bharat/indic-bert + size_factor: 3 +id: + word_vectors: null + transformer: + efficiency: + name: indolem/indobert-base-uncased + size_factor: 3 + accuracy: + name: indolem/indobert-base-uncased + size_factor: 3 it: word_vectors: it_core_news_lg - transformers: null + transformer: null ja: word_vectors: ja_core_news_lg - transformers: null + transformer: null lt: word_vectors: lt_core_news_lg - transformers: null + transformer: null +mk: + word_vectors: null + transformer: + efficiency: + name: anon-submission-mk/bert-base-macedonian-cased + size_factor: 3 + accuracy: + name: anon-submission-mk/bert-base-macedonian-cased + size_factor: 3 nb: word_vectors: nb_core_news_lg - transformers: null + transformer: + efficiency: + name: NbAiLab/nb-bert-base + size_factor: 3 + accuracy: + name: NbAiLab/nb-bert-base + size_factor: 3 nl: word_vectors: nl_core_news_lg transformer: efficiency: - name: pdelobelle/robbert-v2-dutch-base + name: GroNLP/bert-base-dutch-cased size_factor: 3 accuracy: - name: pdelobelle/robbert-v2-dutch-base + name: GroNLP/bert-base-dutch-cased size_factor: 3 pl: word_vectors: pl_core_news_lg @@ -108,7 +183,22 @@ pt: size_factor: 3 ro: word_vectors: ro_core_news_lg - transformers: null + transformer: + efficiency: + name: dumitrescustefan/bert-base-romanian-cased-v1 + size_factor: 3 + accuracy: + name: dumitrescustefan/bert-base-romanian-cased-v1 + size_factor: 3 +si: + word_vectors: null + transformer: + efficiency: + name: setu4993/LaBSE + size_factor: 3 + accuracy: + name: setu4993/LaBSE + size_factor: 3 sv: word_vectors: null transformer: @@ -118,6 +208,33 @@ sv: accuracy: name: KB/bert-base-swedish-cased size_factor: 3 +ta: + word_vectors: null + transformer: + efficiency: + name: ai4bharat/indic-bert + size_factor: 3 + accuracy: + name: ai4bharat/indic-bert + size_factor: 3 +te: + word_vectors: null + transformer: + efficiency: + name: kuppuluri/telugu_bertu + size_factor: 3 + accuracy: + name: kuppuluri/telugu_bertu + size_factor: 3 +th: + word_vectors: null + transformer: + efficiency: + name: monsoon-nlp/bert-base-thai + size_factor: 3 + accuracy: + name: monsoon-nlp/bert-base-thai + size_factor: 3 tr: word_vectors: null transformer: @@ -127,6 +244,24 @@ tr: accuracy: name: dbmdz/bert-base-turkish-cased size_factor: 3 +uk: + word_vectors: null + transformer: + efficiency: + name: youscan/ukr-roberta-base + size_factor: 3 + accuracy: + name: youscan/ukr-roberta-base + size_factor: 3 +ur: + word_vectors: null + transformer: + efficiency: + name: urduhack/roberta-urdu-small + size_factor: 3 + accuracy: + name: urduhack/roberta-urdu-small + size_factor: 3 zh: word_vectors: zh_core_web_lg transformer: diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 0120ec08a..cc22cbba6 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Dict, Any, Union from pathlib import Path from wasabi import msg import typer @@ -7,7 +7,7 @@ import sys from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error from ._util import import_code, setup_gpu -from ..training.loop import train +from ..training.loop import train as train_nlp from ..training.initialize import init_nlp from .. import util @@ -28,7 +28,7 @@ def train_cli( """ Train or update a spaCy pipeline. Requires data in spaCy's binary format. To convert data from other formats, use the `spacy convert` command. The - config file includes all settings and hyperparameters used during traing. + config file includes all settings and hyperparameters used during training. To override settings in the config, e.g. settings that point to local paths or that you want to experiment with, you can override them as command line options. For instance, --training.batch_size 128 overrides @@ -37,17 +37,33 @@ def train_cli( used to register custom functions and architectures that can then be referenced in the config. - DOCS: https://nightly.spacy.io/api/cli#train + DOCS: https://spacy.io/api/cli#train """ util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) + overrides = parse_config_overrides(ctx.args) + import_code(code_path) + train(config_path, output_path, use_gpu=use_gpu, overrides=overrides) + + +def train( + config_path: Union[str, Path], + output_path: Optional[Union[str, Path]] = None, + *, + use_gpu: int = -1, + overrides: Dict[str, Any] = util.SimpleFrozenDict(), +): + config_path = util.ensure_path(config_path) + output_path = util.ensure_path(output_path) # Make sure all files and paths exists if they are needed if not config_path or (str(config_path) != "-" and not config_path.exists()): msg.fail("Config file not found", config_path, exits=1) - if output_path is not None and not output_path.exists(): - output_path.mkdir(parents=True) - msg.good(f"Created output directory: {output_path}") - overrides = parse_config_overrides(ctx.args) - import_code(code_path) + if not output_path: + msg.info("No output directory provided") + else: + if not output_path.exists(): + output_path.mkdir(parents=True) + msg.good(f"Created output directory: {output_path}") + msg.info(f"Saving to output directory: {output_path}") setup_gpu(use_gpu) with show_validation_error(config_path): config = util.load_config(config_path, overrides=overrides, interpolate=False) @@ -56,4 +72,4 @@ def train_cli( nlp = init_nlp(config, use_gpu=use_gpu) msg.good("Initialized pipeline") msg.divider("Training pipeline") - train(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr) + train_nlp(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr) diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py index 9a75ed6f3..a918e9a39 100644 --- a/spacy/cli/validate.py +++ b/spacy/cli/validate.py @@ -3,10 +3,11 @@ from pathlib import Path import sys import requests from wasabi import msg, Printer +import warnings from ._util import app from .. import about -from ..util import get_package_version, get_installed_models, get_base_version +from ..util import get_package_version, get_installed_models, get_minor_version from ..util import get_package_path, get_model_meta, is_compatible_version @@ -17,14 +18,14 @@ def validate_cli(): if the installed packages are compatible and shows upgrade instructions if available. Should be run after `pip install -U spacy`. - DOCS: https://nightly.spacy.io/api/cli#validate + DOCS: https://spacy.io/api/cli#validate """ validate() def validate() -> None: model_pkgs, compat = get_model_pkgs() - spacy_version = get_base_version(about.__version__) + spacy_version = get_minor_version(about.__version__) current_compat = compat.get(spacy_version, {}) if not current_compat: msg.warn(f"No compatible packages found for v{spacy_version} of spaCy") @@ -44,8 +45,8 @@ def validate() -> None: comp = msg.text("", color="green", icon="good", no_print=True) version = msg.text(data["version"], color="green", no_print=True) else: - version = msg.text(data["version"], color="red", no_print=True) - comp = f"--> {compat.get(data['name'], ['n/a'])[0]}" + version = msg.text(data["version"], color="yellow", no_print=True) + comp = f"--> {current_compat.get(data['name'], ['n/a'])[0]}" rows.append((data["name"], data["spacy"], version, comp)) msg.table(rows, header=header) else: @@ -78,7 +79,9 @@ def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]: msg.good("Loaded compatibility table") compat = r.json()["spacy"] all_models = set() - installed_models = get_installed_models() + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="\\[W09[45]") + installed_models = get_installed_models() for spacy_v, models in dict(compat).items(): all_models.update(models.keys()) for model, model_vs in models.items(): @@ -92,9 +95,11 @@ def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]: spacy_version = about.__version__ else: model_path = get_package_path(package) - model_meta = get_model_meta(model_path) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="\\[W09[45]") + model_meta = get_model_meta(model_path) spacy_version = model_meta.get("spacy_version", "n/a") - is_compat = is_compatible_version(about.__version__, spacy_version) + is_compat = is_compatible_version(about.__version__, spacy_version) # type: ignore[assignment] pkgs[pkg_name] = { "name": package, "version": version, diff --git a/spacy/compat.py b/spacy/compat.py index 6eca18b80..89132735d 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -5,12 +5,12 @@ from thinc.util import copy_array try: import cPickle as pickle except ImportError: - import pickle + import pickle # type: ignore[no-redef] try: import copy_reg except ImportError: - import copyreg as copy_reg + import copyreg as copy_reg # type: ignore[no-redef] try: from cupy.cuda.stream import Stream as CudaStream @@ -22,10 +22,18 @@ try: except ImportError: cupy = None +if sys.version_info[:2] >= (3, 8): # Python 3.8+ + from typing import Literal, Protocol, runtime_checkable +else: + from typing_extensions import Literal, Protocol, runtime_checkable # noqa: F401 + +# Important note: The importlib_metadata "backport" includes functionality +# that's not part of the built-in importlib.metadata. We should treat this +# import like the built-in and only use what's available there. try: # Python 3.8+ - from typing import Literal + import importlib.metadata as importlib_metadata except ImportError: - from typing_extensions import Literal # noqa: F401 + from catalogue import _importlib_metadata as importlib_metadata # type: ignore[no-redef] # noqa: F401 from thinc.api import Optimizer # noqa: F401 diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 0f7226083..ceb7357fc 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -68,8 +68,11 @@ seed = ${system.seed} gpu_allocator = ${system.gpu_allocator} dropout = 0.1 accumulate_gradient = 1 -# Controls early-stopping. 0 or -1 mean unlimited. +# Controls early-stopping. 0 disables early stopping. patience = 1600 +# Number of epochs. 0 means unlimited. If >= 0, train corpus is loaded once in +# memory and shuffled within the training loop. -1 means stream train corpus +# rather than loading in memory with no shuffling within the training loop. max_epochs = 0 max_steps = 20000 eval_frequency = 200 @@ -77,6 +80,8 @@ eval_frequency = 200 score_weights = {} # Names of pipeline components that shouldn't be updated during training frozen_components = [] +# Names of pipeline components that should set annotations during training +annotating_components = [] # Location in the config where the dev corpus is defined dev_corpus = "corpora.dev" # Location in the config where the train corpus is defined diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index 48229572b..d9418f675 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -1,8 +1,8 @@ """ spaCy's built in visualization suite for dependencies and named entities. -DOCS: https://nightly.spacy.io/api/top-level#displacy -USAGE: https://nightly.spacy.io/usage/visualizers +DOCS: https://spacy.io/api/top-level#displacy +USAGE: https://spacy.io/usage/visualizers """ from typing import Union, Iterable, Optional, Dict, Any, Callable import warnings @@ -18,7 +18,7 @@ RENDER_WRAPPER = None def render( - docs: Union[Iterable[Union[Doc, Span]], Doc, Span], + docs: Union[Iterable[Union[Doc, Span, dict]], Doc, Span, dict], style: str = "dep", page: bool = False, minify: bool = False, @@ -28,7 +28,8 @@ def render( ) -> str: """Render displaCy visualisation. - docs (Union[Iterable[Doc], Doc]): Document(s) to visualise. + docs (Union[Iterable[Union[Doc, Span, dict]], Doc, Span, dict]]): Document(s) to visualise. + a 'dict' is only allowed here when 'manual' is set to True style (str): Visualisation style, 'dep' or 'ent'. page (bool): Render markup as full HTML page. minify (bool): Minify HTML markup. @@ -37,8 +38,8 @@ def render( manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts. RETURNS (str): Rendered HTML markup. - DOCS: https://nightly.spacy.io/api/top-level#displacy.render - USAGE: https://nightly.spacy.io/usage/visualizers + DOCS: https://spacy.io/api/top-level#displacy.render + USAGE: https://spacy.io/usage/visualizers """ factories = { "dep": (DependencyRenderer, parse_deps), @@ -53,8 +54,8 @@ def render( raise ValueError(Errors.E096) renderer_func, converter = factories[style] renderer = renderer_func(options=options) - parsed = [converter(doc, options) for doc in docs] if not manual else docs - _html["parsed"] = renderer.render(parsed, page=page, minify=minify).strip() + parsed = [converter(doc, options) for doc in docs] if not manual else docs # type: ignore + _html["parsed"] = renderer.render(parsed, page=page, minify=minify).strip() # type: ignore html = _html["parsed"] if RENDER_WRAPPER is not None: html = RENDER_WRAPPER(html) @@ -88,8 +89,8 @@ def serve( port (int): Port to serve visualisation. host (str): Host to serve visualisation. - DOCS: https://nightly.spacy.io/api/top-level#displacy.serve - USAGE: https://nightly.spacy.io/usage/visualizers + DOCS: https://spacy.io/api/top-level#displacy.serve + USAGE: https://spacy.io/usage/visualizers """ from wsgiref import simple_server @@ -120,7 +121,9 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]: doc (Doc): Document do parse. RETURNS (dict): Generated dependency parse keyed by words and arcs. """ - doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"])) + doc = Doc(orig_doc.vocab).from_bytes( + orig_doc.to_bytes(exclude=["user_data", "user_hooks"]) + ) if not doc.has_annotation("DEP"): warnings.warn(Warnings.W005) if options.get("collapse_phrases", False): @@ -131,7 +134,7 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]: "lemma": np.root.lemma_, "ent_type": np.root.ent_type_, } - retokenizer.merge(np, attrs=attrs) + retokenizer.merge(np, attrs=attrs) # type: ignore[arg-type] if options.get("collapse_punct", True): spans = [] for word in doc[:-1]: @@ -146,7 +149,7 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]: with doc.retokenize() as retokenizer: for span, tag, lemma, ent_type in spans: attrs = {"tag": tag, "lemma": lemma, "ent_type": ent_type} - retokenizer.merge(span, attrs=attrs) + retokenizer.merge(span, attrs=attrs) # type: ignore[arg-type] fine_grained = options.get("fine_grained") add_lemma = options.get("add_lemma") words = [ diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index ba56beca3..14d741a3d 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -3,7 +3,7 @@ import uuid from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_WORDS_LEMMA, TPL_DEP_ARCS from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE -from .templates import TPL_ENTS +from .templates import TPL_ENTS, TPL_KB_LINK from ..util import minify_html, escape_html, registry from ..errors import Errors @@ -305,7 +305,7 @@ class EntityRenderer: """Render entities in text. text (str): Original text. - spans (list): Individual entity spans and their start, end and label. + spans (list): Individual entity spans and their start, end, label, kb_id and kb_url. title (str / None): Document title set in Doc.user_data['title']. """ markup = "" @@ -314,6 +314,9 @@ class EntityRenderer: label = span["label"] start = span["start"] end = span["end"] + kb_id = span.get("kb_id", "") + kb_url = span.get("kb_url", "#") + kb_link = TPL_KB_LINK.format(kb_id=kb_id, kb_url=kb_url) if kb_id else "" additional_params = span.get("params", {}) entity = escape_html(text[start:end]) fragments = text[offset:start].split("\n") @@ -323,7 +326,12 @@ class EntityRenderer: markup += "
" if self.ents is None or label.upper() in self.ents: color = self.colors.get(label.upper(), self.default_color) - ent_settings = {"label": label, "text": entity, "bg": color} + ent_settings = { + "label": label, + "text": entity, + "bg": color, + "kb_link": kb_link, + } ent_settings.update(additional_params) markup += self.ent_template.format(**ent_settings) else: diff --git a/spacy/displacy/templates.py b/spacy/displacy/templates.py index b9cbf717b..e7d3d4266 100644 --- a/spacy/displacy/templates.py +++ b/spacy/displacy/templates.py @@ -51,17 +51,22 @@ TPL_ENTS = """ TPL_ENT = """ {text} - {label} + {label}{kb_link} """ TPL_ENT_RTL = """ {text} - {label} + {label}{kb_link} """ +# Important: this needs to start with a space! +TPL_KB_LINK = """ + {kb_id} +""" + TPL_PAGE = """ diff --git a/spacy/errors.py b/spacy/errors.py index fd26db8ba..ff1185361 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -1,3 +1,6 @@ +import warnings + + def add_codes(err_cls): """Add error codes to string messages via class attribute names.""" @@ -12,6 +15,33 @@ def add_codes(err_cls): return ErrorsWithCodes() +def setup_default_warnings(): + # ignore certain numpy warnings + filter_warning("ignore", error_msg="numpy.dtype size changed") # noqa + filter_warning("ignore", error_msg="numpy.ufunc size changed") # noqa + + # warn about entity_ruler & matcher having no patterns only once + for pipe in ["matcher", "entity_ruler"]: + filter_warning("once", error_msg=Warnings.W036.format(name=pipe)) + + # warn once about lemmatizer without required POS + filter_warning("once", error_msg=Warnings.W108) + + +def filter_warning(action: str, error_msg: str): + """Customize how spaCy should handle a certain warning. + + error_msg (str): e.g. "W006", or a full error message + action (str): "default", "error", "ignore", "always", "module" or "once" + """ + warnings.filterwarnings(action, message=_escape_warning_msg(error_msg)) + + +def _escape_warning_msg(msg): + """To filter with warnings.filterwarnings, the [] brackets need to be escaped""" + return msg.replace("[", "\\[").replace("]", "\\]") + + # fmt: off @add_codes @@ -20,7 +50,7 @@ class Warnings: "generate a dependency visualization for it. Make sure the Doc " "was processed with a model that supports dependency parsing, and " "not just a language class like `English()`. For more info, see " - "the docs:\nhttps://nightly.spacy.io/usage/models") + "the docs:\nhttps://spacy.io/usage/models") W006 = ("No entities to visualize found in Doc object. If this is " "surprising to you, make sure the Doc was processed using a model " "that supports named entity recognition, and check the `doc.ents` " @@ -73,17 +103,38 @@ class Warnings: "degree. If this is intentional or the language you're using " "doesn't have a normalization table, please ignore this warning. " "If this is surprising, make sure you have the spacy-lookups-data " - "package installed. The languages with lexeme normalization tables " - "are currently: {langs}") - W035 = ('Discarding subpattern "{pattern}" due to an unrecognized ' + "package installed and load the table in your config. The " + "languages with lexeme normalization tables are currently: " + "{langs}\n\nLoad the table in your config with:\n\n" + "[initialize.lookups]\n" + "@misc = \"spacy.LookupsDataLoader.v1\"\n" + "lang = ${{nlp.lang}}\n" + "tables = [\"lexeme_norm\"]\n") + W035 = ("Discarding subpattern '{pattern}' due to an unrecognized " "attribute or operator.") + W036 = ("The component '{name}' does not have any patterns defined.") - # TODO: fix numbering after merging develop into master + # New warnings added in v3.x + W086 = ("Component '{listener}' will be (re)trained, but it needs the component " + "'{name}' which is frozen. If you want to prevent retraining '{name}' " + "but want to train '{listener}' on top of it, you should add '{name}' to the " + "list of 'annotating_components' in the 'training' block in the config. " + "See the documentation for details: " + "https://spacy.io/usage/training#annotating-components") + W087 = ("Component '{name}' will be (re)trained, but the component '{listener}' " + "depends on it via a listener and is frozen. This means that the " + "performance of '{listener}' will be degraded. You can either freeze " + "both, or neither of the two. If you're sourcing the component from " + "an existing pipeline, you can use the `replace_listeners` setting in " + "the config block to replace its token-to-vector listener with a copy " + "and make it independent. For example, `replace_listeners = " + "[\"model.tok2vec\"]` See the documentation for details: " + "https://spacy.io/usage/training#config-components-listeners") W088 = ("The pipeline component {name} implements a `begin_training` " "method, which won't be called by spaCy. As of v3.0, `begin_training` " "has been renamed to `initialize`, so you likely want to rename the " "component method. See the documentation for details: " - "https://nightly.spacy.io/api/language#initialize") + "https://spacy.io/api/language#initialize") W089 = ("As of spaCy v3.0, the `nlp.begin_training` method has been renamed " "to `nlp.initialize`.") W090 = ("Could not locate any {format} files in path '{path}'.") @@ -97,12 +148,12 @@ class Warnings: "released, because the model may say it's compatible when it's " 'not. Consider changing the "spacy_version" in your meta.json to a ' "version range, with a lower and upper pin. For example: {example}") - W095 = ("Model '{model}' ({model_version}) requires spaCy {version} and is " - "incompatible with the current version ({current}). This may lead " - "to unexpected results or runtime errors. To resolve this, " - "download a newer compatible model or retrain your custom model " - "with the current spaCy version. For more details and available " - "updates, run: python -m spacy validate") + W095 = ("Model '{model}' ({model_version}) was trained with spaCy " + "{version} and may not be 100% compatible with the current version " + "({current}). If you see errors or degraded performance, download " + "a newer compatible model or retrain your custom model with the " + "current spaCy version. For more details and available updates, " + "run: python -m spacy validate") W096 = ("The method `nlp.disable_pipes` is now deprecated - use " "`nlp.select_pipes` instead.") W100 = ("Skipping unsupported morphological feature(s): '{feature}'. " @@ -119,12 +170,28 @@ class Warnings: "call the {matcher} on each Doc object.") W107 = ("The property `Doc.{prop}` is deprecated. Use " "`Doc.has_annotation(\"{attr}\")` instead.") - W108 = ("The rule-based lemmatizer did not find POS annotation for the " - "token '{text}'. Check that your pipeline includes components that " + W108 = ("The rule-based lemmatizer did not find POS annotation for one or " + "more tokens. Check that your pipeline includes components that " "assign token.pos, typically 'tagger'+'attribute_ruler' or " "'morphologizer'.") W109 = ("Unable to save user hooks while serializing the doc. Re-add any " "required user hooks to the doc after processing.") + W110 = ("The DependencyMatcher token pattern {pattern} matched a span " + "{tokens} that is 2+ tokens long. Only the first token in the span " + "will be included in the results. For better results, token " + "patterns should return matches that are each exactly one token " + "long.") + W111 = ("Jupyter notebook detected: if using `prefer_gpu()` or " + "`require_gpu()`, include it in the same cell right before " + "`spacy.load()` to ensure that the model is loaded on the correct " + "device. More information: " + "http://spacy.io/usage/v3#jupyter-notebook-gpu") + W112 = ("The model specified to use for initial vectors ({name}) has no " + "vectors. This is almost certainly a mistake.") + W113 = ("Sourced component '{name}' may not work as expected: source " + "vectors are not identical to current pipeline vectors.") + W114 = ("Using multiprocessing with GPU models is not recommended and may " + "lead to errors.") @add_codes @@ -158,7 +225,7 @@ class Errors: E010 = ("Word vectors set to length 0. This may be because you don't have " "a model installed or loaded, or because your model doesn't " "include word vectors. For more info, see the docs:\n" - "https://nightly.spacy.io/usage/models") + "https://spacy.io/usage/models") E011 = ("Unknown operator: '{op}'. Options: {opts}") E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}") E016 = ("MultitaskObjective target should be function or one of: dep, " @@ -189,7 +256,7 @@ class Errors: E028 = ("`words` expects a list of unicode strings, but got bytes instance: {value}") E029 = ("`noun_chunks` requires the dependency parse, which requires a " "statistical model to be installed and loaded. For more info, see " - "the documentation:\nhttps://nightly.spacy.io/usage/models") + "the documentation:\nhttps://spacy.io/usage/models") E030 = ("Sentence boundaries unset. You can add the 'sentencizer' " "component to the pipeline with: `nlp.add_pipe('sentencizer')`. " "Alternatively, add the dependency parser or sentence recognizer, " @@ -289,17 +356,18 @@ class Errors: E098 = ("Invalid pattern: expected both RIGHT_ID and RIGHT_ATTRS.") E099 = ("Invalid pattern: the first node of pattern should be an anchor " "node. The node should only contain RIGHT_ID and RIGHT_ATTRS.") - E100 = ("Nodes other than the anchor node should all contain LEFT_ID, " - "REL_OP and RIGHT_ID.") + E100 = ("Nodes other than the anchor node should all contain {required}, " + "but these are missing: {missing}") E101 = ("RIGHT_ID should be a new node and LEFT_ID should already have " "have been declared in previous edges.") E102 = ("Can't merge non-disjoint spans. '{token}' is already part of " "tokens to merge. If you want to find the longest non-overlapping " "spans, you can use the util.filter_spans helper:\n" - "https://nightly.spacy.io/api/top-level#util.filter_spans") + "https://spacy.io/api/top-level#util.filter_spans") E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A " "token can only be part of one entity, so make sure the entities " - "you're setting don't overlap.") + "you're setting don't overlap. To work with overlapping entities, " + "consider using doc.spans instead.") E106 = ("Can't find `doc._.{attr}` attribute specified in the underscore " "settings: {opts}") E107 = ("Value of `doc._.{attr}` is not JSON-serializable: {value}") @@ -338,21 +406,10 @@ class Errors: E125 = ("Unexpected value: {value}") E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. " "This is likely a bug in spaCy, so feel free to open an issue.") - E129 = ("Cannot write the label of an existing Span object because a Span " - "is a read-only view of the underlying Token objects stored in the " - "Doc. Instead, create a new Span object and specify the `label` " - "keyword argument, for example:\nfrom spacy.tokens import Span\n" - "span = Span(doc, start={start}, end={end}, label='{label}')") E130 = ("You are running a narrow unicode build, which is incompatible " "with spacy >= 2.1.0. To fix this, reinstall Python and use a wide " "unicode build instead. You can also rebuild Python and set the " "`--enable-unicode=ucs4 flag`.") - E131 = ("Cannot write the kb_id of an existing Span object because a Span " - "is a read-only view of the underlying Token objects stored in " - "the Doc. Instead, create a new Span object and specify the " - "`kb_id` keyword argument, for example:\nfrom spacy.tokens " - "import Span\nspan = Span(doc, start={start}, end={end}, " - "label='{label}', kb_id='{kb_id}')") E132 = ("The vectors for entities and probabilities for alias '{alias}' " "should have equal length, but found {entities_length} and " "{probabilities_length} respectively.") @@ -461,8 +518,69 @@ class Errors: E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.") E200 = ("Can't yet set {attr} from Span. Vote for this feature on the " "issue tracker: http://github.com/explosion/spaCy/issues") + E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.") - # TODO: fix numbering after merging develop into master + # New errors added in v3.x + E866 = ("A SpanGroup is not functional after the corresponding Doc has " + "been garbage collected. To keep using the spans, make sure that " + "the corresponding Doc object is still available in the scope of " + "your function.") + E867 = ("The 'textcat' component requires at least two labels because it " + "uses mutually exclusive classes where exactly one label is True " + "for each doc. For binary classification tasks, you can use two " + "labels with 'textcat' (LABEL / NOT_LABEL) or alternatively, you " + "can use the 'textcat_multilabel' component with one label.") + E868 = ("Found a conflicting gold annotation in a reference document, " + "with the following char-based span occurring both in the gold ents " + "as well as in the negative spans: {span}.") + E869 = ("The notation '{label}' is not supported anymore. To annotate " + "negative NER samples, use `doc.spans[key]` instead, and " + "specify the key as 'incorrect_spans_key' when constructing " + "the NER component.") + E870 = ("Could not serialize the DocBin because it is too large. Consider " + "splitting up your documents into several doc bins and serializing " + "each separately. spacy.Corpus.v1 will search recursively for all " + "*.spacy files if you provide a directory instead of a filename as " + "the 'path'.") + E871 = ("Error encountered in nlp.pipe with multiprocessing:\n\n{error}") + E872 = ("Unable to copy tokenizer from base model due to different " + 'tokenizer settings: current tokenizer config "{curr_config}" ' + 'vs. base model "{base_config}"') + E873 = ("Unable to merge a span from doc.spans with key '{key}' and text " + "'{text}'. This is likely a bug in spaCy, so feel free to open an " + "issue: https://github.com/explosion/spaCy/issues") + E874 = ("Could not initialize the tok2vec model from component " + "'{component}' and layer '{layer}'.") + E875 = ("To use the PretrainVectors objective, make sure that static vectors are loaded. " + "In the config, these are defined by the initialize.vectors setting.") + E879 = ("Unexpected type for 'spans' data. Provide a dictionary mapping keys to " + "a list of spans, with each span represented by a tuple (start_char, end_char). " + "The tuple can be optionally extended with a label and a KB ID.") + E880 = ("The 'wandb' library could not be found - did you install it? " + "Alternatively, specify the 'ConsoleLogger' in the 'training.logger' " + "config section, instead of the 'WandbLogger'.") + E884 = ("The pipeline could not be initialized because the vectors " + "could not be found at '{vectors}'. If your pipeline was already " + "initialized/trained before, call 'resume_training' instead of 'initialize', " + "or initialize only the components that are new.") + E885 = ("entity_linker.set_kb received an invalid 'kb_loader' argument: expected " + "a callable function, but got: {arg_type}") + E886 = ("Can't replace {name} -> {tok2vec} listeners: path '{path}' not " + "found in config for component '{name}'.") + E887 = ("Can't replace {name} -> {tok2vec} listeners: the paths to replace " + "({paths}) don't match the available listeners in the model ({n_listeners}).") + E888 = ("Can't replace listeners for '{name}' ({pipe}): invalid upstream " + "component that doesn't seem to support listeners. Expected Tok2Vec " + "or Transformer component. If you didn't call nlp.replace_listeners " + "manually, this is likely a bug in spaCy.") + E889 = ("Can't replace '{tok2vec}' listeners of component '{name}' because " + "'{unknown}' is not in the pipeline. Available components: {opts}. " + "If you didn't call nlp.replace_listeners manually, this is likely " + "a bug in spaCy.") + E890 = ("Cannot add the alias '{alias}' to the Knowledge base. " + "Each alias should be a meaningful string.") + E891 = ("Alias '{alias}' could not be added to the Knowledge base. " + "This is likely a bug in spaCy.") E892 = ("Unknown function registry: '{name}'.\n\nAvailable names: {available}") E893 = ("Could not find function '{name}' in function registry '{reg_name}'. " "If you're using a custom function, make sure the code is available. " @@ -497,9 +615,9 @@ class Errors: "solve this, remove the existing directories in the output directory.") E902 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. " "Try checking whitespace and delimiters. See " - "https://nightly.spacy.io/api/cli#convert") + "https://spacy.io/api/cli#convert") E903 = ("The token-per-line NER file is not formatted correctly. Try checking " - "whitespace and delimiters. See https://nightly.spacy.io/api/cli#convert") + "whitespace and delimiters. See https://spacy.io/api/cli#convert") E904 = ("Cannot initialize StaticVectors layer: nO dimension unset. This " "dimension refers to the output width, after the linear projection " "has been applied.") @@ -540,8 +658,10 @@ class Errors: E922 = ("Component '{name}' has been initialized with an output dimension of " "{nO} - cannot add any more labels.") E923 = ("It looks like there is no proper sample data to initialize the " - "Model of component '{name}'. This is likely a bug in spaCy, so " - "feel free to open an issue: https://github.com/explosion/spaCy/issues") + "Model of component '{name}'. To check your input data paths and " + "annotation, run: python -m spacy debug data config.cfg " + "and include the same config override values you would specify " + "for the 'spacy train' command.") E924 = ("The '{name}' component does not seem to be initialized properly. " "This is likely a bug in spaCy, so feel free to open an issue: " "https://github.com/explosion/spaCy/issues") @@ -565,7 +685,7 @@ class Errors: "method, make sure it's overwritten on the subclass.") E940 = ("Found NaN values in scores.") E941 = ("Can't find model '{name}'. It looks like you're trying to load a " - "model from a shortcut, which is deprecated as of spaCy v3.0. To " + "model from a shortcut, which is obsolete as of spaCy v3.0. To " "load the model, use its full name instead:\n\n" "nlp = spacy.load(\"{full}\")\n\nFor more details on the available " "models, see the models directory: https://spacy.io/models. If you " @@ -580,8 +700,8 @@ class Errors: "returned the initialized nlp object instead?") E944 = ("Can't copy pipeline component '{name}' from source '{model}': " "not found in pipeline. Available components: {opts}") - E945 = ("Can't copy pipeline component '{name}' from source. Expected loaded " - "nlp object, but got: {source}") + E945 = ("Can't copy pipeline component '{name}' from source. Expected " + "loaded nlp object, but got: {source}") E947 = ("`Matcher.add` received invalid `greedy` argument: expected " "a string value from {expected} but got: '{arg}'") E948 = ("`Matcher.add` received invalid 'patterns' argument: expected " @@ -676,7 +796,7 @@ class Errors: "to token boundaries.") E982 = ("The `Token.ent_iob` attribute should be an integer indexing " "into {values}, but found {value}.") - E983 = ("Invalid key for '{dict}': {key}. Available keys: " + E983 = ("Invalid key(s) for '{dict}': {key}. Available keys: " "{keys}") E984 = ("Invalid component config for '{name}': component block needs either " "a key `factory` specifying the registered function used to " @@ -740,6 +860,24 @@ class Errors: "file.json .`.") E1015 = ("Can't initialize model from config: no {value} found. For more " "information, run: python -m spacy debug config config.cfg") + E1016 = ("The operators 'OP': '?', '*', and '+' are not supported in " + "DependencyMatcher token patterns. The token pattern in " + "RIGHT_ATTR should return matches that are each exactly one token " + "long. Invalid pattern:\n{node}") + E1017 = ("A Doc object requires both 'deps' and 'heads' for dependency " + "parses. If no dependency labels are available, provide " + "placeholder deps such as `deps=[\"dep\"]*len(heads)`.") + E1018 = ("Knowledge base for component '{name}' is not set. " + "Make sure either `nel.initialize` or `nel.set_kb` " + "is called with a `kb_loader` function.") + E1019 = ("`noun_chunks` requires the pos tagging, which requires a " + "statistical model to be installed and loaded. For more info, see " + "the documentation:\nhttps://spacy.io/usage/models") + E1020 = ("No `epoch_resume` value specified and could not infer one from " + "filename. Specify an epoch to resume from.") + E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. " + "Non-UD tags should use the `tag` property.") + E1022 = ("Words must be of type str or int, but input is of type '{wtype}'") # Deprecated model shortcuts, only used in errors and warnings diff --git a/spacy/glossary.py b/spacy/glossary.py index c4a6a5c45..e45704fc5 100644 --- a/spacy/glossary.py +++ b/spacy/glossary.py @@ -58,7 +58,7 @@ GLOSSARY = { "FW": "foreign word", "HYPH": "punctuation mark, hyphen", "IN": "conjunction, subordinating or preposition", - "JJ": "adjective", + "JJ": "adjective (English), other noun-modifier (Chinese)", "JJR": "adjective, comparative", "JJS": "adjective, superlative", "LS": "list item marker", @@ -88,13 +88,14 @@ GLOSSARY = { "WP": "wh-pronoun, personal", "WP$": "wh-pronoun, possessive", "WRB": "wh-adverb", - "SP": "space", + "SP": "space (English), sentence-final particle (Chinese)", "ADD": "email", "NFP": "superfluous punctuation", "GW": "additional word in multi-word expression", "XX": "unknown", "BES": 'auxiliary "be"', "HVS": 'forms of "have"', + "_SP": "whitespace", # POS Tags (German) # TIGER Treebank # http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf @@ -152,6 +153,40 @@ GLOSSARY = { "VVIZU": 'infinitive with "zu", full', "VVPP": "perfect participle, full", "XY": "non-word containing non-letter", + # POS Tags (Chinese) + # OntoNotes / Chinese Penn Treebank + # https://repository.upenn.edu/cgi/viewcontent.cgi?article=1039&context=ircs_reports + "AD": "adverb", + "AS": "aspect marker", + "BA": "把 in ba-construction", + # "CD": "cardinal number", + "CS": "subordinating conjunction", + "DEC": "的 in a relative clause", + "DEG": "associative 的", + "DER": "得 in V-de const. and V-de-R", + "DEV": "地 before VP", + "ETC": "for words 等, 等等", + # "FW": "foreign words" + "IJ": "interjection", + # "JJ": "other noun-modifier", + "LB": "被 in long bei-const", + "LC": "localizer", + "M": "measure word", + "MSP": "other particle", + # "NN": "common noun", + "NR": "proper noun", + "NT": "temporal noun", + "OD": "ordinal number", + "ON": "onomatopoeia", + "P": "preposition excluding 把 and 被", + "PN": "pronoun", + "PU": "punctuation", + "SB": "被 in short bei-const", + # "SP": "sentence-final particle", + "VA": "predicative adjective", + "VC": "是 (copula)", + "VE": "有 as the main verb", + "VV": "other verb", # Noun chunks "NP": "noun phrase", "PP": "prepositional phrase", diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 4a71b26a2..a823dbe1e 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -28,7 +28,7 @@ cdef class Candidate: cdef class KnowledgeBase: cdef Pool mem - cpdef readonly Vocab vocab + cdef readonly Vocab vocab cdef int64_t entity_vector_length # This maps 64bit keys (hash of unique entity string) diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 10aa377eb..fed3009da 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -1,5 +1,5 @@ # cython: infer_types=True, profile=True -from typing import Iterator, Iterable +from typing import Iterator, Iterable, Callable, Dict, Any import srsly from cymem.cymem cimport Pool @@ -23,7 +23,7 @@ cdef class Candidate: algorithm which will disambiguate the various candidates to the correct one. Each candidate (alias, entity) pair is assigned to a certain prior probability. - DOCS: https://nightly.spacy.io/api/kb/#candidate_init + DOCS: https://spacy.io/api/kb/#candidate_init """ def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob): @@ -81,7 +81,7 @@ cdef class KnowledgeBase: """A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases, to support entity linking of named entities to real-world concepts. - DOCS: https://nightly.spacy.io/api/kb + DOCS: https://spacy.io/api/kb """ def __init__(self, Vocab vocab, entity_vector_length): @@ -93,6 +93,17 @@ cdef class KnowledgeBase: self.vocab = vocab self._create_empty_vectors(dummy_hash=self.vocab.strings[""]) + def initialize_entities(self, int64_t nr_entities): + self._entry_index = PreshMap(nr_entities + 1) + self._entries = entry_vec(nr_entities + 1) + + def initialize_vectors(self, int64_t nr_entities): + self._vectors_table = float_matrix(nr_entities + 1) + + def initialize_aliases(self, int64_t nr_aliases): + self._alias_index = PreshMap(nr_aliases + 1) + self._aliases_table = alias_vec(nr_aliases + 1) + @property def entity_vector_length(self): """RETURNS (uint64): length of the entity vectors""" @@ -144,8 +155,8 @@ cdef class KnowledgeBase: raise ValueError(Errors.E140) nr_entities = len(set(entity_list)) - self._entry_index = PreshMap(nr_entities+1) - self._entries = entry_vec(nr_entities+1) + self.initialize_entities(nr_entities) + self.initialize_vectors(nr_entities) i = 0 cdef KBEntryC entry @@ -164,8 +175,8 @@ cdef class KnowledgeBase: entry.entity_hash = entity_hash entry.freq = freq_list[i] - vector_index = self.c_add_vector(entity_vector=vector_list[i]) - entry.vector_index = vector_index + self._vectors_table[i] = entity_vector + entry.vector_index = i entry.feats_row = -1 # Features table currently not implemented @@ -187,6 +198,10 @@ cdef class KnowledgeBase: For a given alias, add its potential entities and prior probabilies to the KB. Return the alias_hash at the end """ + if alias is None or len(alias) == 0: + raise ValueError(Errors.E890.format(alias=alias)) + + previous_alias_nr = self.get_size_aliases() # Throw an error if the length of entities and probabilities are not the same if not len(entities) == len(probabilities): raise ValueError(Errors.E132.format(alias=alias, @@ -220,6 +235,8 @@ cdef class KnowledgeBase: new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs) self._alias_index[alias_hash] = new_index + if previous_alias_nr + 1 != self.get_size_aliases(): + raise RuntimeError(Errors.E891.format(alias=alias)) return alias_hash def append_alias(self, unicode alias, unicode entity, float prior_prob, ignore_warnings=False): @@ -319,6 +336,103 @@ cdef class KnowledgeBase: return 0.0 + def to_bytes(self, **kwargs): + """Serialize the current state to a binary string. + """ + def serialize_header(): + header = (self.get_size_entities(), self.get_size_aliases(), self.entity_vector_length) + return srsly.json_dumps(header) + + def serialize_entries(): + i = 1 + tuples = [] + for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]): + entry = self._entries[entry_index] + assert entry.entity_hash == entry_hash + assert entry_index == i + tuples.append((entry.entity_hash, entry.freq, entry.vector_index)) + i = i + 1 + return srsly.json_dumps(tuples) + + def serialize_aliases(): + i = 1 + headers = [] + indices_lists = [] + probs_lists = [] + for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]): + alias = self._aliases_table[alias_index] + assert alias_index == i + candidate_length = len(alias.entry_indices) + headers.append((alias_hash, candidate_length)) + indices_lists.append(alias.entry_indices) + probs_lists.append(alias.probs) + i = i + 1 + headers_dump = srsly.json_dumps(headers) + indices_dump = srsly.json_dumps(indices_lists) + probs_dump = srsly.json_dumps(probs_lists) + return srsly.json_dumps((headers_dump, indices_dump, probs_dump)) + + serializers = { + "header": serialize_header, + "entity_vectors": lambda: srsly.json_dumps(self._vectors_table), + "entries": serialize_entries, + "aliases": serialize_aliases, + } + return util.to_bytes(serializers, []) + + def from_bytes(self, bytes_data, *, exclude=tuple()): + """Load state from a binary string. + """ + def deserialize_header(b): + header = srsly.json_loads(b) + nr_entities = header[0] + nr_aliases = header[1] + entity_vector_length = header[2] + self.initialize_entities(nr_entities) + self.initialize_vectors(nr_entities) + self.initialize_aliases(nr_aliases) + self.entity_vector_length = entity_vector_length + + def deserialize_vectors(b): + self._vectors_table = srsly.json_loads(b) + + def deserialize_entries(b): + cdef KBEntryC entry + tuples = srsly.json_loads(b) + i = 1 + for (entity_hash, freq, vector_index) in tuples: + entry.entity_hash = entity_hash + entry.freq = freq + entry.vector_index = vector_index + entry.feats_row = -1 # Features table currently not implemented + self._entries[i] = entry + self._entry_index[entity_hash] = i + i += 1 + + def deserialize_aliases(b): + cdef AliasC alias + i = 1 + all_data = srsly.json_loads(b) + headers = srsly.json_loads(all_data[0]) + indices = srsly.json_loads(all_data[1]) + probs = srsly.json_loads(all_data[2]) + for header, indices, probs in zip(headers, indices, probs): + alias_hash, candidate_length = header + alias.entry_indices = indices + alias.probs = probs + self._aliases_table[i] = alias + self._alias_index[alias_hash] = i + i += 1 + + setters = { + "header": deserialize_header, + "entity_vectors": deserialize_vectors, + "entries": deserialize_entries, + "aliases": deserialize_aliases, + } + util.from_bytes(bytes_data, setters, exclude) + return self + def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()): path = ensure_path(path) if not path.exists(): @@ -336,7 +450,7 @@ cdef class KnowledgeBase: raise ValueError(Errors.E929.format(loc=path)) if not path.is_dir(): raise ValueError(Errors.E928.format(loc=path)) - deserialize = {} + deserialize: Dict[str, Callable[[Any], Any]] = {} deserialize["contents"] = lambda p: self.read_contents(p) deserialize["strings.json"] = lambda p: self.vocab.strings.from_disk(p) util.from_disk(path, deserialize, exclude) @@ -398,10 +512,9 @@ cdef class KnowledgeBase: cdef int64_t entity_vector_length reader.read_header(&nr_entities, &entity_vector_length) + self.initialize_entities(nr_entities) + self.initialize_vectors(nr_entities) self.entity_vector_length = entity_vector_length - self._entry_index = PreshMap(nr_entities+1) - self._entries = entry_vec(nr_entities+1) - self._vectors_table = float_matrix(nr_entities+1) # STEP 1: load entity vectors cdef int i = 0 @@ -439,8 +552,7 @@ cdef class KnowledgeBase: # STEP 3: load aliases cdef int64_t nr_aliases reader.read_alias_length(&nr_aliases) - self._alias_index = PreshMap(nr_aliases+1) - self._aliases_table = alias_vec(nr_aliases+1) + self.initialize_aliases(nr_aliases) cdef int64_t nr_candidates cdef vector[int64_t] entry_indices diff --git a/spacy/lang/af/__init__.py b/spacy/lang/af/__init__.py index 91917daee..553fcbf4c 100644 --- a/spacy/lang/af/__init__.py +++ b/spacy/lang/af/__init__.py @@ -1,8 +1,8 @@ from .stop_words import STOP_WORDS -from ...language import Language +from ...language import Language, BaseDefaults -class AfrikaansDefaults(Language.Defaults): +class AfrikaansDefaults(BaseDefaults): stop_words = STOP_WORDS diff --git a/spacy/lang/am/__init__.py b/spacy/lang/am/__init__.py index ed21b55ee..ddae556d6 100644 --- a/spacy/lang/am/__init__.py +++ b/spacy/lang/am/__init__.py @@ -4,12 +4,12 @@ from .punctuation import TOKENIZER_SUFFIXES from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...language import Language +from ...language import Language, BaseDefaults from ...attrs import LANG from ...util import update_exc -class AmharicDefaults(Language.Defaults): +class AmharicDefaults(BaseDefaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: "am" diff --git a/spacy/lang/am/stop_words.py b/spacy/lang/am/stop_words.py index 0d23e4331..5487ada5a 100644 --- a/spacy/lang/am/stop_words.py +++ b/spacy/lang/am/stop_words.py @@ -1,6 +1,33 @@ -# Stop words +# Stop words by Teshome Kassie http://etd.aau.edu.et/bitstream/handle/123456789/3315/Teshome%20Kassie.pdf?sequence=1&isAllowed=y +# Stop words by Tihitina Petros http://etd.aau.edu.et/bitstream/handle/123456789/3384/Tihitina%20Petros.pdf?sequence=1&isAllowed=y + STOP_WORDS = set( """ ግን አንቺ አንተ እናንተ ያንተ ያንቺ የናንተ ራስህን ራስሽን ራሳችሁን +ሁሉ ኋላ በሰሞኑ አሉ በኋላ ሁኔታ በኩል አስታውቀዋል ሆነ በውስጥ +አስታውሰዋል ሆኑ ባጣም እስካሁን ሆኖም በተለይ አሳሰበ ሁል በተመለከተ +አሳስበዋል ላይ በተመሳሳይ አስፈላጊ ሌላ የተለያየ አስገነዘቡ ሌሎች የተለያዩ +አስገንዝበዋል ልዩ ተባለ አብራርተዋል መሆኑ ተገለጸ አስረድተዋል ተገልጿል +ማለቱ ተጨማሪ እባክህ የሚገኝ ተከናወነ እባክሽ ማድረግ ችግር አንጻር ማን +ትናንት እስኪደርስ ነበረች እንኳ ሰሞኑን ነበሩ እንኳን ሲሆን ነበር እዚሁ ሲል +ነው እንደገለጹት አለ ና እንደተናገሩት ቢሆን ነገር እንዳስረዱት ብለዋል ነገሮች +እንደገና ብዙ ናት ወቅት ቦታ ናቸው እንዲሁም በርካታ አሁን እንጂ እስከ +ማለት የሚሆኑት ስለማናቸውም ውስጥ ይሆናሉ ሲባል ከሆነው ስለዚሁ ከአንድ +ያልሆነ ሳለ የነበረውን ከአንዳንድ በማናቸውም በሙሉ የሆነው ያሉ በእነዚሁ +ወር መሆናቸው ከሌሎች በዋና አንዲት ወይም +በላይ እንደ በማቀድ ለሌሎች በሆኑ ቢሆንም ጊዜና ይሆኑበታል በሆነ አንዱ +ለዚህ ለሆነው ለነዚህ ከዚህ የሌላውን ሶስተኛ አንዳንድ ለማንኛውም የሆነ ከሁለት +የነገሩ ሰኣት አንደኛ እንዲሆን እንደነዚህ ማንኛውም ካልሆነ የሆኑት ጋር ቢያንስ +ይህንንም እነደሆነ እነዚህን ይኸው የማናቸውም +በሙሉም ይህችው በተለይም አንዱን የሚችለውን በነዚህ ከእነዚህ በሌላ +የዚሁ ከእነዚሁ ለዚሁ በሚገባ ለእያንዳንዱ የአንቀጹ ወደ ይህም ስለሆነ ወይ +ማናቸውንም ተብሎ እነዚህ መሆናቸውን የሆነችን ከአስር ሳይሆን ከዚያ የለውም +የማይበልጥ እንደሆነና እንዲሆኑ በሚችሉ ብቻ ብሎ ከሌላ የሌላቸውን +ለሆነ በሌሎች ሁለቱንም በቀር ይህ በታች አንደሆነ በነሱ +ይህን የሌላ እንዲህ ከሆነ ያላቸው በነዚሁ በሚል የዚህ ይህንኑ +በእንደዚህ ቁጥር ማናቸውም ሆነው ባሉ በዚህ በስተቀር ሲሆንና +በዚህም መሆን ምንጊዜም እነዚህም በዚህና ያለ ስም +ሲኖር ከዚህም መሆኑን በሁኔታው የማያንስ እነዚህኑ ማንም ከነዚሁ +ያላቸውን እጅግ ሲሆኑ ለሆኑ ሊሆን ለማናቸውም """.split() ) diff --git a/spacy/lang/ar/__init__.py b/spacy/lang/ar/__init__.py index 6abb65efb..18c1f90ed 100644 --- a/spacy/lang/ar/__init__.py +++ b/spacy/lang/ar/__init__.py @@ -2,10 +2,10 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_SUFFIXES from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from ...language import Language +from ...language import Language, BaseDefaults -class ArabicDefaults(Language.Defaults): +class ArabicDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS suffixes = TOKENIZER_SUFFIXES stop_words = STOP_WORDS diff --git a/spacy/lang/az/__init__.py b/spacy/lang/az/__init__.py new file mode 100644 index 000000000..476898364 --- /dev/null +++ b/spacy/lang/az/__init__.py @@ -0,0 +1,16 @@ +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS +from ...language import Language, BaseDefaults + + +class AzerbaijaniDefaults(BaseDefaults): + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS + + +class Azerbaijani(Language): + lang = "az" + Defaults = AzerbaijaniDefaults + + +__all__ = ["Azerbaijani"] diff --git a/spacy/lang/az/examples.py b/spacy/lang/az/examples.py new file mode 100644 index 000000000..f3331a8cb --- /dev/null +++ b/spacy/lang/az/examples.py @@ -0,0 +1,18 @@ +""" +Example sentences to test spaCy and its language models. +>>> from spacy.lang.az.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Bu bir cümlədir.", + "Necəsən?", + "Qarabağ ordeni vətən müharibəsində qələbə münasibəti ilə təsis edilmişdir.", + "Məktəbimizə Bakıdan bir tarix müəllimi gəlmişdi.", + "Atılan növbəti mərmilər lap yaxınlıqda partladı.", + "Sinqapur koronavirus baxımından ən təhlükəsiz ölkələr sırasındadır.", + "Marsda ilk sınaq uçuşu həyata keçirilib.", + "SSRİ dağılandan bəri 5 sahil dövləti Xəzərin statusunu müəyyən edə bilməyiblər.", + "Videoda beyninə xüsusi çip yerləşdirilmiş meymun əks olunub.", +] diff --git a/spacy/lang/az/lex_attrs.py b/spacy/lang/az/lex_attrs.py new file mode 100644 index 000000000..73a5e2762 --- /dev/null +++ b/spacy/lang/az/lex_attrs.py @@ -0,0 +1,89 @@ +from ...attrs import LIKE_NUM + + +# Eleven, twelve etc. are written separate: on bir, on iki + +_num_words = [ + "bir", + "iki", + "üç", + "dörd", + "beş", + "altı", + "yeddi", + "səkkiz", + "doqquz", + "on", + "iyirmi", + "otuz", + "qırx", + "əlli", + "altmış", + "yetmiş", + "səksən", + "doxsan", + "yüz", + "min", + "milyon", + "milyard", + "trilyon", + "kvadrilyon", + "kentilyon", +] + + +_ordinal_words = [ + "birinci", + "ikinci", + "üçüncü", + "dördüncü", + "beşinci", + "altıncı", + "yedinci", + "səkkizinci", + "doqquzuncu", + "onuncu", + "iyirminci", + "otuzuncu", + "qırxıncı", + "əllinci", + "altmışıncı", + "yetmişinci", + "səksəninci", + "doxsanıncı", + "yüzüncü", + "mininci", + "milyonuncu", + "milyardıncı", + "trilyonuncu", + "kvadrilyonuncu", + "kentilyonuncu", +] + +_ordinal_endings = ("inci", "ıncı", "nci", "ncı", "uncu", "üncü") + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + text_lower = text.lower() + # Check cardinal number + if text_lower in _num_words: + return True + # Check ordinal number + if text_lower in _ordinal_words: + return True + if text_lower.endswith(_ordinal_endings): + if text_lower[:-3].isdigit() or text_lower[:-4].isdigit(): + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/az/stop_words.py b/spacy/lang/az/stop_words.py new file mode 100644 index 000000000..2114939ba --- /dev/null +++ b/spacy/lang/az/stop_words.py @@ -0,0 +1,145 @@ +# Source: https://github.com/eliasdabbas/advertools/blob/master/advertools/stopwords.py +STOP_WORDS = set( + """ +amma +arasında +artıq +ay +az +bax +belə +beş +bilər +bir +biraz +biri +birşey +biz +bizim +bizlər +bu +buna +bundan +bunların +bunu +bunun +buradan +bütün +bəli +bəlkə +bəy +bəzi +bəzən +daha +dedi +deyil +dir +düz +də +dək +dən +dəqiqə +edir +edən +elə +et +etdi +etmə +etmək +faiz +gilə +görə +ha +haqqında +harada +heç +hə +həm +həmin +həmişə +hər +idi +il +ildə +ilk +ilə +in +indi +istifadə +isə +ki +kim +kimi +kimə +lakin +lap +mirşey +məhz +mən +mənə +niyə +nə +nəhayət +o +obirisi +of +olan +olar +olaraq +oldu +olduğu +olmadı +olmaz +olmuşdur +olsun +olur +on +ona +ondan +onlar +onlardan +onların +onsuzda +onu +onun +oradan +qarşı +qədər +saat +sadəcə +saniyə +siz +sizin +sizlər +sonra +səhv +sən +sənin +sənə +təəssüf +var +və +xan +xanım +xeyr +ya +yalnız +yaxşı +yeddi +yenə +yox +yoxdur +yoxsa +yəni +zaman +çox +çünki +öz +özü +üçün +əgər +əlbəttə +ən +əslində +""".split() +) diff --git a/spacy/lang/bg/__init__.py b/spacy/lang/bg/__init__.py index a30f49ce7..559cc34c4 100644 --- a/spacy/lang/bg/__init__.py +++ b/spacy/lang/bg/__init__.py @@ -1,9 +1,21 @@ from .stop_words import STOP_WORDS -from ...language import Language +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .lex_attrs import LEX_ATTRS +from ..tokenizer_exceptions import BASE_EXCEPTIONS + +from ...language import Language, BaseDefaults +from ...attrs import LANG +from ...util import update_exc -class BulgarianDefaults(Language.Defaults): +class BulgarianDefaults(BaseDefaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "bg" + + lex_attr_getters.update(LEX_ATTRS) + stop_words = STOP_WORDS + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) class Bulgarian(Language): diff --git a/spacy/lang/bg/lex_attrs.py b/spacy/lang/bg/lex_attrs.py new file mode 100644 index 000000000..bba3c74cd --- /dev/null +++ b/spacy/lang/bg/lex_attrs.py @@ -0,0 +1,88 @@ +from ...attrs import LIKE_NUM + + +_num_words = [ + "нула", + "едно", + "един", + "една", + "две", + "три", + "четири", + "пет", + "шест", + "седем", + "осем", + "девет", + "десет", + "единадесет", + "единайсет", + "дванадесет", + "дванайсет", + "тринадесет", + "тринайсет", + "четиринадесет", + "четиринайсет", + "петнадесет", + "петнайсет", + "шестнадесет", + "шестнайсет", + "седемнадесет", + "седемнайсет", + "осемнадесет", + "осемнайсет", + "деветнадесет", + "деветнайсет", + "двадесет", + "двайсет", + "тридесет", + "трийсет", + "четиридесет", + "четиресет", + "петдесет", + "шестдесет", + "шейсет", + "седемдесет", + "осемдесет", + "деветдесет", + "сто", + "двеста", + "триста", + "четиристотин", + "петстотин", + "шестстотин", + "седемстотин", + "осемстотин", + "деветстотин", + "хиляда", + "милион", + "милиона", + "милиард", + "милиарда", + "трилион", + "трилионa", + "билион", + "билионa", + "квадрилион", + "квадрилионa", + "квинтилион", + "квинтилионa", +] + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + if text.lower() in _num_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/bg/tokenizer_exceptions.py b/spacy/lang/bg/tokenizer_exceptions.py new file mode 100644 index 000000000..0b7487c64 --- /dev/null +++ b/spacy/lang/bg/tokenizer_exceptions.py @@ -0,0 +1,67 @@ +from ...symbols import ORTH, NORM + + +_exc = {} + + +_abbr_exc = [ + {ORTH: "м", NORM: "метър"}, + {ORTH: "мм", NORM: "милиметър"}, + {ORTH: "см", NORM: "сантиметър"}, + {ORTH: "дм", NORM: "дециметър"}, + {ORTH: "км", NORM: "километър"}, + {ORTH: "кг", NORM: "килограм"}, + {ORTH: "мг", NORM: "милиграм"}, + {ORTH: "г", NORM: "грам"}, + {ORTH: "т", NORM: "тон"}, + {ORTH: "хл", NORM: "хектолиър"}, + {ORTH: "дкл", NORM: "декалитър"}, + {ORTH: "л", NORM: "литър"}, +] +for abbr in _abbr_exc: + _exc[abbr[ORTH]] = [abbr] + +_abbr_line_exc = [ + {ORTH: "г-жа", NORM: "госпожа"}, + {ORTH: "г-н", NORM: "господин"}, + {ORTH: "г-ца", NORM: "госпожица"}, + {ORTH: "д-р", NORM: "доктор"}, + {ORTH: "о-в", NORM: "остров"}, + {ORTH: "п-в", NORM: "полуостров"}, +] + +for abbr in _abbr_line_exc: + _exc[abbr[ORTH]] = [abbr] + +_abbr_dot_exc = [ + {ORTH: "акад.", NORM: "академик"}, + {ORTH: "ал.", NORM: "алинея"}, + {ORTH: "арх.", NORM: "архитект"}, + {ORTH: "бл.", NORM: "блок"}, + {ORTH: "бр.", NORM: "брой"}, + {ORTH: "бул.", NORM: "булевард"}, + {ORTH: "в.", NORM: "век"}, + {ORTH: "г.", NORM: "година"}, + {ORTH: "гр.", NORM: "град"}, + {ORTH: "ж.р.", NORM: "женски род"}, + {ORTH: "инж.", NORM: "инженер"}, + {ORTH: "лв.", NORM: "лев"}, + {ORTH: "м.р.", NORM: "мъжки род"}, + {ORTH: "мат.", NORM: "математика"}, + {ORTH: "мед.", NORM: "медицина"}, + {ORTH: "пл.", NORM: "площад"}, + {ORTH: "проф.", NORM: "професор"}, + {ORTH: "с.", NORM: "село"}, + {ORTH: "с.р.", NORM: "среден род"}, + {ORTH: "св.", NORM: "свети"}, + {ORTH: "сп.", NORM: "списание"}, + {ORTH: "стр.", NORM: "страница"}, + {ORTH: "ул.", NORM: "улица"}, + {ORTH: "чл.", NORM: "член"}, +] + +for abbr in _abbr_dot_exc: + _exc[abbr[ORTH]] = [abbr] + + +TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py index 879229888..4eb9735df 100644 --- a/spacy/lang/bn/__init__.py +++ b/spacy/lang/bn/__init__.py @@ -3,11 +3,11 @@ from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .stop_words import STOP_WORDS -from ...language import Language +from ...language import Language, BaseDefaults from ...pipeline import Lemmatizer -class BengaliDefaults(Language.Defaults): +class BengaliDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES @@ -23,11 +23,13 @@ class Bengali(Language): @Bengali.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule"}, + default_config={"model": None, "mode": "rule", "overwrite": False}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): - return Lemmatizer(nlp.vocab, model, name, mode=mode) +def make_lemmatizer( + nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool +): + return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) __all__ = ["Bengali"] diff --git a/spacy/lang/ca/__init__.py b/spacy/lang/ca/__init__.py index 970b23c1e..250ae9463 100644 --- a/spacy/lang/ca/__init__.py +++ b/spacy/lang/ca/__init__.py @@ -1,15 +1,23 @@ +from typing import Optional + +from thinc.api import Model + from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .punctuation import TOKENIZER_INFIXES +from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ...language import Language +from .syntax_iterators import SYNTAX_ITERATORS +from ...language import Language, BaseDefaults +from .lemmatizer import CatalanLemmatizer -class CatalanDefaults(Language.Defaults): +class CatalanDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES + suffixes = TOKENIZER_SUFFIXES stop_words = STOP_WORDS lex_attr_getters = LEX_ATTRS + syntax_iterators = SYNTAX_ITERATORS class Catalan(Language): @@ -17,4 +25,16 @@ class Catalan(Language): Defaults = CatalanDefaults +@Catalan.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "rule", "overwrite": False}, + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool +): + return CatalanLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + + __all__ = ["Catalan"] diff --git a/spacy/lang/ca/lemmatizer.py b/spacy/lang/ca/lemmatizer.py new file mode 100644 index 000000000..2fd012912 --- /dev/null +++ b/spacy/lang/ca/lemmatizer.py @@ -0,0 +1,81 @@ +from typing import List, Tuple + +from ...pipeline import Lemmatizer +from ...tokens import Token + + +class CatalanLemmatizer(Lemmatizer): + """ + Copied from French Lemmatizer + Catalan language lemmatizer applies the default rule based lemmatization + procedure with some modifications for better Catalan language support. + + The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use + the rule-based lemmatization. As a last resort, the lemmatizer checks in + the lookup table. + """ + + @classmethod + def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]: + if mode == "rule": + required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"] + return (required, []) + else: + return super().get_lookups_config(mode) + + def rule_lemmatize(self, token: Token) -> List[str]: + cache_key = (token.orth, token.pos) + if cache_key in self.cache: + return self.cache[cache_key] + string = token.text + univ_pos = token.pos_.lower() + if univ_pos in ("", "eol", "space"): + return [string.lower()] + elif "lemma_rules" not in self.lookups or univ_pos not in ( + "noun", + "verb", + "adj", + "adp", + "adv", + "aux", + "cconj", + "det", + "pron", + "punct", + "sconj", + ): + return self.lookup_lemmatize(token) + index_table = self.lookups.get_table("lemma_index", {}) + exc_table = self.lookups.get_table("lemma_exc", {}) + rules_table = self.lookups.get_table("lemma_rules", {}) + lookup_table = self.lookups.get_table("lemma_lookup", {}) + index = index_table.get(univ_pos, {}) + exceptions = exc_table.get(univ_pos, {}) + rules = rules_table.get(univ_pos, []) + string = string.lower() + forms = [] + if string in index: + forms.append(string) + self.cache[cache_key] = forms + return forms + forms.extend(exceptions.get(string, [])) + oov_forms = [] + if not forms: + for old, new in rules: + if string.endswith(old): + form = string[: len(string) - len(old)] + new + if not form: + pass + elif form in index or not form.isalpha(): + forms.append(form) + else: + oov_forms.append(form) + if not forms: + forms.extend(oov_forms) + if not forms and string in lookup_table.keys(): + forms.append(self.lookup_lemmatize(token)[0]) + if not forms: + forms.append(string) + forms = list(dict.fromkeys(forms)) + self.cache[cache_key] = forms + return forms diff --git a/spacy/lang/ca/punctuation.py b/spacy/lang/ca/punctuation.py index d50b75589..39db08f17 100644 --- a/spacy/lang/ca/punctuation.py +++ b/spacy/lang/ca/punctuation.py @@ -1,12 +1,46 @@ -from ..punctuation import TOKENIZER_INFIXES -from ..char_classes import ALPHA +from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS +from ..char_classes import CURRENCY +from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT +from ..char_classes import merge_chars, _units ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "") -_infixes = TOKENIZER_INFIXES + [ - r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION) -] +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[0-9])[+\-\*^](?=[0-9-])", + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION), + ] +) + +_units = _units.replace("% ", "") +UNITS = merge_chars(_units) + +_suffixes = ( + LIST_PUNCT + + LIST_ELLIPSES + + LIST_QUOTES + + LIST_ICONS + + [r"-", "—", "–"] + + [ + r"(?<=[0-9])\+", + r"(?<=°[FfCcKk])\.", + r"(?<=[0-9])(?:{c})".format(c=CURRENCY), + r"(?<=[0-9])(?:{u})".format(u=UNITS), + r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format( + al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT + ), + r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), + ] +) TOKENIZER_INFIXES = _infixes +TOKENIZER_SUFFIXES = _suffixes diff --git a/spacy/lang/ca/syntax_iterators.py b/spacy/lang/ca/syntax_iterators.py new file mode 100644 index 000000000..917e07c93 --- /dev/null +++ b/spacy/lang/ca/syntax_iterators.py @@ -0,0 +1,48 @@ +from typing import Union, Iterator, Tuple +from ...tokens import Doc, Span +from ...symbols import NOUN, PROPN +from ...errors import Errors + + +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: + """Detect base noun phrases from a dependency parse. Works on Doc and Span.""" + # fmt: off + labels = ["nsubj", "nsubj:pass", "obj", "obl", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] + # fmt: on + doc = doclike.doc # Ensure works on both Doc and Span. + if not doc.has_annotation("DEP"): + raise ValueError(Errors.E029) + np_deps = [doc.vocab.strings[label] for label in labels] + np_label = doc.vocab.strings.add("NP") + prev_end = -1 + for i, word in enumerate(doclike): + if word.pos not in (NOUN, PROPN): + continue + # Prevent nested chunks from being produced + if word.left_edge.i <= prev_end: + continue + if word.dep in np_deps: + left = word.left_edge.i + right = word.right_edge.i + 1 + # leave prepositions and punctuation out of the left side of the chunk + if word.left_edge.pos_ == "ADP" or word.left_edge.pos_ == "PUNCT": + left = word.left_edge.i + 1 + prev_end = word.right_edge.i + # leave subordinated clauses and appositions out of the chunk + a = word.i + 1 + while a < word.right_edge.i: + paraula = doc[a] + if paraula.pos_ == "VERB": + right = paraula.left_edge.i + prev_end = paraula.left_edge.i - 1 + elif paraula.dep_ == "appos": + right = paraula.left_edge.i + 1 + prev_end = paraula.left_edge.i - 1 + a += 1 + # leave punctuation out of the right side of the chunk + if word.right_edge.pos_ == "PUNCT": + right = right - 1 + yield left, right, np_label + + +SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/lang/ca/tokenizer_exceptions.py b/spacy/lang/ca/tokenizer_exceptions.py index b465e97ba..5f9a50f5e 100644 --- a/spacy/lang/ca/tokenizer_exceptions.py +++ b/spacy/lang/ca/tokenizer_exceptions.py @@ -24,6 +24,13 @@ for exc_data in [ {ORTH: "núm", NORM: "número"}, {ORTH: "St.", NORM: "sant"}, {ORTH: "Sta.", NORM: "santa"}, + {ORTH: "'l"}, + {ORTH: "'ls"}, + {ORTH: "'m"}, + {ORTH: "'n"}, + {ORTH: "'ns"}, + {ORTH: "'s"}, + {ORTH: "'t"}, ]: _exc[exc_data[ORTH]] = [exc_data] diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 6fbc45817..9e5441a4f 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -260,7 +260,10 @@ _units = ( "кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб" "كم كم² كم³ م م² م³ سم سم² سم³ مم مم² مم³ كم غرام جرام جم كغ ملغ كوب اكواب" ) -_currency = r"\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼ ₴" +_currency = ( + r"\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼ ₴ ₠ ₡ ₢ ₣ ₤ ₥ ₦ ₧ ₨ ₩ ₪ ₫ € ₭ ₮ ₯ ₰ " + r"₱ ₲ ₳ ₴ ₵ ₶ ₷ ₸ ₹ ₺ ₻ ₼ ₽ ₾ ₿" +) # These expressions contain various unicode variations, including characters # used in Chinese (see #1333, #1340, #1351) – unless there are cross-language diff --git a/spacy/lang/cs/__init__.py b/spacy/lang/cs/__init__.py index 26f5845cc..3e70e4078 100644 --- a/spacy/lang/cs/__init__.py +++ b/spacy/lang/cs/__init__.py @@ -1,9 +1,9 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ...language import Language +from ...language import Language, BaseDefaults -class CzechDefaults(Language.Defaults): +class CzechDefaults(BaseDefaults): lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index c5260ccdd..e148a7b4f 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -3,10 +3,10 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS -from ...language import Language +from ...language import Language, BaseDefaults -class DanishDefaults(Language.Defaults): +class DanishDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES diff --git a/spacy/lang/da/syntax_iterators.py b/spacy/lang/da/syntax_iterators.py index 39181d753..a0b70f004 100644 --- a/spacy/lang/da/syntax_iterators.py +++ b/spacy/lang/da/syntax_iterators.py @@ -1,8 +1,10 @@ +from typing import Union, Iterator, Tuple +from ...tokens import Doc, Span from ...symbols import NOUN, PROPN, PRON, VERB, AUX from ...errors import Errors -def noun_chunks(doclike): +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: def is_verb_token(tok): return tok.pos in [VERB, AUX] @@ -32,7 +34,7 @@ def noun_chunks(doclike): def get_bounds(doc, root): return get_left_bound(doc, root), get_right_bound(doc, root) - doc = doclike.doc + doc = doclike.doc # Ensure works on both Doc and Span. if not doc.has_annotation("DEP"): raise ValueError(Errors.E029) diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index b645d3480..65863c098 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -2,10 +2,10 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .stop_words import STOP_WORDS from .syntax_iterators import SYNTAX_ITERATORS -from ...language import Language +from ...language import Language, BaseDefaults -class GermanDefaults(Language.Defaults): +class GermanDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES diff --git a/spacy/lang/de/syntax_iterators.py b/spacy/lang/de/syntax_iterators.py index aba0e8024..e80504998 100644 --- a/spacy/lang/de/syntax_iterators.py +++ b/spacy/lang/de/syntax_iterators.py @@ -1,11 +1,11 @@ -from typing import Union, Iterator +from typing import Union, Iterator, Tuple from ...symbols import NOUN, PROPN, PRON from ...errors import Errors from ...tokens import Doc, Span -def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: """Detect base noun phrases from a dependency parse. Works on Doc and Span.""" # this iterator extracts spans headed by NOUNs starting from the left-most # syntactic dependent until the NOUN itself for close apposition and diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py index 53069334e..258b37a8a 100644 --- a/spacy/lang/el/__init__.py +++ b/spacy/lang/el/__init__.py @@ -7,10 +7,10 @@ from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .lemmatizer import GreekLemmatizer -from ...language import Language +from ...language import Language, BaseDefaults -class GreekDefaults(Language.Defaults): +class GreekDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES @@ -28,11 +28,13 @@ class Greek(Language): @Greek.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule"}, + default_config={"model": None, "mode": "rule", "overwrite": False}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): - return GreekLemmatizer(nlp.vocab, model, name, mode=mode) +def make_lemmatizer( + nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool +): + return GreekLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) __all__ = ["Greek"] diff --git a/spacy/lang/el/lemmatizer.py b/spacy/lang/el/lemmatizer.py index a049601dc..631848af4 100644 --- a/spacy/lang/el/lemmatizer.py +++ b/spacy/lang/el/lemmatizer.py @@ -57,6 +57,6 @@ class GreekLemmatizer(Lemmatizer): forms.extend(oov_forms) if not forms: forms.append(string) - forms = list(set(forms)) + forms = list(dict.fromkeys(forms)) self.cache[cache_key] = forms return forms diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py index 89cfd8b72..18fa46695 100644 --- a/spacy/lang/el/syntax_iterators.py +++ b/spacy/lang/el/syntax_iterators.py @@ -1,11 +1,11 @@ -from typing import Union, Iterator +from typing import Union, Iterator, Tuple from ...symbols import NOUN, PROPN, PRON from ...errors import Errors from ...tokens import Doc, Span -def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: """Detect base noun phrases from a dependency parse. Works on Doc and Span.""" # It follows the logic of the noun chunks finder of English language, # adjusted to some Greek language special characteristics. diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index 3a3ebeefd..854f59224 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -7,10 +7,10 @@ from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS from .punctuation import TOKENIZER_INFIXES from .lemmatizer import EnglishLemmatizer -from ...language import Language +from ...language import Language, BaseDefaults -class EnglishDefaults(Language.Defaults): +class EnglishDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES lex_attr_getters = LEX_ATTRS @@ -26,11 +26,13 @@ class English(Language): @English.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule"}, + default_config={"model": None, "mode": "rule", "overwrite": False}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): - return EnglishLemmatizer(nlp.vocab, model, name, mode=mode) +def make_lemmatizer( + nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool +): + return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) __all__ = ["English"] diff --git a/spacy/lang/en/lex_attrs.py b/spacy/lang/en/lex_attrs.py index fcc7c6bf2..ab9353919 100644 --- a/spacy/lang/en/lex_attrs.py +++ b/spacy/lang/en/lex_attrs.py @@ -19,7 +19,7 @@ _ordinal_words = [ # fmt: on -def like_num(text: str) -> bool: +def like_num(text): if text.startswith(("+", "-", "±", "~")): text = text[1:] text = text.replace(",", "").replace(".", "") @@ -35,7 +35,7 @@ def like_num(text: str) -> bool: # Check ordinal number if text_lower in _ordinal_words: return True - if text_lower.endswith("th"): + if text_lower.endswith(("st", "nd", "rd", "th")): if text_lower[:-2].isdigit(): return True return False diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py index 00a1bac42..7904e5621 100644 --- a/spacy/lang/en/syntax_iterators.py +++ b/spacy/lang/en/syntax_iterators.py @@ -1,11 +1,11 @@ -from typing import Union, Iterator +from typing import Union, Iterator, Tuple from ...symbols import NOUN, PROPN, PRON from ...errors import Errors from ...tokens import Doc, Span -def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: """ Detect base noun phrases from a dependency parse. Works on both Doc and Span. """ diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index d69508470..55b544e42 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -1,9 +1,10 @@ +from typing import Dict, List from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, NORM from ...util import update_exc -_exc = {} +_exc: Dict[str, List[Dict]] = {} _exclude = [ "Ill", "ill", @@ -294,9 +295,9 @@ for verb_data in [ {ORTH: "has", NORM: "has"}, {ORTH: "dare", NORM: "dare"}, ]: - verb_data_tc = dict(verb_data) + verb_data_tc = dict(verb_data) # type: ignore[call-overload] verb_data_tc[ORTH] = verb_data_tc[ORTH].title() - for data in [verb_data, verb_data_tc]: + for data in [verb_data, verb_data_tc]: # type: ignore[assignment] _exc[data[ORTH] + "n't"] = [ dict(data), {ORTH: "n't", NORM: "not"}, diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py index 9a47855b1..f5d1eb97a 100644 --- a/spacy/lang/es/__init__.py +++ b/spacy/lang/es/__init__.py @@ -1,12 +1,15 @@ +from typing import Optional +from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS +from .lemmatizer import SpanishLemmatizer from .syntax_iterators import SYNTAX_ITERATORS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES -from ...language import Language +from ...language import Language, BaseDefaults -class SpanishDefaults(Language.Defaults): +class SpanishDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES @@ -20,4 +23,16 @@ class Spanish(Language): Defaults = SpanishDefaults +@Spanish.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "rule", "overwrite": False}, + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool +): + return SpanishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + + __all__ = ["Spanish"] diff --git a/spacy/lang/es/lemmatizer.py b/spacy/lang/es/lemmatizer.py new file mode 100644 index 000000000..ca5fc08c8 --- /dev/null +++ b/spacy/lang/es/lemmatizer.py @@ -0,0 +1,428 @@ +from typing import List, Optional, Tuple +import re + +from ...pipeline import Lemmatizer +from ...tokens import Token + + +class SpanishLemmatizer(Lemmatizer): + """ + Spanish rule-based lemmatizer with morph-based rule selection. + """ + + @classmethod + def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]: + if mode == "rule": + required = ["lemma_rules", "lemma_rules_groups", "lemma_index", "lemma_exc"] + return (required, []) + else: + return super().get_lookups_config(mode) + + def rule_lemmatize(self, token: Token) -> List[str]: + cache_key = (token.orth, token.pos, str(token.morph)) + if cache_key in self.cache: + return self.cache[cache_key] + string = token.text + pos = token.pos_.lower() + features = set(token.morph) + if pos in ("", "eol", "space"): + return [string.lower()] + if pos in ( + "adp", + "cconj", + "intj", + "part", + "propn", + "punct", + "sconj", + "sym", + "x", + ): + if token.is_sent_start and pos != "propn": + return [string.lower()] + else: + return [string] + + string = string.lower() + exc = self.lookups.get_table("lemma_exc").get(pos, {}).get(string) + if exc is not None: + lemmas = list(exc) + else: + if pos == "aux": + rule_pos = "verb" + else: + rule_pos = pos + rule = self.select_rule(rule_pos, list(features)) + index = self.lookups.get_table("lemma_index").get(rule_pos, []) + lemmas = getattr(self, "lemmatize_" + rule_pos)( + string, features, rule, index + ) + # Remove duplicates but preserve the ordering + lemmas = list(dict.fromkeys(lemmas)) + + self.cache[cache_key] = lemmas + return lemmas + + def select_rule(self, pos: str, features: List[str]) -> Optional[str]: + groups = self.lookups.get_table("lemma_rules_groups") + if pos in groups: + for group in groups[pos]: + if set(group[1]).issubset(features): + return group[0] + return None + + def lemmatize_adj( + self, word: str, features: List[str], rule: str, index: List[str] + ) -> List[str]: + """ + Lemmatize an adjective. + + word (str): The word to lemmatize. + features (List[str]): The morphological features as a list of Feat=Val + pairs. + index (List[str]): The POS-specific lookup list. + + RETURNS (List[str]): The list of lemmas. + """ + + # Initialize empty lists for the generated lemmas + possible_lemmas = [] + selected_lemmas = [] + + # Apply lemmatization rules + for old, new in self.lookups.get_table("lemma_rules").get(rule, []): + possible_lemma = re.sub(old + "$", new, word) + if possible_lemma != word: + possible_lemmas.append(possible_lemma) + + # Additional rule for plurals that go from esdrújula to grave and end in + # 'n' or 's', e.g., jóvenes -> joven + additional_lemmas = [] + if "Number=Plur" in features: + for possible_lemma in possible_lemmas: + if possible_lemma.endswith("n") or possible_lemma.endswith("s"): + for old, new in self.lookups.get_table("lemma_rules").get( + "accents", [] + ): + additional_lemmas.append(re.sub(old, new, possible_lemma)) + possible_lemmas.extend(additional_lemmas) + + for lemma in possible_lemmas: + if lemma in index: + selected_lemmas.append(lemma) + # If one or more of the created possible lemmas are in the lookup list, + # return all of them + if len(selected_lemmas) > 0: + return selected_lemmas + elif len(possible_lemmas) > 0: + return possible_lemmas + else: + return [word] + + def lemmatize_adv( + self, word: str, features: List[str], rule: str, index: List[str] + ) -> List[str]: + """ + Lemmatize an adverb. + + word (str): The word to lemmatize. + features (List[str]): The morphological features as a list of Feat=Val + pairs. + index (List[str]): The POS-specific lookup list. + + RETURNS (List[str]): The list of lemmas. + """ + + # Apply lemmatization rules + for old, new in self.lookups.get_table("lemma_rules").get("adverbs", []): + if word == old: + return [new] + + # If none of the rules applies, return the original word + return [word] + + def lemmatize_det( + self, word: str, features: List[str], rule: str, index: List[str] + ) -> List[str]: + """ + Lemmatize a determiner. + + word (str): The word to lemmatize. + features (List[str]): The morphological features as a list of Feat=Val + pairs. + index (List[str]): The POS-specific lookup list. + + RETURNS (List[str]): The list of lemmas. + """ + + # Initialize empty lists for the generated lemmas + possible_lemmas = [] + selected_lemmas = [] + + # First, search in rules specific to determiners + for old, new in self.lookups.get_table("lemma_rules").get("det", []): + if word == old: + return [new] + # If none of the specfic rules apply, search in the common rules for + # determiners and pronouns that follow a unique pattern for + # lemmatization. If the word is in the list, return the corresponding + # lemma. + for old, new in self.lookups.get_table("lemma_rules").get( + "det_and_pron_fixed", [] + ): + if word == old: + return [new] + # If the word is not in the list of unique determiners and pronouns, + # apply general rules of lemmatization. Include the original word in the # list of possible lemmas. + for old, new in self.lookups.get_table("lemma_rules").get( + "det_and_pron_general", [] + ): + possible_lemma = re.sub(old + "$", new, word) + possible_lemmas.append(possible_lemma) + possible_lemmas.append(word) + + if len(possible_lemmas) == 1: + return possible_lemmas + elif len(possible_lemmas) > 1: + for lemma in possible_lemmas: + if lemma in index: + selected_lemmas.append(lemma) + if len(selected_lemmas) >= 1: + return selected_lemmas + else: + return possible_lemmas + else: + return [] + + def lemmatize_noun( + self, word: str, features: List[str], rule: str, index: List[str] + ) -> List[str]: + """ + Lemmatize a noun. + + word (str): The word to lemmatize. + features (List[str]): The morphological features as a list of Feat=Val + pairs. + index (List[str]): The POS-specific lookup list. + + RETURNS (List[str]): The list of lemmas. + """ + + # Initialize empty lists for the generated lemmas + possible_lemmas = [] + selected_lemmas = [] + + # Apply lemmatization rules + for old, new in self.lookups.get_table("lemma_rules").get(rule, []): + possible_lemma = re.sub(old + "$", new, word) + if possible_lemma != word: + possible_lemmas.append(possible_lemma) + + # Additional rule for plurals that go from esdrújula to grave and end in + # 'n' or 's', e.g., órdenes -> orden, exámenes -> examen + additional_lemmas = [] + if "Number=Plur" in features: + for possible_lemma in possible_lemmas: + if possible_lemma.endswith("n") or possible_lemma.endswith("s"): + for old, new in self.lookups.get_table("lemma_rules").get( + "accents", [] + ): + additional_lemmas.append(re.sub(old, new, possible_lemma)) + possible_lemmas.extend(additional_lemmas) + + for lemma in possible_lemmas: + if lemma in index: + selected_lemmas.append(lemma) + # If one or more of the created possible lemmas are in the lookup list, + # return all of them + if len(selected_lemmas) > 0: + return selected_lemmas + elif len(possible_lemmas) > 0: + return possible_lemmas + else: + return [word] + + def lemmatize_num( + self, word: str, features: List[str], rule: str, index: List[str] + ) -> List[str]: + """ + Lemmatize a numeral. + + word (str): The word to lemmatize. + features (List[str]): The morphological features as a list of Feat=Val + pairs. + index (List[str]): The POS-specific lookup list. + + RETURNS (List[str]): The list of lemmas. + """ + + # If the word is in the list of rules for numerals, return the + # corresponding lemma + for old, new in self.lookups.get_table("lemma_rules").get("num", []): + if word == old: + return [new] + + # Normalize punctuation + splitted_word = word.split(",") + if re.search(r"(\.)([0-9]{3})$", splitted_word[0]): + word = re.sub(r"\.", r"", word) + word = re.sub(r",", r".", word) + return [word] + + def lemmatize_pron( + self, word: str, features: List[str], rule: Optional[str], index: List[str] + ) -> List[str]: + """ + Lemmatize a pronoun. + + word (str): The word to lemmatize. + features (List[str]): The morphological features as a list of Feat=Val + pairs. + index (List[str]): The POS-specific lookup list. + + RETURNS (List[str]): The list of lemmas. + """ + + # Initialize empty lists for the generated lemmas + possible_lemmas = [] + selected_lemmas = [] + + # First, search in rules specific to pronouns + for old, new in self.lookups.get_table("lemma_rules").get("pron", []): + if word == old: + return [new] + # If none of the specfic rules apply, search in the common rules for + # determiners and pronouns that follow a unique pattern for + # lemmatization. If the word is in the list, return the corresponding + # lemma. + for old, new in self.lookups.get_table("lemma_rules").get( + "det_and_pron_fixed", [] + ): + if word == old: + return [new] + # If the word is not in the list of unique determiners and pronouns, + # apply general rules of lemmatization. Include the original word in the + # list of possible lemmas. + for old, new in self.lookups.get_table("lemma_rules").get( + "det_and_pron_general", [] + ): + possible_lemma = re.sub(old + "$", new, word) + if possible_lemma != word: + possible_lemmas.append(possible_lemma) + possible_lemmas.append(word) + + if len(possible_lemmas) == 1: + return possible_lemmas + elif len(possible_lemmas) > 1: + for lemma in possible_lemmas: + if lemma in index: + selected_lemmas.append(lemma) + if len(selected_lemmas) >= 1: + return selected_lemmas + else: + return possible_lemmas + else: + return [] + + def lemmatize_verb( + self, word: str, features: List[str], rule: Optional[str], index: List[str] + ) -> List[str]: + """ + Lemmatize a verb. + + word (str): The word to lemmatize. + features (List[str]): The morphological features as a list of Feat=Val + pairs. + index (List[str]): The POS-specific lookup list. + + RETURNS (List[str]): The list of lemmas. + """ + # Exceptions for verb+pronoun(s) + if "PronType=Prs" in features: + return self.lemmatize_verb_pron(word, features, rule, index) + + # Initialize empty lists for the generated lemmas + possible_lemmas = [] + selected_lemmas = [] + + # Apply lemmatization rules + rule = str(rule or "") + for old, new in self.lookups.get_table("lemma_rules").get(rule, []): + possible_lemma = re.sub(old + "$", new, word) + if possible_lemma != word: + possible_lemmas.append(possible_lemma) + + for lemma in possible_lemmas: + if lemma in index: + selected_lemmas.append(lemma) + if len(selected_lemmas) == 0: + # If none of the possible lemmas are in the lookup list, + # apply vocalic alternation rules and search in the lookup list + # again + for lemma in possible_lemmas: + for old, new in self.lookups.get_table("lemma_rules").get( + "voc_alt_1", [] + ): + if old in lemma: + for i, char in enumerate(lemma): + if char == old: + voc_alt_lemma = lemma[:i] + new + lemma[i + 1 :] + if voc_alt_lemma in index: + selected_lemmas.append(voc_alt_lemma) + for old, new in self.lookups.get_table("lemma_rules").get( + "voc_alt_2", [] + ): + if old in lemma: + voc_alt_lemma = lemma.replace(old, new, 1) + if voc_alt_lemma in index: + selected_lemmas.append(voc_alt_lemma) + # Additional rule for verbs that lose the accent mark when lemmatized, + # e.g., amplían -> ampliar + additional_lemmas = [] + for possible_lemma in possible_lemmas: + for old, new in self.lookups.get_table("lemma_rules").get("accents", []): + additional_lemmas.append(re.sub(old, new, possible_lemma)) + possible_lemmas.extend(additional_lemmas) + + # If one or more of the created possible lemmas are in the lookup list, + # return all of them + if len(selected_lemmas) > 0: + return selected_lemmas + elif len(possible_lemmas) > 0: + return possible_lemmas + else: + return [word] + + def lemmatize_verb_pron( + self, word: str, features: List[str], rule: Optional[str], index: List[str] + ) -> List[str]: + # Strip and collect pronouns + pron_patt = "^(.*?)([mts]e|l[aeo]s?|n?os)$" + prons: List[str] = [] + verb = word + m = re.search(pron_patt, verb) + while m is not None and len(prons) <= 3: + verb = re.sub(m.group(2) + "$", "", verb) + prons = [m.group(2)] + prons + m = re.search(pron_patt, verb) + # Strip accents from verb form + for old, new in self.lookups.get_table("lemma_rules").get("accents", []): + verb = re.sub(old, new, verb) + # Lemmatize the verb and pronouns, checking for exceptions + exc = self.lookups.get_table("lemma_exc").get("verb", {}).get(verb) + if exc is not None: + verb_lemma = exc[0] + else: + rule = self.select_rule("verb", features) + verb_lemma = self.lemmatize_verb( + verb, features - {"PronType=Prs"}, rule, index # type: ignore[operator] + )[0] + pron_lemmas = [] + for pron in prons: + exc = self.lookups.get_table("lemma_exc").get("pron", {}).get(pron) + if exc is not None: + pron_lemmas.append(exc[0]) + else: + rule = self.select_rule("pron", features) + pron_lemmas.append(self.lemmatize_pron(pron, features, rule, index)[0]) + return [verb_lemma + " " + " ".join(pron_lemmas)] diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py index e753a3f98..8b385a1b9 100644 --- a/spacy/lang/es/syntax_iterators.py +++ b/spacy/lang/es/syntax_iterators.py @@ -1,11 +1,11 @@ -from typing import Union, Iterator +from typing import Union, Iterator, Tuple from ...symbols import NOUN, PROPN, PRON, VERB, AUX from ...errors import Errors from ...tokens import Doc, Span, Token -def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: """Detect base noun phrases from a dependency parse. Works on Doc and Span.""" doc = doclike.doc if not doc.has_annotation("DEP"): diff --git a/spacy/lang/es/tokenizer_exceptions.py b/spacy/lang/es/tokenizer_exceptions.py index fbfe75545..74cdc143d 100644 --- a/spacy/lang/es/tokenizer_exceptions.py +++ b/spacy/lang/es/tokenizer_exceptions.py @@ -51,6 +51,9 @@ for orth in [ "Dr.", "Dra.", "EE.UU.", + "Ee.Uu.", + "EE. UU.", + "Ee. Uu.", "etc.", "fig.", "Gob.", @@ -65,9 +68,11 @@ for orth in [ "Prof.", "Profa.", "q.e.p.d.", - "Q.E.P.D." "S.A.", + "Q.E.P.D.", + "S.A.", "S.L.", - "S.R.L." "s.s.s.", + "S.R.L.", + "s.s.s.", "Sr.", "Sra.", "Srta.", diff --git a/spacy/lang/et/__init__.py b/spacy/lang/et/__init__.py index 9f71882d2..274bc1309 100644 --- a/spacy/lang/et/__init__.py +++ b/spacy/lang/et/__init__.py @@ -1,8 +1,8 @@ from .stop_words import STOP_WORDS -from ...language import Language +from ...language import Language, BaseDefaults -class EstonianDefaults(Language.Defaults): +class EstonianDefaults(BaseDefaults): stop_words = STOP_WORDS diff --git a/spacy/lang/eu/__init__.py b/spacy/lang/eu/__init__.py index 89550be96..3346468bd 100644 --- a/spacy/lang/eu/__init__.py +++ b/spacy/lang/eu/__init__.py @@ -1,10 +1,10 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_SUFFIXES -from ...language import Language +from ...language import Language, BaseDefaults -class BasqueDefaults(Language.Defaults): +class BasqueDefaults(BaseDefaults): suffixes = TOKENIZER_SUFFIXES stop_words = STOP_WORDS lex_attr_getters = LEX_ATTRS diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py index 77ee3bca3..6db64ff62 100644 --- a/spacy/lang/fa/__init__.py +++ b/spacy/lang/fa/__init__.py @@ -5,11 +5,11 @@ from .lex_attrs import LEX_ATTRS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_SUFFIXES from .syntax_iterators import SYNTAX_ITERATORS -from ...language import Language +from ...language import Language, BaseDefaults from ...pipeline import Lemmatizer -class PersianDefaults(Language.Defaults): +class PersianDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS suffixes = TOKENIZER_SUFFIXES lex_attr_getters = LEX_ATTRS @@ -26,11 +26,13 @@ class Persian(Language): @Persian.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule"}, + default_config={"model": None, "mode": "rule", "overwrite": False}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): - return Lemmatizer(nlp.vocab, model, name, mode=mode) +def make_lemmatizer( + nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool +): + return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) __all__ = ["Persian"] diff --git a/spacy/lang/fa/generate_verbs_exc.py b/spacy/lang/fa/generate_verbs_exc.py index 62094c6de..a6d79a386 100644 --- a/spacy/lang/fa/generate_verbs_exc.py +++ b/spacy/lang/fa/generate_verbs_exc.py @@ -639,10 +639,12 @@ for verb_root in verb_roots: ) if past.startswith("آ"): - conjugations = set( - map( - lambda item: item.replace("بآ", "بیا").replace("نآ", "نیا"), - conjugations, + conjugations = list( + set( + map( + lambda item: item.replace("بآ", "بیا").replace("نآ", "نیا"), + conjugations, + ) ) ) diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py index 0be06e73c..8207884b0 100644 --- a/spacy/lang/fa/syntax_iterators.py +++ b/spacy/lang/fa/syntax_iterators.py @@ -1,8 +1,10 @@ +from typing import Union, Iterator, Tuple +from ...tokens import Doc, Span from ...symbols import NOUN, PROPN, PRON from ...errors import Errors -def noun_chunks(doclike): +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: """ Detect base noun phrases from a dependency parse. Works on both Doc and Span. """ diff --git a/spacy/lang/fi/__init__.py b/spacy/lang/fi/__init__.py index 9233c6547..86a834170 100644 --- a/spacy/lang/fi/__init__.py +++ b/spacy/lang/fi/__init__.py @@ -2,10 +2,10 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES -from ...language import Language +from ...language import Language, BaseDefaults -class FinnishDefaults(Language.Defaults): +class FinnishDefaults(BaseDefaults): infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES tokenizer_exceptions = TOKENIZER_EXCEPTIONS diff --git a/spacy/lang/fi/tokenizer_exceptions.py b/spacy/lang/fi/tokenizer_exceptions.py index 22d710cb0..465333b0a 100644 --- a/spacy/lang/fi/tokenizer_exceptions.py +++ b/spacy/lang/fi/tokenizer_exceptions.py @@ -1,5 +1,5 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...symbols import ORTH +from ...symbols import ORTH, NORM from ...util import update_exc @@ -79,5 +79,34 @@ for exc_data in [ ]: _exc[exc_data[ORTH]] = [exc_data] +# Source: https://kaino.kotus.fi/visk/sisallys.php?p=141 +conj_contraction_bases = [ + ("ett", "että"), + ("jott", "jotta"), + ("kosk", "koska"), + ("mutt", "mutta"), + ("vaikk", "vaikka"), + ("ehk", "ehkä"), + ("miks", "miksi"), + ("siks", "siksi"), + ("joll", "jos"), + ("ell", "jos"), +] +conj_contraction_negations = [ + ("en", "en"), + ("et", "et"), + ("ei", "ei"), + ("emme", "emme"), + ("ette", "ette"), + ("eivat", "eivät"), + ("eivät", "eivät"), +] +for (base_lower, base_norm) in conj_contraction_bases: + for base in [base_lower, base_lower.title()]: + for (suffix, suffix_norm) in conj_contraction_negations: + _exc[base + suffix] = [ + {ORTH: base, NORM: base_norm}, + {ORTH: suffix, NORM: suffix_norm}, + ] TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index 1e0011fba..e7267dc61 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -9,10 +9,10 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS from .lemmatizer import FrenchLemmatizer -from ...language import Language +from ...language import Language, BaseDefaults -class FrenchDefaults(Language.Defaults): +class FrenchDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES @@ -31,11 +31,13 @@ class French(Language): @French.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule"}, + default_config={"model": None, "mode": "rule", "overwrite": False}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): - return FrenchLemmatizer(nlp.vocab, model, name, mode=mode) +def make_lemmatizer( + nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool +): + return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) __all__ = ["French"] diff --git a/spacy/lang/fr/lemmatizer.py b/spacy/lang/fr/lemmatizer.py index bb5a270ab..c6422cf96 100644 --- a/spacy/lang/fr/lemmatizer.py +++ b/spacy/lang/fr/lemmatizer.py @@ -75,6 +75,6 @@ class FrenchLemmatizer(Lemmatizer): forms.append(self.lookup_lemmatize(token)[0]) if not forms: forms.append(string) - forms = list(set(forms)) + forms = list(dict.fromkeys(forms)) self.cache[cache_key] = forms return forms diff --git a/spacy/lang/fr/stop_words.py b/spacy/lang/fr/stop_words.py index ab1f2f4a7..b32ee3d71 100644 --- a/spacy/lang/fr/stop_words.py +++ b/spacy/lang/fr/stop_words.py @@ -1,30 +1,31 @@ STOP_WORDS = set( """ a à â abord afin ah ai aie ainsi ait allaient allons -alors anterieur anterieure anterieures apres après as assez attendu au -aucun aucune aujourd aujourd'hui aupres auquel aura auraient aurait auront +alors anterieur anterieure anterieures antérieur antérieure antérieures +apres après as assez attendu au +aupres auquel aura auraient aurait auront aussi autre autrement autres autrui aux auxquelles auxquels avaient avais avait avant avec avoir avons ayant bas basee bat -c' c’ ça car ce ceci cela celle celle-ci celle-là celles celles-ci celles-là celui -celui-ci celui-là cent cependant certain certaine certaines certains certes ces +c' c’ ça car ce ceci cela celle celle-ci celle-la celle-là celles celles-ci celles-la celles-là +celui celui-ci celui-la celui-là cent cependant certain certaine certaines certains certes ces cet cette ceux ceux-ci ceux-là chacun chacune chaque chez ci cinq cinquantaine cinquante cinquantième cinquième combien comme comment compris concernant -d' d’ da dans de debout dedans dehors deja delà depuis derriere +d' d’ da dans de debout dedans dehors deja dejà delà depuis derriere derrière des desormais desquelles desquels dessous dessus deux deuxième -deuxièmement devant devers devra different differentes differents différent +deuxièmement devant devers devra different differente differentes differents différent différente différentes différents dire directe directement dit dite dits divers diverse diverses dix dix-huit dix-neuf dix-sept dixième doit doivent donc dont -douze douzième du duquel durant dès désormais +douze douzième du duquel durant dès déja déjà désormais -effet egale egalement egales eh elle elle-même elles elles-mêmes en encore +effet egalement eh elle elle-meme elle-même elles elles-memes elles-mêmes en encore enfin entre envers environ es ès est et etaient étaient etais étais etait était -etant étant etc été etre être eu eux eux-mêmes exactement excepté +etant étant etc etre être eu eux eux-mêmes exactement excepté également -fais faisaient faisant fait façon feront font +fais faisaient faisant fait facon façon feront font gens @@ -36,45 +37,48 @@ j' j’ je jusqu jusque juste l' l’ la laisser laquelle le lequel les lesquelles lesquels leur leurs longtemps lors lorsque lui lui-meme lui-même là lès -m' m’ ma maint maintenant mais malgre me meme memes merci mes mien +m' m’ ma maint maintenant mais malgre malgré me meme memes merci mes mien mienne miennes miens mille moi moi-meme moi-même moindres moins mon même mêmes n' n’ na ne neanmoins neuvième ni nombreuses nombreux nos notamment -notre nous nous-mêmes nouvea nul néanmoins nôtre nôtres +notre nous nous-mêmes nouveau nul néanmoins nôtre nôtres -o ô on ont onze onzième ore ou ouias oust outre +o ô on ont onze onzième or ou ouias ouste outre ouvert ouverte ouverts où -par parce parfois parle parlent parler parmi parseme partant +par parce parfois parle parlent parler parmi partant pas pendant pense permet personne peu peut peuvent peux plus -plusieurs plutôt possible possibles pour pourquoi -pourrais pourrait pouvait prealable precisement premier première premièrement -pres procedant proche près pu puis puisque +plusieurs plutot plutôt possible possibles pour pourquoi +pourrais pourrait pouvait prealable precisement +premier première premièrement +pres procedant proche près préalable précisement pu puis puisque -qu' qu’ quand quant quant-à-soi quanta quarante quatorze quatre quatre-vingt +qu' qu’ quand quant quant-à-soi quarante quatorze quatre quatre-vingt quatrième quatrièmement que quel quelconque quelle quelles quelqu'un quelque quelques quels qui quiconque quinze quoi quoique relative relativement rend rendre restant reste -restent retour revoici revoilà +restent retour revoici revoila revoilà s' s’ sa sait sans sauf se seize selon semblable semblaient semble semblent sent sept septième sera seraient serait seront ses seul seule -seulement si sien sienne siennes siens sinon six sixième soi soi-même soit -soixante son sont sous souvent specifique specifiques stop +seulement seuls seules si sien sienne siennes siens sinon six sixième soi soi-meme soi-même soit +soixante son sont sous souvent specifique specifiques spécifique spécifiques stop suffisant suffisante suffit suis suit suivant suivante suivantes suivants suivre sur surtout t' t’ ta tant te tel telle tellement telles tels tenant tend tenir tente -tes tien tienne tiennes tiens toi toi-même ton touchant toujours tous -tout toute toutes treize trente tres trois troisième troisièmement +tes tien tienne tiennes tiens toi toi-meme toi-même ton touchant toujours tous +tout toute toutes treize trente tres trois troisième troisièmement très tu té un une unes uns -va vais vas vers via vingt voici voilà vont vos -votre vous vous-mêmes vu vé vôtre vôtres +va vais vas vers via vingt voici voila voilà vont vos +votre votres vous vous-mêmes vu vé vôtre vôtres + +y """.split() ) diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py index 68117a54d..d86662693 100644 --- a/spacy/lang/fr/syntax_iterators.py +++ b/spacy/lang/fr/syntax_iterators.py @@ -1,11 +1,11 @@ -from typing import Union, Iterator +from typing import Union, Iterator, Tuple from ...symbols import NOUN, PROPN, PRON from ...errors import Errors from ...tokens import Doc, Span -def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: """Detect base noun phrases from a dependency parse. Works on Doc and Span.""" # fmt: off labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index 6f429eecc..2e88b58cf 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -82,7 +82,8 @@ for orth in [ for verb in [ "a", - "est" "semble", + "est", + "semble", "indique", "moque", "passe", @@ -114,7 +115,7 @@ for s, verb, pronoun in [("s", "est", "il"), ("S", "EST", "IL")]: ] -_infixes_exc = [] +_infixes_exc = [] # type: ignore[var-annotated] orig_elision = "'" orig_hyphen = "-" diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py index 80131368b..90735d749 100644 --- a/spacy/lang/ga/__init__.py +++ b/spacy/lang/ga/__init__.py @@ -1,9 +1,9 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS -from ...language import Language +from ...language import Language, BaseDefaults -class IrishDefaults(Language.Defaults): +class IrishDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS stop_words = STOP_WORDS diff --git a/spacy/lang/grc/__init__.py b/spacy/lang/grc/__init__.py new file mode 100644 index 000000000..e83f0c5a5 --- /dev/null +++ b/spacy/lang/grc/__init__.py @@ -0,0 +1,18 @@ +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS +from ...language import Language, BaseDefaults + + +class AncientGreekDefaults(BaseDefaults): + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS + + +class AncientGreek(Language): + lang = "grc" + Defaults = AncientGreekDefaults + + +__all__ = ["AncientGreek"] diff --git a/spacy/lang/grc/examples.py b/spacy/lang/grc/examples.py new file mode 100644 index 000000000..9c0bcb265 --- /dev/null +++ b/spacy/lang/grc/examples.py @@ -0,0 +1,17 @@ +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.grc.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "ἐρᾷ μὲν ἁγνὸς οὐρανὸς τρῶσαι χθόνα, ἔρως δὲ γαῖαν λαμβάνει γάμου τυχεῖν·", + "εὐδαίμων Χαρίτων καὶ Μελάνιππος ἔφυ, θείας ἁγητῆρες ἐφαμερίοις φιλότατος.", + "ὃ μὲν δὴ ἀπόστολος ἐς τὴν Μίλητον ἦν.", + "Θρασύβουλος δὲ σαφέως προπεπυσμένος πάντα λόγον καὶ εἰδὼς τὰ Ἀλυάττης μέλλοι ποιήσειν μηχανᾶται τοιάδε.", + "φιλόπαις δ' ἦν ἐκμανῶς καὶ Ἀλέξανδρος ὁ βασιλεύς.", + "Ἀντίγονος ὁ βασιλεὺς ἐπεκώμαζε τῷ Ζήνωνι", + "αὐτὰρ ὃ δεύτατος ἦλθεν ἄναξ ἀνδρῶν Ἀγαμέμνων ἕλκος ἔχων", +] diff --git a/spacy/lang/grc/lex_attrs.py b/spacy/lang/grc/lex_attrs.py new file mode 100644 index 000000000..0ab15e6fd --- /dev/null +++ b/spacy/lang/grc/lex_attrs.py @@ -0,0 +1,314 @@ +from ...attrs import LIKE_NUM + + +_num_words = [ + # CARDINALS + "εἷς", + "ἑνός", + "ἑνί", + "ἕνα", + "μία", + "μιᾶς", + "μιᾷ", + "μίαν", + "ἕν", + "δύο", + "δυοῖν", + "τρεῖς", + "τριῶν", + "τρισί", + "τρία", + "τέτταρες", + "τεττάρων", + "τέτταρσι", + "τέτταρα", + "τέτταρας", + "πέντε", + "ἕξ", + "ἑπτά", + "ὀκτώ", + "ἐννέα", + "δέκα", + "ἕνδεκα", + "δώδεκα", + "πεντεκαίδεκα", + "ἑκκαίδεκα", + "ἑπτακαίδεκα", + "ὀκτωκαίδεκα", + "ἐννεακαίδεκα", + "εἴκοσι", + "τριάκοντα", + "τετταράκοντα", + "πεντήκοντα", + "ἑξήκοντα", + "ἑβδομήκοντα", + "ὀγδοήκοντα", + "ἐνενήκοντα", + "ἑκατόν", + "διακόσιοι", + "διακοσίων", + "διακοσιᾶν", + "διακοσίους", + "διακοσίοις", + "διακόσια", + "διακόσιαι", + "διακοσίαις", + "διακοσίαισι", + "διηκόσιοι", + "διηκοσίων", + "διηκοσιέων", + "διακοσίας", + "διηκόσια", + "διηκόσιαι", + "διηκοσίας", + "τριακόσιοι", + "τριακοσίων", + "τριακοσιᾶν", + "τριακοσίους", + "τριακοσίοις", + "τριακόσια", + "τριακόσιαι", + "τριακοσίαις", + "τριακοσίαισι", + "τριακοσιέων", + "τριακοσίας", + "τριηκόσια", + "τριηκοσίας", + "τριηκόσιοι", + "τριηκοσίοισιν", + "τριηκοσίους", + "τριηκοσίων", + "τετρακόσιοι", + "τετρακοσίων", + "τετρακοσιᾶν", + "τετρακοσίους", + "τετρακοσίοις", + "τετρακόσια", + "τετρακόσιαι", + "τετρακοσίαις", + "τετρακοσίαισι", + "τετρακοσιέων", + "τετρακοσίας", + "πεντακόσιοι", + "πεντακοσίων", + "πεντακοσιᾶν", + "πεντακοσίους", + "πεντακοσίοις", + "πεντακόσια", + "πεντακόσιαι", + "πεντακοσίαις", + "πεντακοσίαισι", + "πεντακοσιέων", + "πεντακοσίας", + "ἑξακόσιοι", + "ἑξακοσίων", + "ἑξακοσιᾶν", + "ἑξακοσίους", + "ἑξακοσίοις", + "ἑξακόσια", + "ἑξακόσιαι", + "ἑξακοσίαις", + "ἑξακοσίαισι", + "ἑξακοσιέων", + "ἑξακοσίας", + "ἑπτακόσιοι", + "ἑπτακοσίων", + "ἑπτακοσιᾶν", + "ἑπτακοσίους", + "ἑπτακοσίοις", + "ἑπτακόσια", + "ἑπτακόσιαι", + "ἑπτακοσίαις", + "ἑπτακοσίαισι", + "ἑπτακοσιέων", + "ἑπτακοσίας", + "ὀκτακόσιοι", + "ὀκτακοσίων", + "ὀκτακοσιᾶν", + "ὀκτακοσίους", + "ὀκτακοσίοις", + "ὀκτακόσια", + "ὀκτακόσιαι", + "ὀκτακοσίαις", + "ὀκτακοσίαισι", + "ὀκτακοσιέων", + "ὀκτακοσίας", + "ἐνακόσιοι", + "ἐνακοσίων", + "ἐνακοσιᾶν", + "ἐνακοσίους", + "ἐνακοσίοις", + "ἐνακόσια", + "ἐνακόσιαι", + "ἐνακοσίαις", + "ἐνακοσίαισι", + "ἐνακοσιέων", + "ἐνακοσίας", + "χίλιοι", + "χιλίων", + "χιλιῶν", + "χιλίους", + "χιλίοις", + "χίλιαι", + "χιλίας", + "χιλίαις", + "χίλια", + "χίλι", + "δισχίλιοι", + "δισχιλίων", + "δισχιλιῶν", + "δισχιλίους", + "δισχιλίοις", + "δισχίλιαι", + "δισχιλίας", + "δισχιλίαις", + "δισχίλια", + "δισχίλι", + "τρισχίλιοι", + "τρισχιλίων", + "τρισχιλιῶν", + "τρισχιλίους", + "τρισχιλίοις", + "τρισχίλιαι", + "τρισχιλίας", + "τρισχιλίαις", + "τρισχίλια", + "τρισχίλι", + "μύριοι", + "μύριοί", + "μυρίων", + "μυρίοις", + "μυρίους", + "μύριαι", + "μυρίαις", + "μυρίας", + "μύρια", + "δισμύριοι", + "δισμύριοί", + "δισμυρίων", + "δισμυρίοις", + "δισμυρίους", + "δισμύριαι", + "δισμυρίαις", + "δισμυρίας", + "δισμύρια", + "δεκακισμύριοι", + "δεκακισμύριοί", + "δεκακισμυρίων", + "δεκακισμυρίοις", + "δεκακισμυρίους", + "δεκακισμύριαι", + "δεκακισμυρίαις", + "δεκακισμυρίας", + "δεκακισμύρια", + # ANCIENT GREEK NUMBERS (1-100) + "α", + "β", + "γ", + "δ", + "ε", + "ϛ", + "ζ", + "η", + "θ", + "ι", + "ια", + "ιβ", + "ιγ", + "ιδ", + "ιε", + "ιϛ", + "ιζ", + "ιη", + "ιθ", + "κ", + "κα", + "κβ", + "κγ", + "κδ", + "κε", + "κϛ", + "κζ", + "κη", + "κθ", + "λ", + "λα", + "λβ", + "λγ", + "λδ", + "λε", + "λϛ", + "λζ", + "λη", + "λθ", + "μ", + "μα", + "μβ", + "μγ", + "μδ", + "με", + "μϛ", + "μζ", + "μη", + "μθ", + "ν", + "να", + "νβ", + "νγ", + "νδ", + "νε", + "νϛ", + "νζ", + "νη", + "νθ", + "ξ", + "ξα", + "ξβ", + "ξγ", + "ξδ", + "ξε", + "ξϛ", + "ξζ", + "ξη", + "ξθ", + "ο", + "οα", + "οβ", + "ογ", + "οδ", + "οε", + "οϛ", + "οζ", + "οη", + "οθ", + "π", + "πα", + "πβ", + "πγ", + "πδ", + "πε", + "πϛ", + "πζ", + "πη", + "πθ", + "ϟ", + "ϟα", + "ϟβ", + "ϟγ", + "ϟδ", + "ϟε", + "ϟϛ", + "ϟζ", + "ϟη", + "ϟθ", + "ρ", +] + + +def like_num(text): + if text.lower() in _num_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/grc/stop_words.py b/spacy/lang/grc/stop_words.py new file mode 100644 index 000000000..cbb766a8c --- /dev/null +++ b/spacy/lang/grc/stop_words.py @@ -0,0 +1,61 @@ +STOP_WORDS = set( + """ +αὐτῷ αὐτοῦ αὐτῆς αὐτόν αὐτὸν αὐτῶν αὐτὸς αὐτὸ αὐτό αὐτός αὐτὴν αὐτοῖς αὐτοὺς αὔτ' αὐτὰ αὐτῇ αὐτὴ +αὐτὼ αὑταὶ καὐτὸς αὐτά αὑτός αὐτοῖσι αὐτοῖσιν αὑτὸς αὐτήν αὐτοῖσί αὐτοί αὐτοὶ αὐτοῖο αὐτάων αὐτὰς +αὐτέων αὐτώ αὐτάς αὐτούς αὐτή αὐταί αὐταὶ αὐτῇσιν τὠυτῷ τὠυτὸ ταὐτὰ ταύτῃ αὐτῇσι αὐτῇς αὐταῖς αὐτᾶς αὐτὰν ταὐτὸν + +γε γ' γέ γὰρ γάρ δαῖτα δαιτὸς δαιτὶ δαὶ δαιτί δαῖτ' δαΐδας δαΐδων δἰ διὰ διά δὲ δ' δέ δὴ δή εἰ εἴ κεἰ κεἴ αἴ αἲ εἲ αἰ + +ἐστί ἐστιν ὢν ἦν ἐστὶν ὦσιν εἶναι ὄντι εἰσιν ἐστι ὄντα οὖσαν ἦσαν ἔστι ὄντας ἐστὲ εἰσὶ εἶ ὤν ἦ οὖσαι ἔσται ἐσμὲν ἐστ' ἐστίν ἔστ' ὦ ἔσει ἦμεν εἰμι εἰσὶν ἦσθ' +ἐστὶ ᾖ οὖσ' ἔστιν εἰμὶ εἴμ' ἐσθ' ᾖς στί εἴην εἶναί οὖσα κἄστ' εἴη ἦσθα εἰμ' ἔστω ὄντ' ἔσθ' ἔμμεναι ἔω ἐὼν ἐσσι ἔσσεται ἐστὸν ἔσαν ἔστων ἐόντα ἦεν ἐοῦσαν ἔην +ἔσσομαι εἰσί ἐστόν ἔσκεν ἐόντ' ἐών ἔσσεσθ' εἰσ' ἐόντες ἐόντε ἐσσεῖται εἰμεν ἔασιν ἔσκε ἔμεναι ἔσεσθαι ἔῃ εἰμὲν εἰσι ἐόντας ἔστε εἰς ἦτε εἰμί ἔσσεαι ἔμμεν +ἐοῦσα ἔμεν ᾖσιν ἐστε ἐόντι εἶεν ἔσσονται ἔησθα ἔσεσθε ἐσσί ἐοῦσ' ἔασι ἔα ἦα ἐόν ἔσσεσθαι ἔσομαι ἔσκον εἴης ἔωσιν εἴησαν ἐὸν ἐουσέων ἔσσῃ ἐούσης ἔσονται +ἐούσας ἐόντων ἐόντος ἐσομένην ἔστωσαν ἔωσι ἔας ἐοῦσαι ἣν εἰσίν ἤστην ὄντες ὄντων οὔσας οὔσαις ὄντος οὖσι οὔσης ἔσῃ ὂν ἐσμεν ἐσμέν οὖσιν ἐσομένους ἐσσόμεσθα + +ἒς ἐς ἔς ἐν κεἰς εἲς κἀν ἔν κατὰ κατ' καθ' κατά κάτα κὰπ κὰκ κὰδ κὰρ κάρ κὰγ κὰμ καὶ καί μετὰ μεθ' μετ' μέτα μετά μέθ' μέτ' μὲν μέν μὴ + +μή μη οὐκ οὒ οὐ οὐχ οὐχὶ κοὐ κοὐχ οὔ κοὐκ οὐχί οὐκὶ οὐδὲν οὐδεὶς οὐδέν κοὐδεὶς κοὐδὲν οὐδένα οὐδενὸς οὐδέν' οὐδενός οὐδενὶ +οὐδεμία οὐδείς οὐδεμίαν οὐδὲ οὐδ' κοὐδ' οὐδέ οὔτε οὔθ' οὔτέ τε οὔτ' οὕτως οὕτω οὕτῶ χοὔτως οὖν ὦν ὧν τοῦτο τοῦθ' τοῦτον τούτῳ +τούτοις ταύτας αὕτη ταῦτα οὗτος ταύτης ταύτην τούτων ταῦτ' τοῦτ' τούτου αὗται τούτους τοῦτό ταῦτά τούτοισι χαὔτη ταῦθ' χοὖτοι +τούτοισιν οὗτός οὗτοι τούτω τουτέων τοῦτὸν οὗτοί τοῦτου οὗτοὶ ταύτῃσι ταύταις ταυτὶ παρὰ παρ' πάρα παρά πὰρ παραὶ πάρ' περὶ +πέρι περί πρὸς πρός ποτ' ποτὶ προτὶ προτί πότι + +σὸς σήν σὴν σὸν σόν σὰ σῶν σοῖσιν σός σῆς σῷ σαῖς σῇ σοῖς σοῦ σ' σὰν σά σὴ σὰς +σᾷ σοὺς σούς σοῖσι σῇς σῇσι σή σῇσιν σοὶ σου ὑμεῖς σὲ σύ σοι ὑμᾶς ὑμῶν ὑμῖν σε +σέ σὺ σέθεν σοί ὑμὶν σφῷν ὑμίν τοι τοὶ σφὼ ὔμμ' σφῶϊ σεῖο τ' σφῶϊν ὔμμιν σέο σευ σεῦ +ὔμμι ὑμέων τύνη ὑμείων τοί ὔμμες σεο τέ τεοῖο ὑμέας σὺν ξὺν σύν + +θ' τί τι τις τινες τινα τινος τινὸς τινὶ τινῶν τίς τίνες τινὰς τιν' τῳ του τίνα τοῦ τῷ τινί τινά τίνος τινι τινας τινὰ τινων +τίν' τευ τέο τινές τεο τινὲς τεῷ τέῳ τινός τεῳ τισὶ + +τοιαῦτα τοιοῦτον τοιοῦθ' τοιοῦτος τοιαύτην τοιαῦτ' τοιούτου τοιαῦθ' τοιαύτῃ τοιούτοις τοιαῦται τοιαῦτά τοιαύτη τοιοῦτοι τοιούτων τοιούτοισι +τοιοῦτο τοιούτους τοιούτῳ τοιαύτης τοιαύταις τοιαύτας τοιοῦτός τίνι τοῖσι τίνων τέων τέοισί τὰ τῇ τώ τὼ + +ἀλλὰ ἀλλ' ἀλλά ἀπ' ἀπὸ κἀπ' ἀφ' τἀπὸ κἀφ' ἄπο ἀπό τὠπὸ τἀπ' ἄλλων ἄλλῳ ἄλλη ἄλλης ἄλλους ἄλλοις ἄλλον ἄλλο ἄλλου τἄλλα ἄλλα +ἄλλᾳ ἄλλοισιν τἄλλ' ἄλλ' ἄλλος ἄλλοισι κἄλλ' ἄλλοι ἄλλῃσι ἄλλόν ἄλλην ἄλλά ἄλλαι ἄλλοισίν ὧλλοι ἄλλῃ ἄλλας ἀλλέων τἆλλα ἄλλως +ἀλλάων ἄλλαις τἆλλ' + +ἂν ἄν κἂν τἂν ἃν κεν κ' κέν κέ κε χ' ἄρα τἄρα ἄρ' τἄρ' ἄρ ῥα ῥά ῥ τὰρ ἄρά ἂρ + +ἡμᾶς με ἐγὼ ἐμὲ μοι κἀγὼ ἡμῶν ἡμεῖς ἐμοὶ ἔγωγ' ἁμοὶ ἡμῖν μ' ἔγωγέ ἐγώ ἐμοί ἐμοῦ κἀμοῦ ἔμ' κἀμὲ ἡμὶν μου ἐμέ ἔγωγε νῷν νὼ χἠμεῖς ἁμὲ κἀγώ κἀμοὶ χἠμᾶς +ἁγὼ ἡμίν κἄμ' ἔμοιγ' μοί τοὐμὲ ἄμμε ἐγὼν ἐμεῦ ἐμεῖο μευ ἔμοιγε ἄμμι μέ ἡμέας νῶϊ ἄμμιν ἧμιν ἐγών νῶΐ ἐμέθεν ἥμιν ἄμμες νῶι ἡμείων ἄμμ' ἡμέων ἐμέο +ἐκ ἔκ ἐξ κἀκ κ ἃκ κἀξ ἔξ εξ Ἐκ τἀμὰ ἐμοῖς τοὐμόν ἐμᾶς τοὐμὸν ἐμῶν ἐμὸς ἐμῆς ἐμῷ τὠμῷ ἐμὸν τἄμ' ἐμὴ ἐμὰς ἐμαῖς ἐμὴν ἐμόν ἐμὰ ἐμός ἐμοὺς ἐμῇ ἐμᾷ +οὑμὸς ἐμοῖν οὑμός κἀμὸν ἐμαὶ ἐμή ἐμάς ἐμοῖσι ἐμοῖσιν ἐμῇσιν ἐμῇσι ἐμῇς ἐμήν + +ἔνι ἐνὶ εἰνὶ εἰν ἐμ ἐπὶ ἐπ' ἔπι ἐφ' κἀπὶ τἀπὶ ἐπί ἔφ' ἔπ' ἐὰν ἢν ἐάν ἤν ἄνπερ + +αὑτοῖς αὑτὸν αὑτῷ ἑαυτοῦ αὑτόν αὑτῆς αὑτῶν αὑτοῦ αὑτὴν αὑτοῖν χαὐτοῦ αὑταῖς ἑωυτοῦ ἑωυτῇ ἑωυτὸν ἐωυτῷ ἑωυτῆς ἑωυτόν ἑωυτῷ +ἑωυτάς ἑωυτῶν ἑωυτοὺς ἑωυτοῖσι ἑαυτῇ ἑαυτούς αὑτοὺς ἑαυτῶν ἑαυτοὺς ἑαυτὸν ἑαυτῷ ἑαυτοῖς ἑαυτὴν ἑαυτῆς + +ἔτι ἔτ' ἔθ' κἄτι ἢ ἤ ἠέ ἠὲ ἦε ἦέ ἡ τοὺς τὴν τὸ τῶν τὸν ὁ ἁ οἱ τοῖς ταῖς τῆς τὰς αἱ τό τὰν τᾶς τοῖσιν αἳ χὠ τήν τά τοῖν τάς ὅ +χοἰ ἣ ἥ χἠ τάν τᾶν ὃ οἳ οἵ τοῖο τόν τοῖιν τούς τάων ταὶ τῇς τῇσι τῇσιν αἵ τοῖό τοῖσίν ὅττί ταί Τὴν τῆ τῶ τάδε ὅδε τοῦδε τόδε τόνδ' +τάδ' τῆσδε τῷδε ὅδ' τῶνδ' τῇδ' τοῦδέ τῶνδε τόνδε τόδ' τοῦδ' τάσδε τήνδε τάσδ' τήνδ' ταῖσδέ τῇδε τῆσδ' τάνδ' τῷδ' τάνδε ἅδε τοῖσδ' ἥδ' +τᾷδέ τοῖσδε τούσδ' ἥδε τούσδε τώδ' ἅδ' οἵδ' τῶνδέ οἵδε τᾷδε τοῖσδεσσι τώδε τῇδέ τοῖσιδε αἵδε τοῦδὲ τῆδ' αἵδ' τοῖσδεσι ὃν ἃ ὃς ᾧ οὗ ἅπερ +οὓς ἧς οἷς ἅσπερ ᾗ ἅ χὦνπερ ὣ αἷς ᾇ ὅς ἥπερ ἃς ὅσπερ ὅνπερ ὧνπερ ᾧπερ ὅν αἷν οἷσι ἇς ἅς ὥ οὕς ἥν οἷσιν ἕης ὅου ᾗς οἷσί οἷσίν τοῖσί ᾗσιν οἵπερ αἷσπερ +ὅστις ἥτις ὅτου ὅτοισι ἥντιν' ὅτῳ ὅντιν' ὅττι ἅσσά ὅτεῳ ὅτις ὅτιν' ὅτευ ἥντινα αἵτινές ὅντινα ἅσσα ᾧτινι οἵτινες ὅτι ἅτις ὅτ' ὑμὴ +ὑμήν ὑμὸν ὑπὲρ ὕπερ ὑπέρτερον ὑπεὶρ ὑπέρτατος ὑπὸ ὑπ' ὑφ' ὕπο ὑπαὶ ὑπό ὕπ' ὕφ' + + ὣς ὡς ὥς ὧς ὥστ' ὥστε ὥσθ' ὤ ὢ + + """.split() +) diff --git a/spacy/lang/grc/tokenizer_exceptions.py b/spacy/lang/grc/tokenizer_exceptions.py new file mode 100644 index 000000000..bcee70f32 --- /dev/null +++ b/spacy/lang/grc/tokenizer_exceptions.py @@ -0,0 +1,111 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ...symbols import ORTH, NORM +from ...util import update_exc + +_exc = {} + +for token in ["᾽Απ'", "᾽ΑΠ'", "ἀφ'", "᾽Αφ", "ἀπὸ"]: + _exc[token] = [{ORTH: token, NORM: "από"}] + +for token in ["᾽Αλλ'", "ἀλλ'", "ἀλλὰ"]: + _exc[token] = [{ORTH: token, NORM: "ἀλλά"}] + +for token in ["παρ'", "Παρ'", "παρὰ", "παρ"]: + _exc[token] = [{ORTH: token, NORM: "παρά"}] + +for token in ["καθ'", "Καθ'", "κατ'", "Κατ'", "κατὰ"]: + _exc[token] = [{ORTH: token, NORM: "κατά"}] + +for token in ["Ἐπ'", "ἐπ'", "ἐπὶ", "Εφ'", "εφ'"]: + _exc[token] = [{ORTH: token, NORM: "επί"}] + +for token in ["Δι'", "δι'", "διὰ"]: + _exc[token] = [{ORTH: token, NORM: "διά"}] + +for token in ["Ὑπ'", "ὑπ'", "ὑφ'"]: + _exc[token] = [{ORTH: token, NORM: "ὑπό"}] + +for token in ["Μετ'", "μετ'", "μεθ'", "μετὰ"]: + _exc[token] = [{ORTH: token, NORM: "μετά"}] + +for token in ["Μ'", "μ'", "μέ", "μὲ"]: + _exc[token] = [{ORTH: token, NORM: "με"}] + +for token in ["Σ'", "σ'", "σέ", "σὲ"]: + _exc[token] = [{ORTH: token, NORM: "σε"}] + +for token in ["Τ'", "τ'", "τέ", "τὲ"]: + _exc[token] = [{ORTH: token, NORM: "τε"}] + +for token in ["Δ'", "δ'", "δὲ"]: + _exc[token] = [{ORTH: token, NORM: "δέ"}] + + +_other_exc = { + "μὲν": [{ORTH: "μὲν", NORM: "μέν"}], + "μὴν": [{ORTH: "μὴν", NORM: "μήν"}], + "τὴν": [{ORTH: "τὴν", NORM: "τήν"}], + "τὸν": [{ORTH: "τὸν", NORM: "τόν"}], + "καὶ": [{ORTH: "καὶ", NORM: "καί"}], + "καὐτός": [{ORTH: "κ", NORM: "καί"}, {ORTH: "αὐτός"}], + "καὐτὸς": [{ORTH: "κ", NORM: "καί"}, {ORTH: "αὐτὸς", NORM: "αὐτός"}], + "κοὐ": [{ORTH: "κ", NORM: "καί"}, {ORTH: "οὐ"}], + "χἡ": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ἡ"}], + "χοἱ": [{ORTH: "χ", NORM: "καί"}, {ORTH: "οἱ"}], + "χἱκετεύετε": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ἱκετεύετε"}], + "κἀν": [{ORTH: "κ", NORM: "καί"}, {ORTH: "ἀν", NORM: "ἐν"}], + "κἀγὼ": [{ORTH: "κἀ", NORM: "καί"}, {ORTH: "γὼ", NORM: "ἐγώ"}], + "κἀγώ": [{ORTH: "κἀ", NORM: "καί"}, {ORTH: "γώ", NORM: "ἐγώ"}], + "ἁγώ": [{ORTH: "ἁ", NORM: "ἃ"}, {ORTH: "γώ", NORM: "ἐγώ"}], + "ἁγὼ": [{ORTH: "ἁ", NORM: "ἃ"}, {ORTH: "γὼ", NORM: "ἐγώ"}], + "ἐγᾦδα": [{ORTH: "ἐγ", NORM: "ἐγώ"}, {ORTH: "ᾦδα", NORM: "οἶδα"}], + "ἐγᾦμαι": [{ORTH: "ἐγ", NORM: "ἐγώ"}, {ORTH: "ᾦμαι", NORM: "οἶμαι"}], + "κἀς": [{ORTH: "κ", NORM: "καί"}, {ORTH: "ἀς", NORM: "ἐς"}], + "κᾆτα": [{ORTH: "κ", NORM: "καί"}, {ORTH: "ᾆτα", NORM: "εἶτα"}], + "κεἰ": [{ORTH: "κ", NORM: "καί"}, {ORTH: "εἰ"}], + "κεἰς": [{ORTH: "κ", NORM: "καί"}, {ORTH: "εἰς"}], + "χὤτε": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤτε", NORM: "ὅτε"}], + "χὤπως": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤπως", NORM: "ὅπως"}], + "χὤτι": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤτι", NORM: "ὅτι"}], + "χὤταν": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤταν", NORM: "ὅταν"}], + "οὑμός": [{ORTH: "οὑ", NORM: "ὁ"}, {ORTH: "μός", NORM: "ἐμός"}], + "οὑμὸς": [{ORTH: "οὑ", NORM: "ὁ"}, {ORTH: "μὸς", NORM: "ἐμός"}], + "οὑμοί": [{ORTH: "οὑ", NORM: "οἱ"}, {ORTH: "μοί", NORM: "ἐμoί"}], + "οὑμοὶ": [{ORTH: "οὑ", NORM: "οἱ"}, {ORTH: "μοὶ", NORM: "ἐμoί"}], + "σοὔστι": [{ORTH: "σοὔ", NORM: "σοί"}, {ORTH: "στι", NORM: "ἐστι"}], + "σοὐστί": [{ORTH: "σοὐ", NORM: "σοί"}, {ORTH: "στί", NORM: "ἐστί"}], + "σοὐστὶ": [{ORTH: "σοὐ", NORM: "σοί"}, {ORTH: "στὶ", NORM: "ἐστί"}], + "μοὖστι": [{ORTH: "μοὖ", NORM: "μοί"}, {ORTH: "στι", NORM: "ἐστι"}], + "μοὔστι": [{ORTH: "μοὔ", NORM: "μοί"}, {ORTH: "στι", NORM: "ἐστι"}], + "τοὔνομα": [{ORTH: "τοὔ", NORM: "τό"}, {ORTH: "νομα", NORM: "ὄνομα"}], + "οὑν": [{ORTH: "οὑ", NORM: "ὁ"}, {ORTH: "ν", NORM: "ἐν"}], + "ὦνερ": [{ORTH: "ὦ", NORM: "ὦ"}, {ORTH: "νερ", NORM: "ἄνερ"}], + "ὦνδρες": [{ORTH: "ὦ", NORM: "ὦ"}, {ORTH: "νδρες", NORM: "ἄνδρες"}], + "προὔχων": [{ORTH: "προὔ", NORM: "πρό"}, {ORTH: "χων", NORM: "ἔχων"}], + "προὔχοντα": [{ORTH: "προὔ", NORM: "πρό"}, {ORTH: "χοντα", NORM: "ἔχοντα"}], + "ὥνεκα": [{ORTH: "ὥ", NORM: "οὗ"}, {ORTH: "νεκα", NORM: "ἕνεκα"}], + "θοἰμάτιον": [{ORTH: "θο", NORM: "τό"}, {ORTH: "ἰμάτιον"}], + "ὥνεκα": [{ORTH: "ὥ", NORM: "οὗ"}, {ORTH: "νεκα", NORM: "ἕνεκα"}], + "τὠληθές": [{ORTH: "τὠ", NORM: "τὸ"}, {ORTH: "ληθές", NORM: "ἀληθές"}], + "θἡμέρᾳ": [{ORTH: "θ", NORM: "τῇ"}, {ORTH: "ἡμέρᾳ"}], + "ἅνθρωπος": [{ORTH: "ἅ", NORM: "ὁ"}, {ORTH: "νθρωπος", NORM: "ἄνθρωπος"}], + "τἄλλα": [{ORTH: "τ", NORM: "τὰ"}, {ORTH: "ἄλλα"}], + "τἆλλα": [{ORTH: "τἆ", NORM: "τὰ"}, {ORTH: "λλα", NORM: "ἄλλα"}], + "ἁνήρ": [{ORTH: "ἁ", NORM: "ὁ"}, {ORTH: "νήρ", NORM: "ἀνήρ"}], + "ἁνὴρ": [{ORTH: "ἁ", NORM: "ὁ"}, {ORTH: "νὴρ", NORM: "ἀνήρ"}], + "ἅνδρες": [{ORTH: "ἅ", NORM: "οἱ"}, {ORTH: "νδρες", NORM: "ἄνδρες"}], + "ἁγαθαί": [{ORTH: "ἁ", NORM: "αἱ"}, {ORTH: "γαθαί", NORM: "ἀγαθαί"}], + "ἁγαθαὶ": [{ORTH: "ἁ", NORM: "αἱ"}, {ORTH: "γαθαὶ", NORM: "ἀγαθαί"}], + "ἁλήθεια": [{ORTH: "ἁ", NORM: "ἡ"}, {ORTH: "λήθεια", NORM: "ἀλήθεια"}], + "τἀνδρός": [{ORTH: "τ", NORM: "τοῦ"}, {ORTH: "ἀνδρός"}], + "τἀνδρὸς": [{ORTH: "τ", NORM: "τοῦ"}, {ORTH: "ἀνδρὸς", NORM: "ἀνδρός"}], + "τἀνδρί": [{ORTH: "τ", NORM: "τῷ"}, {ORTH: "ἀνδρί"}], + "τἀνδρὶ": [{ORTH: "τ", NORM: "τῷ"}, {ORTH: "ἀνδρὶ", NORM: "ἀνδρί"}], + "αὑτός": [{ORTH: "αὑ", NORM: "ὁ"}, {ORTH: "τός", NORM: "αὐτός"}], + "αὑτὸς": [{ORTH: "αὑ", NORM: "ὁ"}, {ORTH: "τὸς", NORM: "αὐτός"}], + "ταὐτοῦ": [{ORTH: "τ", NORM: "τοῦ"}, {ORTH: "αὐτοῦ"}], +} + +_exc.update(_other_exc) + +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/gu/__init__.py b/spacy/lang/gu/__init__.py index 67228ac40..e6fbc9d18 100644 --- a/spacy/lang/gu/__init__.py +++ b/spacy/lang/gu/__init__.py @@ -1,8 +1,8 @@ from .stop_words import STOP_WORDS -from ...language import Language +from ...language import Language, BaseDefaults -class GujaratiDefaults(Language.Defaults): +class GujaratiDefaults(BaseDefaults): stop_words = STOP_WORDS diff --git a/spacy/lang/he/__init__.py b/spacy/lang/he/__init__.py index e0adc3293..dd2ee478d 100644 --- a/spacy/lang/he/__init__.py +++ b/spacy/lang/he/__init__.py @@ -1,9 +1,9 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ...language import Language +from ...language import Language, BaseDefaults -class HebrewDefaults(Language.Defaults): +class HebrewDefaults(BaseDefaults): stop_words = STOP_WORDS lex_attr_getters = LEX_ATTRS writing_system = {"direction": "rtl", "has_case": False, "has_letters": True} diff --git a/spacy/lang/hi/__init__.py b/spacy/lang/hi/__init__.py index 384f040c8..4c8ae446d 100644 --- a/spacy/lang/hi/__init__.py +++ b/spacy/lang/hi/__init__.py @@ -1,9 +1,9 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ...language import Language +from ...language import Language, BaseDefaults -class HindiDefaults(Language.Defaults): +class HindiDefaults(BaseDefaults): stop_words = STOP_WORDS lex_attr_getters = LEX_ATTRS diff --git a/spacy/lang/hr/__init__.py b/spacy/lang/hr/__init__.py index 118e0946a..30870b522 100644 --- a/spacy/lang/hr/__init__.py +++ b/spacy/lang/hr/__init__.py @@ -1,8 +1,8 @@ from .stop_words import STOP_WORDS -from ...language import Language +from ...language import Language, BaseDefaults -class CroatianDefaults(Language.Defaults): +class CroatianDefaults(BaseDefaults): stop_words = STOP_WORDS diff --git a/spacy/lang/hu/__init__.py b/spacy/lang/hu/__init__.py index 8962603a6..9426bacea 100644 --- a/spacy/lang/hu/__init__.py +++ b/spacy/lang/hu/__init__.py @@ -1,10 +1,10 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .stop_words import STOP_WORDS -from ...language import Language +from ...language import Language, BaseDefaults -class HungarianDefaults(Language.Defaults): +class HungarianDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES diff --git a/spacy/lang/hy/__init__.py b/spacy/lang/hy/__init__.py index 4577ab641..481eaae0a 100644 --- a/spacy/lang/hy/__init__.py +++ b/spacy/lang/hy/__init__.py @@ -1,9 +1,9 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ...language import Language +from ...language import Language, BaseDefaults -class ArmenianDefaults(Language.Defaults): +class ArmenianDefaults(BaseDefaults): lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py index 87373551c..0d72cfa9d 100644 --- a/spacy/lang/id/__init__.py +++ b/spacy/lang/id/__init__.py @@ -3,10 +3,10 @@ from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIX from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS -from ...language import Language +from ...language import Language, BaseDefaults -class IndonesianDefaults(Language.Defaults): +class IndonesianDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py index 0f29bfe16..fa984d411 100644 --- a/spacy/lang/id/syntax_iterators.py +++ b/spacy/lang/id/syntax_iterators.py @@ -1,11 +1,11 @@ -from typing import Union, Iterator +from typing import Union, Iterator, Tuple from ...symbols import NOUN, PROPN, PRON from ...errors import Errors from ...tokens import Doc, Span -def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: """ Detect base noun phrases from a dependency parse. Works on both Doc and Span. """ diff --git a/spacy/lang/is/__init__.py b/spacy/lang/is/__init__.py index be5de5981..318363beb 100644 --- a/spacy/lang/is/__init__.py +++ b/spacy/lang/is/__init__.py @@ -1,8 +1,8 @@ from .stop_words import STOP_WORDS -from ...language import Language +from ...language import Language, BaseDefaults -class IcelandicDefaults(Language.Defaults): +class IcelandicDefaults(BaseDefaults): stop_words = STOP_WORDS diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index 25cbaa651..863ed8e2f 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -1,10 +1,14 @@ +from typing import Optional +from thinc.api import Model + from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES -from ...language import Language +from ...language import Language, BaseDefaults +from .lemmatizer import ItalianLemmatizer -class ItalianDefaults(Language.Defaults): +class ItalianDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS stop_words = STOP_WORDS prefixes = TOKENIZER_PREFIXES @@ -16,4 +20,16 @@ class Italian(Language): Defaults = ItalianDefaults +@Italian.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "pos_lookup", "overwrite": False}, + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool +): + return ItalianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + + __all__ = ["Italian"] diff --git a/spacy/lang/it/lemmatizer.py b/spacy/lang/it/lemmatizer.py new file mode 100644 index 000000000..e44e64e3a --- /dev/null +++ b/spacy/lang/it/lemmatizer.py @@ -0,0 +1,132 @@ +from typing import List, Dict, Tuple + +from ...pipeline import Lemmatizer +from ...tokens import Token + + +class ItalianLemmatizer(Lemmatizer): + """This lemmatizer was adapted from the Polish one (version of April 2021). + It implements lookup lemmatization based on the morphological lexicon + morph-it (Baroni and Zanchetta). The table lemma_lookup with non-POS-aware + entries is used as a backup for words that aren't handled by morph-it.""" + + @classmethod + def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]: + if mode == "pos_lookup": + required = [ + "lemma_lookup_num", + "lemma_lookup_det", + "lemma_lookup_adp", + "lemma_lookup_adj", + "lemma_lookup_noun", + "lemma_lookup_pron", + "lemma_lookup_verb", + "lemma_lookup_aux", + "lemma_lookup_adv", + "lemma_lookup_other", + "lemma_lookup", + ] + return (required, []) + else: + return super().get_lookups_config(mode) + + def pos_lookup_lemmatize(self, token: Token) -> List[str]: + string = token.text + univ_pos = token.pos_ + morphology = token.morph.to_dict() + lookup_pos = univ_pos.lower() + if univ_pos == "PROPN": + lookup_pos = "noun" + elif univ_pos == "PART": + lookup_pos = "pron" + lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {}) + if univ_pos == "NOUN": + return self.lemmatize_noun(string, morphology, lookup_table) + else: + if univ_pos != "PROPN": + string = string.lower() + if univ_pos == "DET": + return self.lemmatize_det(string, morphology, lookup_table) + elif univ_pos == "PRON": + return self.lemmatize_pron(string, morphology, lookup_table) + elif univ_pos == "ADP": + return self.lemmatize_adp(string, morphology, lookup_table) + elif univ_pos == "ADJ": + return self.lemmatize_adj(string, morphology, lookup_table) + else: + lemma = lookup_table.get(string, "") + if not lemma: + lookup_table = self.lookups.get_table("lemma_lookup_other") + lemma = lookup_table.get(string, "") + if not lemma: + lookup_table = self.lookups.get_table( + "lemma_lookup" + ) # "legacy" lookup table + lemma = lookup_table.get(string, string.lower()) + return [lemma] + + def lemmatize_det( + self, string: str, morphology: dict, lookup_table: Dict[str, str] + ) -> List[str]: + if string in [ + "l'", + "lo", + "la", + "i", + "gli", + "le", + ]: + return ["il"] + if string in ["un'", "un", "una"]: + return ["uno"] + return [lookup_table.get(string, string)] + + def lemmatize_pron( + self, string: str, morphology: dict, lookup_table: Dict[str, str] + ) -> List[str]: + if string in [ + "l'", + "li", + "la", + "gli", + "le", + ]: + return ["lo"] + if string in ["un'", "un", "una"]: + return ["uno"] + lemma = lookup_table.get(string, string) + if lemma == "alcun": + lemma = "alcuno" + elif lemma == "qualcun": + lemma = "qualcuno" + return [lemma] + + def lemmatize_adp( + self, string: str, morphology: dict, lookup_table: Dict[str, str] + ) -> List[str]: + if string == "d'": + return ["di"] + return [lookup_table.get(string, string)] + + def lemmatize_adj( + self, string: str, morphology: dict, lookup_table: Dict[str, str] + ) -> List[str]: + lemma = lookup_table.get(string, string) + if lemma == "alcun": + lemma = "alcuno" + elif lemma == "qualcun": + lemma = "qualcuno" + return [lemma] + + def lemmatize_noun( + self, string: str, morphology: dict, lookup_table: Dict[str, str] + ) -> List[str]: + # this method is case-sensitive, in order to work + # for incorrectly tagged proper names + if string != string.lower(): + if string.lower() in lookup_table: + return [lookup_table[string.lower()]] + elif string in lookup_table: + return [lookup_table[string]] + return [string.lower()] + return [lookup_table.get(string, string)] diff --git a/spacy/lang/it/stop_words.py b/spacy/lang/it/stop_words.py index e97613912..4178ed452 100644 --- a/spacy/lang/it/stop_words.py +++ b/spacy/lang/it/stop_words.py @@ -72,7 +72,7 @@ steste stesti stette stettero stetti stia stiamo stiano stiate sto su sua subito successivamente successivo sue sugl sugli sui sul sull sulla sulle sullo suo suoi -tale tali talvolta tanto te tempo ti titolo torino tra tranne tre trenta +tale tali talvolta tanto te tempo ti titolo tra tranne tre trenta troppo trovato tu tua tue tuo tuoi tutta tuttavia tutte tutti tutto uguali ulteriore ultimo un una uno uomo diff --git a/spacy/lang/it/tokenizer_exceptions.py b/spacy/lang/it/tokenizer_exceptions.py index 0c9968bc6..42883863b 100644 --- a/spacy/lang/it/tokenizer_exceptions.py +++ b/spacy/lang/it/tokenizer_exceptions.py @@ -17,14 +17,19 @@ _exc = { for orth in [ "..", "....", + "a.C.", "al.", "all-path", "art.", "Art.", "artt.", "att.", + "avv.", + "Avv.", "by-pass", "c.d.", + "c/c", + "C.so", "centro-sinistra", "check-up", "Civ.", @@ -48,6 +53,8 @@ for orth in [ "prof.", "sett.", "s.p.a.", + "s.n.c", + "s.r.l", "ss.", "St.", "tel.", diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 4e6bf9d3c..8499fc73e 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -10,7 +10,7 @@ from .tag_orth_map import TAG_ORTH_MAP from .tag_bigram_map import TAG_BIGRAM_MAP from ...compat import copy_reg from ...errors import Errors -from ...language import Language +from ...language import Language, BaseDefaults from ...scorer import Scorer from ...symbols import POS from ...tokens import Doc @@ -154,7 +154,7 @@ class JapaneseTokenizer(DummyTokenizer): def to_disk(self, path: Union[str, Path], **kwargs) -> None: path = util.ensure_path(path) serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())} - return util.to_disk(path, serializers, []) + util.to_disk(path, serializers, []) def from_disk(self, path: Union[str, Path], **kwargs) -> "JapaneseTokenizer": path = util.ensure_path(path) @@ -164,7 +164,7 @@ class JapaneseTokenizer(DummyTokenizer): return self -class JapaneseDefaults(Language.Defaults): +class JapaneseDefaults(BaseDefaults): config = load_config_from_str(DEFAULT_CONFIG) stop_words = STOP_WORDS syntax_iterators = SYNTAX_ITERATORS diff --git a/spacy/lang/ja/syntax_iterators.py b/spacy/lang/ja/syntax_iterators.py index cca4902ab..588a9ba03 100644 --- a/spacy/lang/ja/syntax_iterators.py +++ b/spacy/lang/ja/syntax_iterators.py @@ -1,4 +1,4 @@ -from typing import Union, Iterator +from typing import Union, Iterator, Tuple, Set from ...symbols import NOUN, PROPN, PRON, VERB from ...tokens import Doc, Span @@ -10,13 +10,13 @@ labels = ["nsubj", "nmod", "ddoclike", "nsubjpass", "pcomp", "pdoclike", "doclik # fmt: on -def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: """Detect base noun phrases from a dependency parse. Works on Doc and Span.""" doc = doclike.doc # Ensure works on both Doc and Span. np_deps = [doc.vocab.strings.add(label) for label in labels] doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") - seen = set() + seen: Set[int] = set() for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): continue diff --git a/spacy/lang/kn/__init__.py b/spacy/lang/kn/__init__.py index 8e53989e6..ccd46a394 100644 --- a/spacy/lang/kn/__init__.py +++ b/spacy/lang/kn/__init__.py @@ -1,8 +1,8 @@ from .stop_words import STOP_WORDS -from ...language import Language +from ...language import Language, BaseDefaults -class KannadaDefaults(Language.Defaults): +class KannadaDefaults(BaseDefaults): stop_words = STOP_WORDS diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index 83c9f4962..dfb311136 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -1,9 +1,9 @@ -from typing import Optional, Any, Dict +from typing import Iterator, Any, Dict from .stop_words import STOP_WORDS from .tag_map import TAG_MAP from .lex_attrs import LEX_ATTRS -from ...language import Language +from ...language import Language, BaseDefaults from ...tokens import Doc from ...compat import copy_reg from ...scorer import Scorer @@ -29,9 +29,9 @@ def create_tokenizer(): class KoreanTokenizer(DummyTokenizer): - def __init__(self, nlp: Optional[Language] = None): + def __init__(self, nlp: Language): self.vocab = nlp.vocab - MeCab = try_mecab_import() + MeCab = try_mecab_import() # type: ignore[func-returns-value] self.mecab_tokenizer = MeCab("-F%f[0],%f[7]") def __del__(self): @@ -49,7 +49,7 @@ class KoreanTokenizer(DummyTokenizer): doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens] return doc - def detailed_tokens(self, text: str) -> Dict[str, Any]: + def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]: # 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3], # 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], * for node in self.mecab_tokenizer.parse(text, as_nodes=True): @@ -68,7 +68,7 @@ class KoreanTokenizer(DummyTokenizer): return Scorer.score_tokenization(examples) -class KoreanDefaults(Language.Defaults): +class KoreanDefaults(BaseDefaults): config = load_config_from_str(DEFAULT_CONFIG) lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/ky/__init__.py b/spacy/lang/ky/__init__.py new file mode 100644 index 000000000..ccca384bd --- /dev/null +++ b/spacy/lang/ky/__init__.py @@ -0,0 +1,20 @@ +from .lex_attrs import LEX_ATTRS +from .punctuation import TOKENIZER_INFIXES +from .stop_words import STOP_WORDS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from ...language import Language, BaseDefaults + + +class KyrgyzDefaults(BaseDefaults): + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + infixes = TOKENIZER_INFIXES + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS + + +class Kyrgyz(Language): + lang = "ky" + Defaults = KyrgyzDefaults + + +__all__ = ["Kyrgyz"] diff --git a/spacy/lang/ky/examples.py b/spacy/lang/ky/examples.py new file mode 100644 index 000000000..ba77ea975 --- /dev/null +++ b/spacy/lang/ky/examples.py @@ -0,0 +1,16 @@ +""" +Example sentences to test spaCy and its language models. +>>> from spacy.lang.ky.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + +sentences = [ + "Apple Улуу Британия стартабын $1 миллиардга сатып алууну көздөөдө.", + "Автоном автомобилдерди камсыздоо жоопкерчилиги өндүрүүчүлөргө артылды.", + "Сан-Франциско тротуар менен жүрүүчү робот-курьерлерге тыю салууну караштырууда.", + "Лондон - Улуу Британияда жайгашкан ири шаар.", + "Кайдасың?", + "Франциянын президенти ким?", + "Америка Кошмо Штаттарынын борбор калаасы кайсы шаар?", + "Барак Обама качан төрөлгөн?", +] diff --git a/spacy/lang/ky/lex_attrs.py b/spacy/lang/ky/lex_attrs.py new file mode 100644 index 000000000..bdf993482 --- /dev/null +++ b/spacy/lang/ky/lex_attrs.py @@ -0,0 +1,48 @@ +from ...attrs import LIKE_NUM + +_num_words = [ + "нөл", + "ноль", + "бир", + "эки", + "үч", + "төрт", + "беш", + "алты", + "жети", + "сегиз", + "тогуз", + "он", + "жыйырма", + "отуз", + "кырк", + "элүү", + "алтымыш", + "жетмиш", + "сексен", + "токсон", + "жүз", + "миң", + "миллион", + "миллиард", + "триллион", + "триллиард", +] + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + if text in _num_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/ky/punctuation.py b/spacy/lang/ky/punctuation.py new file mode 100644 index 000000000..fa9819f80 --- /dev/null +++ b/spacy/lang/ky/punctuation.py @@ -0,0 +1,21 @@ +from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, HYPHENS +from ..char_classes import LIST_ELLIPSES, LIST_ICONS + +_hyphens_no_dash = HYPHENS.replace("-", "").strip("|").replace("||", "") +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), + r"(?<=[{a}])[,!?/()]+(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}{q}])[:<>=](?=[{a}])".format(a=ALPHA, q=CONCAT_QUOTES), + r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES), + r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=_hyphens_no_dash), + r"(?<=[0-9])-(?=[{a}])".format(a=ALPHA), + r"(?<=[0-9])-(?=[0-9])", + ] +) + +TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/ky/stop_words.py b/spacy/lang/ky/stop_words.py new file mode 100644 index 000000000..ea40bdfa2 --- /dev/null +++ b/spacy/lang/ky/stop_words.py @@ -0,0 +1,42 @@ +STOP_WORDS = set( + """ +ага адам айтты айтымында айтып ал алар +алардын алган алуу алып анда андан аны +анын ар + +бар басма баш башка башкы башчысы берген +биз билдирген билдирди бир биринчи бирок +бишкек болгон болот болсо болуп боюнча +буга бул + +гана + +да дагы деген деди деп + +жана жатат жаткан жаңы же жогорку жок жол +жолу + +кабыл калган кандай карата каршы катары +келген керек кийин кол кылмыш кыргыз +күнү көп + +маалымат мамлекеттик мен менен миң +мурдагы мыйзам мындай мүмкүн + +ошол ошондой + +сүрөт сөз + +тарабынан турган тууралуу + +укук учурда + +чейин чек + +экенин эки эл эле эмес эми эч + +үч үчүн + +өз +""".split() +) diff --git a/spacy/lang/ky/tokenizer_exceptions.py b/spacy/lang/ky/tokenizer_exceptions.py new file mode 100644 index 000000000..8ec727ac1 --- /dev/null +++ b/spacy/lang/ky/tokenizer_exceptions.py @@ -0,0 +1,53 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ...symbols import ORTH, NORM +from ...util import update_exc + +_exc = {} + +_abbrev_exc = [ + # Weekdays abbreviations + {ORTH: "дүй", NORM: "дүйшөмбү"}, + {ORTH: "шей", NORM: "шейшемби"}, + {ORTH: "шар", NORM: "шаршемби"}, + {ORTH: "бей", NORM: "бейшемби"}, + {ORTH: "жум", NORM: "жума"}, + {ORTH: "ишм", NORM: "ишемби"}, + {ORTH: "жек", NORM: "жекшемби"}, + # Months abbreviations + {ORTH: "янв", NORM: "январь"}, + {ORTH: "фев", NORM: "февраль"}, + {ORTH: "мар", NORM: "март"}, + {ORTH: "апр", NORM: "апрель"}, + {ORTH: "июн", NORM: "июнь"}, + {ORTH: "июл", NORM: "июль"}, + {ORTH: "авг", NORM: "август"}, + {ORTH: "сен", NORM: "сентябрь"}, + {ORTH: "окт", NORM: "октябрь"}, + {ORTH: "ноя", NORM: "ноябрь"}, + {ORTH: "дек", NORM: "декабрь"}, + # Number abbreviations + {ORTH: "млрд", NORM: "миллиард"}, + {ORTH: "млн", NORM: "миллион"}, +] + +for abbr in _abbrev_exc: + for orth in (abbr[ORTH], abbr[ORTH].capitalize(), abbr[ORTH].upper()): + _exc[orth] = [{ORTH: orth, NORM: abbr[NORM]}] + _exc[orth + "."] = [{ORTH: orth + ".", NORM: abbr[NORM]}] + +for exc_data in [ # "etc." abbreviations + {ORTH: "ж.б.у.с.", NORM: "жана башка ушул сыяктуу"}, + {ORTH: "ж.б.", NORM: "жана башка"}, + {ORTH: "ж.", NORM: "жыл"}, + {ORTH: "б.з.ч.", NORM: "биздин заманга чейин"}, + {ORTH: "б.з.", NORM: "биздин заман"}, + {ORTH: "кк.", NORM: "кылымдар"}, + {ORTH: "жж.", NORM: "жылдар"}, + {ORTH: "к.", NORM: "кылым"}, + {ORTH: "көч.", NORM: "көчөсү"}, + {ORTH: "м-н", NORM: "менен"}, + {ORTH: "б-ча", NORM: "боюнча"}, +]: + _exc[exc_data[ORTH]] = [exc_data] + +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/lb/__init__.py b/spacy/lang/lb/__init__.py index da6fe55d7..7827e7762 100644 --- a/spacy/lang/lb/__init__.py +++ b/spacy/lang/lb/__init__.py @@ -2,10 +2,10 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES from .lex_attrs import LEX_ATTRS from .stop_words import STOP_WORDS -from ...language import Language +from ...language import Language, BaseDefaults -class LuxembourgishDefaults(Language.Defaults): +class LuxembourgishDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES lex_attr_getters = LEX_ATTRS diff --git a/spacy/lang/lij/__init__.py b/spacy/lang/lij/__init__.py index 5ae280324..b7e11f77e 100644 --- a/spacy/lang/lij/__init__.py +++ b/spacy/lang/lij/__init__.py @@ -1,10 +1,10 @@ from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES -from ...language import Language +from ...language import Language, BaseDefaults -class LigurianDefaults(Language.Defaults): +class LigurianDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES stop_words = STOP_WORDS diff --git a/spacy/lang/lt/__init__.py b/spacy/lang/lt/__init__.py index e395a8f62..3ae000e5f 100644 --- a/spacy/lang/lt/__init__.py +++ b/spacy/lang/lt/__init__.py @@ -2,10 +2,10 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ...language import Language +from ...language import Language, BaseDefaults -class LithuanianDefaults(Language.Defaults): +class LithuanianDefaults(BaseDefaults): infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES tokenizer_exceptions = TOKENIZER_EXCEPTIONS diff --git a/spacy/lang/lt/punctuation.py b/spacy/lang/lt/punctuation.py index 506aa8f32..22aee0941 100644 --- a/spacy/lang/lt/punctuation.py +++ b/spacy/lang/lt/punctuation.py @@ -19,7 +19,7 @@ _infixes = ( ) -_suffixes = ["\."] + list(TOKENIZER_SUFFIXES) +_suffixes = [r"\."] + list(TOKENIZER_SUFFIXES) TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/lv/__init__.py b/spacy/lang/lv/__init__.py index 142bc706e..a05e5b939 100644 --- a/spacy/lang/lv/__init__.py +++ b/spacy/lang/lv/__init__.py @@ -1,8 +1,8 @@ from .stop_words import STOP_WORDS -from ...language import Language +from ...language import Language, BaseDefaults -class LatvianDefaults(Language.Defaults): +class LatvianDefaults(BaseDefaults): stop_words = STOP_WORDS diff --git a/spacy/lang/mk/__init__.py b/spacy/lang/mk/__init__.py index ef2670b4a..376afb552 100644 --- a/spacy/lang/mk/__init__.py +++ b/spacy/lang/mk/__init__.py @@ -6,13 +6,13 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .lex_attrs import LEX_ATTRS from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...language import Language +from ...language import Language, BaseDefaults from ...attrs import LANG from ...util import update_exc from ...lookups import Lookups -class MacedonianDefaults(Language.Defaults): +class MacedonianDefaults(BaseDefaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: "mk" @@ -38,11 +38,13 @@ class Macedonian(Language): @Macedonian.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule"}, + default_config={"model": None, "mode": "rule", "overwrite": False}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): - return MacedonianLemmatizer(nlp.vocab, model, name, mode=mode) +def make_lemmatizer( + nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool +): + return MacedonianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) __all__ = ["Macedonian"] diff --git a/spacy/lang/mk/lemmatizer.py b/spacy/lang/mk/lemmatizer.py index ce3e73b7a..a792095e7 100644 --- a/spacy/lang/mk/lemmatizer.py +++ b/spacy/lang/mk/lemmatizer.py @@ -9,7 +9,6 @@ class MacedonianLemmatizer(Lemmatizer): def rule_lemmatize(self, token: Token) -> List[str]: string = token.text univ_pos = token.pos_.lower() - morphology = token.morph.to_dict() if univ_pos in ("", "eol", "space"): return [string.lower()] diff --git a/spacy/lang/ml/__init__.py b/spacy/lang/ml/__init__.py index cfad52261..9f90605f0 100644 --- a/spacy/lang/ml/__init__.py +++ b/spacy/lang/ml/__init__.py @@ -1,9 +1,9 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ...language import Language +from ...language import Language, BaseDefaults -class MalayalamDefaults(Language.Defaults): +class MalayalamDefaults(BaseDefaults): lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/mr/__init__.py b/spacy/lang/mr/__init__.py index af0c49878..3e172fa60 100644 --- a/spacy/lang/mr/__init__.py +++ b/spacy/lang/mr/__init__.py @@ -1,8 +1,8 @@ from .stop_words import STOP_WORDS -from ...language import Language +from ...language import Language, BaseDefaults -class MarathiDefaults(Language.Defaults): +class MarathiDefaults(BaseDefaults): stop_words = STOP_WORDS diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index 62d7707f3..e27754e55 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -5,11 +5,11 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .syntax_iterators import SYNTAX_ITERATORS -from ...language import Language +from ...language import Language, BaseDefaults from ...pipeline import Lemmatizer -class NorwegianDefaults(Language.Defaults): +class NorwegianDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES @@ -26,11 +26,13 @@ class Norwegian(Language): @Norwegian.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule"}, + default_config={"model": None, "mode": "rule", "overwrite": False}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): - return Lemmatizer(nlp.vocab, model, name, mode=mode) +def make_lemmatizer( + nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool +): + return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) __all__ = ["Norwegian"] diff --git a/spacy/lang/nb/punctuation.py b/spacy/lang/nb/punctuation.py index 9b800029c..8f2933670 100644 --- a/spacy/lang/nb/punctuation.py +++ b/spacy/lang/nb/punctuation.py @@ -27,7 +27,7 @@ _infixes = ( + [ r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), - r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py index 68117a54d..d86662693 100644 --- a/spacy/lang/nb/syntax_iterators.py +++ b/spacy/lang/nb/syntax_iterators.py @@ -1,11 +1,11 @@ -from typing import Union, Iterator +from typing import Union, Iterator, Tuple from ...symbols import NOUN, PROPN, PRON from ...errors import Errors from ...tokens import Doc, Span -def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: """Detect base noun phrases from a dependency parse. Works on Doc and Span.""" # fmt: off labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] diff --git a/spacy/lang/ne/__init__.py b/spacy/lang/ne/__init__.py index 68632e9ad..0028d1b0b 100644 --- a/spacy/lang/ne/__init__.py +++ b/spacy/lang/ne/__init__.py @@ -1,9 +1,9 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ...language import Language +from ...language import Language, BaseDefaults -class NepaliDefaults(Language.Defaults): +class NepaliDefaults(BaseDefaults): stop_words = STOP_WORDS lex_attr_getters = LEX_ATTRS diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index a3591f1bf..8f370eaaf 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -1,21 +1,24 @@ from typing import Optional + from thinc.api import Model -from .stop_words import STOP_WORDS +from .lemmatizer import DutchLemmatizer from .lex_attrs import LEX_ATTRS -from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES -from .lemmatizer import DutchLemmatizer -from ...language import Language +from .stop_words import STOP_WORDS +from .syntax_iterators import SYNTAX_ITERATORS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from ...language import Language, BaseDefaults -class DutchDefaults(Language.Defaults): +class DutchDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES lex_attr_getters = LEX_ATTRS + syntax_iterators = SYNTAX_ITERATORS stop_words = STOP_WORDS @@ -27,11 +30,13 @@ class Dutch(Language): @Dutch.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule"}, + default_config={"model": None, "mode": "rule", "overwrite": False}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): - return DutchLemmatizer(nlp.vocab, model, name, mode=mode) +def make_lemmatizer( + nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool +): + return DutchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) __all__ = ["Dutch"] diff --git a/spacy/lang/nl/lemmatizer.py b/spacy/lang/nl/lemmatizer.py index 6c025dcf6..4f6b2ef30 100644 --- a/spacy/lang/nl/lemmatizer.py +++ b/spacy/lang/nl/lemmatizer.py @@ -97,7 +97,7 @@ class DutchLemmatizer(Lemmatizer): return forms else: oov_forms.append(form) - forms = list(set(oov_forms)) + forms = list(dict.fromkeys(oov_forms)) # Back-off through remaining return value candidates. if forms: for form in forms: diff --git a/spacy/lang/nl/syntax_iterators.py b/spacy/lang/nl/syntax_iterators.py new file mode 100644 index 000000000..1ab5e7cff --- /dev/null +++ b/spacy/lang/nl/syntax_iterators.py @@ -0,0 +1,72 @@ +from typing import Union, Iterator, Tuple + +from ...symbols import NOUN, PRON +from ...errors import Errors +from ...tokens import Doc, Span + + +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: + """ + Detect base noun phrases from a dependency parse. Works on Doc and Span. + The definition is inspired by https://www.nltk.org/book/ch07.html + Consider : [Noun + determinant / adjective] and also [Pronoun] + """ + # fmt: off + # labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] + # fmt: on + doc = doclike.doc # Ensure works on both Doc and Span. + + # Check for dependencies: POS, DEP + if not doc.has_annotation("POS"): + raise ValueError(Errors.E1019) + if not doc.has_annotation("DEP"): + raise ValueError(Errors.E029) + + # See UD tags: https://universaldependencies.org/u/dep/index.html + # amod = adjectival modifier + # nmod:poss = possessive nominal modifier + # nummod = numeric modifier + # det = determiner + # det:poss = possessive determiner + noun_deps = [ + doc.vocab.strings[label] for label in ["amod", "nmod:poss", "det", "det:poss"] + ] + + # nsubj = nominal subject + # nsubj:pass = passive nominal subject + pronoun_deps = [doc.vocab.strings[label] for label in ["nsubj", "nsubj:pass"]] + + # Label NP for the Span to identify it as Noun-Phrase + span_label = doc.vocab.strings.add("NP") + + # Only NOUNS and PRONOUNS matter + for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)): + # For NOUNS + # Pick children from syntactic parse (only those with certain dependencies) + if word.pos == NOUN: + # Some debugging. It happens that VERBS are POS-TAGGED as NOUNS + # We check if the word has a "nsubj", if it's the case, we eliminate it + nsubjs = filter( + lambda x: x.dep == doc.vocab.strings["nsubj"], word.children + ) + next_word = next(nsubjs, None) + if next_word is not None: + # We found some nsubj, so we skip this word. Otherwise, consider it a normal NOUN + continue + + children = filter(lambda x: x.dep in noun_deps, word.children) + children_i = [c.i for c in children] + [word.i] + + start_span = min(children_i) + end_span = max(children_i) + 1 + yield start_span, end_span, span_label + + # PRONOUNS only if it is the subject of a verb + elif word.pos == PRON: + if word.dep in pronoun_deps: + start_span = word.i + end_span = word.i + 1 + yield start_span, end_span, span_label + + +SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 9e7303e83..4b8c88bd7 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -8,7 +8,7 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .lemmatizer import PolishLemmatizer from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...language import Language +from ...language import Language, BaseDefaults TOKENIZER_EXCEPTIONS = { @@ -16,7 +16,7 @@ TOKENIZER_EXCEPTIONS = { } -class PolishDefaults(Language.Defaults): +class PolishDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES @@ -33,11 +33,13 @@ class Polish(Language): @Polish.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "pos_lookup"}, + default_config={"model": None, "mode": "pos_lookup", "overwrite": False}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): - return PolishLemmatizer(nlp.vocab, model, name, mode=mode) +def make_lemmatizer( + nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool +): + return PolishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) __all__ = ["Polish"] diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py index 0447099f0..9ae6501fb 100644 --- a/spacy/lang/pt/__init__.py +++ b/spacy/lang/pt/__init__.py @@ -2,10 +2,10 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES -from ...language import Language +from ...language import Language, BaseDefaults -class PortugueseDefaults(Language.Defaults): +class PortugueseDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES prefixes = TOKENIZER_PREFIXES diff --git a/spacy/lang/ro/__init__.py b/spacy/lang/ro/__init__.py index f0d8d8d31..50027ffd2 100644 --- a/spacy/lang/ro/__init__.py +++ b/spacy/lang/ro/__init__.py @@ -3,14 +3,14 @@ from .stop_words import STOP_WORDS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES from .lex_attrs import LEX_ATTRS -from ...language import Language +from ...language import Language, BaseDefaults # Lemma data note: # Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/ # Replaced characters using cedillas with the correct ones (ș and ț) -class RomanianDefaults(Language.Defaults): +class RomanianDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py index 2f3965fcc..16ae5eef5 100644 --- a/spacy/lang/ru/__init__.py +++ b/spacy/lang/ru/__init__.py @@ -5,10 +5,10 @@ from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .lex_attrs import LEX_ATTRS from .lemmatizer import RussianLemmatizer -from ...language import Language +from ...language import Language, BaseDefaults -class RussianDefaults(Language.Defaults): +class RussianDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS @@ -22,7 +22,7 @@ class Russian(Language): @Russian.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "pymorphy2"}, + default_config={"model": None, "mode": "pymorphy2", "overwrite": False}, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( @@ -30,7 +30,7 @@ def make_lemmatizer( model: Optional[Model], name: str, mode: str, - overwrite: bool = False, + overwrite: bool, ): return RussianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index b7a2fc8e4..ce5ccf36f 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -12,8 +12,6 @@ PUNCT_RULES = {"«": '"', "»": '"'} class RussianLemmatizer(Lemmatizer): - _morph = None - def __init__( self, vocab: Vocab, @@ -23,18 +21,18 @@ class RussianLemmatizer(Lemmatizer): mode: str = "pymorphy2", overwrite: bool = False, ) -> None: + if mode == "pymorphy2": + try: + from pymorphy2 import MorphAnalyzer + except ImportError: + raise ImportError( + "The Russian lemmatizer mode 'pymorphy2' requires the " + "pymorphy2 library. Install it with: pip install pymorphy2" + ) from None + if getattr(self, "_morph", None) is None: + self._morph = MorphAnalyzer() super().__init__(vocab, model, name, mode=mode, overwrite=overwrite) - try: - from pymorphy2 import MorphAnalyzer - except ImportError: - raise ImportError( - "The Russian lemmatizer requires the pymorphy2 library: " - 'try to fix it with "pip install pymorphy2"' - ) from None - if RussianLemmatizer._morph is None: - RussianLemmatizer._morph = MorphAnalyzer() - def pymorphy2_lemmatize(self, token: Token) -> List[str]: string = token.text univ_pos = token.pos_ @@ -58,7 +56,9 @@ class RussianLemmatizer(Lemmatizer): if not len(filtered_analyses): return [string.lower()] if morphology is None or (len(morphology) == 1 and POS in morphology): - return list(set([analysis.normal_form for analysis in filtered_analyses])) + return list( + dict.fromkeys([analysis.normal_form for analysis in filtered_analyses]) + ) if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"): features_to_compare = ["Case", "Number", "Gender"] elif univ_pos == "NUM": @@ -89,14 +89,16 @@ class RussianLemmatizer(Lemmatizer): filtered_analyses.append(analysis) if not len(filtered_analyses): return [string.lower()] - return list(set([analysis.normal_form for analysis in filtered_analyses])) + return list( + dict.fromkeys([analysis.normal_form for analysis in filtered_analyses]) + ) - def lookup_lemmatize(self, token: Token) -> List[str]: + def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]: string = token.text analyses = self._morph.parse(string) if len(analyses) == 1: - return analyses[0].normal_form - return string + return [analyses[0].normal_form] + return [string] def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]: diff --git a/spacy/lang/sa/__init__.py b/spacy/lang/sa/__init__.py index 345137817..61398af6c 100644 --- a/spacy/lang/sa/__init__.py +++ b/spacy/lang/sa/__init__.py @@ -1,9 +1,9 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ...language import Language +from ...language import Language, BaseDefaults -class SanskritDefaults(Language.Defaults): +class SanskritDefaults(BaseDefaults): lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/si/__init__.py b/spacy/lang/si/__init__.py index d77e3bb8b..971cee3c6 100644 --- a/spacy/lang/si/__init__.py +++ b/spacy/lang/si/__init__.py @@ -1,9 +1,9 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ...language import Language +from ...language import Language, BaseDefaults -class SinhalaDefaults(Language.Defaults): +class SinhalaDefaults(BaseDefaults): lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/sk/__init__.py b/spacy/lang/sk/__init__.py index 4003c7340..da6e3048e 100644 --- a/spacy/lang/sk/__init__.py +++ b/spacy/lang/sk/__init__.py @@ -1,9 +1,9 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ...language import Language +from ...language import Language, BaseDefaults -class SlovakDefaults(Language.Defaults): +class SlovakDefaults(BaseDefaults): lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/sl/__init__.py b/spacy/lang/sl/__init__.py index 0330cc4d0..9ddd676bf 100644 --- a/spacy/lang/sl/__init__.py +++ b/spacy/lang/sl/__init__.py @@ -1,8 +1,8 @@ from .stop_words import STOP_WORDS -from ...language import Language +from ...language import Language, BaseDefaults -class SlovenianDefaults(Language.Defaults): +class SlovenianDefaults(BaseDefaults): stop_words = STOP_WORDS diff --git a/spacy/lang/sq/__init__.py b/spacy/lang/sq/__init__.py index a4bacfa49..5e32a0cbe 100644 --- a/spacy/lang/sq/__init__.py +++ b/spacy/lang/sq/__init__.py @@ -1,8 +1,8 @@ from .stop_words import STOP_WORDS -from ...language import Language +from ...language import Language, BaseDefaults -class AlbanianDefaults(Language.Defaults): +class AlbanianDefaults(BaseDefaults): stop_words = STOP_WORDS diff --git a/spacy/lang/sr/__init__.py b/spacy/lang/sr/__init__.py index 165e54975..fd0c8c832 100644 --- a/spacy/lang/sr/__init__.py +++ b/spacy/lang/sr/__init__.py @@ -1,10 +1,10 @@ from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .lex_attrs import LEX_ATTRS -from ...language import Language +from ...language import Language, BaseDefaults -class SerbianDefaults(Language.Defaults): +class SerbianDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index 2490eb9ec..518ee0db7 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -4,7 +4,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS -from ...language import Language +from ...language import Language, BaseDefaults from ...pipeline import Lemmatizer @@ -12,7 +12,7 @@ from ...pipeline import Lemmatizer from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES -class SwedishDefaults(Language.Defaults): +class SwedishDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES @@ -29,11 +29,13 @@ class Swedish(Language): @Swedish.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule"}, + default_config={"model": None, "mode": "rule", "overwrite": False}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): - return Lemmatizer(nlp.vocab, model, name, mode=mode) +def make_lemmatizer( + nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool +): + return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) __all__ = ["Swedish"] diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py index d5ae47853..06ad016ac 100644 --- a/spacy/lang/sv/syntax_iterators.py +++ b/spacy/lang/sv/syntax_iterators.py @@ -1,11 +1,11 @@ -from typing import Union, Iterator +from typing import Union, Iterator, Tuple from ...symbols import NOUN, PROPN, PRON from ...errors import Errors from ...tokens import Doc, Span -def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: """Detect base noun phrases from a dependency parse. Works on Doc and Span.""" # fmt: off labels = ["nsubj", "nsubj:pass", "dobj", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] diff --git a/spacy/lang/ta/__init__.py b/spacy/lang/ta/__init__.py index ac5fc7124..4929a4b97 100644 --- a/spacy/lang/ta/__init__.py +++ b/spacy/lang/ta/__init__.py @@ -1,9 +1,9 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ...language import Language +from ...language import Language, BaseDefaults -class TamilDefaults(Language.Defaults): +class TamilDefaults(BaseDefaults): lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/te/__init__.py b/spacy/lang/te/__init__.py index e6dc80e28..77cc2fe9b 100644 --- a/spacy/lang/te/__init__.py +++ b/spacy/lang/te/__init__.py @@ -1,9 +1,9 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ...language import Language +from ...language import Language, BaseDefaults -class TeluguDefaults(Language.Defaults): +class TeluguDefaults(BaseDefaults): lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py index 219c50c1a..10d466bd3 100644 --- a/spacy/lang/th/__init__.py +++ b/spacy/lang/th/__init__.py @@ -1,6 +1,6 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ...language import Language +from ...language import Language, BaseDefaults from ...tokens import Doc from ...util import DummyTokenizer, registry, load_config_from_str @@ -39,7 +39,7 @@ class ThaiTokenizer(DummyTokenizer): return Doc(self.vocab, words=words, spaces=spaces) -class ThaiDefaults(Language.Defaults): +class ThaiDefaults(BaseDefaults): config = load_config_from_str(DEFAULT_CONFIG) lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/ti/__init__.py b/spacy/lang/ti/__init__.py index 709fb21cb..c74c081b5 100644 --- a/spacy/lang/ti/__init__.py +++ b/spacy/lang/ti/__init__.py @@ -4,12 +4,12 @@ from .punctuation import TOKENIZER_SUFFIXES from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...language import Language +from ...language import Language, BaseDefaults from ...attrs import LANG from ...util import update_exc -class TigrinyaDefaults(Language.Defaults): +class TigrinyaDefaults(BaseDefaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: "ti" diff --git a/spacy/lang/tl/__init__.py b/spacy/lang/tl/__init__.py index 61530dc30..30838890a 100644 --- a/spacy/lang/tl/__init__.py +++ b/spacy/lang/tl/__init__.py @@ -1,10 +1,10 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ...language import Language +from ...language import Language, BaseDefaults -class TagalogDefaults(Language.Defaults): +class TagalogDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/tn/__init__.py b/spacy/lang/tn/__init__.py new file mode 100644 index 000000000..28e887eea --- /dev/null +++ b/spacy/lang/tn/__init__.py @@ -0,0 +1,18 @@ +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS +from .punctuation import TOKENIZER_INFIXES +from ...language import Language, BaseDefaults + + +class SetswanaDefaults(BaseDefaults): + infixes = TOKENIZER_INFIXES + stop_words = STOP_WORDS + lex_attr_getters = LEX_ATTRS + + +class Setswana(Language): + lang = "tn" + Defaults = SetswanaDefaults + + +__all__ = ["Setswana"] diff --git a/spacy/lang/tn/examples.py b/spacy/lang/tn/examples.py new file mode 100644 index 000000000..7b33fae5a --- /dev/null +++ b/spacy/lang/tn/examples.py @@ -0,0 +1,15 @@ +""" +Example sentences to test spaCy and its language models. +>>> from spacy.lang.tn.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple e nyaka go reka JSE ka tlhwatlhwa ta R1 billion", + "Johannesburg ke toropo e kgolo mo Afrika Borwa.", + "O ko kae?", + "ke mang presidente ya Afrika Borwa?", + "ke eng toropo kgolo ya Afrika Borwa?", + "Nelson Mandela o belegwe leng?", +] diff --git a/spacy/lang/tn/lex_attrs.py b/spacy/lang/tn/lex_attrs.py new file mode 100644 index 000000000..c136d0ab2 --- /dev/null +++ b/spacy/lang/tn/lex_attrs.py @@ -0,0 +1,107 @@ +from ...attrs import LIKE_NUM + +_num_words = [ + "lefela", + "nngwe", + "pedi", + "tharo", + "nne", + "tlhano", + "thataro", + "supa", + "robedi", + "robongwe", + "lesome", + "lesomenngwe", + "lesomepedi", + "sometharo", + "somenne", + "sometlhano", + "somethataro", + "somesupa", + "somerobedi", + "somerobongwe", + "someamabedi", + "someamararo", + "someamane", + "someamatlhano", + "someamarataro", + "someamasupa", + "someamarobedi", + "someamarobongwe", + "lekgolo", + "sekete", + "milione", + "bilione", + "terilione", + "kwatirilione", + "gajillione", + "bazillione", +] + + +_ordinal_words = [ + "ntlha", + "bobedi", + "boraro", + "bone", + "botlhano", + "borataro", + "bosupa", + "borobedi ", + "borobongwe", + "bolesome", + "bolesomengwe", + "bolesomepedi", + "bolesometharo", + "bolesomenne", + "bolesometlhano", + "bolesomethataro", + "bolesomesupa", + "bolesomerobedi", + "bolesomerobongwe", + "somamabedi", + "someamararo", + "someamane", + "someamatlhano", + "someamarataro", + "someamasupa", + "someamarobedi", + "someamarobongwe", + "lekgolo", + "sekete", + "milione", + "bilione", + "terilione", + "kwatirilione", + "gajillione", + "bazillione", +] + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + + text_lower = text.lower() + if text_lower in _num_words: + return True + + # CHeck ordinal number + if text_lower in _ordinal_words: + return True + if text_lower.endswith("th"): + if text_lower[:-2].isdigit(): + return True + + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/tn/punctuation.py b/spacy/lang/tn/punctuation.py new file mode 100644 index 000000000..a52755564 --- /dev/null +++ b/spacy/lang/tn/punctuation.py @@ -0,0 +1,19 @@ +from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS +from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA + +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[0-9])[+\-\*^](?=[0-9-])", + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), + r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), + ] +) + + +TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/tn/stop_words.py b/spacy/lang/tn/stop_words.py new file mode 100644 index 000000000..f614771dd --- /dev/null +++ b/spacy/lang/tn/stop_words.py @@ -0,0 +1,20 @@ +# Stop words +STOP_WORDS = set( + """ +ke gareng ga selekanyo tlhwatlhwa yo mongwe se +sengwe fa go le jalo gongwe ba na mo tikologong +jaaka kwa morago nna gonne ka sa pele nako teng +tlase fela ntle magareng tsona feta bobedi kgabaganya +moo gape kgatlhanong botlhe tsotlhe bokana e esi +setseng mororo dinako golo kgolo nnye wena gago +o ntse ntle tla goreng gangwe mang yotlhe gore +eo yona tseraganyo eng ne sentle re rona thata +godimo fitlha pedi masomamabedi lesomepedi mmogo +tharo tseo boraro tseno yone jaanong bobona bona +lesome tsaya tsamaiso nngwe masomethataro thataro +tsa mmatota tota sale thoko supa dira tshwanetse di mmalwa masisi +bonala e tshwanang bogolo tsenya tsweetswee karolo +sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa +tlhano lesometlhano botlalo lekgolo +""".split() +) diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 960302513..e41db911f 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -35,8 +35,8 @@ URL_PATTERN = ( # host & domain names # mods: match is case-sensitive, so include [A-Z] r"(?:" # noqa: E131 - r"(?:" - r"[A-Za-z0-9\u00a1-\uffff]" + r"(?:" # noqa: E131 + r"[A-Za-z0-9\u00a1-\uffff]" # noqa: E131 r"[A-Za-z0-9\u00a1-\uffff_-]{0,62}" r")?" r"[A-Za-z0-9\u00a1-\uffff]\." diff --git a/spacy/lang/tr/__init__.py b/spacy/lang/tr/__init__.py index 679411acf..02b5c7bf4 100644 --- a/spacy/lang/tr/__init__.py +++ b/spacy/lang/tr/__init__.py @@ -2,10 +2,10 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH from .stop_words import STOP_WORDS from .syntax_iterators import SYNTAX_ITERATORS from .lex_attrs import LEX_ATTRS -from ...language import Language +from ...language import Language, BaseDefaults -class TurkishDefaults(Language.Defaults): +class TurkishDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/tr/syntax_iterators.py b/spacy/lang/tr/syntax_iterators.py index 3fd726fb5..769af1223 100644 --- a/spacy/lang/tr/syntax_iterators.py +++ b/spacy/lang/tr/syntax_iterators.py @@ -1,8 +1,10 @@ +from typing import Union, Iterator, Tuple +from ...tokens import Doc, Span from ...symbols import NOUN, PROPN, PRON from ...errors import Errors -def noun_chunks(doclike): +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: """ Detect base noun phrases from a dependency parse. Works on both Doc and Span. """ diff --git a/spacy/lang/tt/__init__.py b/spacy/lang/tt/__init__.py index c8e293f29..d5e1e87ef 100644 --- a/spacy/lang/tt/__init__.py +++ b/spacy/lang/tt/__init__.py @@ -2,10 +2,10 @@ from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_INFIXES from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from ...language import Language +from ...language import Language, BaseDefaults -class TatarDefaults(Language.Defaults): +class TatarDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES lex_attr_getters = LEX_ATTRS diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py index 24c88e5a7..1fa568292 100644 --- a/spacy/lang/uk/__init__.py +++ b/spacy/lang/uk/__init__.py @@ -6,10 +6,10 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .lemmatizer import UkrainianLemmatizer -from ...language import Language +from ...language import Language, BaseDefaults -class UkrainianDefaults(Language.Defaults): +class UkrainianDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS @@ -23,11 +23,11 @@ class Ukrainian(Language): @Ukrainian.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "pymorphy2"}, + default_config={"model": None, "mode": "pymorphy2", "overwrite": False}, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False + nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool ): return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py index 56d9c75c0..1fb030e06 100644 --- a/spacy/lang/uk/lemmatizer.py +++ b/spacy/lang/uk/lemmatizer.py @@ -16,14 +16,15 @@ class UkrainianLemmatizer(RussianLemmatizer): mode: str = "pymorphy2", overwrite: bool = False, ) -> None: + if mode == "pymorphy2": + try: + from pymorphy2 import MorphAnalyzer + except ImportError: + raise ImportError( + "The Ukrainian lemmatizer mode 'pymorphy2' requires the " + "pymorphy2 library and dictionaries. Install them with: " + "pip install pymorphy2 pymorphy2-dicts-uk" + ) from None + if getattr(self, "_morph", None) is None: + self._morph = MorphAnalyzer(lang="uk") super().__init__(vocab, model, name, mode=mode, overwrite=overwrite) - try: - from pymorphy2 import MorphAnalyzer - except ImportError: - raise ImportError( - "The Ukrainian lemmatizer requires the pymorphy2 library and " - "dictionaries: try to fix it with " - '"pip install pymorphy2 pymorphy2-dicts-uk"' - ) from None - if UkrainianLemmatizer._morph is None: - UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk") diff --git a/spacy/lang/ur/__init__.py b/spacy/lang/ur/__init__.py index e3dee5805..266c5a73d 100644 --- a/spacy/lang/ur/__init__.py +++ b/spacy/lang/ur/__init__.py @@ -1,10 +1,10 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_SUFFIXES -from ...language import Language +from ...language import Language, BaseDefaults -class UrduDefaults(Language.Defaults): +class UrduDefaults(BaseDefaults): suffixes = TOKENIZER_SUFFIXES lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/vi/__init__.py b/spacy/lang/vi/__init__.py index 1328de495..9d5fd8d9d 100644 --- a/spacy/lang/vi/__init__.py +++ b/spacy/lang/vi/__init__.py @@ -1,8 +1,15 @@ +from typing import Any, Dict, Union +from pathlib import Path +import re +import srsly +import string + from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ...language import Language +from ...language import Language, BaseDefaults from ...tokens import Doc from ...util import DummyTokenizer, registry, load_config_from_str +from ... import util DEFAULT_CONFIG = """ @@ -40,19 +47,110 @@ class VietnameseTokenizer(DummyTokenizer): def __call__(self, text: str) -> Doc: if self.use_pyvi: - words, spaces = self.ViTokenizer.spacy_tokenize(text) + words = self.pyvi_tokenize(text) + words, spaces = util.get_words_and_spaces(words, text) return Doc(self.vocab, words=words, spaces=spaces) else: - words = [] - spaces = [] - for token in self.tokenizer(text): - words.extend(list(token.text)) - spaces.extend([False] * len(token.text)) - spaces[-1] = bool(token.whitespace_) + words, spaces = util.get_words_and_spaces(text.split(), text) return Doc(self.vocab, words=words, spaces=spaces) + # The methods pyvi_sylabelize_with_ws and pyvi_tokenize are adapted from + # pyvi v0.1, MIT License, Copyright (c) 2016 Viet-Trung Tran. + # See licenses/3rd_party_licenses.txt + def pyvi_sylabelize_with_ws(self, text): + """Modified from pyvi to preserve whitespace and skip unicode + normalization.""" + specials = [r"==>", r"->", r"\.\.\.", r">>"] + digit = r"\d+([\.,_]\d+)+" + email = r"([a-zA-Z0-9_.+-]+@([a-zA-Z0-9-]+\.)+[a-zA-Z0-9-]+)" + web = r"\w+://[^\s]+" + word = r"\w+" + non_word = r"[^\w\s]" + abbreviations = [ + r"[A-ZĐ]+\.", + r"Tp\.", + r"Mr\.", + r"Mrs\.", + r"Ms\.", + r"Dr\.", + r"ThS\.", + ] -class VietnameseDefaults(Language.Defaults): + patterns = [] + patterns.extend(abbreviations) + patterns.extend(specials) + patterns.extend([web, email]) + patterns.extend([digit, non_word, word]) + + patterns = r"(\s+|" + "|".join(patterns) + ")" + tokens = re.findall(patterns, text, re.UNICODE) + + return [token[0] for token in tokens] + + def pyvi_tokenize(self, text): + """Modified from pyvi to preserve text and whitespace.""" + if len(text) == 0: + return [] + elif text.isspace(): + return [text] + segs = self.pyvi_sylabelize_with_ws(text) + words = [] + preceding_ws = [] + for i, token in enumerate(segs): + if not token.isspace(): + words.append(token) + preceding_ws.append( + "" if (i == 0 or not segs[i - 1].isspace()) else segs[i - 1] + ) + labels = self.ViTokenizer.ViTokenizer.model.predict( + [self.ViTokenizer.ViTokenizer.sent2features(words, False)] + ) + token = words[0] + tokens = [] + for i in range(1, len(labels[0])): + if ( + labels[0][i] == "I_W" + and words[i] not in string.punctuation + and words[i - 1] not in string.punctuation + and not words[i][0].isdigit() + and not words[i - 1][0].isdigit() + and not (words[i][0].istitle() and not words[i - 1][0].istitle()) + ): + token = token + preceding_ws[i] + words[i] + else: + tokens.append(token) + token = words[i] + tokens.append(token) + return tokens + + def _get_config(self) -> Dict[str, Any]: + return {"use_pyvi": self.use_pyvi} + + def _set_config(self, config: Dict[str, Any] = {}) -> None: + self.use_pyvi = config.get("use_pyvi", False) + + def to_bytes(self, **kwargs) -> bytes: + serializers = {"cfg": lambda: srsly.json_dumps(self._get_config())} + return util.to_bytes(serializers, []) + + def from_bytes(self, data: bytes, **kwargs) -> "VietnameseTokenizer": + deserializers = {"cfg": lambda b: self._set_config(srsly.json_loads(b))} + util.from_bytes(data, deserializers, []) + return self + + def to_disk(self, path: Union[str, Path], **kwargs) -> None: + path = util.ensure_path(path) + serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())} + util.to_disk(path, serializers, []) + + def from_disk(self, path: Union[str, Path], **kwargs) -> "VietnameseTokenizer": + path = util.ensure_path(path) + serializers = {"cfg": lambda p: self._set_config(srsly.read_json(p))} + util.from_disk(path, serializers, []) + return self + + +class VietnameseDefaults(BaseDefaults): config = load_config_from_str(DEFAULT_CONFIG) lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/yo/__init__.py b/spacy/lang/yo/__init__.py index df6bb7d4a..6c38ec8af 100644 --- a/spacy/lang/yo/__init__.py +++ b/spacy/lang/yo/__init__.py @@ -1,9 +1,9 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ...language import Language +from ...language import Language, BaseDefaults -class YorubaDefaults(Language.Defaults): +class YorubaDefaults(BaseDefaults): lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 9a8a21a63..755a294e2 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -6,7 +6,7 @@ import warnings from pathlib import Path from ...errors import Warnings, Errors -from ...language import Language +from ...language import Language, BaseDefaults from ...scorer import Scorer from ...tokens import Doc from ...training import validate_examples, Example @@ -56,21 +56,21 @@ def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char): class ChineseTokenizer(DummyTokenizer): def __init__(self, nlp: Language, segmenter: Segmenter = Segmenter.char): self.vocab = nlp.vocab - if isinstance(segmenter, Segmenter): - segmenter = segmenter.value - self.segmenter = segmenter + self.segmenter = ( + segmenter.value if isinstance(segmenter, Segmenter) else segmenter + ) self.pkuseg_seg = None self.jieba_seg = None - if segmenter not in Segmenter.values(): + if self.segmenter not in Segmenter.values(): warn_msg = Warnings.W103.format( lang="Chinese", - segmenter=segmenter, + segmenter=self.segmenter, supported=", ".join(Segmenter.values()), default="'char' (character segmentation)", ) warnings.warn(warn_msg) self.segmenter = Segmenter.char - if segmenter == Segmenter.jieba: + if self.segmenter == Segmenter.jieba: self.jieba_seg = try_jieba_import() def initialize( @@ -90,7 +90,7 @@ class ChineseTokenizer(DummyTokenizer): def __call__(self, text: str) -> Doc: if self.segmenter == Segmenter.jieba: - words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x]) + words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x]) # type: ignore[union-attr] (words, spaces) = util.get_words_and_spaces(words, text) return Doc(self.vocab, words=words, spaces=spaces) elif self.segmenter == Segmenter.pkuseg: @@ -121,7 +121,7 @@ class ChineseTokenizer(DummyTokenizer): try: import spacy_pkuseg - self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(None) + self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(None) # type: ignore[attr-defined] except ImportError: msg = ( "spacy_pkuseg not installed: unable to reset pkuseg " @@ -129,7 +129,7 @@ class ChineseTokenizer(DummyTokenizer): ) raise ImportError(msg) from None for word in words: - self.pkuseg_seg.preprocesser.insert(word.strip(), "") + self.pkuseg_seg.preprocesser.insert(word.strip(), "") # type: ignore[attr-defined] else: warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter) warnings.warn(warn_msg) @@ -282,7 +282,7 @@ class ChineseTokenizer(DummyTokenizer): util.from_disk(path, serializers, []) -class ChineseDefaults(Language.Defaults): +class ChineseDefaults(BaseDefaults): config = load_config_from_str(DEFAULT_CONFIG) lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS @@ -294,7 +294,7 @@ class Chinese(Language): Defaults = ChineseDefaults -def try_jieba_import() -> None: +def try_jieba_import(): try: import jieba @@ -310,7 +310,7 @@ def try_jieba_import() -> None: raise ImportError(msg) from None -def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None: +def try_pkuseg_import(pkuseg_model: Optional[str], pkuseg_user_dict: Optional[str]): try: import spacy_pkuseg @@ -318,9 +318,9 @@ def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None: msg = "spacy-pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG raise ImportError(msg) from None try: - return spacy_pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict) + return spacy_pkuseg.pkuseg(pkuseg_model, user_dict=pkuseg_user_dict) except FileNotFoundError: - msg = "Unable to load pkuseg model from: " + pkuseg_model + msg = "Unable to load pkuseg model from: " + str(pkuseg_model or "") raise FileNotFoundError(msg) from None diff --git a/spacy/language.py b/spacy/language.py index 07e7e4148..80703259d 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1,5 +1,7 @@ -from typing import Optional, Any, Dict, Callable, Iterable, Union, List, Pattern -from typing import Tuple +from typing import Iterator, Optional, Any, Dict, Callable, Iterable +from typing import Union, Tuple, List, Set, Pattern, Sequence +from typing import NoReturn, TYPE_CHECKING, TypeVar, cast, overload + from dataclasses import dataclass import random import itertools @@ -8,20 +10,23 @@ from contextlib import contextmanager from copy import deepcopy from pathlib import Path import warnings -from thinc.api import Model, get_current_ops, Config, Optimizer +from thinc.api import get_current_ops, Config, CupyOps, Optimizer import srsly import multiprocessing as mp from itertools import chain, cycle from timeit import default_timer as timer +import traceback +from . import ty from .tokens.underscore import Underscore from .vocab import Vocab, create_vocab from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis from .training import Example, validate_examples from .training.initialize import init_vocab, init_tok2vec from .scorer import Scorer -from .util import registry, SimpleFrozenList, _pipe +from .util import registry, SimpleFrozenList, _pipe, raise_error from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER +from .util import warn_if_jupyter_cupy from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_INFIXES @@ -34,6 +39,11 @@ from .git_info import GIT_VERSION from . import util from . import about from .lookups import load_lookups +from .compat import Literal + + +if TYPE_CHECKING: + from .pipeline import Pipe # noqa: F401 # This is the base config will all settings (training etc.) @@ -43,6 +53,9 @@ DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH) # in the main config and only added via the 'init fill-config' command DEFAULT_CONFIG_PRETRAIN_PATH = Path(__file__).parent / "default_config_pretraining.cfg" +# Type variable for contexts piped with documents +_AnyContext = TypeVar("_AnyContext") + class BaseDefaults: """Language data defaults, available via Language.Defaults. Can be @@ -52,14 +65,14 @@ class BaseDefaults: config: Config = Config(section_order=CONFIG_SECTION_ORDER) tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS - prefixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_PREFIXES - suffixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_SUFFIXES - infixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_INFIXES - token_match: Optional[Pattern] = None - url_match: Optional[Pattern] = URL_MATCH + prefixes: Optional[Sequence[Union[str, Pattern]]] = TOKENIZER_PREFIXES + suffixes: Optional[Sequence[Union[str, Pattern]]] = TOKENIZER_SUFFIXES + infixes: Optional[Sequence[Union[str, Pattern]]] = TOKENIZER_INFIXES + token_match: Optional[Callable] = None + url_match: Optional[Callable] = URL_MATCH syntax_iterators: Dict[str, Callable] = {} lex_attr_getters: Dict[int, Callable[[str], Any]] = {} - stop_words = set() + stop_words: Set[str] = set() writing_system = {"direction": "ltr", "has_case": True, "has_letters": True} @@ -104,11 +117,11 @@ class Language: object and processing pipeline. lang (str): Two-letter language ID, i.e. ISO code. - DOCS: https://nightly.spacy.io/api/language + DOCS: https://spacy.io/api/language """ Defaults = BaseDefaults - lang: str = None + lang: Optional[str] = None default_config = DEFAULT_CONFIG factories = SimpleFrozenDict(error=Errors.E957) @@ -141,7 +154,7 @@ class Language: returns a tokenizer. batch_size (int): Default batch size for pipe and evaluate. - DOCS: https://nightly.spacy.io/api/language#init + DOCS: https://spacy.io/api/language#init """ # We're only calling this to import all factories provided via entry # points. The factory decorator applied to these functions takes care @@ -151,7 +164,7 @@ class Language: self._config = DEFAULT_CONFIG.merge(self.default_config) self._meta = dict(meta) self._path = None - self._optimizer = None + self._optimizer: Optional[Optimizer] = None # Component meta and configs are only needed on the instance self._pipe_meta: Dict[str, "FactoryMeta"] = {} # meta by component self._pipe_configs: Dict[str, Config] = {} # config by component @@ -167,8 +180,8 @@ class Language: self.vocab: Vocab = vocab if self.lang is None: self.lang = self.vocab.lang - self._components = [] - self._disabled = set() + self._components: List[Tuple[str, "Pipe"]] = [] + self._disabled: Set[str] = set() self.max_length = max_length # Create the default tokenizer from the default config if not create_tokenizer: @@ -176,6 +189,7 @@ class Language: create_tokenizer = registry.resolve(tokenizer_cfg)["tokenizer"] self.tokenizer = create_tokenizer(self) self.batch_size = batch_size + self.default_error_handler = raise_error def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) @@ -193,9 +207,9 @@ class Language: RETURNS (Dict[str, Any]): The meta. - DOCS: https://nightly.spacy.io/api/language#meta + DOCS: https://spacy.io/api/language#meta """ - spacy_version = util.get_model_version_range(about.__version__) + spacy_version = util.get_minor_version_range(about.__version__) if self.vocab.lang: self._meta.setdefault("lang", self.vocab.lang) else: @@ -234,7 +248,7 @@ class Language: RETURNS (thinc.api.Config): The config. - DOCS: https://nightly.spacy.io/api/language#config + DOCS: https://spacy.io/api/language#config """ self._config.setdefault("nlp", {}) self._config.setdefault("training", {}) @@ -287,7 +301,7 @@ class Language: return SimpleFrozenList(names) @property - def components(self) -> List[Tuple[str, Callable[[Doc], Doc]]]: + def components(self) -> List[Tuple[str, "Pipe"]]: """Get all (name, component) tuples in the pipeline, including the currently disabled components. """ @@ -306,12 +320,12 @@ class Language: return SimpleFrozenList(names, error=Errors.E926.format(attr="component_names")) @property - def pipeline(self) -> List[Tuple[str, Callable[[Doc], Doc]]]: + def pipeline(self) -> List[Tuple[str, "Pipe"]]: """The processing pipeline consisting of (name, component) tuples. The components are called on the Doc in order as it passes through the pipeline. - RETURNS (List[Tuple[str, Callable[[Doc], Doc]]]): The pipeline. + RETURNS (List[Tuple[str, Pipe]]): The pipeline. """ pipes = [(n, p) for n, p in self._components if n not in self._disabled] return SimpleFrozenList(pipes, error=Errors.E926.format(attr="pipeline")) @@ -419,7 +433,7 @@ class Language: assigns: Iterable[str] = SimpleFrozenList(), requires: Iterable[str] = SimpleFrozenList(), retokenizes: bool = False, - default_score_weights: Dict[str, float] = SimpleFrozenDict(), + default_score_weights: Dict[str, Optional[float]] = SimpleFrozenDict(), func: Optional[Callable] = None, ) -> Callable: """Register a new pipeline component factory. Can be used as a decorator @@ -431,19 +445,19 @@ class Language: default_config (Dict[str, Any]): Default configuration, describing the default values of the factory arguments. assigns (Iterable[str]): Doc/Token attributes assigned by this component, - e.g. "token.ent_id". Used for pipeline analyis. + e.g. "token.ent_id". Used for pipeline analysis. requires (Iterable[str]): Doc/Token attributes required by this component, - e.g. "token.ent_id". Used for pipeline analyis. + e.g. "token.ent_id". Used for pipeline analysis. retokenizes (bool): Whether the component changes the tokenization. Used for pipeline analysis. - default_score_weights (Dict[str, float]): The scores to report during + default_score_weights (Dict[str, Optional[float]]): The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to 1.0 per component and will be combined and normalized for the whole pipeline. If None, the score won't be shown in the logs or be weighted. func (Optional[Callable]): Factory function if not used as a decorator. - DOCS: https://nightly.spacy.io/api/language#factory + DOCS: https://spacy.io/api/language#factory """ if not isinstance(name, str): raise ValueError(Errors.E963.format(decorator="factory")) @@ -501,12 +515,12 @@ class Language: @classmethod def component( cls, - name: Optional[str] = None, + name: str, *, assigns: Iterable[str] = SimpleFrozenList(), requires: Iterable[str] = SimpleFrozenList(), retokenizes: bool = False, - func: Optional[Callable[[Doc], Doc]] = None, + func: Optional["Pipe"] = None, ) -> Callable: """Register a new pipeline component. Can be used for stateless function components that don't require a separate factory. Can be used as a @@ -516,24 +530,24 @@ class Language: name (str): The name of the component factory. assigns (Iterable[str]): Doc/Token attributes assigned by this component, - e.g. "token.ent_id". Used for pipeline analyis. + e.g. "token.ent_id". Used for pipeline analysis. requires (Iterable[str]): Doc/Token attributes required by this component, - e.g. "token.ent_id". Used for pipeline analyis. + e.g. "token.ent_id". Used for pipeline analysis. retokenizes (bool): Whether the component changes the tokenization. Used for pipeline analysis. func (Optional[Callable]): Factory function if not used as a decorator. - DOCS: https://nightly.spacy.io/api/language#component + DOCS: https://spacy.io/api/language#component """ if name is not None and not isinstance(name, str): raise ValueError(Errors.E963.format(decorator="component")) component_name = name if name is not None else util.get_object_name(func) - def add_component(component_func: Callable[[Doc], Doc]) -> Callable: + def add_component(component_func: "Pipe") -> Callable: if isinstance(func, type): # function is a class raise ValueError(Errors.E965.format(name=component_name)) - def factory_func(nlp: cls, name: str) -> Callable[[Doc], Doc]: + def factory_func(nlp, name: str) -> "Pipe": return component_func internal_name = cls.get_factory_name(name) @@ -583,13 +597,13 @@ class Language: print_pipe_analysis(analysis, keys=keys) return analysis - def get_pipe(self, name: str) -> Callable[[Doc], Doc]: + def get_pipe(self, name: str) -> "Pipe": """Get a pipeline component for a given component name. name (str): Name of pipeline component to get. RETURNS (callable): The pipeline component. - DOCS: https://nightly.spacy.io/api/language#get_pipe + DOCS: https://spacy.io/api/language#get_pipe """ for pipe_name, component in self._components: if pipe_name == name: @@ -601,24 +615,24 @@ class Language: factory_name: str, name: Optional[str] = None, *, - config: Optional[Dict[str, Any]] = SimpleFrozenDict(), + config: Dict[str, Any] = SimpleFrozenDict(), raw_config: Optional[Config] = None, validate: bool = True, - ) -> Callable[[Doc], Doc]: + ) -> "Pipe": """Create a pipeline component. Mostly used internally. To create and add a component to the pipeline, you can use nlp.add_pipe. factory_name (str): Name of component factory. name (Optional[str]): Optional name to assign to component instance. Defaults to factory name if not set. - config (Optional[Dict[str, Any]]): Config parameters to use for this - component. Will be merged with default config, if available. + config (Dict[str, Any]): Config parameters to use for this component. + Will be merged with default config, if available. raw_config (Optional[Config]): Internals: the non-interpolated config. validate (bool): Whether to validate the component config against the arguments and types expected by the factory. - RETURNS (Callable[[Doc], Doc]): The pipeline component. + RETURNS (Pipe): The pipeline component. - DOCS: https://nightly.spacy.io/api/language#create_pipe + DOCS: https://spacy.io/api/language#create_pipe """ name = name if name is not None else factory_name if not isinstance(config, dict): @@ -636,21 +650,20 @@ class Language: ) raise ValueError(err) pipe_meta = self.get_factory_meta(factory_name) - config = config or {} # This is unideal, but the alternative would mean you always need to # specify the full config settings, which is not really viable. if pipe_meta.default_config: config = Config(pipe_meta.default_config).merge(config) - # We need to create a top-level key because Thinc doesn't allow resolving - # top-level references to registered functions. Also gives nicer errors. - # The name allows components to know their pipe name and use it in the - # losses etc. (even if multiple instances of the same factory are used) internal_name = self.get_factory_name(factory_name) # If the language-specific factory doesn't exist, try again with the # not-specific name if internal_name not in registry.factories: internal_name = factory_name + # The name allows components to know their pipe name and use it in the + # losses etc. (even if multiple instances of the same factory are used) config = {"nlp": self, "name": name, **config, "@factories": internal_name} + # We need to create a top-level key because Thinc doesn't allow resolving + # top-level references to registered functions. Also gives nicer errors. cfg = {factory_name: config} # We're calling the internal _fill here to avoid constructing the # registered functions twice @@ -672,7 +685,7 @@ class Language: def create_pipe_from_source( self, source_name: str, source: "Language", *, name: str - ) -> Tuple[Callable[[Doc], Doc], str]: + ) -> Tuple["Pipe", str]: """Create a pipeline component by copying it from an existing model. source_name (str): Name of the component in the source pipeline. @@ -680,15 +693,22 @@ class Language: name (str): Optional alternative name to use in current pipeline. RETURNS (Tuple[Callable, str]): The component and its factory name. """ - # TODO: handle errors and mismatches (vectors etc.) - if not isinstance(source, self.__class__): + # Check source type + if not isinstance(source, Language): raise ValueError(Errors.E945.format(name=source_name, source=type(source))) - if not source.has_pipe(source_name): + # Check vectors, with faster checks first + if ( + self.vocab.vectors.shape != source.vocab.vectors.shape + or self.vocab.vectors.key2row != source.vocab.vectors.key2row + or self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes() + ): + warnings.warn(Warnings.W113.format(name=source_name)) + if source_name not in source.component_names: raise KeyError( Errors.E944.format( name=source_name, model=f"{source.meta['lang']}_{source.meta['name']}", - opts=", ".join(source.pipe_names), + opts=", ".join(source.component_names), ) ) pipe = source.get_pipe(source_name) @@ -697,8 +717,9 @@ class Language: source_config = source.config.interpolate() pipe_config = util.copy_config(source_config["components"][source_name]) self._pipe_configs[name] = pipe_config - for s in source.vocab.strings: - self.vocab.strings.add(s) + if self.vocab.strings != source.vocab.strings: + for s in source.vocab.strings: + self.vocab.strings.add(s) return pipe, pipe_config["factory"] def add_pipe( @@ -711,10 +732,10 @@ class Language: first: Optional[bool] = None, last: Optional[bool] = None, source: Optional["Language"] = None, - config: Optional[Dict[str, Any]] = SimpleFrozenDict(), + config: Dict[str, Any] = SimpleFrozenDict(), raw_config: Optional[Config] = None, validate: bool = True, - ) -> Callable[[Doc], Doc]: + ) -> "Pipe": """Add a component to the processing pipeline. Valid components are callables that take a `Doc` object, modify it and return it. Only one of before/after/first/last can be set. Default behaviour is "last". @@ -732,14 +753,14 @@ class Language: last (bool): If True, insert component last in the pipeline. source (Language): Optional loaded nlp object to copy the pipeline component from. - config (Optional[Dict[str, Any]]): Config parameters to use for this - component. Will be merged with default config, if available. + config (Dict[str, Any]): Config parameters to use for this component. + Will be merged with default config, if available. raw_config (Optional[Config]): Internals: the non-interpolated config. validate (bool): Whether to validate the component config against the arguments and types expected by the factory. - RETURNS (Callable[[Doc], Doc]): The pipeline component. + RETURNS (Pipe): The pipeline component. - DOCS: https://nightly.spacy.io/api/language#add_pipe + DOCS: https://spacy.io/api/language#add_pipe """ if not isinstance(factory_name, str): bad_val = repr(factory_name) @@ -837,7 +858,7 @@ class Language: name (str): Name of the component. RETURNS (bool): Whether a component of the name exists in the pipeline. - DOCS: https://nightly.spacy.io/api/language#has_pipe + DOCS: https://spacy.io/api/language#has_pipe """ return name in self.pipe_names @@ -848,7 +869,7 @@ class Language: *, config: Dict[str, Any] = SimpleFrozenDict(), validate: bool = True, - ) -> Callable[[Doc], Doc]: + ) -> "Pipe": """Replace a component in the pipeline. name (str): Name of the component to replace. @@ -857,18 +878,18 @@ class Language: component. Will be merged with default config, if available. validate (bool): Whether to validate the component config against the arguments and types expected by the factory. - RETURNS (Callable[[Doc], Doc]): The new pipeline component. + RETURNS (Pipe): The new pipeline component. - DOCS: https://nightly.spacy.io/api/language#replace_pipe + DOCS: https://spacy.io/api/language#replace_pipe """ - if name not in self.pipe_names: + if name not in self.component_names: raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names)) if hasattr(factory_name, "__call__"): err = Errors.E968.format(component=repr(factory_name), name=name) raise ValueError(err) # We need to delegate to Language.add_pipe here instead of just writing # to Language.pipeline to make sure the configs are handled correctly - pipe_index = self.pipe_names.index(name) + pipe_index = self.component_names.index(name) self.remove_pipe(name) if not len(self._components) or pipe_index == len(self._components): # we have no components to insert before/after, or we're replacing the last component @@ -890,7 +911,7 @@ class Language: old_name (str): Name of the component to rename. new_name (str): New name of the component. - DOCS: https://nightly.spacy.io/api/language#rename_pipe + DOCS: https://spacy.io/api/language#rename_pipe """ if old_name not in self.component_names: raise ValueError( @@ -909,13 +930,13 @@ class Language: init_cfg = self._config["initialize"]["components"].pop(old_name) self._config["initialize"]["components"][new_name] = init_cfg - def remove_pipe(self, name: str) -> Tuple[str, Callable[[Doc], Doc]]: + def remove_pipe(self, name: str) -> Tuple[str, "Pipe"]: """Remove a component from the pipeline. name (str): Name of the component to remove. RETURNS (tuple): A `(name, component)` tuple of the removed component. - DOCS: https://nightly.spacy.io/api/language#remove_pipe + DOCS: https://spacy.io/api/language#remove_pipe """ if name not in self.component_names: raise ValueError(Errors.E001.format(name=name, opts=self.component_names)) @@ -924,6 +945,7 @@ class Language: # because factory may be used for something else self._pipe_meta.pop(name) self._pipe_configs.pop(name) + self.meta.get("_sourced_vectors_hashes", {}).pop(name, None) # Make sure name is removed from the [initialize] config if name in self._config["initialize"]["components"]: self._config["initialize"]["components"].pop(name) @@ -966,12 +988,12 @@ class Language: is preserved. text (str): The text to be processed. - disable (list): Names of the pipeline components to disable. + disable (List[str]): Names of the pipeline components to disable. component_cfg (Dict[str, dict]): An optional dictionary with extra keyword arguments for specific components. RETURNS (Doc): A container for accessing the annotations. - DOCS: https://nightly.spacy.io/api/language#call + DOCS: https://spacy.io/api/language#call """ doc = self.make_doc(text) if component_cfg is None: @@ -981,11 +1003,16 @@ class Language: continue if not hasattr(proc, "__call__"): raise ValueError(Errors.E003.format(component=type(proc), name=name)) + error_handler = self.default_error_handler + if hasattr(proc, "get_error_handler"): + error_handler = proc.get_error_handler() try: - doc = proc(doc, **component_cfg.get(name, {})) + doc = proc(doc, **component_cfg.get(name, {})) # type: ignore[call-arg] except KeyError as e: # This typically happens if a component is not initialized raise ValueError(Errors.E109.format(name=name)) from e + except Exception as e: + error_handler(name, proc, [doc], e) if doc is None: raise ValueError(Errors.E005.format(name=name)) return doc @@ -1000,7 +1027,7 @@ class Language: """ warnings.warn(Warnings.W096, DeprecationWarning) if len(names) == 1 and isinstance(names[0], (list, tuple)): - names = names[0] # support list of names instead of spread + names = names[0] # type: ignore[assignment] # support list of names instead of spread return self.select_pipes(disable=names) def select_pipes( @@ -1017,7 +1044,7 @@ class Language: disable (str or iterable): The name(s) of the pipes to disable enable (str or iterable): The name(s) of the pipes to enable - all others will be disabled - DOCS: https://nightly.spacy.io/api/language#select_pipes + DOCS: https://spacy.io/api/language#select_pipes """ if enable is None and disable is None: raise ValueError(Errors.E991) @@ -1035,6 +1062,7 @@ class Language: ) ) disable = to_disable + assert disable is not None # DisabledPipes will restore the pipes in 'disable' when it's done, so we need to exclude # those pipes that were already disabled. disable = [d for d in disable if d not in self._disabled] @@ -1062,6 +1090,7 @@ class Language: losses: Optional[Dict[str, float]] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, exclude: Iterable[str] = SimpleFrozenList(), + annotates: Iterable[str] = SimpleFrozenList(), ): """Update the models in the pipeline. @@ -1069,19 +1098,22 @@ class Language: _: Should not be set - serves to catch backwards-incompatible scripts. drop (float): The dropout rate. sgd (Optimizer): An optimizer. - losses (Dict[str, float]): Dictionary to update with the loss, keyed by component. + losses (Dict[str, float]): Dictionary to update with the loss, keyed by + component. component_cfg (Dict[str, Dict]): Config parameters for specific pipeline components, keyed by component name. exclude (Iterable[str]): Names of components that shouldn't be updated. + annotates (Iterable[str]): Names of components that should set + annotations on the predicted examples after updating. RETURNS (Dict[str, float]): The updated losses dictionary - DOCS: https://nightly.spacy.io/api/language#update + DOCS: https://spacy.io/api/language#update """ if _ is not None: raise ValueError(Errors.E989) if losses is None: losses = {} - if len(examples) == 0: + if isinstance(examples, list) and len(examples) == 0: return losses validate_examples(examples, "Language.update") examples = _copy_examples(examples) @@ -1091,22 +1123,36 @@ class Language: sgd = self._optimizer if component_cfg is None: component_cfg = {} + pipe_kwargs = {} for i, (name, proc) in enumerate(self.pipeline): component_cfg.setdefault(name, {}) + pipe_kwargs[name] = deepcopy(component_cfg[name]) component_cfg[name].setdefault("drop", drop) + pipe_kwargs[name].setdefault("batch_size", self.batch_size) for name, proc in self.pipeline: - if name in exclude or not hasattr(proc, "update"): - continue - proc.update(examples, sgd=None, losses=losses, **component_cfg[name]) - if sgd not in (None, False): - for name, proc in self.pipeline: + # ignore statements are used here because mypy ignores hasattr + if name not in exclude and hasattr(proc, "update"): + proc.update(examples, sgd=None, losses=losses, **component_cfg[name]) # type: ignore + if sgd not in (None, False): if ( name not in exclude - and hasattr(proc, "is_trainable") + and isinstance(proc, ty.TrainableComponent) and proc.is_trainable and proc.model not in (True, False, None) ): proc.finish_update(sgd) + if name in annotates: + for doc, eg in zip( + _pipe( + (eg.predicted for eg in examples), + proc=proc, + name=name, + default_error_handler=self.default_error_handler, + kwargs=pipe_kwargs[name], + ), + examples, + ): + eg.predicted = doc return losses def rehearse( @@ -1138,10 +1184,12 @@ class Language: >>> raw_batch = [Example.from_dict(nlp.make_doc(text), {}) for text in next(raw_text_batches)] >>> nlp.rehearse(raw_batch) - DOCS: https://nightly.spacy.io/api/language#rehearse + DOCS: https://spacy.io/api/language#rehearse """ - if len(examples) == 0: - return + if losses is None: + losses = {} + if isinstance(examples, list) and len(examples) == 0: + return losses validate_examples(examples, "Language.rehearse") if sgd is None: if self._optimizer is None: @@ -1156,18 +1204,18 @@ class Language: def get_grads(W, dW, key=None): grads[key] = (W, dW) - get_grads.learn_rate = sgd.learn_rate - get_grads.b1 = sgd.b1 - get_grads.b2 = sgd.b2 + get_grads.learn_rate = sgd.learn_rate # type: ignore[attr-defined, union-attr] + get_grads.b1 = sgd.b1 # type: ignore[attr-defined, union-attr] + get_grads.b2 = sgd.b2 # type: ignore[attr-defined, union-attr] for name, proc in pipes: if name in exclude or not hasattr(proc, "rehearse"): continue grads = {} - proc.rehearse( + proc.rehearse( # type: ignore[attr-defined] examples, sgd=get_grads, losses=losses, **component_cfg.get(name, {}) ) for key, (W, dW) in grads.items(): - sgd(W, dW, key=key) + sgd(W, dW, key=key) # type: ignore[call-arg, misc] return losses def begin_training( @@ -1193,7 +1241,7 @@ class Language: provided, will be created using the .create_optimizer() method. RETURNS (thinc.api.Optimizer): The optimizer. - DOCS: https://nightly.spacy.io/api/language#initialize + DOCS: https://spacy.io/api/language#initialize """ if get_examples is None: util.logger.debug( @@ -1213,31 +1261,34 @@ class Language: before_init = I["before_init"] if before_init is not None: before_init(self) - init_vocab( - self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"] - ) - pretrain_cfg = config.get("pretraining") - if pretrain_cfg: - P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain) - init_tok2vec(self, P, I) + try: + init_vocab( + self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"] + ) + except IOError: + raise IOError(Errors.E884.format(vectors=I["vectors"])) if self.vocab.vectors.data.shape[1] >= 1: ops = get_current_ops() self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) if hasattr(self.tokenizer, "initialize"): tok_settings = validate_init_settings( - self.tokenizer.initialize, + self.tokenizer.initialize, # type: ignore[union-attr] I["tokenizer"], section="tokenizer", name="tokenizer", ) - self.tokenizer.initialize(get_examples, nlp=self, **tok_settings) + self.tokenizer.initialize(get_examples, nlp=self, **tok_settings) # type: ignore[union-attr] for name, proc in self.pipeline: - if hasattr(proc, "initialize"): + if isinstance(proc, ty.InitializableComponent): p_settings = I["components"].get(name, {}) p_settings = validate_init_settings( proc.initialize, p_settings, section="components", name=name ) proc.initialize(get_examples, nlp=self, **p_settings) + pretrain_cfg = config.get("pretraining") + if pretrain_cfg: + P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain) + init_tok2vec(self, P, I) self._link_components() self._optimizer = sgd if sgd is not None: @@ -1260,20 +1311,38 @@ class Language: RETURNS (Optimizer): The optimizer. - DOCS: https://nightly.spacy.io/api/language#resume_training + DOCS: https://spacy.io/api/language#resume_training """ ops = get_current_ops() if self.vocab.vectors.data.shape[1] >= 1: self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) for name, proc in self.pipeline: if hasattr(proc, "_rehearsal_model"): - proc._rehearsal_model = deepcopy(proc.model) + proc._rehearsal_model = deepcopy(proc.model) # type: ignore[attr-defined] if sgd is not None: self._optimizer = sgd elif self._optimizer is None: self._optimizer = self.create_optimizer() return self._optimizer + def set_error_handler( + self, + error_handler: Callable[[str, "Pipe", List[Doc], Exception], NoReturn], + ): + """Set an error handler object for all the components in the pipeline that implement + a set_error_handler function. + + error_handler (Callable[[str, Pipe, List[Doc], Exception], NoReturn]): + Function that deals with a failing batch of documents. This callable function should take in + the component's name, the component itself, the offending batch of documents, and the exception + that was thrown. + DOCS: https://spacy.io/api/language#set_error_handler + """ + self.default_error_handler = error_handler + for name, pipe in self.pipeline: + if hasattr(pipe, "set_error_handler"): + pipe.set_error_handler(error_handler) + def evaluate( self, examples: Iterable[Example], @@ -1282,7 +1351,7 @@ class Language: scorer: Optional[Scorer] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, scorer_cfg: Optional[Dict[str, Any]] = None, - ) -> Dict[str, Union[float, dict]]: + ) -> Dict[str, Any]: """Evaluate a model's pipeline components. examples (Iterable[Example]): `Example` objects. @@ -1293,9 +1362,10 @@ class Language: arguments for specific components. scorer_cfg (dict): An optional dictionary with extra keyword arguments for the scorer. + RETURNS (Scorer): The scorer containing the evaluation results. - DOCS: https://nightly.spacy.io/api/language#evaluate + DOCS: https://spacy.io/api/language#evaluate """ examples = list(examples) validate_examples(examples, "Language.evaluate") @@ -1312,12 +1382,22 @@ class Language: scorer = Scorer(**kwargs) # reset annotation in predicted docs and time tokenization start_time = timer() + # this is purely for timing + for eg in examples: + self.make_doc(eg.reference.text) # apply all pipeline components for name, pipe in self.pipeline: kwargs = component_cfg.get(name, {}) kwargs.setdefault("batch_size", batch_size) for doc, eg in zip( - _pipe((eg.predicted for eg in examples), pipe, kwargs), examples + _pipe( + (eg.predicted for eg in examples), + proc=pipe, + name=name, + default_error_handler=self.default_error_handler, + kwargs=kwargs, + ), + examples, ): eg.predicted = doc end_time = timer() @@ -1343,13 +1423,13 @@ class Language: >>> with nlp.use_params(optimizer.averages): >>> nlp.to_disk("/tmp/checkpoint") - DOCS: https://nightly.spacy.io/api/language#use_params + DOCS: https://spacy.io/api/language#use_params """ if not params: yield else: contexts = [ - pipe.use_params(params) + pipe.use_params(params) # type: ignore[attr-defined] for name, pipe in self.pipeline if hasattr(pipe, "use_params") and hasattr(pipe, "model") ] @@ -1367,16 +1447,42 @@ class Language: except StopIteration: pass + @overload def pipe( self, texts: Iterable[str], *, + as_tuples: Literal[False] = ..., + batch_size: Optional[int] = ..., + disable: Iterable[str] = ..., + component_cfg: Optional[Dict[str, Dict[str, Any]]] = ..., + n_process: int = ..., + ) -> Iterator[Doc]: + ... + + @overload + def pipe( # noqa: F811 + self, + texts: Iterable[Tuple[str, _AnyContext]], + *, + as_tuples: Literal[True] = ..., + batch_size: Optional[int] = ..., + disable: Iterable[str] = ..., + component_cfg: Optional[Dict[str, Dict[str, Any]]] = ..., + n_process: int = ..., + ) -> Iterator[Tuple[Doc, _AnyContext]]: + ... + + def pipe( # noqa: F811 + self, + texts: Union[Iterable[str], Iterable[Tuple[str, _AnyContext]]], + *, as_tuples: bool = False, batch_size: Optional[int] = None, disable: Iterable[str] = SimpleFrozenList(), component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, n_process: int = 1, - ): + ) -> Union[Iterator[Doc], Iterator[Tuple[Doc, _AnyContext]]]: """Process texts as a stream, and yield `Doc` objects in order. texts (Iterable[str]): A sequence of texts to process. @@ -1390,11 +1496,11 @@ class Language: n_process (int): Number of processors to process texts. If -1, set `multiprocessing.cpu_count()`. YIELDS (Doc): Documents in the order of the original text. - DOCS: https://nightly.spacy.io/api/language#pipe + DOCS: https://spacy.io/api/language#pipe """ - if n_process == -1: - n_process = mp.cpu_count() + # Handle texts with context as tuples if as_tuples: + texts = cast(Iterable[Tuple[str, _AnyContext]], texts) text_context1, text_context2 = itertools.tee(texts) texts = (tc[0] for tc in text_context1) contexts = (tc[1] for tc in text_context2) @@ -1408,6 +1514,13 @@ class Language: for doc, context in zip(docs, contexts): yield (doc, context) return + + # At this point, we know that we're dealing with an iterable of plain texts + texts = cast(Iterable[str], texts) + + # Set argument defaults + if n_process == -1: + n_process = mp.cpu_count() if component_cfg is None: component_cfg = {} if batch_size is None: @@ -1422,10 +1535,19 @@ class Language: kwargs = component_cfg.get(name, {}) # Allow component_cfg to overwrite the top-level kwargs. kwargs.setdefault("batch_size", batch_size) - f = functools.partial(_pipe, proc=proc, kwargs=kwargs) + f = functools.partial( + _pipe, + proc=proc, + name=name, + kwargs=kwargs, + default_error_handler=self.default_error_handler, + ) pipes.append(f) if n_process != 1: + if self._has_gpu_model(disable): + warnings.warn(Warnings.W114) + docs = self._multiprocessing_pipe(texts, pipes, n_process, batch_size) else: # if n_process == 1, no processes are forked. @@ -1435,17 +1557,28 @@ class Language: for doc in docs: yield doc + def _has_gpu_model(self, disable: Iterable[str]): + for name, proc in self.pipeline: + is_trainable = hasattr(proc, "is_trainable") and proc.is_trainable # type: ignore + if name in disable or not is_trainable: + continue + + if hasattr(proc, "model") and hasattr(proc.model, "ops") and isinstance(proc.model.ops, CupyOps): # type: ignore + return True + + return False + def _multiprocessing_pipe( self, texts: Iterable[str], - pipes: Iterable[Callable[[Doc], Doc]], + pipes: Iterable[Callable[..., Iterator[Doc]]], n_process: int, batch_size: int, - ) -> None: + ) -> Iterator[Doc]: # raw_texts is used later to stop iteration. texts, raw_texts = itertools.tee(texts) # for sending texts to worker - texts_q = [mp.Queue() for _ in range(n_process)] + texts_q: List[mp.Queue] = [mp.Queue() for _ in range(n_process)] # for receiving byte-encoded docs from worker bytedocs_recv_ch, bytedocs_send_ch = zip( *[mp.Pipe(False) for _ in range(n_process)] @@ -1472,11 +1605,21 @@ class Language: # Cycle channels not to break the order of docs. # The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable. - byte_docs = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch)) - docs = (Doc(self.vocab).from_bytes(byte_doc) for byte_doc in byte_docs) + byte_tuples = chain.from_iterable( + recv.recv() for recv in cycle(bytedocs_recv_ch) + ) try: - for i, (_, doc) in enumerate(zip(raw_texts, docs), 1): - yield doc + for i, (_, (byte_doc, byte_error)) in enumerate( + zip(raw_texts, byte_tuples), 1 + ): + if byte_doc is not None: + doc = Doc(self.vocab).from_bytes(byte_doc) + yield doc + elif byte_error is not None: + error = srsly.msgpack_loads(byte_error) + self.default_error_handler( + None, None, None, ValueError(Errors.E871.format(error=error)) + ) if i % batch_size == 0: # tell `sender` that one batch was consumed. sender.step() @@ -1488,13 +1631,13 @@ class Language: """Register 'listeners' within pipeline components, to allow them to effectively share weights. """ - # I had though, "Why do we do this inside the Language object? Shouldn't + # I had thought, "Why do we do this inside the Language object? Shouldn't # it be the tok2vec/transformer/etc's job? # The problem is we need to do it during deserialization...And the # components don't receive the pipeline then. So this does have to be # here :( for i, (name1, proc1) in enumerate(self.pipeline): - if hasattr(proc1, "find_listeners"): + if isinstance(proc1, ty.ListenedToComponent): for name2, proc2 in self.pipeline[i + 1 :]: proc1.find_listeners(proc2) @@ -1528,7 +1671,7 @@ class Language: the types expected by the factory. RETURNS (Language): The initialized Language class. - DOCS: https://nightly.spacy.io/api/language#from_config + DOCS: https://spacy.io/api/language#from_config """ if auto_fill: config = Config( @@ -1552,6 +1695,7 @@ class Language: # using the nlp.config with all defaults. config = util.copy_config(config) orig_pipeline = config.pop("components", {}) + orig_pretraining = config.pop("pretraining", None) config["components"] = {} if auto_fill: filled = registry.fill(config, validate=validate, schema=ConfigSchema) @@ -1559,6 +1703,9 @@ class Language: filled = config filled["components"] = orig_pipeline config["components"] = orig_pipeline + if orig_pretraining is not None: + filled["pretraining"] = orig_pretraining + config["pretraining"] = orig_pretraining resolved_nlp = registry.resolve( filled["nlp"], validate=validate, schema=ConfigSchemaNlp ) @@ -1575,6 +1722,10 @@ class Language: or lang_cls is not cls ): raise ValueError(Errors.E943.format(value=type(lang_cls))) + + # Warn about require_gpu usage in jupyter notebook + warn_if_jupyter_cupy() + # Note that we don't load vectors here, instead they get loaded explicitly # inside stuff like the spacy train function. If we loaded them here, # then we would load them twice at runtime: once when we make from config, @@ -1589,9 +1740,12 @@ class Language: # Later we replace the component config with the raw config again. interpolated = filled.interpolate() if not filled.is_interpolated else filled pipeline = interpolated.get("components", {}) + sourced = util.get_sourced_components(interpolated) # If components are loaded from a source (existing models), we cache # them here so they're only loaded once source_nlps = {} + source_nlp_vectors_hashes = {} + vocab_b = None for pipe_name in config["nlp"]["pipeline"]: if pipe_name not in pipeline: opts = ", ".join(pipeline.keys()) @@ -1614,13 +1768,51 @@ class Language: raw_config=raw_config, ) else: + # We need the sourced components to reference the same + # vocab without modifying the current vocab state **AND** + # we still want to load the source model vectors to perform + # the vectors check. Since the source vectors clobber the + # current ones, we save the original vocab state and + # restore after this loop. Existing strings are preserved + # during deserialization, so they do not need any + # additional handling. + if vocab_b is None: + vocab_b = nlp.vocab.to_bytes(exclude=["lookups", "strings"]) model = pipe_cfg["source"] if model not in source_nlps: - # We only need the components here and we need to init - # model with the same vocab as the current nlp object - source_nlps[model] = util.load_model(model, vocab=nlp.vocab) + # Load with the same vocab, adding any strings + source_nlps[model] = util.load_model( + model, vocab=nlp.vocab, exclude=["lookups"] + ) source_name = pipe_cfg.get("component", pipe_name) - nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name) + listeners_replaced = False + if "replace_listeners" in pipe_cfg: + for name, proc in source_nlps[model].pipeline: + if source_name in getattr(proc, "listening_components", []): + source_nlps[model].replace_listeners( + name, source_name, pipe_cfg["replace_listeners"] + ) + listeners_replaced = True + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="\\[W113\\]") + nlp.add_pipe( + source_name, source=source_nlps[model], name=pipe_name + ) + if model not in source_nlp_vectors_hashes: + source_nlp_vectors_hashes[model] = hash( + source_nlps[model].vocab.vectors.to_bytes() + ) + if "_sourced_vectors_hashes" not in nlp.meta: + nlp.meta["_sourced_vectors_hashes"] = {} + nlp.meta["_sourced_vectors_hashes"][ + pipe_name + ] = source_nlp_vectors_hashes[model] + # Delete from cache if listeners were replaced + if listeners_replaced: + del source_nlps[model] + # Restore the original vocab after sourcing if necessary + if vocab_b is not None: + nlp.vocab.from_bytes(vocab_b) disabled_pipes = [*config["nlp"]["disabled"], *disable] nlp._disabled = set(p for p in disabled_pipes if p not in exclude) nlp.batch_size = config["nlp"]["batch_size"] @@ -1631,8 +1823,121 @@ class Language: raise ValueError( Errors.E942.format(name="pipeline_creation", value=type(nlp)) ) + # Detect components with listeners that are not frozen consistently + for name, proc in nlp.pipeline: + if isinstance(proc, ty.ListenedToComponent): + # Remove listeners not in the pipeline + listener_names = proc.listening_components + unused_listener_names = [ + ll for ll in listener_names if ll not in nlp.pipe_names + ] + for listener_name in unused_listener_names: + for listener in proc.listener_map.get(listener_name, []): + proc.remove_listener(listener, listener_name) + + for listener_name in proc.listening_components: + # e.g. tok2vec/transformer + # If it's a component sourced from another pipeline, we check if + # the tok2vec listeners should be replaced with standalone tok2vec + # models (e.g. so component can be frozen without its performance + # degrading when other components/tok2vec are updated) + paths = sourced.get(listener_name, {}).get("replace_listeners", []) + if paths: + nlp.replace_listeners(name, listener_name, paths) return nlp + def replace_listeners( + self, + tok2vec_name: str, + pipe_name: str, + listeners: Iterable[str], + ) -> None: + """Find listener layers (connecting to a token-to-vector embedding + component) of a given pipeline component model and replace + them with a standalone copy of the token-to-vector layer. This can be + useful when training a pipeline with components sourced from an existing + pipeline: if multiple components (e.g. tagger, parser, NER) listen to + the same tok2vec component, but some of them are frozen and not updated, + their performance may degrade significally as the tok2vec component is + updated with new data. To prevent this, listeners can be replaced with + a standalone tok2vec layer that is owned by the component and doesn't + change if the component isn't updated. + + tok2vec_name (str): Name of the token-to-vector component, typically + "tok2vec" or "transformer". + pipe_name (str): Name of pipeline component to replace listeners for. + listeners (Iterable[str]): The paths to the listeners, relative to the + component config, e.g. ["model.tok2vec"]. Typically, implementations + will only connect to one tok2vec component, [model.tok2vec], but in + theory, custom models can use multiple listeners. The value here can + either be an empty list to not replace any listeners, or a complete + (!) list of the paths to all listener layers used by the model. + + DOCS: https://spacy.io/api/language#replace_listeners + """ + if tok2vec_name not in self.pipe_names: + err = Errors.E889.format( + tok2vec=tok2vec_name, + name=pipe_name, + unknown=tok2vec_name, + opts=", ".join(self.pipe_names), + ) + raise ValueError(err) + if pipe_name not in self.pipe_names: + err = Errors.E889.format( + tok2vec=tok2vec_name, + name=pipe_name, + unknown=pipe_name, + opts=", ".join(self.pipe_names), + ) + raise ValueError(err) + tok2vec = self.get_pipe(tok2vec_name) + tok2vec_cfg = self.get_pipe_config(tok2vec_name) + if not isinstance(tok2vec, ty.ListenedToComponent): + raise ValueError(Errors.E888.format(name=tok2vec_name, pipe=type(tok2vec))) + tok2vec_model = tok2vec.model + pipe_listeners = tok2vec.listener_map.get(pipe_name, []) + pipe = self.get_pipe(pipe_name) + pipe_cfg = self._pipe_configs[pipe_name] + if listeners: + util.logger.debug(f"Replacing listeners of component '{pipe_name}'") + if len(list(listeners)) != len(pipe_listeners): + # The number of listeners defined in the component model doesn't + # match the listeners to replace, so we won't be able to update + # the nodes and generate a matching config + err = Errors.E887.format( + name=pipe_name, + tok2vec=tok2vec_name, + paths=listeners, + n_listeners=len(pipe_listeners), + ) + raise ValueError(err) + # Update the config accordingly by copying the tok2vec model to all + # sections defined in the listener paths + for listener_path in listeners: + # Check if the path actually exists in the config + try: + util.dot_to_object(pipe_cfg, listener_path) + except KeyError: + err = Errors.E886.format( + name=pipe_name, tok2vec=tok2vec_name, path=listener_path + ) + raise ValueError(err) + new_config = tok2vec_cfg["model"] + if "replace_listener_cfg" in tok2vec_model.attrs: + replace_func = tok2vec_model.attrs["replace_listener_cfg"] + new_config = replace_func( + tok2vec_cfg["model"], pipe_cfg["model"]["tok2vec"] + ) + util.set_dot_to_object(pipe_cfg, listener_path, new_config) + # Go over the listener layers and replace them + for listener in pipe_listeners: + new_model = tok2vec_model.copy() + if "replace_listener" in tok2vec_model.attrs: + new_model = tok2vec_model.attrs["replace_listener"](new_model) + util.replace_model_node(pipe.model, listener, new_model) # type: ignore[attr-defined] + tok2vec.remove_listener(listener, pipe_name) + def to_disk( self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() ) -> None: @@ -1641,13 +1946,13 @@ class Language: path (str / Path): Path to a directory, which will be created if it doesn't exist. - exclude (list): Names of components or serialization fields to exclude. + exclude (Iterable[str]): Names of components or serialization fields to exclude. - DOCS: https://nightly.spacy.io/api/language#to_disk + DOCS: https://spacy.io/api/language#to_disk """ path = util.ensure_path(path) serializers = {} - serializers["tokenizer"] = lambda p: self.tokenizer.to_disk( + serializers["tokenizer"] = lambda p: self.tokenizer.to_disk( # type: ignore[union-attr] p, exclude=["vocab"] ) serializers["meta.json"] = lambda p: srsly.write_json(p, self.meta) @@ -1657,22 +1962,26 @@ class Language: continue if not hasattr(proc, "to_disk"): continue - serializers[name] = lambda p, proc=proc: proc.to_disk(p, exclude=["vocab"]) - serializers["vocab"] = lambda p: self.vocab.to_disk(p) + serializers[name] = lambda p, proc=proc: proc.to_disk(p, exclude=["vocab"]) # type: ignore[misc] + serializers["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude) util.to_disk(path, serializers, exclude) def from_disk( - self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() + self, + path: Union[str, Path], + *, + exclude: Iterable[str] = SimpleFrozenList(), + overrides: Dict[str, Any] = SimpleFrozenDict(), ) -> "Language": """Loads state from a directory. Modifies the object in place and returns it. If the saved `Language` object contains a model, the model will be loaded. path (str / Path): A path to a directory. - exclude (list): Names of components or serialization fields to exclude. + exclude (Iterable[str]): Names of components or serialization fields to exclude. RETURNS (Language): The modified `Language` object. - DOCS: https://nightly.spacy.io/api/language#from_disk + DOCS: https://spacy.io/api/language#from_disk """ def deserialize_meta(path: Path) -> None: @@ -1685,17 +1994,17 @@ class Language: def deserialize_vocab(path: Path) -> None: if path.exists(): - self.vocab.from_disk(path) + self.vocab.from_disk(path, exclude=exclude) path = util.ensure_path(path) deserializers = {} - if Path(path / "config.cfg").exists(): + if Path(path / "config.cfg").exists(): # type: ignore[operator] deserializers["config.cfg"] = lambda p: self.config.from_disk( - p, interpolate=False + p, interpolate=False, overrides=overrides ) - deserializers["meta.json"] = deserialize_meta - deserializers["vocab"] = deserialize_vocab - deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk( + deserializers["meta.json"] = deserialize_meta # type: ignore[assignment] + deserializers["vocab"] = deserialize_vocab # type: ignore[assignment] + deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk( # type: ignore[union-attr] p, exclude=["vocab"] ) for name, proc in self._components: @@ -1703,28 +2012,28 @@ class Language: continue if not hasattr(proc, "from_disk"): continue - deserializers[name] = lambda p, proc=proc: proc.from_disk( + deserializers[name] = lambda p, proc=proc: proc.from_disk( # type: ignore[misc] p, exclude=["vocab"] ) - if not (path / "vocab").exists() and "vocab" not in exclude: + if not (path / "vocab").exists() and "vocab" not in exclude: # type: ignore[operator] # Convert to list here in case exclude is (default) tuple exclude = list(exclude) + ["vocab"] - util.from_disk(path, deserializers, exclude) - self._path = path + util.from_disk(path, deserializers, exclude) # type: ignore[arg-type] + self._path = path # type: ignore[assignment] self._link_components() return self def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes: """Serialize the current state to a binary string. - exclude (list): Names of components or serialization fields to exclude. + exclude (Iterable[str]): Names of components or serialization fields to exclude. RETURNS (bytes): The serialized form of the `Language` object. - DOCS: https://nightly.spacy.io/api/language#to_bytes + DOCS: https://spacy.io/api/language#to_bytes """ - serializers = {} - serializers["vocab"] = lambda: self.vocab.to_bytes() - serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"]) + serializers: Dict[str, Callable[[], bytes]] = {} + serializers["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude) + serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"]) # type: ignore[union-attr] serializers["meta.json"] = lambda: srsly.json_dumps(self.meta) serializers["config.cfg"] = lambda: self.config.to_bytes() for name, proc in self._components: @@ -1732,7 +2041,7 @@ class Language: continue if not hasattr(proc, "to_bytes"): continue - serializers[name] = lambda proc=proc: proc.to_bytes(exclude=["vocab"]) + serializers[name] = lambda proc=proc: proc.to_bytes(exclude=["vocab"]) # type: ignore[misc] return util.to_bytes(serializers, exclude) def from_bytes( @@ -1741,10 +2050,10 @@ class Language: """Load state from a binary string. bytes_data (bytes): The data to load from. - exclude (list): Names of components or serialization fields to exclude. + exclude (Iterable[str]): Names of components or serialization fields to exclude. RETURNS (Language): The `Language` object. - DOCS: https://nightly.spacy.io/api/language#from_bytes + DOCS: https://spacy.io/api/language#from_bytes """ def deserialize_meta(b): @@ -1754,13 +2063,13 @@ class Language: # from self.vocab.vectors, so set the name directly self.vocab.vectors.name = data.get("vectors", {}).get("name") - deserializers = {} + deserializers: Dict[str, Callable[[bytes], Any]] = {} deserializers["config.cfg"] = lambda b: self.config.from_bytes( b, interpolate=False ) deserializers["meta.json"] = deserialize_meta - deserializers["vocab"] = self.vocab.from_bytes - deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes( + deserializers["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude) + deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes( # type: ignore[union-attr] b, exclude=["vocab"] ) for name, proc in self._components: @@ -1768,7 +2077,7 @@ class Language: continue if not hasattr(proc, "from_bytes"): continue - deserializers[name] = lambda b, proc=proc: proc.from_bytes( + deserializers[name] = lambda b, proc=proc: proc.from_bytes( # type: ignore[misc] b, exclude=["vocab"] ) util.from_bytes(bytes_data, deserializers, exclude) @@ -1790,7 +2099,7 @@ class FactoryMeta: requires: Iterable[str] = tuple() retokenizes: bool = False scores: Iterable[str] = tuple() - default_score_weights: Optional[Dict[str, float]] = None # noqa: E704 + default_score_weights: Optional[Dict[str, Optional[float]]] = None # noqa: E704 class DisabledPipes(list): @@ -1822,7 +2131,7 @@ class DisabledPipes(list): def _copy_examples(examples: Iterable[Example]) -> List[Example]: """Make a copy of a batch of examples, copying the predicted Doc as well. This is used in contexts where we need to take ownership of the examples - so that they can be mutated, for instance during Language.evaluate and + so that they can be mutated, for instance during Language.evaluate and Language.update. """ return [Example(eg.x.copy(), eg.y) for eg in examples] @@ -1830,7 +2139,7 @@ def _copy_examples(examples: Iterable[Example]) -> List[Example]: def _apply_pipes( make_doc: Callable[[str], Doc], - pipes: Iterable[Callable[[Doc], Doc]], + pipes: Iterable[Callable[..., Iterator[Doc]]], receiver, sender, underscore_state: Tuple[dict, dict, dict], @@ -1838,7 +2147,7 @@ def _apply_pipes( """Worker for Language.pipe make_doc (Callable[[str,] Doc]): Function to create Doc from text. - pipes (Iterable[Callable[[Doc], Doc]]): The components to apply. + pipes (Iterable[Pipe]): The components to apply. receiver (multiprocessing.Connection): Pipe to receive text. Usually created by `multiprocessing.Pipe()` sender (multiprocessing.Connection): Pipe to send doc. Usually created by @@ -1848,12 +2157,19 @@ def _apply_pipes( """ Underscore.load_state(underscore_state) while True: - texts = receiver.get() - docs = (make_doc(text) for text in texts) - for pipe in pipes: - docs = pipe(docs) - # Connection does not accept unpickable objects, so send list. - sender.send([doc.to_bytes() for doc in docs]) + try: + texts = receiver.get() + docs = (make_doc(text) for text in texts) + for pipe in pipes: + docs = pipe(docs) # type: ignore[arg-type, assignment] + # Connection does not accept unpickable objects, so send list. + byte_docs = [(doc.to_bytes(), None) for doc in docs] + padding = [(None, None)] * (len(texts) - len(byte_docs)) + sender.send(byte_docs + padding) # type: ignore[operator] + except Exception: + error_msg = [(None, srsly.msgpack_dumps(traceback.format_exc()))] + padding = [(None, None)] * (len(texts) - 1) + sender.send(error_msg + padding) class _Sender: diff --git a/spacy/lexeme.pyi b/spacy/lexeme.pyi new file mode 100644 index 000000000..4eae6be43 --- /dev/null +++ b/spacy/lexeme.pyi @@ -0,0 +1,61 @@ +from typing import ( + Union, + Any, +) +from thinc.types import Floats1d +from .tokens import Doc, Span, Token +from .vocab import Vocab + +class Lexeme: + def __init__(self, vocab: Vocab, orth: int) -> None: ... + def __richcmp__(self, other: Lexeme, op: int) -> bool: ... + def __hash__(self) -> int: ... + def set_attrs(self, **attrs: Any) -> None: ... + def set_flag(self, flag_id: int, value: bool) -> None: ... + def check_flag(self, flag_id: int) -> bool: ... + def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ... + @property + def has_vector(self) -> bool: ... + @property + def vector_norm(self) -> float: ... + vector: Floats1d + rank: str + sentiment: float + @property + def orth_(self) -> str: ... + @property + def text(self) -> str: ... + lower: str + norm: int + shape: int + prefix: int + suffix: int + cluster: int + lang: int + prob: float + lower_: str + norm_: str + shape_: str + prefix_: str + suffix_: str + lang_: str + flags: int + @property + def is_oov(self) -> bool: ... + is_stop: bool + is_alpha: bool + is_ascii: bool + is_digit: bool + is_lower: bool + is_upper: bool + is_title: bool + is_punct: bool + is_space: bool + is_bracket: bool + is_quote: bool + is_left_punct: bool + is_right_punct: bool + is_currency: bool + like_url: bool + like_num: bool + like_email: bool diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 17ce574ce..3564b6e42 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -30,7 +30,7 @@ cdef class Lexeme: tag, dependency parse, or lemma (lemmatization depends on the part-of-speech tag). - DOCS: https://nightly.spacy.io/api/lexeme + DOCS: https://spacy.io/api/lexeme """ def __init__(self, Vocab vocab, attr_t orth): """Create a Lexeme object. @@ -163,7 +163,7 @@ cdef class Lexeme: self.vocab.set_vector(self.c.orth, vector) property rank: - """RETURNS (str): Sequential ID of the lexemes's lexical type, used + """RETURNS (str): Sequential ID of the lexeme's lexical type, used to index into tables, e.g. for word vectors.""" def __get__(self): return self.c.id @@ -205,7 +205,7 @@ cdef class Lexeme: self.c.lower = x property norm: - """RETURNS (uint64): The lexemes's norm, i.e. a normalised form of the + """RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the lexeme text. """ def __get__(self): @@ -288,7 +288,7 @@ cdef class Lexeme: self.c.lower = self.vocab.strings.add(x) property norm_: - """RETURNS (str): The lexemes's norm, i.e. a normalised form of the + """RETURNS (str): The lexeme's norm, i.e. a normalised form of the lexeme text. """ def __get__(self): @@ -451,7 +451,7 @@ cdef class Lexeme: Lexeme.c_set_flag(self.c, IS_QUOTE, x) property is_left_punct: - """RETURNS (bool): Whether the lexeme is left punctuation, e.g. ).""" + """RETURNS (bool): Whether the lexeme is left punctuation, e.g. (.""" def __get__(self): return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT) diff --git a/spacy/lookups.py b/spacy/lookups.py index 133cb0672..b2f3dc15e 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -1,4 +1,4 @@ -from typing import Dict, Any, List, Union, Optional +from typing import Any, List, Union, Optional, Dict from pathlib import Path import srsly from preshed.bloom import BloomFilter @@ -12,18 +12,16 @@ from .strings import get_string_id UNSET = object() -def load_lookups( - lang: str, tables: List[str], strict: bool = True -) -> Optional[Dict[str, Any]]: +def load_lookups(lang: str, tables: List[str], strict: bool = True) -> "Lookups": """Load the data from the spacy-lookups-data package for a given language, - if available. Returns an empty dict if there's no data or if the package + if available. Returns an empty `Lookups` container if there's no data or if the package is not installed. lang (str): The language code (corresponds to entry point exposed by the spacy-lookups-data package). tables (List[str]): Name of tables to load, e.g. ["lemma_lookup", "lemma_exc"] strict (bool): Whether to raise an error if a table doesn't exist. - RETURNS (Dict[str, Any]): The lookups, keyed by table name. + RETURNS (Lookups): The lookups container containing the loaded tables. """ # TODO: import spacy_lookups_data instead of going via entry points here? lookups = Lookups() @@ -36,9 +34,9 @@ def load_lookups( if table not in data: if strict: raise ValueError(Errors.E955.format(table=table, lang=lang)) - language_data = {} + language_data = {} # type: ignore[var-annotated] else: - language_data = load_language_data(data[table]) + language_data = load_language_data(data[table]) # type: ignore[assignment] lookups.add_table(table, language_data) return lookups @@ -57,7 +55,7 @@ class Table(OrderedDict): data (dict): The dictionary. name (str): Optional table name for reference. - DOCS: https://nightly.spacy.io/api/lookups#table.from_dict + DOCS: https://spacy.io/api/lookups#table.from_dict """ self = cls(name=name) self.update(data) @@ -69,7 +67,7 @@ class Table(OrderedDict): name (str): Optional table name for reference. data (dict): Initial data, used to hint Bloom Filter. - DOCS: https://nightly.spacy.io/api/lookups#table.init + DOCS: https://spacy.io/api/lookups#table.init """ OrderedDict.__init__(self) self.name = name @@ -118,7 +116,7 @@ class Table(OrderedDict): key = get_string_id(key) return OrderedDict.get(self, key, default) - def __contains__(self, key: Union[str, int]) -> bool: + def __contains__(self, key: Union[str, int]) -> bool: # type: ignore[override] """Check whether a key is in the table. String keys will be hashed. key (str / int): The key to check. @@ -135,7 +133,7 @@ class Table(OrderedDict): RETURNS (bytes): The serialized table. - DOCS: https://nightly.spacy.io/api/lookups#table.to_bytes + DOCS: https://spacy.io/api/lookups#table.to_bytes """ data = { "name": self.name, @@ -150,7 +148,7 @@ class Table(OrderedDict): bytes_data (bytes): The data to load. RETURNS (Table): The loaded table. - DOCS: https://nightly.spacy.io/api/lookups#table.from_bytes + DOCS: https://spacy.io/api/lookups#table.from_bytes """ loaded = srsly.msgpack_loads(bytes_data) data = loaded.get("dict", {}) @@ -172,9 +170,9 @@ class Lookups: def __init__(self) -> None: """Initialize the Lookups object. - DOCS: https://nightly.spacy.io/api/lookups#init + DOCS: https://spacy.io/api/lookups#init """ - self._tables = {} + self._tables: Dict[str, Table] = {} def __contains__(self, name: str) -> bool: """Check if the lookups contain a table of a given name. Delegates to @@ -201,7 +199,7 @@ class Lookups: data (dict): Optional data to add to the table. RETURNS (Table): The newly added table. - DOCS: https://nightly.spacy.io/api/lookups#add_table + DOCS: https://spacy.io/api/lookups#add_table """ if name in self.tables: raise ValueError(Errors.E158.format(name=name)) @@ -215,7 +213,7 @@ class Lookups: name (str): Name of the table to set. table (Table): The Table to set. - DOCS: https://nightly.spacy.io/api/lookups#set_table + DOCS: https://spacy.io/api/lookups#set_table """ self._tables[name] = table @@ -227,7 +225,7 @@ class Lookups: default (Any): Optional default value to return if table doesn't exist. RETURNS (Table): The table. - DOCS: https://nightly.spacy.io/api/lookups#get_table + DOCS: https://spacy.io/api/lookups#get_table """ if name not in self._tables: if default == UNSET: @@ -241,7 +239,7 @@ class Lookups: name (str): Name of the table to remove. RETURNS (Table): The removed table. - DOCS: https://nightly.spacy.io/api/lookups#remove_table + DOCS: https://spacy.io/api/lookups#remove_table """ if name not in self._tables: raise KeyError(Errors.E159.format(name=name, tables=self.tables)) @@ -253,7 +251,7 @@ class Lookups: name (str): Name of the table. RETURNS (bool): Whether a table of that name exists. - DOCS: https://nightly.spacy.io/api/lookups#has_table + DOCS: https://spacy.io/api/lookups#has_table """ return name in self._tables @@ -262,7 +260,7 @@ class Lookups: RETURNS (bytes): The serialized Lookups. - DOCS: https://nightly.spacy.io/api/lookups#to_bytes + DOCS: https://spacy.io/api/lookups#to_bytes """ return srsly.msgpack_dumps(self._tables) @@ -272,7 +270,7 @@ class Lookups: bytes_data (bytes): The data to load. RETURNS (Lookups): The loaded Lookups. - DOCS: https://nightly.spacy.io/api/lookups#from_bytes + DOCS: https://spacy.io/api/lookups#from_bytes """ self._tables = {} for key, value in srsly.msgpack_loads(bytes_data).items(): @@ -287,7 +285,7 @@ class Lookups: path (str / Path): The file path. - DOCS: https://nightly.spacy.io/api/lookups#to_disk + DOCS: https://spacy.io/api/lookups#to_disk """ path = ensure_path(path) if not path.exists(): @@ -305,7 +303,7 @@ class Lookups: path (str / Path): The directory path. RETURNS (Lookups): The loaded lookups. - DOCS: https://nightly.spacy.io/api/lookups#from_disk + DOCS: https://spacy.io/api/lookups#from_disk """ path = ensure_path(path) filepath = path / filename diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index 69de57026..b667e6b2f 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -3,13 +3,13 @@ from typing import List from collections import defaultdict from itertools import product -import numpy +import warnings from .matcher cimport Matcher from ..vocab cimport Vocab from ..tokens.doc cimport Doc -from ..errors import Errors +from ..errors import Errors, Warnings from ..tokens import Span @@ -121,13 +121,15 @@ cdef class DependencyMatcher: raise ValueError(Errors.E099.format(key=key)) visited_nodes[relation["RIGHT_ID"]] = True else: - if not( - "RIGHT_ID" in relation - and "RIGHT_ATTRS" in relation - and "REL_OP" in relation - and "LEFT_ID" in relation - ): - raise ValueError(Errors.E100.format(key=key)) + required_keys = {"RIGHT_ID", "RIGHT_ATTRS", "REL_OP", "LEFT_ID"} + relation_keys = set(relation.keys()) + missing = required_keys - relation_keys + if missing: + missing_txt = ", ".join(list(missing)) + raise ValueError(Errors.E100.format( + required=required_keys, + missing=missing_txt + )) if ( relation["RIGHT_ID"] in visited_nodes or relation["LEFT_ID"] not in visited_nodes @@ -137,6 +139,8 @@ cdef class DependencyMatcher: raise ValueError(Errors.E1007.format(op=relation["REL_OP"])) visited_nodes[relation["RIGHT_ID"]] = True visited_nodes[relation["LEFT_ID"]] = True + if relation["RIGHT_ATTRS"].get("OP", "") in ("?", "*", "+"): + raise ValueError(Errors.E1016.format(node=relation)) idx = idx + 1 def _get_matcher_key(self, key, pattern_idx, token_idx): @@ -172,28 +176,23 @@ cdef class DependencyMatcher: self._callbacks[key] = on_match # Add 'RIGHT_ATTRS' to self._patterns[key] - _patterns = [] - for pattern in patterns: - token_patterns = [] - for i in range(len(pattern)): - token_pattern = [pattern[i]["RIGHT_ATTRS"]] - token_patterns.append(token_pattern) - _patterns.append(token_patterns) + _patterns = [[[pat["RIGHT_ATTRS"]] for pat in pattern] for pattern in patterns] + pattern_offset = len(self._patterns[key]) self._patterns[key].extend(_patterns) # Add each node pattern of all the input patterns individually to the # matcher. This enables only a single instance of Matcher to be used. # Multiple adds are required to track each node pattern. tokens_to_key_list = [] - for i in range(len(_patterns)): + for i, current_patterns in enumerate(_patterns, start=pattern_offset): # Preallocate list space - tokens_to_key = [None]*len(_patterns[i]) + tokens_to_key = [None] * len(current_patterns) # TODO: Better ways to hash edges in pattern? - for j in range(len(_patterns[i])): + for j, _pattern in enumerate(current_patterns): k = self._get_matcher_key(key, i, j) - self._matcher.add(k, [_patterns[i][j]]) + self._matcher.add(k, [_pattern]) tokens_to_key[j] = k tokens_to_key_list.append(tokens_to_key) @@ -265,7 +264,9 @@ cdef class DependencyMatcher: self._raw_patterns.pop(key) self._tree.pop(key) self._root.pop(key) - self._tokens_to_key.pop(key) + for mklist in self._tokens_to_key.pop(key): + for mkey in mklist: + self._matcher.remove(mkey) def _get_keys_to_position_maps(self, doc): """ @@ -277,7 +278,9 @@ cdef class DependencyMatcher: e.g. keys_to_position_maps[root_index][match_id] = [...] """ keys_to_position_maps = defaultdict(lambda: defaultdict(list)) - for match_id, start, _ in self._matcher(doc): + for match_id, start, end in self._matcher(doc): + if start + 1 != end: + warnings.warn(Warnings.W110.format(tokens=[t.text for t in doc[start:end]], pattern=self._matcher.get(match_id)[1][0][0])) token = doc[start] root = ([token] + list(token.ancestors))[-1] keys_to_position_maps[root.i][match_id].append(start) @@ -294,7 +297,7 @@ cdef class DependencyMatcher: if isinstance(doclike, Doc): doc = doclike elif isinstance(doclike, Span): - doc = doclike.as_doc() + doc = doclike.as_doc(copy_user_data=True) else: raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__)) @@ -328,7 +331,7 @@ cdef class DependencyMatcher: # position of the matched tokens for candidate_match in product(*all_positions): - # A potential match is a valid match if all relationhips between the + # A potential match is a valid match if all relationships between the # matched tokens are satisfied. is_valid = True for left_idx in range(len(candidate_match)): @@ -415,18 +418,10 @@ cdef class DependencyMatcher: return [] def _right_sib(self, doc, node): - candidate_children = [] - for child in list(doc[node].head.children): - if child.i > node: - candidate_children.append(doc[child.i]) - return candidate_children + return [doc[child.i] for child in doc[node].head.children if child.i > node] def _left_sib(self, doc, node): - candidate_children = [] - for child in list(doc[node].head.children): - if child.i < node: - candidate_children.append(doc[child.i]) - return candidate_children + return [doc[child.i] for child in doc[node].head.children if child.i < node] def _normalize_key(self, key): if isinstance(key, basestring): diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd index 52a30d94c..455f978cc 100644 --- a/spacy/matcher/matcher.pxd +++ b/spacy/matcher/matcher.pxd @@ -46,6 +46,12 @@ cdef struct TokenPatternC: int32_t nr_py quantifier_t quantifier hash_t key + int32_t token_idx + + +cdef struct MatchAlignmentC: + int32_t token_idx + int32_t length cdef struct PatternStateC: diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi new file mode 100644 index 000000000..ec4a88eaf --- /dev/null +++ b/spacy/matcher/matcher.pyi @@ -0,0 +1,42 @@ +from typing import Any, List, Dict, Tuple, Optional, Callable, Union, Iterator, Iterable +from ..vocab import Vocab +from ..tokens import Doc, Span + +class Matcher: + def __init__(self, vocab: Vocab, validate: bool = ...) -> None: ... + def __reduce__(self) -> Any: ... + def __len__(self) -> int: ... + def __contains__(self, key: str) -> bool: ... + def add( + self, + key: Union[str, int], + patterns: List[List[Dict[str, Any]]], + *, + on_match: Optional[ + Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any] + ] = ..., + greedy: Optional[str] = ... + ) -> None: ... + def remove(self, key: str) -> None: ... + def has_key(self, key: Union[str, int]) -> bool: ... + def get( + self, key: Union[str, int], default: Optional[Any] = ... + ) -> Tuple[Optional[Callable[[Any], Any]], List[List[Dict[Any, Any]]]]: ... + def pipe( + self, + docs: Iterable[Tuple[Doc, Any]], + batch_size: int = ..., + return_matches: bool = ..., + as_tuples: bool = ..., + ) -> Union[ + Iterator[Tuple[Tuple[Doc, Any], Any]], Iterator[Tuple[Doc, Any]], Iterator[Doc] + ]: ... + def __call__( + self, + doclike: Union[Doc, Span], + *, + as_spans: bool = ..., + allow_missing: bool = ..., + with_alignments: bool = ... + ) -> Union[List[Tuple[int, int, int]], List[Span]]: ... + def _normalize_key(self, key: Any) -> Any: ... diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 31699bfa1..f8482a1eb 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -32,8 +32,8 @@ DEF PADDING = 5 cdef class Matcher: """Match sequences of tokens, based on pattern rules. - DOCS: https://nightly.spacy.io/api/matcher - USAGE: https://nightly.spacy.io/usage/rule-based-matching + DOCS: https://spacy.io/api/matcher + USAGE: https://spacy.io/usage/rule-based-matching """ def __init__(self, vocab, validate=True): @@ -96,12 +96,10 @@ cdef class Matcher: by returning a non-overlapping set per key, either taking preference to the first greedy match ("FIRST"), or the longest ("LONGEST"). - As of spaCy v2.2.2, Matcher.add supports the future API, which makes - the patterns the second argument and a list (instead of a variable - number of arguments). The on_match callback becomes an optional keyword - argument. + Since spaCy v2.2.2, Matcher.add takes a list of patterns as the second + argument, and the on_match callback is an optional keyword argument. - key (str): The match ID. + key (Union[str, int]): The match ID. patterns (list): The patterns to add for the given key. on_match (callable): Optional callback executed on match. greedy (str): Optional filter: "FIRST" or "LONGEST". @@ -138,6 +136,11 @@ cdef class Matcher: self._filter[key] = greedy self._patterns[key].extend(patterns) + def _require_patterns(self) -> None: + """Raise a warning if this component has no patterns defined.""" + if len(self) == 0: + warnings.warn(Warnings.W036.format(name="matcher")) + def remove(self, key): """Remove a rule from the matcher. A KeyError is raised if the key does not exist. @@ -196,17 +199,26 @@ cdef class Matcher: else: yield doc - def __call__(self, object doclike, *, as_spans=False, allow_missing=False): + def __call__(self, object doclike, *, as_spans=False, allow_missing=False, with_alignments=False): """Find all token sequences matching the supplied pattern. doclike (Doc or Span): The document to match over. as_spans (bool): Return Span objects with labels instead of (match_id, start, end) tuples. + allow_missing (bool): Whether to skip checks for missing annotation for + attributes included in patterns. Defaults to False. + with_alignments (bool): Return match alignment information, which is + `List[int]` with length of matched span. Each entry denotes the + corresponding index of token pattern. If as_spans is set to True, + this setting is ignored. RETURNS (list): A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end]`. The `match_id` is an integer. If as_spans is set to True, a list of Span objects is returned. + If with_alignments is set to True and as_spans is set to False, + A list of `(match_id, start, end, alignments)` tuples is returned. """ + self._require_patterns() if isinstance(doclike, Doc): doc = doclike length = len(doc) @@ -215,6 +227,9 @@ cdef class Matcher: length = doclike.end - doclike.start else: raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__)) + # Skip alignments calculations if as_spans is set + if as_spans: + with_alignments = False cdef Pool tmp_pool = Pool() if not allow_missing: for attr in (TAG, POS, MORPH, LEMMA, DEP): @@ -222,7 +237,7 @@ cdef class Matcher: if attr == TAG: pipe = "tagger" elif attr in (POS, MORPH): - pipe = "morphologizer" + pipe = "morphologizer or tagger+attribute_ruler" elif attr == LEMMA: pipe = "lemmatizer" elif attr == DEP: @@ -230,18 +245,20 @@ cdef class Matcher: error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr)) raise ValueError(error_msg) matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length, - extensions=self._extensions, predicates=self._extra_predicates) + extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments) final_matches = [] pairs_by_id = {} - # For each key, either add all matches, or only the filtered, non-overlapping ones - for (key, start, end) in matches: + # For each key, either add all matches, or only the filtered, + # non-overlapping ones this `match` can be either (start, end) or + # (start, end, alignments) depending on `with_alignments=` option. + for key, *match in matches: span_filter = self._filter.get(key) if span_filter is not None: pairs = pairs_by_id.get(key, []) - pairs.append((start,end)) + pairs.append(match) pairs_by_id[key] = pairs else: - final_matches.append((key, start, end)) + final_matches.append((key, *match)) matched = tmp_pool.alloc(length, sizeof(char)) empty = tmp_pool.alloc(length, sizeof(char)) for key, pairs in pairs_by_id.items(): @@ -253,23 +270,46 @@ cdef class Matcher: sorted_pairs = sorted(pairs, key=lambda x: (x[1]-x[0], -x[0]), reverse=True) # reverse sort by length else: raise ValueError(Errors.E947.format(expected=["FIRST", "LONGEST"], arg=span_filter)) - for (start, end) in sorted_pairs: + for match in sorted_pairs: + start, end = match[:2] assert 0 <= start < end # Defend against segfaults span_len = end-start # If no tokens in the span have matched if memcmp(&matched[start], &empty[start], span_len * sizeof(matched[0])) == 0: - final_matches.append((key, start, end)) + final_matches.append((key, *match)) # Mark tokens that have matched memset(&matched[start], 1, span_len * sizeof(matched[0])) + if as_spans: + final_results = [] + for key, start, end, *_ in final_matches: + if isinstance(doclike, Span): + start += doclike.start + end += doclike.start + final_results.append(Span(doc, start, end, label=key)) + elif with_alignments: + # convert alignments List[Dict[str, int]] --> List[int] + # when multiple alignment (belongs to the same length) is found, + # keeps the alignment that has largest token_idx + final_results = [] + for key, start, end, alignments in final_matches: + sorted_alignments = sorted(alignments, key=lambda x: (x['length'], x['token_idx']), reverse=False) + alignments = [0] * (end-start) + for align in sorted_alignments: + if align['length'] >= end-start: + continue + # Since alignments are sorted in order of (length, token_idx) + # this overwrites smaller token_idx when they have same length. + alignments[align['length']] = align['token_idx'] + final_results.append((key, start, end, alignments)) + final_matches = final_results # for callbacks + else: + final_results = final_matches # perform the callbacks on the filtered set of results - for i, (key, start, end) in enumerate(final_matches): + for i, (key, *_) in enumerate(final_matches): on_match = self._callbacks.get(key, None) if on_match is not None: on_match(self, doc, i, final_matches) - if as_spans: - return [Span(doc, start, end, label=key) for key, start, end in final_matches] - else: - return final_matches + return final_results def _normalize_key(self, key): if isinstance(key, basestring): @@ -286,20 +326,22 @@ def unpickle_matcher(vocab, patterns, callbacks): return matcher -cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple()): +cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0): """Find matches in a doc, with a compiled array of patterns. Matches are - returned as a list of (id, start, end) tuples. + returned as a list of (id, start, end) tuples or (id, start, end, alignments) tuples (if with_alignments != 0) To augment the compiled patterns, we optionally also take two Python lists. The "predicates" list contains functions that take a Python list and return a boolean value. It's mostly used for regular expressions. - The "extra_getters" list contains functions that take a Python list and return + The "extensions" list contains functions that take a Python list and return an attr ID. It's mostly used for extension attributes. """ cdef vector[PatternStateC] states cdef vector[MatchC] matches + cdef vector[vector[MatchAlignmentC]] align_states + cdef vector[vector[MatchAlignmentC]] align_matches cdef PatternStateC state cdef int i, j, nr_extra_attr cdef Pool mem = Pool() @@ -326,12 +368,14 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e for i in range(length): for j in range(n): states.push_back(PatternStateC(patterns[j], i, 0)) - transition_states(states, matches, predicate_cache, - doclike[i], extra_attr_values, predicates) + if with_alignments != 0: + align_states.resize(states.size()) + transition_states(states, matches, align_states, align_matches, predicate_cache, + doclike[i], extra_attr_values, predicates, with_alignments) extra_attr_values += nr_extra_attr predicate_cache += len(predicates) # Handle matches that end in 0-width patterns - finish_states(matches, states) + finish_states(matches, states, align_matches, align_states, with_alignments) seen = set() for i in range(matches.size()): match = ( @@ -342,17 +386,24 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e # We need to deduplicate, because we could otherwise arrive at the same # match through two paths, e.g. .?.? matching 'a'. Are we matching the # first .?, or the second .? -- it doesn't matter, it's just one match. - if match not in seen: - output.append(match) + # Skip 0-length matches. (TODO: fix algorithm) + if match not in seen and matches[i].length > 0: + if with_alignments != 0: + # since the length of align_matches equals to that of match, we can share same 'i' + output.append(match + (align_matches[i],)) + else: + output.append(match) seen.add(match) return output cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches, + vector[vector[MatchAlignmentC]]& align_states, vector[vector[MatchAlignmentC]]& align_matches, int8_t* cached_py_predicates, - Token token, const attr_t* extra_attrs, py_predicates) except *: + Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments) except *: cdef int q = 0 cdef vector[PatternStateC] new_states + cdef vector[vector[MatchAlignmentC]] align_new_states cdef int nr_predicate = len(py_predicates) for i in range(states.size()): if states[i].pattern.nr_py >= 1: @@ -367,23 +418,39 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match # it in the states list, because q doesn't advance. state = states[i] states[q] = state + # Separate from states, performance is guaranteed for users who only need basic options (without alignments). + # `align_states` always corresponds to `states` 1:1. + if with_alignments != 0: + align_state = align_states[i] + align_states[q] = align_state while action in (RETRY, RETRY_ADVANCE, RETRY_EXTEND): + # Update alignment before the transition of current state + # 'MatchAlignmentC' maps 'original token index of current pattern' to 'current matching length' + if with_alignments != 0: + align_states[q].push_back(MatchAlignmentC(states[q].pattern.token_idx, states[q].length)) if action == RETRY_EXTEND: # This handles the 'extend' new_states.push_back( PatternStateC(pattern=states[q].pattern, start=state.start, length=state.length+1)) + if with_alignments != 0: + align_new_states.push_back(align_states[q]) if action == RETRY_ADVANCE: # This handles the 'advance' new_states.push_back( PatternStateC(pattern=states[q].pattern+1, start=state.start, length=state.length+1)) + if with_alignments != 0: + align_new_states.push_back(align_states[q]) states[q].pattern += 1 if states[q].pattern.nr_py != 0: update_predicate_cache(cached_py_predicates, states[q].pattern, token, py_predicates) action = get_action(states[q], token.c, extra_attrs, cached_py_predicates) + # Update alignment before the transition of current state + if with_alignments != 0: + align_states[q].push_back(MatchAlignmentC(states[q].pattern.token_idx, states[q].length)) if action == REJECT: pass elif action == ADVANCE: @@ -396,29 +463,50 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match matches.push_back( MatchC(pattern_id=ent_id, start=state.start, length=state.length+1)) + # `align_matches` always corresponds to `matches` 1:1 + if with_alignments != 0: + align_matches.push_back(align_states[q]) elif action == MATCH_DOUBLE: # push match without last token if length > 0 if state.length > 0: matches.push_back( MatchC(pattern_id=ent_id, start=state.start, length=state.length)) + # MATCH_DOUBLE emits matches twice, + # add one more to align_matches in order to keep 1:1 relationship + if with_alignments != 0: + align_matches.push_back(align_states[q]) # push match with last token matches.push_back( MatchC(pattern_id=ent_id, start=state.start, length=state.length+1)) + # `align_matches` always corresponds to `matches` 1:1 + if with_alignments != 0: + align_matches.push_back(align_states[q]) elif action == MATCH_REJECT: matches.push_back( MatchC(pattern_id=ent_id, start=state.start, length=state.length)) + # `align_matches` always corresponds to `matches` 1:1 + if with_alignments != 0: + align_matches.push_back(align_states[q]) elif action == MATCH_EXTEND: matches.push_back( MatchC(pattern_id=ent_id, start=state.start, length=state.length)) + # `align_matches` always corresponds to `matches` 1:1 + if with_alignments != 0: + align_matches.push_back(align_states[q]) states[q].length += 1 q += 1 states.resize(q) for i in range(new_states.size()): states.push_back(new_states[i]) + # `align_states` always corresponds to `states` 1:1 + if with_alignments != 0: + align_states.resize(q) + for i in range(align_new_states.size()): + align_states.push_back(align_new_states[i]) cdef int update_predicate_cache(int8_t* cache, @@ -441,15 +529,27 @@ cdef int update_predicate_cache(int8_t* cache, raise ValueError(Errors.E125.format(value=result)) -cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states) except *: +cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states, + vector[vector[MatchAlignmentC]]& align_matches, + vector[vector[MatchAlignmentC]]& align_states, + bint with_alignments) except *: """Handle states that end in zero-width patterns.""" cdef PatternStateC state + cdef vector[MatchAlignmentC] align_state for i in range(states.size()): state = states[i] + if with_alignments != 0: + align_state = align_states[i] while get_quantifier(state) in (ZERO_PLUS, ZERO_ONE): + # Update alignment before the transition of current state + if with_alignments != 0: + align_state.push_back(MatchAlignmentC(state.pattern.token_idx, state.length)) is_final = get_is_final(state) if is_final: ent_id = get_ent_id(state.pattern) + # `align_matches` always corresponds to `matches` 1:1 + if with_alignments != 0: + align_matches.push_back(align_state) matches.push_back( MatchC(pattern_id=ent_id, start=state.start, length=state.length)) break @@ -604,7 +704,7 @@ cdef int8_t get_quantifier(PatternStateC state) nogil: cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs) except NULL: pattern = mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC)) cdef int i, index - for i, (quantifier, spec, extensions, predicates) in enumerate(token_specs): + for i, (quantifier, spec, extensions, predicates, token_idx) in enumerate(token_specs): pattern[i].quantifier = quantifier # Ensure attrs refers to a null pointer if nr_attr == 0 if len(spec) > 0: @@ -625,6 +725,7 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs) pattern[i].py_predicates[j] = index pattern[i].nr_py = len(predicates) pattern[i].key = hash64(pattern[i].attrs, pattern[i].nr_attr * sizeof(AttrValueC), 0) + pattern[i].token_idx = token_idx i = len(token_specs) # Use quantifier to identify final ID pattern node (rather than previous # uninitialized quantifier == 0/ZERO + nr_attr == 0 + non-zero-length attrs) @@ -635,6 +736,7 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs) pattern[i].nr_attr = 1 pattern[i].nr_extra_attr = 0 pattern[i].nr_py = 0 + pattern[i].token_idx = -1 return pattern @@ -652,7 +754,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates): """This function interprets the pattern, converting the various bits of syntactic sugar before we compile it into a struct with init_pattern. - We need to split the pattern up into three parts: + We need to split the pattern up into four parts: * Normal attribute/value pairs, which are stored on either the token or lexeme, can be handled directly. * Extension attributes are handled specially, as we need to prefetch the @@ -661,13 +763,14 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates): functions and store them. So we store these specially as well. * Extension attributes that have extra predicates are stored within the extra_predicates. + * Token index that this pattern belongs to. """ tokens = [] string_store = vocab.strings - for spec in token_specs: + for token_idx, spec in enumerate(token_specs): if not spec: # Signifier for 'any token' - tokens.append((ONE, [(NULL_ATTR, 0)], [], [])) + tokens.append((ONE, [(NULL_ATTR, 0)], [], [], token_idx)) continue if not isinstance(spec, dict): raise ValueError(Errors.E154.format()) @@ -676,7 +779,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates): extensions = _get_extensions(spec, string_store, extensions_table) predicates = _get_extra_predicates(spec, extra_predicates, vocab) for op in ops: - tokens.append((op, list(attr_values), list(extensions), list(predicates))) + tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx)) return tokens @@ -737,7 +840,7 @@ class _RegexPredicate: class _SetPredicate: - operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET") + operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS") def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None): self.i = i @@ -760,14 +863,16 @@ class _SetPredicate: else: value = get_token_attr_for_matcher(token.c, self.attr) - if self.predicate in ("IS_SUBSET", "IS_SUPERSET"): + if self.predicate in ("IS_SUBSET", "IS_SUPERSET", "INTERSECTS"): if self.attr == MORPH: # break up MORPH into individual Feat=Val values value = set(get_string_id(v) for v in MorphAnalysis.from_id(self.vocab, value)) else: - # IS_SUBSET for other attrs will be equivalent to "IN" - # IS_SUPERSET will only match for other attrs with 0 or 1 values - value = set([value]) + # treat a single value as a list + if isinstance(value, (str, int)): + value = set([get_string_id(value)]) + else: + value = set(get_string_id(v) for v in value) if self.predicate == "IN": return value in self.value elif self.predicate == "NOT_IN": @@ -776,6 +881,8 @@ class _SetPredicate: return value <= self.value elif self.predicate == "IS_SUPERSET": return value >= self.value + elif self.predicate == "INTERSECTS": + return bool(value & self.value) def __repr__(self): return repr(("SetPredicate", self.i, self.attr, self.value, self.predicate)) @@ -820,6 +927,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab): "NOT_IN": _SetPredicate, "IS_SUBSET": _SetPredicate, "IS_SUPERSET": _SetPredicate, + "INTERSECTS": _SetPredicate, "==": _ComparisonPredicate, "!=": _ComparisonPredicate, ">=": _ComparisonPredicate, diff --git a/spacy/matcher/phrasematcher.pxd b/spacy/matcher/phrasematcher.pxd index 3b42f3fab..1bdc19012 100644 --- a/spacy/matcher/phrasematcher.pxd +++ b/spacy/matcher/phrasematcher.pxd @@ -18,4 +18,4 @@ cdef class PhraseMatcher: cdef Pool mem cdef key_t _terminal_hash - cdef void find_matches(self, Doc doc, vector[SpanC] *matches) nogil + cdef void find_matches(self, Doc doc, int start_idx, int end_idx, vector[SpanC] *matches) nogil diff --git a/spacy/matcher/phrasematcher.pyi b/spacy/matcher/phrasematcher.pyi new file mode 100644 index 000000000..d73633ec0 --- /dev/null +++ b/spacy/matcher/phrasematcher.pyi @@ -0,0 +1,25 @@ +from typing import List, Tuple, Union, Optional, Callable, Any, Dict + +from . import Matcher +from ..vocab import Vocab +from ..tokens import Doc, Span + +class PhraseMatcher: + def __init__( + self, vocab: Vocab, attr: Optional[Union[int, str]], validate: bool = ... + ) -> None: ... + def __call__( + self, + doclike: Union[Doc, Span], + *, + as_spans: bool = ..., + ) -> Union[List[Tuple[int, int, int]], List[Span]]: ... + def add( + self, + key: str, + docs: List[List[Dict[str, Any]]], + *, + on_match: Optional[ + Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any] + ] = ..., + ) -> None: ... diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 7e99859b5..2ff5105ad 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -5,6 +5,7 @@ from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter import warnings from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, MORPH +from ..attrs import IDS from ..structs cimport TokenC from ..tokens.token cimport Token from ..tokens.span cimport Span @@ -19,8 +20,8 @@ cdef class PhraseMatcher: sequences based on lists of token descriptions, the `PhraseMatcher` accepts match patterns in the form of `Doc` objects. - DOCS: https://nightly.spacy.io/api/phrasematcher - USAGE: https://nightly.spacy.io/usage/rule-based-matching#phrasematcher + DOCS: https://spacy.io/api/phrasematcher + USAGE: https://spacy.io/usage/rule-based-matching#phrasematcher Adapted from FlashText: https://github.com/vi3k6i5/flashtext MIT License (see `LICENSE`) @@ -34,7 +35,7 @@ cdef class PhraseMatcher: attr (int / str): Token attribute to match on. validate (bool): Perform additional validation when patterns are added. - DOCS: https://nightly.spacy.io/api/phrasematcher#init + DOCS: https://spacy.io/api/phrasematcher#init """ self.vocab = vocab self._callbacks = {} @@ -49,19 +50,23 @@ cdef class PhraseMatcher: if isinstance(attr, (int, long)): self.attr = attr else: + if attr is None: + attr = "ORTH" attr = attr.upper() if attr == "TEXT": attr = "ORTH" + if attr == "IS_SENT_START": + attr = "SENT_START" if attr.lower() not in TokenPattern().dict(): raise ValueError(Errors.E152.format(attr=attr)) - self.attr = self.vocab.strings[attr] + self.attr = IDS.get(attr) def __len__(self): """Get the number of match IDs added to the matcher. RETURNS (int): The number of rules. - DOCS: https://nightly.spacy.io/api/phrasematcher#len + DOCS: https://spacy.io/api/phrasematcher#len """ return len(self._callbacks) @@ -71,7 +76,7 @@ cdef class PhraseMatcher: key (str): The match ID. RETURNS (bool): Whether the matcher contains rules for this match ID. - DOCS: https://nightly.spacy.io/api/phrasematcher#contains + DOCS: https://spacy.io/api/phrasematcher#contains """ return key in self._callbacks @@ -85,7 +90,7 @@ cdef class PhraseMatcher: key (str): The match ID. - DOCS: https://nightly.spacy.io/api/phrasematcher#remove + DOCS: https://spacy.io/api/phrasematcher#remove """ if key not in self._docs: raise KeyError(key) @@ -152,9 +157,8 @@ cdef class PhraseMatcher: """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID key, an on_match callback, and one or more patterns. - As of spaCy v2.2.2, PhraseMatcher.add supports the future API, which - makes the patterns the second argument and a list (instead of a variable - number of arguments). The on_match callback becomes an optional keyword + Since spaCy v2.2.2, PhraseMatcher.add takes a list of patterns as the + second argument, with the on_match callback as an optional keyword argument. key (str): The match ID. @@ -164,7 +168,7 @@ cdef class PhraseMatcher: as variable arguments. Will be ignored if a list of patterns is provided as the second argument. - DOCS: https://nightly.spacy.io/api/phrasematcher#add + DOCS: https://spacy.io/api/phrasematcher#add """ if docs is None or hasattr(docs, "__call__"): # old API on_match = docs @@ -191,7 +195,7 @@ cdef class PhraseMatcher: if attr == TAG: pipe = "tagger" elif attr in (POS, MORPH): - pipe = "morphologizer" + pipe = "morphologizer or tagger+attribute_ruler" elif attr == LEMMA: pipe = "lemmatizer" elif attr == DEP: @@ -227,10 +231,10 @@ cdef class PhraseMatcher: result = internal_node map_set(self.mem, result, self.vocab.strings[key], NULL) - def __call__(self, doc, *, as_spans=False): + def __call__(self, object doclike, *, as_spans=False): """Find all sequences matching the supplied patterns on the `Doc`. - doc (Doc): The document to match over. + doclike (Doc or Span): The document to match over. as_spans (bool): Return Span objects with labels instead of (match_id, start, end) tuples. RETURNS (list): A list of `(match_id, start, end)` tuples, @@ -238,15 +242,25 @@ cdef class PhraseMatcher: `doc[start:end]`. The `match_id` is an integer. If as_spans is set to True, a list of Span objects is returned. - DOCS: https://nightly.spacy.io/api/phrasematcher#call + DOCS: https://spacy.io/api/phrasematcher#call """ matches = [] - if doc is None or len(doc) == 0: + if doclike is None or len(doclike) == 0: # if doc is empty or None just return empty list return matches + if isinstance(doclike, Doc): + doc = doclike + start_idx = 0 + end_idx = len(doc) + elif isinstance(doclike, Span): + doc = doclike.doc + start_idx = doclike.start + end_idx = doclike.end + else: + raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__)) cdef vector[SpanC] c_matches - self.find_matches(doc, &c_matches) + self.find_matches(doc, start_idx, end_idx, &c_matches) for i in range(c_matches.size()): matches.append((c_matches[i].label, c_matches[i].start, c_matches[i].end)) for i, (ent_id, start, end) in enumerate(matches): @@ -258,17 +272,17 @@ cdef class PhraseMatcher: else: return matches - cdef void find_matches(self, Doc doc, vector[SpanC] *matches) nogil: + cdef void find_matches(self, Doc doc, int start_idx, int end_idx, vector[SpanC] *matches) nogil: cdef MapStruct* current_node = self.c_map cdef int start = 0 - cdef int idx = 0 - cdef int idy = 0 + cdef int idx = start_idx + cdef int idy = start_idx cdef key_t key cdef void* value cdef int i = 0 cdef SpanC ms cdef void* result - while idx < doc.length: + while idx < end_idx: start = idx token = Token.get_struct_attr(&doc.c[idx], self.attr) # look for sequences from this position @@ -276,7 +290,7 @@ cdef class PhraseMatcher: if result: current_node = result idy = idx + 1 - while idy < doc.length: + while idy < end_idx: result = map_get(current_node, self._terminal_hash) if result: i = 0 diff --git a/spacy/ml/__init__.py b/spacy/ml/__init__.py index c382d915b..fce8ae5af 100644 --- a/spacy/ml/__init__.py +++ b/spacy/ml/__init__.py @@ -1 +1,2 @@ +from .callbacks import create_models_with_nvtx_range # noqa: F401 from .models import * # noqa: F401, F403 diff --git a/spacy/ml/_character_embed.py b/spacy/ml/_character_embed.py index f5c539c42..e46735102 100644 --- a/spacy/ml/_character_embed.py +++ b/spacy/ml/_character_embed.py @@ -3,8 +3,10 @@ from thinc.api import Model from thinc.types import Floats2d from ..tokens import Doc +from ..util import registry +@registry.layers("spacy.CharEmbed.v1") def CharacterEmbed(nM: int, nC: int) -> Model[List[Doc], List[Floats2d]]: # nM: Number of dimensions per character. nC: Number of characters. return Model( @@ -42,7 +44,7 @@ def forward(model: Model, docs: List[Doc], is_train: bool): # Let's say I have a 2d array of indices, and a 3d table of data. What numpy # incantation do I chant to get # output[i, j, k] == data[j, ids[i, j], k]? - doc_vectors[:, nCv] = E[nCv, doc_ids[:, nCv]] + doc_vectors[:, nCv] = E[nCv, doc_ids[:, nCv]] # type: ignore[call-overload, index] output.append(doc_vectors.reshape((len(doc), nO))) ids.append(doc_ids) diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/ml/callbacks.py b/spacy/ml/callbacks.py new file mode 100644 index 000000000..b0d088182 --- /dev/null +++ b/spacy/ml/callbacks.py @@ -0,0 +1,39 @@ +from functools import partial +from typing import Type, Callable, TYPE_CHECKING + +from thinc.layers import with_nvtx_range +from thinc.model import Model, wrap_model_recursive + +from ..util import registry + +if TYPE_CHECKING: + # This lets us add type hints for mypy etc. without causing circular imports + from ..language import Language # noqa: F401 + + +@registry.callbacks("spacy.models_with_nvtx_range.v1") +def create_models_with_nvtx_range( + forward_color: int = -1, backprop_color: int = -1 +) -> Callable[["Language"], "Language"]: + def models_with_nvtx_range(nlp): + pipes = [ + pipe + for _, pipe in nlp.components + if hasattr(pipe, "is_trainable") and pipe.is_trainable + ] + + # We need process all models jointly to avoid wrapping callbacks twice. + models = Model( + "wrap_with_nvtx_range", + forward=lambda model, X, is_train: ..., + layers=[pipe.model for pipe in pipes], + ) + + for node in models.walk(): + with_nvtx_range( + node, forward_color=forward_color, backprop_color=backprop_color + ) + + return nlp + + return models_with_nvtx_range diff --git a/spacy/ml/extract_ngrams.py b/spacy/ml/extract_ngrams.py index 7e1cce884..c9c82f369 100644 --- a/spacy/ml/extract_ngrams.py +++ b/spacy/ml/extract_ngrams.py @@ -1,10 +1,12 @@ from thinc.api import Model +from ..util import registry from ..attrs import LOWER +@registry.layers("spacy.extract_ngrams.v1") def extract_ngrams(ngram_size: int, attr: int = LOWER) -> Model: - model = Model("extract_ngrams", forward) + model: Model = Model("extract_ngrams", forward) model.attrs["ngram_size"] = ngram_size model.attrs["attr"] = attr return model @@ -17,7 +19,7 @@ def forward(model: Model, docs, is_train: bool): unigrams = model.ops.asarray(doc.to_array([model.attrs["attr"]])) ngrams = [unigrams] for n in range(2, model.attrs["ngram_size"] + 1): - ngrams.append(model.ops.ngrams(n, unigrams)) + ngrams.append(model.ops.ngrams(n, unigrams)) # type: ignore[arg-type] keys = model.ops.xp.concatenate(ngrams) keys, vals = model.ops.xp.unique(keys, return_counts=True) batch_keys.append(keys) diff --git a/spacy/ml/extract_spans.py b/spacy/ml/extract_spans.py new file mode 100644 index 000000000..9bc972032 --- /dev/null +++ b/spacy/ml/extract_spans.py @@ -0,0 +1,60 @@ +from typing import Tuple, Callable +from thinc.api import Model, to_numpy +from thinc.types import Ragged, Ints1d + +from ..util import registry + + +@registry.layers("spacy.extract_spans.v1") +def extract_spans() -> Model[Tuple[Ragged, Ragged], Ragged]: + """Extract spans from a sequence of source arrays, as specified by an array + of (start, end) indices. The output is a ragged array of the + extracted spans. + """ + return Model( + "extract_spans", forward, layers=[], refs={}, attrs={}, dims={}, init=init + ) + + +def init(model, X=None, Y=None): + pass + + +def forward( + model: Model, source_spans: Tuple[Ragged, Ragged], is_train: bool +) -> Tuple[Ragged, Callable]: + """Get subsequences from source vectors.""" + ops = model.ops + X, spans = source_spans + assert spans.dataXd.ndim == 2 + indices = _get_span_indices(ops, spans, X.lengths) + Y = Ragged(X.dataXd[indices], spans.dataXd[:, 1] - spans.dataXd[:, 0]) # type: ignore[arg-type, index] + x_shape = X.dataXd.shape + x_lengths = X.lengths + + def backprop_windows(dY: Ragged) -> Tuple[Ragged, Ragged]: + dX = Ragged(ops.alloc2f(*x_shape), x_lengths) + ops.scatter_add(dX.dataXd, indices, dY.dataXd) # type: ignore[arg-type] + return (dX, spans) + + return Y, backprop_windows + + +def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d: + """Construct a flat array that has the indices we want to extract from the + source data. For instance, if we want the spans (5, 9), (8, 10) the + indices will be [5, 6, 7, 8, 8, 9]. + """ + spans, lengths = _ensure_cpu(spans, lengths) + indices = [] + offset = 0 + for i, length in enumerate(lengths): + spans_i = spans[i].dataXd + offset + for j in range(spans_i.shape[0]): + indices.append(ops.xp.arange(spans_i[j, 0], spans_i[j, 1])) # type: ignore[call-overload, index] + offset += length + return ops.flatten(indices) + + +def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]: + return (Ragged(to_numpy(spans.dataXd), to_numpy(spans.lengths)), to_numpy(lengths)) diff --git a/spacy/ml/models/__init__.py b/spacy/ml/models/__init__.py index f03237019..9b7628f0e 100644 --- a/spacy/ml/models/__init__.py +++ b/spacy/ml/models/__init__.py @@ -1,6 +1,7 @@ from .entity_linker import * # noqa from .multi_task import * # noqa from .parser import * # noqa +from .spancat import * # noqa from .tagger import * # noqa from .textcat import * # noqa from .tok2vec import * # noqa diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index f37203b1b..831fee90f 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -1,23 +1,27 @@ from pathlib import Path -from typing import Optional, Callable, Iterable +from typing import Optional, Callable, Iterable, List +from thinc.types import Floats2d from thinc.api import chain, clone, list2ragged, reduce_mean, residual from thinc.api import Model, Maxout, Linear from ...util import registry from ...kb import KnowledgeBase, Candidate, get_candidates from ...vocab import Vocab +from ...tokens import Span, Doc -@registry.architectures.register("spacy.EntityLinker.v1") -def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model: +@registry.architectures("spacy.EntityLinker.v1") +def build_nel_encoder( + tok2vec: Model, nO: Optional[int] = None +) -> Model[List[Doc], Floats2d]: with Model.define_operators({">>": chain, "**": clone}): - token_width = tok2vec.get_dim("nO") + token_width = tok2vec.maybe_get_dim("nO") output_layer = Linear(nO=nO, nI=token_width) model = ( tok2vec >> list2ragged() >> reduce_mean() - >> residual(Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0)) + >> residual(Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0)) # type: ignore[arg-type] >> output_layer ) model.set_ref("output_layer", output_layer) @@ -25,7 +29,7 @@ def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model: return model -@registry.misc.register("spacy.KBFromFile.v1") +@registry.misc("spacy.KBFromFile.v1") def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]: def kb_from_file(vocab): kb = KnowledgeBase(vocab, entity_vector_length=1) @@ -35,7 +39,7 @@ def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]: return kb_from_file -@registry.misc.register("spacy.EmptyKB.v1") +@registry.misc("spacy.EmptyKB.v1") def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]: def empty_kb_factory(vocab): return KnowledgeBase(vocab=vocab, entity_vector_length=entity_vector_length) @@ -43,6 +47,6 @@ def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]: return empty_kb_factory -@registry.misc.register("spacy.CandidateGenerator.v1") -def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]: +@registry.misc("spacy.CandidateGenerator.v1") +def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]: return get_candidates diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index 7c0589bff..37473b7f4 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -1,9 +1,11 @@ -from typing import Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING +from typing import Any, Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING, cast +from thinc.types import Floats2d from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model from thinc.api import MultiSoftmax, list2array from thinc.api import to_categorical, CosineDistance, L2Distance +from thinc.loss import Loss -from ...util import registry +from ...util import registry, OOV_RANK from ...errors import Errors from ...attrs import ID @@ -13,14 +15,16 @@ from functools import partial if TYPE_CHECKING: # This lets us add type hints for mypy etc. without causing circular imports from ...vocab import Vocab # noqa: F401 - from ...tokens import Doc # noqa: F401 + from ...tokens.doc import Doc # noqa: F401 -@registry.architectures.register("spacy.PretrainVectors.v1") +@registry.architectures("spacy.PretrainVectors.v1") def create_pretrain_vectors( maxout_pieces: int, hidden_size: int, loss: str ) -> Callable[["Vocab", Model], Model]: def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model: + if vocab.vectors.data.shape[1] == 0: + raise ValueError(Errors.E875) model = build_cloze_multi_task_model( vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces ) @@ -28,6 +32,7 @@ def create_pretrain_vectors( return model def create_vectors_loss() -> Callable: + distance: Loss if loss == "cosine": distance = CosineDistance(normalize=True, ignore_zeros=True) return partial(get_vectors_loss, distance=distance) @@ -40,7 +45,7 @@ def create_pretrain_vectors( return create_vectors_objective -@registry.architectures.register("spacy.PretrainCharacters.v1") +@registry.architectures("spacy.PretrainCharacters.v1") def create_pretrain_characters( maxout_pieces: int, hidden_size: int, n_characters: int ) -> Callable[["Vocab", Model], Model]: @@ -68,6 +73,7 @@ def get_vectors_loss(ops, docs, prediction, distance): # and look them up all at once. This prevents data copying. ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) target = docs[0].vocab.vectors.data[ids] + target[ids == OOV_RANK] = 0 d_target, loss = distance(prediction, target) return loss, d_target @@ -112,7 +118,7 @@ def build_cloze_multi_task_model( ) -> Model: nO = vocab.vectors.data.shape[1] output_layer = chain( - list2array(), + cast(Model[List["Floats2d"], Floats2d], list2array()), Maxout( nO=hidden_size, nI=tok2vec.get_dim("nO"), @@ -133,10 +139,10 @@ def build_cloze_characters_multi_task_model( vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int, nr_char: int ) -> Model: output_layer = chain( - list2array(), - Maxout(hidden_size, nP=maxout_pieces), + cast(Model[List["Floats2d"], Floats2d], list2array()), + Maxout(nO=hidden_size, nP=maxout_pieces), LayerNorm(nI=hidden_size), - MultiSoftmax([256] * nr_char, nI=hidden_size), + MultiSoftmax([256] * nr_char, nI=hidden_size), # type: ignore[arg-type] ) model = build_masked_language_model(vocab, chain(tok2vec, output_layer)) model.set_ref("tok2vec", tok2vec) @@ -168,7 +174,7 @@ def build_masked_language_model( if wrapped.has_dim(dim): model.set_dim(dim, wrapped.get_dim(dim)) - mlm_model = Model( + mlm_model: Model = Model( "masked-language-model", mlm_forward, layers=[wrapped_model], @@ -182,13 +188,19 @@ def build_masked_language_model( class _RandomWords: def __init__(self, vocab: "Vocab") -> None: + # Extract lexeme representations self.words = [lex.text for lex in vocab if lex.prob != 0.0] - self.probs = [lex.prob for lex in vocab if lex.prob != 0.0] self.words = self.words[:10000] - self.probs = self.probs[:10000] - self.probs = numpy.exp(numpy.array(self.probs, dtype="f")) - self.probs /= self.probs.sum() - self._cache = [] + + # Compute normalized lexeme probabilities + probs = [lex.prob for lex in vocab if lex.prob != 0.0] + probs = probs[:10000] + probs: numpy.ndarray = numpy.exp(numpy.array(probs, dtype="f")) + probs /= probs.sum() + self.probs = probs + + # Initialize cache + self._cache: List[int] = [] def next(self) -> str: if not self._cache: @@ -203,7 +215,7 @@ def _apply_mask( docs: Iterable["Doc"], random_words: _RandomWords, mask_prob: float = 0.15 ) -> Tuple[numpy.ndarray, List["Doc"]]: # This needs to be here to avoid circular imports - from ...tokens import Doc # noqa: F811 + from ...tokens.doc import Doc # noqa: F811 N = sum(len(doc) for doc in docs) mask = numpy.random.uniform(0.0, 1.0, (N,)) diff --git a/spacy/ml/models/spancat.py b/spacy/ml/models/spancat.py new file mode 100644 index 000000000..893db2e6d --- /dev/null +++ b/spacy/ml/models/spancat.py @@ -0,0 +1,64 @@ +from typing import List, Tuple, cast +from thinc.api import Model, with_getitem, chain, list2ragged, Logistic +from thinc.api import Maxout, Linear, concatenate, glorot_uniform_init +from thinc.api import reduce_mean, reduce_max, reduce_first, reduce_last +from thinc.types import Ragged, Floats2d + +from ...util import registry +from ...tokens import Doc +from ..extract_spans import extract_spans + + +@registry.layers("spacy.LinearLogistic.v1") +def build_linear_logistic(nO=None, nI=None) -> Model[Floats2d, Floats2d]: + """An output layer for multi-label classification. It uses a linear layer + followed by a logistic activation. + """ + return chain(Linear(nO=nO, nI=nI, init_W=glorot_uniform_init), Logistic()) + + +@registry.layers("spacy.mean_max_reducer.v1") +def build_mean_max_reducer(hidden_size: int) -> Model[Ragged, Floats2d]: + """Reduce sequences by concatenating their mean and max pooled vectors, + and then combine the concatenated vectors with a hidden layer. + """ + return chain( + concatenate( + cast(Model[Ragged, Floats2d], reduce_last()), + cast(Model[Ragged, Floats2d], reduce_first()), + reduce_mean(), + reduce_max(), + ), + Maxout(nO=hidden_size, normalize=True, dropout=0.0), + ) + + +@registry.architectures("spacy.SpanCategorizer.v1") +def build_spancat_model( + tok2vec: Model[List[Doc], List[Floats2d]], + reducer: Model[Ragged, Floats2d], + scorer: Model[Floats2d, Floats2d], +) -> Model[Tuple[List[Doc], Ragged], Floats2d]: + """Build a span categorizer model, given a token-to-vector model, a + reducer model to map the sequence of vectors for each span down to a single + vector, and a scorer model to map the vectors to probabilities. + + tok2vec (Model[List[Doc], List[Floats2d]]): The tok2vec model. + reducer (Model[Ragged, Floats2d]): The reducer model. + scorer (Model[Floats2d, Floats2d]): The scorer model. + """ + model = chain( + cast( + Model[Tuple[List[Doc], Ragged], Tuple[Ragged, Ragged]], + with_getitem( + 0, chain(tok2vec, cast(Model[List[Floats2d], Ragged], list2ragged())) + ), + ), + extract_spans(), + reducer, + scorer, + ) + model.set_ref("tok2vec", tok2vec) + model.set_ref("reducer", reducer) + model.set_ref("scorer", scorer) + return model diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py index 09405214c..9c7fe042d 100644 --- a/spacy/ml/models/tagger.py +++ b/spacy/ml/models/tagger.py @@ -6,7 +6,7 @@ from ...util import registry from ...tokens import Doc -@registry.architectures.register("spacy.Tagger.v1") +@registry.architectures("spacy.Tagger.v1") def build_tagger_model( tok2vec: Model[List[Doc], List[Floats2d]], nO: Optional[int] = None ) -> Model[List[Doc], List[Floats2d]]: @@ -20,7 +20,7 @@ def build_tagger_model( # TODO: glorot_uniform_init seems to work a bit better than zero_init here?! t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None output_layer = Softmax(nO, t2v_width, init_W=zero_init) - softmax = with_array(output_layer) + softmax = with_array(output_layer) # type: ignore model = chain(tok2vec, softmax) model.set_ref("tok2vec", tok2vec) model.set_ref("softmax", output_layer) diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index e0c11ed99..c8c146f02 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -1,11 +1,13 @@ +from functools import partial from typing import Optional, List from thinc.types import Floats2d from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum -from thinc.api import with_cpu, Relu, residual, LayerNorm +from thinc.api import with_cpu, Relu, residual, LayerNorm, resizable from thinc.layers.chain import init as init_chain +from thinc.layers.resizable import resize_model, resize_linear_weighted from ...attrs import ORTH from ...util import registry @@ -15,7 +17,10 @@ from ...tokens import Doc from .tok2vec import get_tok2vec_width -@registry.architectures.register("spacy.TextCatCNN.v1") +NEG_VALUE = -5000 + + +@registry.architectures("spacy.TextCatCNN.v2") def build_simple_cnn_text_classifier( tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None ) -> Model[List[Doc], Floats2d]: @@ -25,42 +30,79 @@ def build_simple_cnn_text_classifier( outputs sum to 1. If exclusive_classes=False, a logistic non-linearity is applied instead, so that outputs are in the range [0, 1]. """ + fill_defaults = {"b": 0, "W": 0} with Model.define_operators({">>": chain}): cnn = tok2vec >> list2ragged() >> reduce_mean() + nI = tok2vec.maybe_get_dim("nO") if exclusive_classes: - output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO")) - model = cnn >> output_layer - model.set_ref("output_layer", output_layer) + output_layer = Softmax(nO=nO, nI=nI) + fill_defaults["b"] = NEG_VALUE + resizable_layer: Model = resizable( + output_layer, + resize_layer=partial( + resize_linear_weighted, fill_defaults=fill_defaults + ), + ) + model = cnn >> resizable_layer else: - linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO")) - model = cnn >> linear_layer >> Logistic() - model.set_ref("output_layer", linear_layer) + output_layer = Linear(nO=nO, nI=nI) + resizable_layer = resizable( + output_layer, + resize_layer=partial( + resize_linear_weighted, fill_defaults=fill_defaults + ), + ) + model = cnn >> resizable_layer >> Logistic() + model.set_ref("output_layer", output_layer) + model.attrs["resize_output"] = partial( + resize_and_set_ref, + resizable_layer=resizable_layer, + ) model.set_ref("tok2vec", tok2vec) - model.set_dim("nO", nO) + model.set_dim("nO", nO) # type: ignore # TODO: remove type ignore once Thinc has been updated model.attrs["multi_label"] = not exclusive_classes return model -@registry.architectures.register("spacy.TextCatBOW.v1") +def resize_and_set_ref(model, new_nO, resizable_layer): + resizable_layer = resize_model(resizable_layer, new_nO) + model.set_ref("output_layer", resizable_layer.layers[0]) + model.set_dim("nO", new_nO, force=True) + return model + + +@registry.architectures("spacy.TextCatBOW.v2") def build_bow_text_classifier( exclusive_classes: bool, ngram_size: int, no_output_layer: bool, nO: Optional[int] = None, ) -> Model[List[Doc], Floats2d]: + fill_defaults = {"b": 0, "W": 0} with Model.define_operators({">>": chain}): - sparse_linear = SparseLinear(nO) - model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear - model = with_cpu(model, model.ops) + sparse_linear = SparseLinear(nO=nO) + output_layer = None if not no_output_layer: + fill_defaults["b"] = NEG_VALUE output_layer = softmax_activation() if exclusive_classes else Logistic() + resizable_layer = resizable( # type: ignore[var-annotated] + sparse_linear, + resize_layer=partial(resize_linear_weighted, fill_defaults=fill_defaults), + ) + model = extract_ngrams(ngram_size, attr=ORTH) >> resizable_layer + model = with_cpu(model, model.ops) + if output_layer: model = model >> with_cpu(output_layer, output_layer.ops) + model.set_dim("nO", nO) # type: ignore[arg-type] model.set_ref("output_layer", sparse_linear) model.attrs["multi_label"] = not exclusive_classes + model.attrs["resize_output"] = partial( + resize_and_set_ref, resizable_layer=resizable_layer + ) return model -@registry.architectures.register("spacy.TextCatEnsemble.v2") +@registry.architectures("spacy.TextCatEnsemble.v2") def build_text_classifier_v2( tok2vec: Model[List[Doc], List[Floats2d]], linear_model: Model[List[Doc], Floats2d], @@ -69,9 +111,7 @@ def build_text_classifier_v2( exclusive_classes = not linear_model.attrs["multi_label"] with Model.define_operators({">>": chain, "|": concatenate}): width = tok2vec.maybe_get_dim("nO") - attention_layer = ParametricAttention( - width - ) # TODO: benchmark performance difference of this layer + attention_layer = ParametricAttention(width) maxout_layer = Maxout(nO=width, nI=width) norm_layer = LayerNorm(nI=width) cnn_model = ( @@ -90,14 +130,14 @@ def build_text_classifier_v2( model = (linear_model | cnn_model) >> output_layer model.set_ref("tok2vec", tok2vec) if model.has_dim("nO") is not False: - model.set_dim("nO", nO) + model.set_dim("nO", nO) # type: ignore[arg-type] model.set_ref("output_layer", linear_model.get_ref("output_layer")) model.set_ref("attention_layer", attention_layer) model.set_ref("maxout_layer", maxout_layer) model.set_ref("norm_layer", norm_layer) model.attrs["multi_label"] = not exclusive_classes - model.init = init_ensemble_textcat + model.init = init_ensemble_textcat # type: ignore[assignment] return model @@ -107,11 +147,12 @@ def init_ensemble_textcat(model, X, Y) -> Model: model.get_ref("maxout_layer").set_dim("nO", tok2vec_width) model.get_ref("maxout_layer").set_dim("nI", tok2vec_width) model.get_ref("norm_layer").set_dim("nI", tok2vec_width) + model.get_ref("norm_layer").set_dim("nO", tok2vec_width) init_chain(model, X, Y) return model -@registry.architectures.register("spacy.TextCatLowData.v1") +@registry.architectures("spacy.TextCatLowData.v1") def build_text_classifier_lowdata( width: int, dropout: Optional[float], nO: Optional[int] = None ) -> Model[List[Doc], Floats2d]: @@ -123,7 +164,7 @@ def build_text_classifier_lowdata( >> list2ragged() >> ParametricAttention(width) >> reduce_sum() - >> residual(Relu(width, width)) ** 2 + >> residual(Relu(width, width)) ** 2 # type: ignore[arg-type] >> Linear(nO, width) ) if dropout: diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index c4bd6b0d7..8d78e418f 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -1,5 +1,5 @@ -from typing import Optional, List, Union -from thinc.types import Floats2d +from typing import Optional, List, Union, cast +from thinc.types import Floats2d, Ints2d, Ragged from thinc.api import chain, clone, concatenate, with_array, with_padded from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM @@ -14,7 +14,7 @@ from ...pipeline.tok2vec import Tok2VecListener from ...attrs import intify_attr -@registry.architectures.register("spacy.Tok2VecListener.v1") +@registry.architectures("spacy.Tok2VecListener.v1") def tok2vec_listener_v1(width: int, upstream: str = "*"): tok2vec = Tok2VecListener(upstream_name=upstream, width=width) return tok2vec @@ -31,7 +31,7 @@ def get_tok2vec_width(model: Model): return nO -@registry.architectures.register("spacy.HashEmbedCNN.v1") +@registry.architectures("spacy.HashEmbedCNN.v2") def build_hash_embed_cnn_tok2vec( *, width: int, @@ -87,7 +87,7 @@ def build_hash_embed_cnn_tok2vec( ) -@registry.architectures.register("spacy.Tok2Vec.v2") +@registry.architectures("spacy.Tok2Vec.v2") def build_Tok2Vec_model( embed: Model[List[Doc], List[Floats2d]], encode: Model[List[Floats2d], List[Floats2d]], @@ -108,7 +108,7 @@ def build_Tok2Vec_model( return tok2vec -@registry.architectures.register("spacy.MultiHashEmbed.v1") +@registry.architectures("spacy.MultiHashEmbed.v2") def MultiHashEmbed( width: int, attrs: List[Union[str, int]], @@ -158,31 +158,35 @@ def MultiHashEmbed( embeddings = [make_hash_embed(i) for i in range(len(attrs))] concat_size = width * (len(embeddings) + include_static_vectors) + max_out: Model[Ragged, Ragged] = with_array( + Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True) # type: ignore + ) if include_static_vectors: + feature_extractor: Model[List[Doc], Ragged] = chain( + FeatureExtractor(attrs), + cast(Model[List[Ints2d], Ragged], list2ragged()), + with_array(concatenate(*embeddings)), + ) model = chain( concatenate( - chain( - FeatureExtractor(attrs), - list2ragged(), - with_array(concatenate(*embeddings)), - ), + feature_extractor, StaticVectors(width, dropout=0.0), ), - with_array(Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)), - ragged2list(), + max_out, + cast(Model[Ragged, List[Floats2d]], ragged2list()), ) else: model = chain( FeatureExtractor(list(attrs)), - list2ragged(), + cast(Model[List[Ints2d], Ragged], list2ragged()), with_array(concatenate(*embeddings)), - with_array(Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)), - ragged2list(), + max_out, + cast(Model[Ragged, List[Floats2d]], ragged2list()), ) return model -@registry.architectures.register("spacy.CharacterEmbed.v1") +@registry.architectures("spacy.CharacterEmbed.v2") def CharacterEmbed( width: int, rows: int, @@ -220,42 +224,46 @@ def CharacterEmbed( """ feature = intify_attr(feature) if feature is None: - raise ValueError(Errors.E911(feat=feature)) + raise ValueError(Errors.E911.format(feat=feature)) + char_embed = chain( + _character_embed.CharacterEmbed(nM=nM, nC=nC), + cast(Model[List[Floats2d], Ragged], list2ragged()), + ) + feature_extractor: Model[List[Doc], Ragged] = chain( + FeatureExtractor([feature]), + cast(Model[List[Ints2d], Ragged], list2ragged()), + with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)), # type: ignore + ) + max_out: Model[Ragged, Ragged] if include_static_vectors: + max_out = with_array( + Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0) # type: ignore + ) model = chain( concatenate( - chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()), - chain( - FeatureExtractor([feature]), - list2ragged(), - with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)), - ), + char_embed, + feature_extractor, StaticVectors(width, dropout=0.0), ), - with_array( - Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0) - ), - ragged2list(), + max_out, + cast(Model[Ragged, List[Floats2d]], ragged2list()), ) else: + max_out = with_array( + Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0) # type: ignore + ) model = chain( concatenate( - chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()), - chain( - FeatureExtractor([feature]), - list2ragged(), - with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)), - ), + char_embed, + feature_extractor, ), - with_array( - Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0) - ), - ragged2list(), + max_out, + cast(Model[Ragged, List[Floats2d]], ragged2list()), ) return model -@registry.architectures.register("spacy.MaxoutWindowEncoder.v2") +@registry.architectures("spacy.MaxoutWindowEncoder.v2") def MaxoutWindowEncoder( width: int, window_size: int, maxout_pieces: int, depth: int ) -> Model[List[Floats2d], List[Floats2d]]: @@ -281,13 +289,13 @@ def MaxoutWindowEncoder( normalize=True, ), ) - model = clone(residual(cnn), depth) + model = clone(residual(cnn), depth) # type: ignore[arg-type] model.set_dim("nO", width) receptive_field = window_size * depth - return with_array(model, pad=receptive_field) + return with_array(model, pad=receptive_field) # type: ignore[arg-type] -@registry.architectures.register("spacy.MishWindowEncoder.v2") +@registry.architectures("spacy.MishWindowEncoder.v2") def MishWindowEncoder( width: int, window_size: int, depth: int ) -> Model[List[Floats2d], List[Floats2d]]: @@ -305,12 +313,12 @@ def MishWindowEncoder( expand_window(window_size=window_size), Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True), ) - model = clone(residual(cnn), depth) + model = clone(residual(cnn), depth) # type: ignore[arg-type] model.set_dim("nO", width) - return with_array(model) + return with_array(model) # type: ignore[arg-type] -@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1") +@registry.architectures("spacy.TorchBiLSTMEncoder.v1") def BiLSTMEncoder( width: int, depth: int, dropout: float ) -> Model[List[Floats2d], List[Floats2d]]: diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py index ea4c7fb77..53ef01906 100644 --- a/spacy/ml/staticvectors.py +++ b/spacy/ml/staticvectors.py @@ -8,7 +8,7 @@ from ..tokens import Doc from ..errors import Errors -@registry.layers("spacy.StaticVectors.v1") +@registry.layers("spacy.StaticVectors.v2") def StaticVectors( nO: Optional[int] = None, nM: Optional[int] = None, @@ -38,7 +38,7 @@ def forward( return _handle_empty(model.ops, model.get_dim("nO")) key_attr = model.attrs["key_attr"] W = cast(Floats2d, model.ops.as_contig(model.get_param("W"))) - V = cast(Floats2d, docs[0].vocab.vectors.data) + V = cast(Floats2d, model.ops.asarray(docs[0].vocab.vectors.data)) rows = model.ops.flatten( [doc.vocab.vectors.find(keys=doc.to_array(key_attr)) for doc in docs] ) @@ -46,8 +46,10 @@ def forward( vectors_data = model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True) except ValueError: raise RuntimeError(Errors.E896) + # Convert negative indices to 0-vectors (TODO: more options for UNK tokens) + vectors_data[rows < 0] = 0 output = Ragged( - vectors_data, model.ops.asarray([len(doc) for doc in docs], dtype="i") + vectors_data, model.ops.asarray([len(doc) for doc in docs], dtype="i") # type: ignore ) mask = None if is_train: @@ -60,7 +62,9 @@ def forward( d_output.data *= mask model.inc_grad( "W", - model.ops.gemm(d_output.data, model.ops.as_contig(V[rows]), trans1=True), + model.ops.gemm( + cast(Floats2d, d_output.data), model.ops.as_contig(V[rows]), trans1=True + ), ) return [] @@ -95,4 +99,7 @@ def _handle_empty(ops: Ops, nO: int): def _get_drop_mask(ops: Ops, nO: int, rate: Optional[float]) -> Optional[Floats1d]: - return ops.get_dropout_mask((nO,), rate) if rate is not None else None + if rate is not None: + mask = ops.get_dropout_mask((nO,), rate) + return mask # type: ignore + return None diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 207f4bd5d..10d263851 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -3,12 +3,14 @@ from thinc.api import Ops, Model, normal_init, chain, list2array, Linear from thinc.types import Floats1d, Floats2d, Floats3d, Ints2d, Floats4d import numpy from ..tokens.doc import Doc +from ..util import registry TransitionSystem = Any # TODO State = Any # TODO +@registry.layers("spacy.TransitionModel.v2") def TransitionModel( *, tok2vec: Model[List[Doc], List[Floats2d]], diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 4fe8f7428..8d449d065 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -1,14 +1,11 @@ from cymem.cymem cimport Pool -from preshed.maps cimport PreshMap, PreshMapArray -from libc.stdint cimport uint64_t -from murmurhash cimport mrmr +from preshed.maps cimport PreshMap cimport numpy as np +from libc.stdint cimport uint64_t -from .structs cimport TokenC, MorphAnalysisC +from .structs cimport MorphAnalysisC from .strings cimport StringStore -from .typedefs cimport hash_t, attr_t, flags_t -from .parts_of_speech cimport univ_pos_t -from . cimport symbols +from .typedefs cimport attr_t, hash_t cdef class Morphology: @@ -16,14 +13,6 @@ cdef class Morphology: cdef readonly StringStore strings cdef PreshMap tags # Keyed by hash, value is pointer to tag - cdef public object lemmatizer - cdef readonly object tag_map - cdef readonly object tag_names - cdef readonly object reverse_index - cdef readonly object _exc - cdef readonly PreshMapArray _cache - cdef readonly int n_tags - cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except * cdef int insert(self, MorphAnalysisC tag) except -1 diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index e8469223a..c3ffc46a1 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -1,20 +1,11 @@ # cython: infer_types -from libc.string cimport memset - -import srsly -from collections import Counter import numpy import warnings -from .attrs cimport POS, IS_SPACE -from .parts_of_speech cimport SPACE -from .lexeme cimport Lexeme +from .attrs cimport POS -from .strings import get_string_id -from .attrs import LEMMA, intify_attrs from .parts_of_speech import IDS as POS_IDS -from .errors import Errors, Warnings -from .util import ensure_path +from .errors import Warnings from . import symbols diff --git a/spacy/pipe_analysis.py b/spacy/pipe_analysis.py index d0362e7e1..245747061 100644 --- a/spacy/pipe_analysis.py +++ b/spacy/pipe_analysis.py @@ -1,4 +1,4 @@ -from typing import List, Dict, Iterable, Optional, Union, TYPE_CHECKING +from typing import List, Set, Dict, Iterable, ItemsView, Union, TYPE_CHECKING from wasabi import msg from .tokens import Doc, Token, Span @@ -67,7 +67,7 @@ def get_attr_info(nlp: "Language", attr: str) -> Dict[str, List[str]]: RETURNS (Dict[str, List[str]]): A dict keyed by "assigns" and "requires", mapped to a list of component names. """ - result = {"assigns": [], "requires": []} + result: Dict[str, List[str]] = {"assigns": [], "requires": []} for pipe_name in nlp.pipe_names: meta = nlp.get_pipe_meta(pipe_name) if attr in meta.assigns: @@ -79,7 +79,7 @@ def get_attr_info(nlp: "Language", attr: str) -> Dict[str, List[str]]: def analyze_pipes( nlp: "Language", *, keys: List[str] = DEFAULT_KEYS -) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: +) -> Dict[str, Dict[str, Union[List[str], Dict]]]: """Print a formatted summary for the current nlp object's pipeline. Shows a table with the pipeline components and why they assign and require, as well as any problems if available. @@ -88,8 +88,11 @@ def analyze_pipes( keys (List[str]): The meta keys to show in the table. RETURNS (dict): A dict with "summary" and "problems". """ - result = {"summary": {}, "problems": {}} - all_attrs = set() + result: Dict[str, Dict[str, Union[List[str], Dict]]] = { + "summary": {}, + "problems": {}, + } + all_attrs: Set[str] = set() for i, name in enumerate(nlp.pipe_names): meta = nlp.get_pipe_meta(name) all_attrs.update(meta.assigns) @@ -102,19 +105,18 @@ def analyze_pipes( prev_meta = nlp.get_pipe_meta(prev_name) for annot in prev_meta.assigns: requires[annot] = True - result["problems"][name] = [] - for annot, fulfilled in requires.items(): - if not fulfilled: - result["problems"][name].append(annot) + result["problems"][name] = [ + annot for annot, fulfilled in requires.items() if not fulfilled + ] result["attrs"] = {attr: get_attr_info(nlp, attr) for attr in all_attrs} return result def print_pipe_analysis( - analysis: Dict[str, Union[List[str], Dict[str, List[str]]]], + analysis: Dict[str, Dict[str, Union[List[str], Dict]]], *, keys: List[str] = DEFAULT_KEYS, -) -> Optional[Dict[str, Union[List[str], Dict[str, List[str]]]]]: +) -> None: """Print a formatted version of the pipe analysis produced by analyze_pipes. analysis (Dict[str, Union[List[str], Dict[str, List[str]]]]): The analysis. @@ -122,7 +124,7 @@ def print_pipe_analysis( """ msg.divider("Pipeline Overview") header = ["#", "Component", *[key.capitalize() for key in keys]] - summary = analysis["summary"].items() + summary: ItemsView = analysis["summary"].items() body = [[i, n, *[v for v in m.values()]] for i, (n, m) in enumerate(summary)] msg.table(body, header=header, divider=True, multiline=True) n_problems = sum(len(p) for p in analysis["problems"].values()) diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index 1fa53a556..7b483724c 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -11,6 +11,7 @@ from .senter import SentenceRecognizer from .sentencizer import Sentencizer from .tagger import Tagger from .textcat import TextCategorizer +from .spancat import SpanCategorizer from .textcat_multilabel import MultiLabel_TextCategorizer from .tok2vec import Tok2Vec from .functions import merge_entities, merge_noun_chunks, merge_subtokens @@ -27,6 +28,7 @@ __all__ = [ "Pipe", "SentenceRecognizer", "Sentencizer", + "SpanCategorizer", "Tagger", "TextCategorizer", "Tok2Vec", diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd index 7f644a151..45202cb67 100644 --- a/spacy/pipeline/_parser_internals/_state.pxd +++ b/spacy/pipeline/_parser_internals/_state.pxd @@ -279,7 +279,7 @@ cdef cppclass StateC: return this._stack.size() int buffer_length() nogil const: - return this.length - this._b_i + return (this.length - this._b_i) + this._rebuffer.size() void push() nogil: b0 = this.B(0) diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx index b477891f8..cba77dfde 100644 --- a/spacy/pipeline/_parser_internals/arc_eager.pyx +++ b/spacy/pipeline/_parser_internals/arc_eager.pyx @@ -614,10 +614,22 @@ cdef class ArcEager(TransitionSystem): actions[LEFT].setdefault('dep', 0) return actions + @property + def builtin_labels(self): + return ["ROOT", "dep"] + @property def action_types(self): return (SHIFT, REDUCE, LEFT, RIGHT, BREAK) + def get_doc_labels(self, doc): + """Get the labels required for a given Doc.""" + labels = set(self.builtin_labels) + for token in doc: + if token.dep_: + labels.add(token.dep_) + return labels + def transition(self, StateClass state, action): cdef Transition t = self.lookup_transition(action) t.do(state.c, t.label) diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx index e4e95695c..c88fd35f0 100644 --- a/spacy/pipeline/_parser_internals/ner.pyx +++ b/spacy/pipeline/_parser_internals/ner.pyx @@ -1,3 +1,5 @@ +import os +import random from libc.stdint cimport int32_t from cymem.cymem cimport Pool @@ -6,10 +8,11 @@ from thinc.extra.search cimport Beam from ...tokens.doc cimport Doc from ...tokens.span import Span +from ...tokens.span cimport Span from ...typedefs cimport weight_t, attr_t from ...lexeme cimport Lexeme from ...attrs cimport IS_SPACE -from ...structs cimport TokenC +from ...structs cimport TokenC, SpanC from ...training.example cimport Example from .stateclass cimport StateClass from ._state cimport StateC @@ -25,7 +28,6 @@ cdef enum: LAST UNIT OUT - ISNT N_MOVES @@ -36,39 +38,62 @@ MOVE_NAMES[IN] = 'I' MOVE_NAMES[LAST] = 'L' MOVE_NAMES[UNIT] = 'U' MOVE_NAMES[OUT] = 'O' -MOVE_NAMES[ISNT] = 'x' cdef struct GoldNERStateC: Transition* ner + SpanC* negs int32_t length + int32_t nr_neg cdef class BiluoGold: cdef Pool mem cdef GoldNERStateC c - def __init__(self, BiluoPushDown moves, StateClass stcls, Example example): + def __init__(self, BiluoPushDown moves, StateClass stcls, Example example, neg_key): self.mem = Pool() - self.c = create_gold_state(self.mem, moves, stcls.c, example) + self.c = create_gold_state(self.mem, moves, stcls.c, example, neg_key) def update(self, StateClass stcls): update_gold_state(&self.c, stcls.c) - cdef GoldNERStateC create_gold_state( Pool mem, BiluoPushDown moves, const StateC* stcls, - Example example + Example example, + neg_key ) except *: cdef GoldNERStateC gs + cdef Span neg + if neg_key is not None: + negs = example.get_aligned_spans_y2x( + example.y.spans.get(neg_key, []), + allow_overlap=True + ) + else: + negs = [] assert example.x.length > 0 gs.ner = mem.alloc(example.x.length, sizeof(Transition)) - ner_tags = example.get_aligned_ner() + gs.negs = mem.alloc(len(negs), sizeof(SpanC)) + gs.nr_neg = len(negs) + ner_ents, ner_tags = example.get_aligned_ents_and_ner() for i, ner_tag in enumerate(ner_tags): gs.ner[i] = moves.lookup_transition(ner_tag) + + # Prevent conflicting spans in the data. For NER, spans are equal if they have the same offsets and label. + neg_span_triples = {(neg_ent.start_char, neg_ent.end_char, neg_ent.label) for neg_ent in negs} + for pos_span in ner_ents: + if (pos_span.start_char, pos_span.end_char, pos_span.label) in neg_span_triples: + raise ValueError(Errors.E868.format(span=(pos_span.start_char, pos_span.end_char, pos_span.label_))) + + # In order to handle negative samples, we need to maintain the full + # (start, end, label) triple. If we break it down to the 'isnt B-LOC' + # thing, we'll get blocked if there's an incorrect prefix. + for i, neg in enumerate(negs): + gs.negs[i] = neg.c return gs @@ -126,6 +151,13 @@ cdef class BiluoPushDown(TransitionSystem): def action_types(self): return (BEGIN, IN, LAST, UNIT, OUT) + def get_doc_labels(self, doc): + labels = set() + for token in doc: + if token.ent_type: + labels.add(token.ent_type_) + return labels + def move_name(self, int move, attr_t label): if move == OUT: return 'O' @@ -149,21 +181,16 @@ cdef class BiluoPushDown(TransitionSystem): cdef attr_t label if name == '-' or name == '' or name is None: return Transition(clas=0, move=MISSING, label=0, score=0) - elif name == '!O': - return Transition(clas=0, move=ISNT, label=0, score=0) elif '-' in name: move_str, label_str = name.split('-', 1) - # Hacky way to denote 'not this entity' + # Deprecated, hacky way to denote 'not this entity' if label_str.startswith('!'): - label_str = label_str[1:] - move_str = 'x' + raise ValueError(Errors.E869.format(label=name)) label = self.strings.add(label_str) else: move_str = name label = 0 move = MOVE_NAMES.index(move_str) - if move == ISNT: - return Transition(clas=0, move=ISNT, label=label, score=0) for i in range(self.n_moves): if self.c[i].move == move and self.c[i].label == label: return self.c[i] @@ -213,7 +240,7 @@ cdef class BiluoPushDown(TransitionSystem): label_id = label_name if action == OUT and label_id != 0: return None - if action == MISSING or action == ISNT: + if action == MISSING: return None # Check we're not creating a move we already have, so that this is # idempotent @@ -240,7 +267,7 @@ cdef class BiluoPushDown(TransitionSystem): for i in range(state.c._ents.size()): ent = state.c._ents.at(i) if ent.start != -1 and ent.end != -1: - ents.append(Span(doc, ent.start, ent.end, label=ent.label)) + ents.append(Span(doc, ent.start, ent.end, label=ent.label, kb_id=doc.c[ent.start].ent_kb_id)) doc.set_ents(ents, default="unmodified") # Set non-blocked tokens to O for i in range(doc.length): @@ -263,9 +290,23 @@ cdef class BiluoPushDown(TransitionSystem): return parses def init_gold(self, StateClass state, Example example): - return BiluoGold(self, state, example) + return BiluoGold(self, state, example, self.neg_key) def has_gold(self, Example eg, start=0, end=None): + # We get x and y referring to X, we want to check relative to Y, + # the reference + y_spans = eg.get_aligned_spans_x2y([eg.x[start:end]]) + if not y_spans: + y_spans = [eg.y[:]] + y_span = y_spans[0] + start = y_span.start + end = y_span.end + neg_key = self.neg_key + if neg_key is not None: + # If we have any negative samples, count that as having annotation. + for span in eg.y.spans.get(neg_key, []): + if span.start >= start and span.end <= end: + return True if end is not None and end < 0: end = None for word in eg.y[start:end]: @@ -301,8 +342,6 @@ cdef class BiluoPushDown(TransitionSystem): n_gold += costs[i] <= 0 else: costs[i] = 9000 - if n_gold < 1: - raise ValueError cdef class Missing: @@ -368,23 +407,33 @@ cdef class Begin: @staticmethod cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil: gold = _gold - cdef int g_act = gold.ner[s.B(0)].move - cdef attr_t g_tag = gold.ner[s.B(0)].label + b0 = s.B(0) + cdef int cost = 0 + cdef int g_act = gold.ner[b0].move + cdef attr_t g_tag = gold.ner[b0].label if g_act == MISSING: - return 0 + pass elif g_act == BEGIN: # B, Gold B --> Label match - return label != g_tag - # Support partial supervision in the form of "not this label" - elif g_act == ISNT: - return label == g_tag + cost += label != g_tag else: # B, Gold I --> False (P) # B, Gold L --> False (P) # B, Gold O --> False (P) # B, Gold U --> False (P) - return 1 + cost += 1 + if s.buffer_length() < 3: + # Handle negatives. In general we can't really do much to block + # B, because we don't know whether the whole entity is going to + # be correct or not. However, we can at least tell whether we're + # going to be opening an entity where there's only one possible + # L. + for span in gold.negs[:gold.nr_neg]: + if span.label == label and span.start == b0: + cost += 1 + break + return cost cdef class In: @@ -457,9 +506,6 @@ cdef class In: elif g_act == UNIT: # I, Gold U --> True iff next tag == O return next_act != OUT - # Support partial supervision in the form of "not this label" - elif g_act == ISNT: - return 0 else: return 1 @@ -499,32 +545,41 @@ cdef class Last: cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil: gold = _gold move = LAST + b0 = s.B(0) + ent_start = s.E(0) - cdef int g_act = gold.ner[s.B(0)].move - cdef attr_t g_tag = gold.ner[s.B(0)].label + cdef int g_act = gold.ner[b0].move + cdef attr_t g_tag = gold.ner[b0].label + + cdef int cost = 0 if g_act == MISSING: - return 0 + pass elif g_act == BEGIN: # L, Gold B --> True - return 0 + pass elif g_act == IN: # L, Gold I --> True iff this entity sunk - return not _entity_is_sunk(s, gold.ner) + cost += not _entity_is_sunk(s, gold.ner) elif g_act == LAST: # L, Gold L --> True - return 0 + pass elif g_act == OUT: # L, Gold O --> True - return 0 + pass elif g_act == UNIT: # L, Gold U --> True - return 0 - # Support partial supervision in the form of "not this label" - elif g_act == ISNT: - return 0 + pass else: - return 1 + cost += 1 + # If we have negative-example entities, integrate them into the objective, + # by marking actions that close an entity that we know is incorrect + # as costly. + for span in gold.negs[:gold.nr_neg]: + if span.label == label and (span.end-1) == b0 and span.start == ent_start: + cost += 1 + break + return cost cdef class Unit: @@ -568,21 +623,29 @@ cdef class Unit: gold = _gold cdef int g_act = gold.ner[s.B(0)].move cdef attr_t g_tag = gold.ner[s.B(0)].label + cdef int cost = 0 if g_act == MISSING: - return 0 + pass elif g_act == UNIT: # U, Gold U --> True iff tag match - return label != g_tag - # Support partial supervision in the form of "not this label" - elif g_act == ISNT: - return label == g_tag + cost += label != g_tag else: # U, Gold B --> False # U, Gold I --> False # U, Gold L --> False # U, Gold O --> False - return 1 + cost += 1 + # If we have negative-example entities, integrate them into the objective. + # This is fairly straight-forward for U- entities, as we have a single + # action + cdef int b0 = s.B(0) + for span in gold.negs[:gold.nr_neg]: + if span.label == label and span.start == b0 and span.end == (b0+1): + cost += 1 + break + return cost + cdef class Out: @@ -608,25 +671,24 @@ cdef class Out: gold = _gold cdef int g_act = gold.ner[s.B(0)].move cdef attr_t g_tag = gold.ner[s.B(0)].label - - if g_act == ISNT and g_tag == 0: - return 1 - elif g_act == MISSING or g_act == ISNT: - return 0 + cdef weight_t cost = 0 + if g_act == MISSING: + pass elif g_act == BEGIN: # O, Gold B --> False - return 1 + cost += 1 elif g_act == IN: # O, Gold I --> True - return 0 + pass elif g_act == LAST: # O, Gold L --> True - return 0 + pass elif g_act == OUT: # O, Gold O --> True - return 0 + pass elif g_act == UNIT: # O, Gold U --> False - return 1 + cost += 1 else: - return 1 + cost += 1 + return cost diff --git a/spacy/pipeline/_parser_internals/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd index eed347b98..52ebd2b8e 100644 --- a/spacy/pipeline/_parser_internals/transition_system.pxd +++ b/spacy/pipeline/_parser_internals/transition_system.pxd @@ -41,6 +41,7 @@ cdef class TransitionSystem: cdef public attr_t root_label cdef public freqs cdef public object labels + cdef public object cfg cdef init_state_t init_beam_state cdef del_state_t del_beam_state diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx index 7632a1993..201128283 100644 --- a/spacy/pipeline/_parser_internals/transition_system.pyx +++ b/spacy/pipeline/_parser_internals/transition_system.pyx @@ -35,7 +35,14 @@ cdef int _del_state(Pool mem, void* state, void* x) except -1: cdef class TransitionSystem: - def __init__(self, StringStore string_table, labels_by_action=None, min_freq=None): + def __init__( + self, + StringStore string_table, + labels_by_action=None, + min_freq=None, + incorrect_spans_key=None + ): + self.cfg = {"neg_key": incorrect_spans_key} self.mem = Pool() self.strings = string_table self.n_moves = 0 @@ -51,8 +58,13 @@ cdef class TransitionSystem: self.del_beam_state = _del_state def __reduce__(self): + # TODO: This loses the 'cfg' return (self.__class__, (self.strings, self.labels), None, None) + @property + def neg_key(self): + return self.cfg.get("neg_key") + def init_batch(self, docs): cdef StateClass state states = [] @@ -161,8 +173,6 @@ cdef class TransitionSystem: def is_valid(self, StateClass stcls, move_name): action = self.lookup_transition(move_name) - if action.move == 0: - return False return action.is_valid(stcls.c, action.label) cdef int set_valid(self, int* is_valid, const StateC* st) nogil: @@ -249,17 +259,22 @@ cdef class TransitionSystem: transitions = [] serializers = { 'moves': lambda: srsly.json_dumps(self.labels), - 'strings': lambda: self.strings.to_bytes() + 'strings': lambda: self.strings.to_bytes(), + 'cfg': lambda: self.cfg } return util.to_bytes(serializers, exclude) def from_bytes(self, bytes_data, exclude=tuple()): + # We're adding a new field, 'cfg', here and we don't want to break + # previous models that don't have it. + msg = srsly.msgpack_loads(bytes_data) labels = {} - deserializers = { - 'moves': lambda b: labels.update(srsly.json_loads(b)), - 'strings': lambda b: self.strings.from_bytes(b) - } - msg = util.from_bytes(bytes_data, deserializers, exclude) + if 'moves' not in exclude: + labels.update(srsly.json_loads(msg['moves'])) + if 'strings' not in exclude: + self.strings.from_bytes(msg['strings']) + if 'cfg' not in exclude and 'cfg' in msg: + self.cfg.update(msg['cfg']) self.initialize_actions(labels) return self diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index 0a34d712a..331eaa4d8 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -32,7 +32,7 @@ class AttributeRuler(Pipe): """Set token-level attributes for tokens matched by Matcher patterns. Additionally supports importing patterns from tag maps and morph rules. - DOCS: https://nightly.spacy.io/api/attributeruler + DOCS: https://spacy.io/api/attributeruler """ def __init__( @@ -48,15 +48,15 @@ class AttributeRuler(Pipe): RETURNS (AttributeRuler): The AttributeRuler component. - DOCS: https://nightly.spacy.io/api/attributeruler#init + DOCS: https://spacy.io/api/attributeruler#init """ self.name = name self.vocab = vocab self.matcher = Matcher(self.vocab, validate=validate) self.validate = validate - self.attrs = [] - self._attrs_unnormed = [] # store for reference - self.indices = [] + self.attrs: List[Dict] = [] + self._attrs_unnormed: List[Dict] = [] # store for reference + self.indices: List[int] = [] def clear(self) -> None: """Reset all patterns.""" @@ -94,14 +94,27 @@ class AttributeRuler(Pipe): doc (Doc): The document to process. RETURNS (Doc): The processed Doc. - DOCS: https://nightly.spacy.io/api/attributeruler#call + DOCS: https://spacy.io/api/attributeruler#call """ - matches = self.matcher(doc, allow_missing=True) - # Sort by the attribute ID, so that later rules have precendence + error_handler = self.get_error_handler() + try: + matches = self.match(doc) + self.set_annotations(doc, matches) + return doc + except Exception as e: + return error_handler(self.name, self, [doc], e) + + def match(self, doc: Doc): + matches = self.matcher(doc, allow_missing=True, as_spans=False) + # Sort by the attribute ID, so that later rules have precedence matches = [ - (int(self.vocab.strings[m_id]), m_id, s, e) for m_id, s, e in matches + (int(self.vocab.strings[m_id]), m_id, s, e) for m_id, s, e in matches # type: ignore ] matches.sort() + return matches + + def set_annotations(self, doc, matches): + """Modify the document in place""" for attr_id, match_id, start, end in matches: span = Span(doc, start, end, label=match_id) attrs = self.attrs[attr_id] @@ -121,7 +134,6 @@ class AttributeRuler(Pipe): ) ) from None set_token_attrs(span[index], attrs) - return doc def load_from_tag_map( self, tag_map: Dict[str, Dict[Union[int, str], Union[int, str]]] @@ -131,7 +143,7 @@ class AttributeRuler(Pipe): tag_map (dict): The tag map that maps fine-grained tags to coarse-grained tags and morphological features. - DOCS: https://nightly.spacy.io/api/attributeruler#load_from_morph_rules + DOCS: https://spacy.io/api/attributeruler#load_from_morph_rules """ for tag, attrs in tag_map.items(): pattern = [{"TAG": tag}] @@ -142,7 +154,7 @@ class AttributeRuler(Pipe): else: morph = self.vocab.morphology.add(attrs["MORPH"]) attrs["MORPH"] = self.vocab.strings[morph] - self.add([pattern], attrs) + self.add([pattern], attrs) # type: ignore[list-item] def load_from_morph_rules( self, morph_rules: Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]] @@ -153,7 +165,7 @@ class AttributeRuler(Pipe): fine-grained tags to coarse-grained tags, lemmas and morphological features. - DOCS: https://nightly.spacy.io/api/attributeruler#load_from_morph_rules + DOCS: https://spacy.io/api/attributeruler#load_from_morph_rules """ for tag in morph_rules: for word in morph_rules[tag]: @@ -166,7 +178,7 @@ class AttributeRuler(Pipe): elif morph_attrs: morph = self.vocab.morphology.add(morph_attrs) attrs["MORPH"] = self.vocab.strings[morph] - self.add([pattern], attrs) + self.add([pattern], attrs) # type: ignore[list-item] def add( self, patterns: Iterable[MatcherPatternType], attrs: Dict, index: int = 0 @@ -181,12 +193,12 @@ class AttributeRuler(Pipe): index (int): The index of the token in the matched span to modify. May be negative to index from the end of the span. Defaults to 0. - DOCS: https://nightly.spacy.io/api/attributeruler#add + DOCS: https://spacy.io/api/attributeruler#add """ # We need to make a string here, because otherwise the ID we pass back # will be interpreted as the hash of a string, rather than an ordinal. key = str(len(self.attrs)) - self.matcher.add(self.vocab.strings.add(key), patterns) + self.matcher.add(self.vocab.strings.add(key), patterns) # type: ignore[arg-type] self._attrs_unnormed.append(attrs) attrs = normalize_token_attrs(self.vocab, attrs) self.attrs.append(attrs) @@ -199,10 +211,10 @@ class AttributeRuler(Pipe): as the arguments to AttributeRuler.add (patterns/attrs/index) to add as patterns. - DOCS: https://nightly.spacy.io/api/attributeruler#add_patterns + DOCS: https://spacy.io/api/attributeruler#add_patterns """ for p in patterns: - self.add(**p) + self.add(**p) # type: ignore[arg-type] @property def patterns(self) -> List[AttributeRulerPatternType]: @@ -211,10 +223,10 @@ class AttributeRuler(Pipe): for i in range(len(self.attrs)): p = {} p["patterns"] = self.matcher.get(str(i))[1] - p["attrs"] = self._attrs_unnormed[i] - p["index"] = self.indices[i] + p["attrs"] = self._attrs_unnormed[i] # type: ignore + p["index"] = self.indices[i] # type: ignore all_patterns.append(p) - return all_patterns + return all_patterns # type: ignore[return-value] def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]: """Score a batch of examples. @@ -224,7 +236,7 @@ class AttributeRuler(Pipe): Scorer.score_token_attr for the attributes "tag", "pos", "morph" and "lemma" for the target token attributes. - DOCS: https://nightly.spacy.io/api/tagger#score + DOCS: https://spacy.io/api/tagger#score """ def morph_key_getter(token, attr): @@ -232,7 +244,7 @@ class AttributeRuler(Pipe): validate_examples(examples, "AttributeRuler.score") results = {} - attrs = set() + attrs = set() # type: ignore for token_attrs in self.attrs: attrs.update(token_attrs) for attr in attrs: @@ -261,10 +273,10 @@ class AttributeRuler(Pipe): exclude (Iterable[str]): String names of serialization fields to exclude. RETURNS (bytes): The serialized object. - DOCS: https://nightly.spacy.io/api/attributeruler#to_bytes + DOCS: https://spacy.io/api/attributeruler#to_bytes """ serialize = {} - serialize["vocab"] = self.vocab.to_bytes + serialize["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude) serialize["patterns"] = lambda: srsly.msgpack_dumps(self.patterns) return util.to_bytes(serialize, exclude) @@ -277,14 +289,14 @@ class AttributeRuler(Pipe): exclude (Iterable[str]): String names of serialization fields to exclude. returns (AttributeRuler): The loaded object. - DOCS: https://nightly.spacy.io/api/attributeruler#from_bytes + DOCS: https://spacy.io/api/attributeruler#from_bytes """ def load_patterns(b): self.add_patterns(srsly.msgpack_loads(b)) deserialize = { - "vocab": lambda b: self.vocab.from_bytes(b), + "vocab": lambda b: self.vocab.from_bytes(b, exclude=exclude), "patterns": load_patterns, } util.from_bytes(bytes_data, deserialize, exclude) @@ -298,10 +310,10 @@ class AttributeRuler(Pipe): path (Union[Path, str]): A path to a directory. exclude (Iterable[str]): String names of serialization fields to exclude. - DOCS: https://nightly.spacy.io/api/attributeruler#to_disk + DOCS: https://spacy.io/api/attributeruler#to_disk """ serialize = { - "vocab": lambda p: self.vocab.to_disk(p), + "vocab": lambda p: self.vocab.to_disk(p, exclude=exclude), "patterns": lambda p: srsly.write_msgpack(p, self.patterns), } util.to_disk(path, serialize, exclude) @@ -315,14 +327,14 @@ class AttributeRuler(Pipe): exclude (Iterable[str]): String names of serialization fields to exclude. RETURNS (AttributeRuler): The loaded object. - DOCS: https://nightly.spacy.io/api/attributeruler#from_disk + DOCS: https://spacy.io/api/attributeruler#from_disk """ def load_patterns(p): self.add_patterns(srsly.read_msgpack(p)) deserialize = { - "vocab": lambda p: self.vocab.from_disk(p), + "vocab": lambda p: self.vocab.from_disk(p, exclude=exclude), "patterns": load_patterns, } util.from_disk(path, deserialize, exclude) diff --git a/spacy/pipeline/dep_parser.py b/spacy/pipeline/dep_parser.py index 02ae63925..f9d9d4840 100644 --- a/spacy/pipeline/dep_parser.py +++ b/spacy/pipeline/dep_parser.py @@ -3,6 +3,7 @@ from collections import defaultdict from typing import Optional, Iterable from thinc.api import Model, Config +from ._parser_internals.transition_system import TransitionSystem from .transition_parser import Parser from ._parser_internals.arc_eager import ArcEager @@ -23,7 +24,7 @@ hidden_width = 64 maxout_pieces = 2 [model.tok2vec] -@architectures = "spacy.HashEmbedCNN.v1" +@architectures = "spacy.HashEmbedCNN.v2" pretrained_vectors = null width = 96 depth = 4 @@ -58,7 +59,7 @@ def make_parser( nlp: Language, name: str, model: Model, - moves: Optional[list], + moves: Optional[TransitionSystem], update_with_oracle_cut_size: int, learn_tokens: bool, min_action_freq: int, @@ -84,13 +85,13 @@ def make_parser( model (Model): The model for the transition-based parser. The model needs to have a specific substructure of named components --- see the spacy.ml.tb_framework.TransitionModel for details. - moves (List[str]): A list of transition names. Inferred from the data if not - provided. - update_with_oracle_cut_size (int): - During training, cut long sequences into shorter segments by creating - intermediate states based on the gold-standard history. The model is - not very sensitive to this parameter, so you usually won't need to change - it. 100 is a good default. + moves (Optional[TransitionSystem]): This defines how the parse-state is created, + updated and evaluated. If 'moves' is None, a new instance is + created with `self.TransitionSystem()`. Defaults to `None`. + update_with_oracle_cut_size (int): During training, cut long sequences into + shorter segments by creating intermediate states based on the gold-standard + history. The model is not very sensitive to this parameter, so you usually + won't need to change it. 100 is a good default. learn_tokens (bool): Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. min_action_freq (int): The minimum frequency of labelled actions to retain. @@ -111,6 +112,9 @@ def make_parser( beam_width=1, beam_density=0.0, beam_update_prob=0.0, + # At some point in the future we can try to implement support for + # partial annotations, perhaps only in the beam objective. + incorrect_spans_key=None ) @@ -140,7 +144,7 @@ def make_beam_parser( nlp: Language, name: str, model: Model, - moves: Optional[list], + moves: Optional[TransitionSystem], update_with_oracle_cut_size: int, learn_tokens: bool, min_action_freq: int, @@ -165,8 +169,13 @@ def make_beam_parser( model (Model): The model for the transition-based parser. The model needs to have a specific substructure of named components --- see the spacy.ml.tb_framework.TransitionModel for details. - moves (List[str]): A list of transition names. Inferred from the data if not - provided. + moves (Optional[TransitionSystem]): This defines how the parse-state is created, + updated and evaluated. If 'moves' is None, a new instance is + created with `self.TransitionSystem()`. Defaults to `None`. + update_with_oracle_cut_size (int): During training, cut long sequences into + shorter segments by creating intermediate states based on the gold-standard + history. The model is not very sensitive to this parameter, so you usually + won't need to change it. 100 is a good default. beam_width (int): The number of candidate analyses to maintain. beam_density (float): The minimum ratio between the scores of the first and last candidates in the beam. This allows the parser to avoid exploring @@ -196,17 +205,53 @@ def make_beam_parser( multitasks=[], learn_tokens=learn_tokens, min_action_freq=min_action_freq, + # At some point in the future we can try to implement support for + # partial annotations, perhaps only in the beam objective. + incorrect_spans_key=None ) class DependencyParser(Parser): """Pipeline component for dependency parsing. - DOCS: https://nightly.spacy.io/api/dependencyparser + DOCS: https://spacy.io/api/dependencyparser """ TransitionSystem = ArcEager + def __init__( + self, + vocab, + model, + name="parser", + moves=None, + *, + update_with_oracle_cut_size=100, + min_action_freq=30, + learn_tokens=False, + beam_width=1, + beam_density=0.0, + beam_update_prob=0.0, + multitasks=tuple(), + incorrect_spans_key=None, + ): + """Create a DependencyParser. + """ + super().__init__( + vocab, + model, + name, + moves, + update_with_oracle_cut_size=update_with_oracle_cut_size, + min_action_freq=min_action_freq, + learn_tokens=learn_tokens, + beam_width=beam_width, + beam_density=beam_density, + beam_update_prob=beam_update_prob, + multitasks=multitasks, + incorrect_spans_key=incorrect_spans_key, + ) + @property def postprocesses(self): output = [nonproj.deprojectivize] @@ -244,7 +289,7 @@ class DependencyParser(Parser): RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans and Scorer.score_deps. - DOCS: https://nightly.spacy.io/api/dependencyparser#score + DOCS: https://spacy.io/api/dependencyparser#score """ def has_sents(doc): @@ -283,3 +328,10 @@ class DependencyParser(Parser): head_scores.append(score_head_dict) label_scores.append(score_label_dict) return head_scores, label_scores + + def _ensure_labels_are_added(self, docs): + # This gives the parser a chance to add labels it's missing for a batch + # of documents. However, this isn't desirable for the dependency parser, + # because we instead have a label frequency cut-off and back off rare + # labels to 'dep'. + pass diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index f1ba8637b..4a0902444 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -1,6 +1,7 @@ -from itertools import islice -from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List +from typing import Optional, Iterable, Callable, Dict, Union, List, Any +from thinc.types import Floats2d from pathlib import Path +from itertools import islice import srsly import random from thinc.api import CosineDistance, Model, Optimizer, Config @@ -9,7 +10,7 @@ import warnings from ..kb import KnowledgeBase, Candidate from ..ml import empty_kb -from ..tokens import Doc +from ..tokens import Doc, Span from .pipe import deserialize_config from .trainable_pipe import TrainablePipe from ..language import Language @@ -26,7 +27,7 @@ default_model_config = """ @architectures = "spacy.EntityLinker.v1" [model.tok2vec] -@architectures = "spacy.HashEmbedCNN.v1" +@architectures = "spacy.HashEmbedCNN.v2" pretrained_vectors = null width = 96 depth = 2 @@ -45,6 +46,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] default_config={ "model": DEFAULT_NEL_MODEL, "labels_discard": [], + "n_sents": 0, "incl_prior": True, "incl_context": True, "entity_vector_length": 64, @@ -62,10 +64,11 @@ def make_entity_linker( model: Model, *, labels_discard: Iterable[str], + n_sents: int, incl_prior: bool, incl_context: bool, entity_vector_length: int, - get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]], + get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], ): """Construct an EntityLinker component. @@ -73,6 +76,7 @@ def make_entity_linker( representations. Given a batch of Doc objects, it should return a single array, with one row per item in the batch. labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction. + n_sents (int): The number of neighbouring sentences to take into account. incl_prior (bool): Whether or not to include prior probabilities from the KB in the model. incl_context (bool): Whether or not to include the local context in the model. entity_vector_length (int): Size of encoding vectors in the KB. @@ -84,6 +88,7 @@ def make_entity_linker( model, name, labels_discard=labels_discard, + n_sents=n_sents, incl_prior=incl_prior, incl_context=incl_context, entity_vector_length=entity_vector_length, @@ -94,7 +99,7 @@ def make_entity_linker( class EntityLinker(TrainablePipe): """Pipeline component for named entity linking. - DOCS: https://nightly.spacy.io/api/entitylinker + DOCS: https://spacy.io/api/entitylinker """ NIL = "NIL" # string used to refer to a non-existing link @@ -106,10 +111,11 @@ class EntityLinker(TrainablePipe): name: str = "entity_linker", *, labels_discard: Iterable[str], + n_sents: int, incl_prior: bool, incl_context: bool, entity_vector_length: int, - get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]], + get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], ) -> None: """Initialize an entity linker. @@ -118,39 +124,41 @@ class EntityLinker(TrainablePipe): name (str): The component instance name, used to add entries to the losses during training. labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction. + n_sents (int): The number of neighbouring sentences to take into account. incl_prior (bool): Whether or not to include prior probabilities from the KB in the model. incl_context (bool): Whether or not to include the local context in the model. entity_vector_length (int): Size of encoding vectors in the KB. - get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that + get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that produces a list of candidates, given a certain knowledge base and a textual mention. - DOCS: https://nightly.spacy.io/api/entitylinker#init + DOCS: https://spacy.io/api/entitylinker#init """ self.vocab = vocab self.model = model self.name = name - cfg = { - "labels_discard": list(labels_discard), - "incl_prior": incl_prior, - "incl_context": incl_context, - "entity_vector_length": entity_vector_length, - } + self.labels_discard = list(labels_discard) + self.n_sents = n_sents + self.incl_prior = incl_prior + self.incl_context = incl_context self.get_candidates = get_candidates - self.cfg = dict(cfg) + self.cfg: Dict[str, Any] = {} self.distance = CosineDistance(normalize=False) - # how many neightbour sentences to take into account - self.n_sents = cfg.get("n_sents", 0) + # how many neighbour sentences to take into account # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'. self.kb = empty_kb(entity_vector_length)(self.vocab) def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]): """Define the KB of this pipe by providing a function that will create it using this object's vocab.""" + if not callable(kb_loader): + raise ValueError(Errors.E885.format(arg_type=type(kb_loader))) + self.kb = kb_loader(self.vocab) - self.cfg["entity_vector_length"] = self.kb.entity_vector_length def validate_kb(self) -> None: # Raise an error if the knowledge base is not initialized. + if self.kb is None: + raise ValueError(Errors.E1018.format(name=self.name)) if len(self.kb) == 0: raise ValueError(Errors.E139.format(name=self.name)) @@ -159,7 +167,7 @@ class EntityLinker(TrainablePipe): get_examples: Callable[[], Iterable[Example]], *, nlp: Optional[Language] = None, - kb_loader: Callable[[Vocab], KnowledgeBase] = None, + kb_loader: Optional[Callable[[Vocab], KnowledgeBase]] = None, ): """Initialize the pipe for training, using a representative set of data examples. @@ -171,7 +179,7 @@ class EntityLinker(TrainablePipe): Note that providing this argument, will overwrite all data accumulated in the current KB. Use this only when loading a KB as-such from file. - DOCS: https://nightly.spacy.io/api/entitylinker#initialize + DOCS: https://spacy.io/api/entitylinker#initialize """ validate_get_examples(get_examples, "EntityLinker.initialize") if kb_loader is not None: @@ -198,8 +206,7 @@ class EntityLinker(TrainablePipe): losses: Optional[Dict[str, float]] = None, ) -> Dict[str, float]: """Learn from a batch of documents and gold-standard information, - updating the pipe's model. Delegates to predict, get_loss and - set_annotations. + updating the pipe's model. Delegates to predict and get_loss. examples (Iterable[Example]): A batch of Example objects. drop (float): The dropout rate. @@ -208,7 +215,7 @@ class EntityLinker(TrainablePipe): Updated using the component name as the key. RETURNS (Dict[str, float]): The updated losses dictionary. - DOCS: https://nightly.spacy.io/api/entitylinker#update + DOCS: https://spacy.io/api/entitylinker#update """ self.validate_kb() if losses is None: @@ -218,13 +225,6 @@ class EntityLinker(TrainablePipe): return losses validate_examples(examples, "EntityLinker.update") sentence_docs = [] - docs = [] - for eg in examples: - eg.predicted.ents = eg.reference.ents - docs.append(eg.predicted) - # This seems simpler than other ways to get that exact output -- but - # it does run the model twice :( - predictions = self.predict(docs) for eg in examples: sentences = [s for s in eg.reference.sents] kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) @@ -260,10 +260,9 @@ class EntityLinker(TrainablePipe): if sgd is not None: self.finish_update(sgd) losses[self.name] += loss - self.set_annotations(docs, predictions) return losses - def get_loss(self, examples: Iterable[Example], sentence_encodings): + def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d): validate_examples(examples, "EntityLinker.get_loss") entity_encodings = [] for eg in examples: @@ -279,38 +278,11 @@ class EntityLinker(TrainablePipe): method="get_loss", msg="gold entities do not match up" ) raise RuntimeError(err) - gradients = self.distance.get_grad(sentence_encodings, entity_encodings) - loss = self.distance.get_loss(sentence_encodings, entity_encodings) + # TODO: fix typing issue here + gradients = self.distance.get_grad(sentence_encodings, entity_encodings) # type: ignore + loss = self.distance.get_loss(sentence_encodings, entity_encodings) # type: ignore loss = loss / len(entity_encodings) - return loss, gradients - - def __call__(self, doc: Doc) -> Doc: - """Apply the pipe to a Doc. - - doc (Doc): The document to process. - RETURNS (Doc): The processed Doc. - - DOCS: https://nightly.spacy.io/api/entitylinker#call - """ - kb_ids = self.predict([doc]) - self.set_annotations([doc], kb_ids) - return doc - - def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]: - """Apply the pipe to a stream of documents. This usually happens under - the hood when the nlp object is called on a text and all components are - applied to the Doc. - - stream (Iterable[Doc]): A stream of documents. - batch_size (int): The number of documents to buffer. - YIELDS (Doc): Processed documents in order. - - DOCS: https://nightly.spacy.io/api/entitylinker#pipe - """ - for docs in util.minibatch(stream, size=batch_size): - kb_ids = self.predict(docs) - self.set_annotations(docs, kb_ids) - yield from docs + return float(loss), gradients def predict(self, docs: Iterable[Doc]) -> List[str]: """Apply the pipeline's model to a batch of docs, without modifying them. @@ -318,13 +290,13 @@ class EntityLinker(TrainablePipe): no prediction. docs (Iterable[Doc]): The documents to predict. - RETURNS (List[int]): The models prediction for each document. + RETURNS (List[str]): The models prediction for each document. - DOCS: https://nightly.spacy.io/api/entitylinker#predict + DOCS: https://spacy.io/api/entitylinker#predict """ self.validate_kb() entity_count = 0 - final_kb_ids = [] + final_kb_ids: List[str] = [] if not docs: return final_kb_ids if isinstance(docs, Doc): @@ -332,78 +304,67 @@ class EntityLinker(TrainablePipe): for i, doc in enumerate(docs): sentences = [s for s in doc.sents] if len(doc) > 0: - # Looping through each sentence and each entity - # This may go wrong if there are entities across sentences - which shouldn't happen normally. - for sent_index, sent in enumerate(sentences): - if sent.ents: - # get n_neightbour sentences, clipped to the length of the document - start_sentence = max(0, sent_index - self.n_sents) - end_sentence = min( - len(sentences) - 1, sent_index + self.n_sents - ) - start_token = sentences[start_sentence].start - end_token = sentences[end_sentence].end - sent_doc = doc[start_token:end_token].as_doc() - # currently, the context is the same for each entity in a sentence (should be refined) - xp = self.model.ops.xp - if self.cfg.get("incl_context"): - sentence_encoding = self.model.predict([sent_doc])[0] - sentence_encoding_t = sentence_encoding.T - sentence_norm = xp.linalg.norm(sentence_encoding_t) - for ent in sent.ents: - entity_count += 1 - to_discard = self.cfg.get("labels_discard", []) - if to_discard and ent.label_ in to_discard: - # ignoring this entity - setting to NIL - final_kb_ids.append(self.NIL) - else: - candidates = self.get_candidates(self.kb, ent) - if not candidates: - # no prediction possible for this entity - setting to NIL - final_kb_ids.append(self.NIL) - elif len(candidates) == 1: - # shortcut for efficiency reasons: take the 1 candidate - # TODO: thresholding - final_kb_ids.append(candidates[0].entity_) - else: - random.shuffle(candidates) - # set all prior probabilities to 0 if incl_prior=False - prior_probs = xp.asarray( - [c.prior_prob for c in candidates] + # Looping through each entity (TODO: rewrite) + for ent in doc.ents: + sent = ent.sent + sent_index = sentences.index(sent) + assert sent_index >= 0 + # get n_neighbour sentences, clipped to the length of the document + start_sentence = max(0, sent_index - self.n_sents) + end_sentence = min(len(sentences) - 1, sent_index + self.n_sents) + start_token = sentences[start_sentence].start + end_token = sentences[end_sentence].end + sent_doc = doc[start_token:end_token].as_doc() + # currently, the context is the same for each entity in a sentence (should be refined) + xp = self.model.ops.xp + if self.incl_context: + sentence_encoding = self.model.predict([sent_doc])[0] + sentence_encoding_t = sentence_encoding.T + sentence_norm = xp.linalg.norm(sentence_encoding_t) + entity_count += 1 + if ent.label_ in self.labels_discard: + # ignoring this entity - setting to NIL + final_kb_ids.append(self.NIL) + else: + candidates = list(self.get_candidates(self.kb, ent)) + if not candidates: + # no prediction possible for this entity - setting to NIL + final_kb_ids.append(self.NIL) + elif len(candidates) == 1: + # shortcut for efficiency reasons: take the 1 candidate + # TODO: thresholding + final_kb_ids.append(candidates[0].entity_) + else: + random.shuffle(candidates) + # set all prior probabilities to 0 if incl_prior=False + prior_probs = xp.asarray([c.prior_prob for c in candidates]) + if not self.incl_prior: + prior_probs = xp.asarray([0.0 for _ in candidates]) + scores = prior_probs + # add in similarity from the context + if self.incl_context: + entity_encodings = xp.asarray( + [c.entity_vector for c in candidates] + ) + entity_norm = xp.linalg.norm(entity_encodings, axis=1) + if len(entity_encodings) != len(prior_probs): + raise RuntimeError( + Errors.E147.format( + method="predict", + msg="vectors not of equal length", + ) ) - if not self.cfg.get("incl_prior"): - prior_probs = xp.asarray( - [0.0 for _ in candidates] - ) - scores = prior_probs - # add in similarity from the context - if self.cfg.get("incl_context"): - entity_encodings = xp.asarray( - [c.entity_vector for c in candidates] - ) - entity_norm = xp.linalg.norm( - entity_encodings, axis=1 - ) - if len(entity_encodings) != len(prior_probs): - raise RuntimeError( - Errors.E147.format( - method="predict", - msg="vectors not of equal length", - ) - ) - # cosine similarity - sims = xp.dot( - entity_encodings, sentence_encoding_t - ) / (sentence_norm * entity_norm) - if sims.shape != prior_probs.shape: - raise ValueError(Errors.E161) - scores = ( - prior_probs + sims - (prior_probs * sims) - ) - # TODO: thresholding - best_index = scores.argmax().item() - best_candidate = candidates[best_index] - final_kb_ids.append(best_candidate.entity_) + # cosine similarity + sims = xp.dot(entity_encodings, sentence_encoding_t) / ( + sentence_norm * entity_norm + ) + if sims.shape != prior_probs.shape: + raise ValueError(Errors.E161) + scores = prior_probs + sims - (prior_probs * sims) + # TODO: thresholding + best_index = scores.argmax().item() + best_candidate = candidates[best_index] + final_kb_ids.append(best_candidate.entity_) if not (len(final_kb_ids) == entity_count): err = Errors.E147.format( method="predict", msg="result variables not of equal length" @@ -417,7 +378,7 @@ class EntityLinker(TrainablePipe): docs (Iterable[Doc]): The documents to modify. kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict. - DOCS: https://nightly.spacy.io/api/entitylinker#set_annotations + DOCS: https://spacy.io/api/entitylinker#set_annotations """ count_ents = len([ent for doc in docs for ent in doc.ents]) if count_ents != len(kb_ids): @@ -436,11 +397,53 @@ class EntityLinker(TrainablePipe): examples (Iterable[Example]): The examples to score. RETURNS (Dict[str, Any]): The scores. - DOCS TODO: https://nightly.spacy.io/api/entity_linker#score + DOCS TODO: https://spacy.io/api/entity_linker#score """ validate_examples(examples, "EntityLinker.score") return Scorer.score_links(examples, negative_labels=[self.NIL]) + def to_bytes(self, *, exclude=tuple()): + """Serialize the pipe to a bytestring. + + exclude (Iterable[str]): String names of serialization fields to exclude. + RETURNS (bytes): The serialized object. + + DOCS: https://spacy.io/api/entitylinker#to_bytes + """ + self._validate_serialization_attrs() + serialize = {} + if hasattr(self, "cfg") and self.cfg is not None: + serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) + serialize["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude) + serialize["kb"] = self.kb.to_bytes + serialize["model"] = self.model.to_bytes + return util.to_bytes(serialize, exclude) + + def from_bytes(self, bytes_data, *, exclude=tuple()): + """Load the pipe from a bytestring. + + exclude (Iterable[str]): String names of serialization fields to exclude. + RETURNS (TrainablePipe): The loaded object. + + DOCS: https://spacy.io/api/entitylinker#from_bytes + """ + self._validate_serialization_attrs() + + def load_model(b): + try: + self.model.from_bytes(b) + except AttributeError: + raise ValueError(Errors.E149) from None + + deserialize = {} + if hasattr(self, "cfg") and self.cfg is not None: + deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b)) + deserialize["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude) + deserialize["kb"] = lambda b: self.kb.from_bytes(b) + deserialize["model"] = load_model + util.from_bytes(bytes_data, deserialize, exclude) + return self + def to_disk( self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() ) -> None: @@ -449,10 +452,10 @@ class EntityLinker(TrainablePipe): path (str / Path): Path to a directory. exclude (Iterable[str]): String names of serialization fields to exclude. - DOCS: https://nightly.spacy.io/api/entitylinker#to_disk + DOCS: https://spacy.io/api/entitylinker#to_disk """ serialize = {} - serialize["vocab"] = lambda p: self.vocab.to_disk(p) + serialize["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude) serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["kb"] = lambda p: self.kb.to_disk(p) serialize["model"] = lambda p: self.model.to_disk(p) @@ -467,17 +470,19 @@ class EntityLinker(TrainablePipe): exclude (Iterable[str]): String names of serialization fields to exclude. RETURNS (EntityLinker): The modified EntityLinker object. - DOCS: https://nightly.spacy.io/api/entitylinker#from_disk + DOCS: https://spacy.io/api/entitylinker#from_disk """ def load_model(p): try: - self.model.from_bytes(p.open("rb").read()) + with p.open("rb") as infile: + self.model.from_bytes(infile.read()) except AttributeError: raise ValueError(Errors.E149) from None - deserialize = {} + deserialize: Dict[str, Callable[[Any], Any]] = {} deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p)) + deserialize["vocab"] = lambda p: self.vocab.from_disk(p, exclude=exclude) deserialize["kb"] = lambda p: self.kb.from_disk(p) deserialize["model"] = load_model util.from_disk(path, deserialize, exclude) diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index c3d983dec..b8f32b4d3 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -1,4 +1,6 @@ +import warnings from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence +from typing import cast from collections import defaultdict from pathlib import Path import srsly @@ -6,7 +8,7 @@ import srsly from .pipe import Pipe from ..training import Example from ..language import Language -from ..errors import Errors +from ..errors import Errors, Warnings from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList from ..tokens import Doc, Span from ..matcher import Matcher, PhraseMatcher @@ -59,8 +61,8 @@ class EntityRuler(Pipe): purely rule-based entity recognition system. After initialization, the component is typically added to the pipeline using `nlp.add_pipe`. - DOCS: https://nightly.spacy.io/api/entityruler - USAGE: https://nightly.spacy.io/usage/rule-based-matching#entityruler + DOCS: https://spacy.io/api/entityruler + USAGE: https://spacy.io/usage/rule-based-matching#entityruler """ def __init__( @@ -94,26 +96,21 @@ class EntityRuler(Pipe): added by the model, overwrite them by matches if necessary. ent_id_sep (str): Separator used internally for entity IDs. - DOCS: https://nightly.spacy.io/api/entityruler#init + DOCS: https://spacy.io/api/entityruler#init """ self.nlp = nlp self.name = name self.overwrite = overwrite_ents - self.token_patterns = defaultdict(list) - self.phrase_patterns = defaultdict(list) + self.token_patterns = defaultdict(list) # type: ignore + self.phrase_patterns = defaultdict(list) # type: ignore + self._validate = validate self.matcher = Matcher(nlp.vocab, validate=validate) - if phrase_matcher_attr is not None: - if phrase_matcher_attr.upper() == "TEXT": - phrase_matcher_attr = "ORTH" - self.phrase_matcher_attr = phrase_matcher_attr - self.phrase_matcher = PhraseMatcher( - nlp.vocab, attr=self.phrase_matcher_attr, validate=validate - ) - else: - self.phrase_matcher_attr = None - self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate) + self.phrase_matcher_attr = phrase_matcher_attr + self.phrase_matcher = PhraseMatcher( + nlp.vocab, attr=self.phrase_matcher_attr, validate=validate + ) self.ent_id_sep = ent_id_sep - self._ent_ids = defaultdict(dict) + self._ent_ids = defaultdict(tuple) # type: ignore if patterns is not None: self.add_patterns(patterns) @@ -133,14 +130,33 @@ class EntityRuler(Pipe): doc (Doc): The Doc object in the pipeline. RETURNS (Doc): The Doc with added entities, if available. - DOCS: https://nightly.spacy.io/api/entityruler#call + DOCS: https://spacy.io/api/entityruler#call """ - matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc)) - matches = set( + error_handler = self.get_error_handler() + try: + matches = self.match(doc) + self.set_annotations(doc, matches) + return doc + except Exception as e: + return error_handler(self.name, self, [doc], e) + + def match(self, doc: Doc): + self._require_patterns() + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="\\[W036") + matches = cast( + List[Tuple[int, int, int]], + list(self.matcher(doc)) + list(self.phrase_matcher(doc)), + ) + final_matches = set( [(m_id, start, end) for m_id, start, end in matches if start != end] ) get_sort_key = lambda m: (m[2] - m[1], -m[1]) - matches = sorted(matches, key=get_sort_key, reverse=True) + final_matches = sorted(final_matches, key=get_sort_key, reverse=True) + return final_matches + + def set_annotations(self, doc, matches): + """Modify the document in place""" entities = list(doc.ents) new_entities = [] seen_tokens = set() @@ -163,7 +179,6 @@ class EntityRuler(Pipe): ] seen_tokens.update(range(start, end)) doc.ents = entities + new_entities - return doc @property def labels(self) -> Tuple[str, ...]: @@ -171,7 +186,7 @@ class EntityRuler(Pipe): RETURNS (set): The string labels. - DOCS: https://nightly.spacy.io/api/entityruler#labels + DOCS: https://spacy.io/api/entityruler#labels """ keys = set(self.token_patterns.keys()) keys.update(self.phrase_patterns.keys()) @@ -183,7 +198,7 @@ class EntityRuler(Pipe): all_labels.add(label) else: all_labels.add(l) - return tuple(all_labels) + return tuple(sorted(all_labels)) def initialize( self, @@ -199,19 +214,19 @@ class EntityRuler(Pipe): nlp (Language): The current nlp object the component is part of. patterns Optional[Iterable[PatternType]]: The list of patterns. - DOCS: https://nightly.spacy.io/api/entityruler#initialize + DOCS: https://spacy.io/api/entityruler#initialize """ self.clear() if patterns: - self.add_patterns(patterns) + self.add_patterns(patterns) # type: ignore[arg-type] @property - def ent_ids(self) -> Tuple[str, ...]: + def ent_ids(self) -> Tuple[Optional[str], ...]: """All entity ids present in the match patterns `id` properties RETURNS (set): The string entity ids. - DOCS: https://nightly.spacy.io/api/entityruler#ent_ids + DOCS: https://spacy.io/api/entityruler#ent_ids """ keys = set(self.token_patterns.keys()) keys.update(self.phrase_patterns.keys()) @@ -229,7 +244,7 @@ class EntityRuler(Pipe): RETURNS (list): The original patterns, one dictionary per pattern. - DOCS: https://nightly.spacy.io/api/entityruler#patterns + DOCS: https://spacy.io/api/entityruler#patterns """ all_patterns = [] for label, patterns in self.token_patterns.items(): @@ -256,7 +271,7 @@ class EntityRuler(Pipe): patterns (list): The patterns to add. - DOCS: https://nightly.spacy.io/api/entityruler#add_patterns + DOCS: https://spacy.io/api/entityruler#add_patterns """ # disable the nlp components after this one in case they hadn't been initialized / deserialised yet @@ -266,9 +281,7 @@ class EntityRuler(Pipe): if self == pipe: current_index = i break - subsequent_pipes = [ - pipe for pipe in self.nlp.pipe_names[current_index + 1 :] - ] + subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index:]] except ValueError: subsequent_pipes = [] with self.nlp.select_pipes(disable=subsequent_pipes): @@ -289,36 +302,43 @@ class EntityRuler(Pipe): self.nlp.pipe(phrase_pattern_texts), phrase_pattern_ids, ): - phrase_pattern = {"label": label, "pattern": pattern, "id": ent_id} + phrase_pattern = {"label": label, "pattern": pattern} if ent_id: phrase_pattern["id"] = ent_id phrase_patterns.append(phrase_pattern) - for entry in token_patterns + phrase_patterns: + for entry in token_patterns + phrase_patterns: # type: ignore[operator] label = entry["label"] if "id" in entry: ent_label = label label = self._create_label(label, entry["id"]) key = self.matcher._normalize_key(label) self._ent_ids[key] = (ent_label, entry["id"]) - pattern = entry["pattern"] + pattern = entry["pattern"] # type: ignore if isinstance(pattern, Doc): self.phrase_patterns[label].append(pattern) + self.phrase_matcher.add(label, [pattern]) # type: ignore elif isinstance(pattern, list): self.token_patterns[label].append(pattern) + self.matcher.add(label, [pattern]) else: raise ValueError(Errors.E097.format(pattern=pattern)) - for label, patterns in self.token_patterns.items(): - self.matcher.add(label, patterns) - for label, patterns in self.phrase_patterns.items(): - self.phrase_matcher.add(label, patterns) def clear(self) -> None: """Reset all patterns.""" self.token_patterns = defaultdict(list) self.phrase_patterns = defaultdict(list) - self._ent_ids = defaultdict(dict) + self._ent_ids = defaultdict(tuple) + self.matcher = Matcher(self.nlp.vocab, validate=self._validate) + self.phrase_matcher = PhraseMatcher( + self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate + ) - def _split_label(self, label: str) -> Tuple[str, str]: + def _require_patterns(self) -> None: + """Raise a warning if this component has no patterns defined.""" + if len(self) == 0: + warnings.warn(Warnings.W036.format(name=self.name)) + + def _split_label(self, label: str) -> Tuple[str, Optional[str]]: """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep label (str): The value of label in a pattern entry @@ -328,11 +348,12 @@ class EntityRuler(Pipe): ent_label, ent_id = label.rsplit(self.ent_id_sep, 1) else: ent_label = label - ent_id = None + ent_id = None # type: ignore return ent_label, ent_id - def _create_label(self, label: str, ent_id: str) -> str: + def _create_label(self, label: Any, ent_id: Any) -> str: """Join Entity label with ent_id if the pattern has an `id` attribute + If ent_id is not a string, the label is returned as is. label (str): The label to set for ent.label_ ent_id (str): The label @@ -354,7 +375,7 @@ class EntityRuler(Pipe): patterns_bytes (bytes): The bytestring to load. RETURNS (EntityRuler): The loaded entity ruler. - DOCS: https://nightly.spacy.io/api/entityruler#from_bytes + DOCS: https://spacy.io/api/entityruler#from_bytes """ cfg = srsly.msgpack_loads(patterns_bytes) self.clear() @@ -362,10 +383,9 @@ class EntityRuler(Pipe): self.add_patterns(cfg.get("patterns", cfg)) self.overwrite = cfg.get("overwrite", False) self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None) - if self.phrase_matcher_attr is not None: - self.phrase_matcher = PhraseMatcher( - self.nlp.vocab, attr=self.phrase_matcher_attr - ) + self.phrase_matcher = PhraseMatcher( + self.nlp.vocab, attr=self.phrase_matcher_attr + ) self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP) else: self.add_patterns(cfg) @@ -376,7 +396,7 @@ class EntityRuler(Pipe): RETURNS (bytes): The serialized patterns. - DOCS: https://nightly.spacy.io/api/entityruler#to_bytes + DOCS: https://spacy.io/api/entityruler#to_bytes """ serial = { "overwrite": self.overwrite, @@ -395,7 +415,7 @@ class EntityRuler(Pipe): path (str / Path): The JSONL file to load. RETURNS (EntityRuler): The loaded entity ruler. - DOCS: https://nightly.spacy.io/api/entityruler#from_disk + DOCS: https://spacy.io/api/entityruler#from_disk """ path = ensure_path(path) self.clear() @@ -416,10 +436,9 @@ class EntityRuler(Pipe): self.phrase_matcher_attr = cfg.get("phrase_matcher_attr") self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP) - if self.phrase_matcher_attr is not None: - self.phrase_matcher = PhraseMatcher( - self.nlp.vocab, attr=self.phrase_matcher_attr - ) + self.phrase_matcher = PhraseMatcher( + self.nlp.vocab, attr=self.phrase_matcher_attr + ) from_disk(path, deserializers_patterns, {}) return self @@ -431,7 +450,7 @@ class EntityRuler(Pipe): path (str / Path): The JSONL file to save. - DOCS: https://nightly.spacy.io/api/entityruler#to_disk + DOCS: https://spacy.io/api/entityruler#to_disk """ path = ensure_path(path) cfg = { diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py index d955e970d..f0a75dc2c 100644 --- a/spacy/pipeline/functions.py +++ b/spacy/pipeline/functions.py @@ -1,10 +1,9 @@ -import srsly -from thinc.api import Config from typing import Dict, Any +import srsly + from ..language import Language from ..matcher import Matcher from ..tokens import Doc -from ..util import filter_spans from .. import util @@ -19,14 +18,14 @@ def merge_noun_chunks(doc: Doc) -> Doc: doc (Doc): The Doc object. RETURNS (Doc): The Doc object with merged noun chunks. - DOCS: https://nightly.spacy.io/api/pipeline-functions#merge_noun_chunks + DOCS: https://spacy.io/api/pipeline-functions#merge_noun_chunks """ if not doc.has_annotation("DEP"): return doc with doc.retokenize() as retokenizer: for np in doc.noun_chunks: attrs = {"tag": np.root.tag, "dep": np.root.dep} - retokenizer.merge(np, attrs=attrs) + retokenizer.merge(np, attrs=attrs) # type: ignore[arg-type] return doc @@ -41,12 +40,12 @@ def merge_entities(doc: Doc): doc (Doc): The Doc object. RETURNS (Doc): The Doc object with merged entities. - DOCS: https://nightly.spacy.io/api/pipeline-functions#merge_entities + DOCS: https://spacy.io/api/pipeline-functions#merge_entities """ with doc.retokenize() as retokenizer: for ent in doc.ents: attrs = {"tag": ent.root.tag, "dep": ent.root.dep, "ent_type": ent.label} - retokenizer.merge(ent, attrs=attrs) + retokenizer.merge(ent, attrs=attrs) # type: ignore[arg-type] return doc @@ -58,13 +57,13 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc: label (str): The subtoken dependency label. RETURNS (Doc): The Doc object with merged subtokens. - DOCS: https://nightly.spacy.io/api/pipeline-functions#merge_subtokens + DOCS: https://spacy.io/api/pipeline-functions#merge_subtokens """ # TODO: make stateful component with "label" config merger = Matcher(doc.vocab) merger.add("SUBTOK", [[{"DEP": label, "op": "+"}]]) matches = merger(doc) - spans = filter_spans([doc[start : end + 1] for _, start, end in matches]) + spans = util.filter_spans([doc[start : end + 1] for _, start, end in matches]) # type: ignore[misc, operator] with doc.retokenize() as retokenizer: for span in spans: retokenizer.merge(span) @@ -77,15 +76,9 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc: retokenizes=True, ) def make_token_splitter( - nlp: Language, - name: str, - *, - min_length=0, - split_length=0, + nlp: Language, name: str, *, min_length: int = 0, split_length: int = 0 ): - return TokenSplitter( - min_length=min_length, split_length=split_length - ) + return TokenSplitter(min_length=min_length, split_length=split_length) class TokenSplitter: @@ -100,11 +93,11 @@ class TokenSplitter: if len(t.text) >= self.min_length: orths = [] heads = [] - attrs = {} + attrs = {} # type: ignore[var-annotated] for i in range(0, len(t.text), self.split_length): orths.append(t.text[i : i + self.split_length]) heads.append((t, i / self.split_length)) - retokenizer.split(t, orths, heads, attrs) + retokenizer.split(t, orths, heads, attrs) # type: ignore[arg-type] return doc def _get_config(self) -> Dict[str, Any]: diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py index 29a139f1a..ad227d240 100644 --- a/spacy/pipeline/lemmatizer.py +++ b/spacy/pipeline/lemmatizer.py @@ -1,8 +1,9 @@ -from typing import Optional, List, Dict, Any, Callable, Iterable, Iterator, Union -from typing import Tuple +from typing import Optional, List, Dict, Any, Callable, Iterable, Union, Tuple from thinc.api import Model from pathlib import Path +import warnings + from .pipe import Pipe from ..errors import Errors, Warnings from ..language import Language @@ -23,11 +24,7 @@ from .. import util default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, - model: Optional[Model], - name: str, - mode: str, - overwrite: bool = False, + nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False ): return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) @@ -37,7 +34,7 @@ class Lemmatizer(Pipe): The Lemmatizer supports simple part-of-speech-sensitive suffix rules and lookup tables. - DOCS: https://nightly.spacy.io/api/lemmatizer + DOCS: https://spacy.io/api/lemmatizer """ @classmethod @@ -73,7 +70,7 @@ class Lemmatizer(Pipe): overwrite (bool): Whether to overwrite existing lemmas. Defaults to `False`. - DOCS: https://nightly.spacy.io/api/lemmatizer#init + DOCS: https://spacy.io/api/lemmatizer#init """ self.vocab = vocab self.model = model @@ -91,7 +88,7 @@ class Lemmatizer(Pipe): if not hasattr(self, mode_attr): raise ValueError(Errors.E1003.format(mode=mode)) self.lemmatize = getattr(self, mode_attr) - self.cache = {} + self.cache = {} # type: ignore[var-annotated] @property def mode(self): @@ -103,14 +100,18 @@ class Lemmatizer(Pipe): doc (Doc): The Doc to process. RETURNS (Doc): The processed Doc. - DOCS: https://nightly.spacy.io/api/lemmatizer#call + DOCS: https://spacy.io/api/lemmatizer#call """ if not self._validated: self._validate_tables(Errors.E1004) - for token in doc: - if self.overwrite or token.lemma == 0: - token.lemma_ = self.lemmatize(token)[0] - return doc + error_handler = self.get_error_handler() + try: + for token in doc: + if self.overwrite or token.lemma == 0: + token.lemma_ = self.lemmatize(token)[0] + return doc + except Exception as e: + error_handler(self.name, self, [doc], e) def initialize( self, @@ -154,28 +155,13 @@ class Lemmatizer(Pipe): ) self._validated = True - def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]: - """Apply the pipe to a stream of documents. This usually happens under - the hood when the nlp object is called on a text and all components are - applied to the Doc. - - stream (Iterable[Doc]): A stream of documents. - batch_size (int): The number of documents to buffer. - YIELDS (Doc): Processed documents in order. - - DOCS: https://nightly.spacy.io/api/lemmatizer#pipe - """ - for doc in stream: - doc = self(doc) - yield doc - def lookup_lemmatize(self, token: Token) -> List[str]: """Lemmatize using a lookup-based approach. token (Token): The token to lemmatize. RETURNS (list): The available lemmas for the string. - DOCS: https://nightly.spacy.io/api/lemmatizer#lookup_lemmatize + DOCS: https://spacy.io/api/lemmatizer#lookup_lemmatize """ lookup_table = self.lookups.get_table("lemma_lookup", {}) result = lookup_table.get(token.text, token.text) @@ -189,16 +175,16 @@ class Lemmatizer(Pipe): token (Token): The token to lemmatize. RETURNS (list): The available lemmas for the string. - DOCS: https://nightly.spacy.io/api/lemmatizer#rule_lemmatize + DOCS: https://spacy.io/api/lemmatizer#rule_lemmatize """ - cache_key = (token.orth, token.pos, token.morph) + cache_key = (token.orth, token.pos, token.morph.key) # type: ignore[attr-defined] if cache_key in self.cache: return self.cache[cache_key] string = token.text univ_pos = token.pos_.lower() if univ_pos in ("", "eol", "space"): if univ_pos == "": - logger.warning(Warnings.W108.format(text=string)) + warnings.warn(Warnings.W108) return [string.lower()] # See Issue #435 for example of where this logic is requied. if self.is_base_form(token): @@ -257,7 +243,7 @@ class Lemmatizer(Pipe): token (Token): The token. RETURNS (bool): Whether the token is a base form. - DOCS: https://nightly.spacy.io/api/lemmatizer#is_base_form + DOCS: https://spacy.io/api/lemmatizer#is_base_form """ return False @@ -267,7 +253,7 @@ class Lemmatizer(Pipe): examples (Iterable[Example]): The examples to score. RETURNS (Dict[str, Any]): The scores. - DOCS: https://nightly.spacy.io/api/lemmatizer#score + DOCS: https://spacy.io/api/lemmatizer#score """ validate_examples(examples, "Lemmatizer.score") return Scorer.score_token_attr(examples, "lemma", **kwargs) @@ -280,10 +266,10 @@ class Lemmatizer(Pipe): path (str / Path): Path to a directory. exclude (Iterable[str]): String names of serialization fields to exclude. - DOCS: https://nightly.spacy.io/api/lemmatizer#to_disk + DOCS: https://spacy.io/api/lemmatizer#to_disk """ serialize = {} - serialize["vocab"] = lambda p: self.vocab.to_disk(p) + serialize["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude) serialize["lookups"] = lambda p: self.lookups.to_disk(p) util.to_disk(path, serialize, exclude) @@ -296,10 +282,10 @@ class Lemmatizer(Pipe): exclude (Iterable[str]): String names of serialization fields to exclude. RETURNS (Lemmatizer): The modified Lemmatizer object. - DOCS: https://nightly.spacy.io/api/lemmatizer#from_disk + DOCS: https://spacy.io/api/lemmatizer#from_disk """ - deserialize = {} - deserialize["vocab"] = lambda p: self.vocab.from_disk(p) + deserialize: Dict[str, Callable[[Any], Any]] = {} + deserialize["vocab"] = lambda p: self.vocab.from_disk(p, exclude=exclude) deserialize["lookups"] = lambda p: self.lookups.from_disk(p) util.from_disk(path, deserialize, exclude) self._validate_tables() @@ -311,10 +297,10 @@ class Lemmatizer(Pipe): exclude (Iterable[str]): String names of serialization fields to exclude. RETURNS (bytes): The serialized object. - DOCS: https://nightly.spacy.io/api/lemmatizer#to_bytes + DOCS: https://spacy.io/api/lemmatizer#to_bytes """ serialize = {} - serialize["vocab"] = self.vocab.to_bytes + serialize["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude) serialize["lookups"] = self.lookups.to_bytes return util.to_bytes(serialize, exclude) @@ -327,10 +313,10 @@ class Lemmatizer(Pipe): exclude (Iterable[str]): String names of serialization fields to exclude. RETURNS (Lemmatizer): The loaded Lemmatizer. - DOCS: https://nightly.spacy.io/api/lemmatizer#from_bytes + DOCS: https://spacy.io/api/lemmatizer#from_bytes """ - deserialize = {} - deserialize["vocab"] = lambda b: self.vocab.from_bytes(b) + deserialize: Dict[str, Callable[[Any], Any]] = {} + deserialize["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude) deserialize["lookups"] = lambda b: self.lookups.from_bytes(b) util.from_bytes(bytes_data, deserialize, exclude) self._validate_tables() diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 7751f398a..3ba05e616 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -27,7 +27,7 @@ default_model_config = """ @architectures = "spacy.Tok2Vec.v2" [model.tok2vec.embed] -@architectures = "spacy.CharacterEmbed.v1" +@architectures = "spacy.CharacterEmbed.v2" width = 128 rows = 7000 nM = 64 @@ -75,7 +75,7 @@ class Morphologizer(Tagger): name (str): The component instance name, used to add entries to the losses during training. - DOCS: https://nightly.spacy.io/api/morphologizer#init + DOCS: https://spacy.io/api/morphologizer#init """ self.vocab = vocab self.model = model @@ -104,7 +104,7 @@ class Morphologizer(Tagger): label (str): The label to add. RETURNS (int): 0 if label is already present, otherwise 1. - DOCS: https://nightly.spacy.io/api/morphologizer#add_label + DOCS: https://spacy.io/api/morphologizer#add_label """ if not isinstance(label, str): raise ValueError(Errors.E187) @@ -134,9 +134,10 @@ class Morphologizer(Tagger): returns a representative sample of gold-standard Example objects. nlp (Language): The current nlp object the component is part of. - DOCS: https://nightly.spacy.io/api/morphologizer#initialize + DOCS: https://spacy.io/api/morphologizer#initialize """ validate_get_examples(get_examples, "Morphologizer.initialize") + util.check_lexeme_norms(self.vocab, "morphologizer") if labels is not None: self.cfg["labels_morph"] = labels["morph"] self.cfg["labels_pos"] = labels["pos"] @@ -185,7 +186,7 @@ class Morphologizer(Tagger): docs (Iterable[Doc]): The documents to modify. batch_tag_ids: The IDs to set, produced by Morphologizer.predict. - DOCS: https://nightly.spacy.io/api/morphologizer#set_annotations + DOCS: https://spacy.io/api/morphologizer#set_annotations """ if isinstance(docs, Doc): docs = [docs] @@ -208,7 +209,7 @@ class Morphologizer(Tagger): scores: Scores representing the model's predictions. RETURNS (Tuple[float, float]): The loss and the gradient. - DOCS: https://nightly.spacy.io/api/morphologizer#get_loss + DOCS: https://spacy.io/api/morphologizer#get_loss """ validate_examples(examples, "Morphologizer.get_loss") loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False) @@ -254,7 +255,7 @@ class Morphologizer(Tagger): Scorer.score_token_attr for the attributes "pos" and "morph" and Scorer.score_token_attr_per_feat for the attribute "morph". - DOCS: https://nightly.spacy.io/api/morphologizer#score + DOCS: https://spacy.io/api/morphologizer#score """ def morph_key_getter(token, attr): return getattr(token, attr).key diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx index cfb492612..8c44061e2 100644 --- a/spacy/pipeline/multitask.pyx +++ b/spacy/pipeline/multitask.pyx @@ -22,7 +22,7 @@ maxout_pieces = 3 token_vector_width = 96 [model.tok2vec] -@architectures = "spacy.HashEmbedCNN.v1" +@architectures = "spacy.HashEmbedCNN.v2" pretrained_vectors = null width = 96 depth = 4 @@ -197,7 +197,7 @@ class ClozeMultitask(TrainablePipe): target = vectors[ids] gradient = self.distance.get_grad(prediction, target) loss = self.distance.get_loss(prediction, target) - return loss, gradient + return float(loss), gradient def update(self, examples, *, drop=0., sgd=None, losses=None): pass diff --git a/spacy/pipeline/ner.py b/spacy/pipeline/ner.py index 474dec9bd..830f1aacd 100644 --- a/spacy/pipeline/ner.py +++ b/spacy/pipeline/ner.py @@ -3,6 +3,7 @@ from collections import defaultdict from typing import Optional, Iterable from thinc.api import Model, Config +from ._parser_internals.transition_system import TransitionSystem from .transition_parser import Parser from ._parser_internals.ner import BiluoPushDown @@ -20,7 +21,7 @@ hidden_width = 64 maxout_pieces = 2 [model.tok2vec] -@architectures = "spacy.HashEmbedCNN.v1" +@architectures = "spacy.HashEmbedCNN.v2" pretrained_vectors = null width = 96 depth = 4 @@ -39,6 +40,7 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"] "moves": None, "update_with_oracle_cut_size": 100, "model": DEFAULT_NER_MODEL, + "incorrect_spans_key": None }, default_score_weights={ "ents_f": 1.0, @@ -51,8 +53,9 @@ def make_ner( nlp: Language, name: str, model: Model, - moves: Optional[list], + moves: Optional[TransitionSystem], update_with_oracle_cut_size: int, + incorrect_spans_key: Optional[str]=None ): """Create a transition-based EntityRecognizer component. The entity recognizer identifies non-overlapping labelled spans of tokens. @@ -70,13 +73,16 @@ def make_ner( model (Model): The model for the transition-based parser. The model needs to have a specific substructure of named components --- see the spacy.ml.tb_framework.TransitionModel for details. - moves (list[str]): A list of transition names. Inferred from the data if not - provided. - update_with_oracle_cut_size (int): - During training, cut long sequences into shorter segments by creating - intermediate states based on the gold-standard history. The model is - not very sensitive to this parameter, so you usually won't need to change - it. 100 is a good default. + moves (Optional[TransitionSystem]): This defines how the parse-state is created, + updated and evaluated. If 'moves' is None, a new instance is + created with `self.TransitionSystem()`. Defaults to `None`. + update_with_oracle_cut_size (int): During training, cut long sequences into + shorter segments by creating intermediate states based on the gold-standard + history. The model is not very sensitive to this parameter, so you usually + won't need to change it. 100 is a good default. + incorrect_spans_key (Optional[str]): Identifies spans that are known + to be incorrect entity annotations. The incorrect entity annotations + can be stored in the span group, under this key. """ return EntityRecognizer( nlp.vocab, @@ -84,9 +90,8 @@ def make_ner( name, moves=moves, update_with_oracle_cut_size=update_with_oracle_cut_size, + incorrect_spans_key=incorrect_spans_key, multitasks=[], - min_action_freq=1, - learn_tokens=False, beam_width=1, beam_density=0.0, beam_update_prob=0.0, @@ -103,6 +108,7 @@ def make_ner( "beam_density": 0.01, "beam_update_prob": 0.5, "beam_width": 32, + "incorrect_spans_key": None, }, default_score_weights={ "ents_f": 1.0, @@ -115,11 +121,12 @@ def make_beam_ner( nlp: Language, name: str, model: Model, - moves: Optional[list], + moves: Optional[TransitionSystem], update_with_oracle_cut_size: int, beam_width: int, beam_density: float, beam_update_prob: float, + incorrect_spans_key: Optional[str]=None ): """Create a transition-based EntityRecognizer component that uses beam-search. The entity recognizer identifies non-overlapping labelled spans of tokens. @@ -137,13 +144,13 @@ def make_beam_ner( model (Model): The model for the transition-based parser. The model needs to have a specific substructure of named components --- see the spacy.ml.tb_framework.TransitionModel for details. - moves (list[str]): A list of transition names. Inferred from the data if not - provided. - update_with_oracle_cut_size (int): - During training, cut long sequences into shorter segments by creating - intermediate states based on the gold-standard history. The model is - not very sensitive to this parameter, so you usually won't need to change - it. 100 is a good default. + moves (Optional[TransitionSystem]): This defines how the parse-state is created, + updated and evaluated. If 'moves' is None, a new instance is + created with `self.TransitionSystem()`. Defaults to `None`. + update_with_oracle_cut_size (int): During training, cut long sequences into + shorter segments by creating intermediate states based on the gold-standard + history. The model is not very sensitive to this parameter, so you usually + won't need to change it. 100 is a good default. beam_width (int): The number of candidate analyses to maintain. beam_density (float): The minimum ratio between the scores of the first and last candidates in the beam. This allows the parser to avoid exploring @@ -153,6 +160,8 @@ def make_beam_ner( beam_update_prob (float): The chance of making a beam update, instead of a greedy update. Greedy updates are an approximation for the beam updates, and are faster to compute. + incorrect_spans_key (Optional[str]): Optional key into span groups of + entities known to be non-entities. """ return EntityRecognizer( nlp.vocab, @@ -161,22 +170,52 @@ def make_beam_ner( moves=moves, update_with_oracle_cut_size=update_with_oracle_cut_size, multitasks=[], - min_action_freq=1, - learn_tokens=False, beam_width=beam_width, beam_density=beam_density, beam_update_prob=beam_update_prob, + incorrect_spans_key=incorrect_spans_key ) class EntityRecognizer(Parser): """Pipeline component for named entity recognition. - DOCS: https://nightly.spacy.io/api/entityrecognizer + DOCS: https://spacy.io/api/entityrecognizer """ TransitionSystem = BiluoPushDown + def __init__( + self, + vocab, + model, + name="ner", + moves=None, + *, + update_with_oracle_cut_size=100, + beam_width=1, + beam_density=0.0, + beam_update_prob=0.0, + multitasks=tuple(), + incorrect_spans_key=None, + ): + """Create an EntityRecognizer. + """ + super().__init__( + vocab, + model, + name, + moves, + update_with_oracle_cut_size=update_with_oracle_cut_size, + min_action_freq=1, # not relevant for NER + learn_tokens=False, # not relevant for NER + beam_width=beam_width, + beam_density=beam_density, + beam_update_prob=beam_update_prob, + multitasks=multitasks, + incorrect_spans_key=incorrect_spans_key, + ) + def add_multitask_objective(self, mt_component): """Register another component as a multi-task objective. Experimental.""" self._multitasks.append(mt_component) @@ -207,7 +246,7 @@ class EntityRecognizer(Parser): examples (Iterable[Example]): The examples to score. RETURNS (Dict[str, Any]): The NER precision, recall and f-scores. - DOCS: https://nightly.spacy.io/api/entityrecognizer#score + DOCS: https://spacy.io/api/entityrecognizer#score """ validate_examples(examples, "EntityRecognizer.score") return get_ner_prf(examples) diff --git a/spacy/pipeline/pipe.pyi b/spacy/pipeline/pipe.pyi new file mode 100644 index 000000000..c7c0568f9 --- /dev/null +++ b/spacy/pipeline/pipe.pyi @@ -0,0 +1,38 @@ +from pathlib import Path +from typing import Any, Callable, Dict, Iterable, Iterator, List +from typing import NoReturn, Optional, Tuple, Union + +from ..tokens.doc import Doc + +from ..training import Example +from ..language import Language + +class Pipe: + def __call__(self, doc: Doc) -> Doc: ... + def pipe( + self, stream: Iterable[Doc], *, batch_size: int = ... + ) -> Iterator[Doc]: ... + def initialize( + self, + get_examples: Callable[[], Iterable[Example]], + *, + nlp: Language = ..., + ) -> None: ... + def score( + self, examples: Iterable[Example], **kwargs: Any + ) -> Dict[str, Union[float, Dict[str, float]]]: ... + @property + def is_trainable(self) -> bool: ... + @property + def labels(self) -> Tuple[str, ...]: ... + @property + def label_data(self) -> Any: ... + def _require_labels(self) -> None: ... + def set_error_handler( + self, error_handler: Callable[[str, "Pipe", List[Doc], Exception], NoReturn] + ) -> None: ... + def get_error_handler( + self, + ) -> Callable[[str, "Pipe", List[Doc], Exception], NoReturn]: ... + +def deserialize_config(path: Path) -> Any: ... diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index afb59fdb3..4372645af 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -1,13 +1,14 @@ # cython: infer_types=True, profile=True -import warnings from typing import Optional, Tuple, Iterable, Iterator, Callable, Union, Dict import srsly +import warnings from ..tokens.doc cimport Doc from ..training import Example from ..errors import Errors, Warnings from ..language import Language +from ..util import raise_error cdef class Pipe: """This class is a base class and not instantiated directly. It provides @@ -15,7 +16,7 @@ cdef class Pipe: Trainable pipeline components like the EntityRecognizer or TextCategorizer should inherit from the subclass 'TrainablePipe'. - DOCS: https://nightly.spacy.io/api/pipe + DOCS: https://spacy.io/api/pipe """ @classmethod @@ -33,7 +34,7 @@ cdef class Pipe: docs (Doc): The Doc to process. RETURNS (Doc): The processed Doc. - DOCS: https://nightly.spacy.io/api/pipe#call + DOCS: https://spacy.io/api/pipe#call """ raise NotImplementedError(Errors.E931.format(parent="Pipe", method="__call__", name=self.name)) @@ -46,11 +47,15 @@ cdef class Pipe: batch_size (int): The number of documents to buffer. YIELDS (Doc): Processed documents in order. - DOCS: https://nightly.spacy.io/api/pipe#pipe + DOCS: https://spacy.io/api/pipe#pipe """ + error_handler = self.get_error_handler() for doc in stream: - doc = self(doc) - yield doc + try: + doc = self(doc) + yield doc + except Exception as e: + error_handler(self.name, self, [doc], e) def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language=None): """Initialize the pipe. For non-trainable components, this method @@ -64,7 +69,7 @@ cdef class Pipe: returns a representative sample of gold-standard Example objects. nlp (Language): The current nlp object the component is part of. - DOCS: https://nightly.spacy.io/api/pipe#initialize + DOCS: https://spacy.io/api/pipe#initialize """ pass @@ -74,7 +79,7 @@ cdef class Pipe: examples (Iterable[Example]): The examples to score. RETURNS (Dict[str, Any]): The scores. - DOCS: https://nightly.spacy.io/api/pipe#score + DOCS: https://spacy.io/api/pipe#score """ return {} @@ -83,7 +88,7 @@ cdef class Pipe: return False @property - def labels(self) -> Optional[Tuple[str]]: + def labels(self) -> Tuple[str, ...]: return tuple() @property @@ -98,6 +103,30 @@ cdef class Pipe: if not self.labels or list(self.labels) == [""]: raise ValueError(Errors.E143.format(name=self.name)) + def set_error_handler(self, error_handler: Callable) -> None: + """Set an error handler function. + + error_handler (Callable[[str, Callable[[Doc], Doc], List[Doc], Exception], None]): + Function that deals with a failing batch of documents. This callable function should take in + the component's name, the component itself, the offending batch of documents, and the exception + that was thrown. + + DOCS: https://spacy.io/api/pipe#set_error_handler + """ + self.error_handler = error_handler + + def get_error_handler(self) -> Callable: + """Retrieve the error handler function. + + RETURNS (Callable): The error handler, or if it's not set a default function that just reraises. + + DOCS: https://spacy.io/api/pipe#get_error_handler + """ + if hasattr(self, "error_handler"): + return self.error_handler + return raise_error + + def deserialize_config(path): if path.exists(): return srsly.read_json(path) diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index 6e8b1c324..60102efcb 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -1,16 +1,14 @@ # cython: infer_types=True, profile=True, binding=True -import srsly from typing import Optional, List +import srsly from ..tokens.doc cimport Doc - from .pipe import Pipe from ..language import Language from ..scorer import Scorer from ..training import validate_examples from .. import util - @Language.factory( "sentencizer", assigns=["token.is_sent_start", "doc.sents"], @@ -28,7 +26,7 @@ def make_sentencizer( class Sentencizer(Pipe): """Segment the Doc into sentences using a rule-based strategy. - DOCS: https://nightly.spacy.io/api/sentencizer + DOCS: https://spacy.io/api/sentencizer """ default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹', @@ -50,7 +48,7 @@ class Sentencizer(Pipe): serialized with the nlp object. RETURNS (Sentencizer): The sentencizer component. - DOCS: https://nightly.spacy.io/api/sentencizer#init + DOCS: https://spacy.io/api/sentencizer#init """ self.name = name if punct_chars: @@ -64,38 +62,15 @@ class Sentencizer(Pipe): doc (Doc): The document to process. RETURNS (Doc): The processed Doc. - DOCS: https://nightly.spacy.io/api/sentencizer#call + DOCS: https://spacy.io/api/sentencizer#call """ - start = 0 - seen_period = False - for i, token in enumerate(doc): - is_in_punct_chars = token.text in self.punct_chars - token.is_sent_start = i == 0 - if seen_period and not token.is_punct and not is_in_punct_chars: - doc[start].is_sent_start = True - start = token.i - seen_period = False - elif is_in_punct_chars: - seen_period = True - if start < len(doc): - doc[start].is_sent_start = True - return doc - - def pipe(self, stream, batch_size=128): - """Apply the pipe to a stream of documents. This usually happens under - the hood when the nlp object is called on a text and all components are - applied to the Doc. - - stream (Iterable[Doc]): A stream of documents. - batch_size (int): The number of documents to buffer. - YIELDS (Doc): Processed documents in order. - - DOCS: https://nightly.spacy.io/api/sentencizer#pipe - """ - for docs in util.minibatch(stream, size=batch_size): - predictions = self.predict(docs) - self.set_annotations(docs, predictions) - yield from docs + error_handler = self.get_error_handler() + try: + tags = self.predict([doc]) + self.set_annotations([doc], tags) + return doc + except Exception as e: + error_handler(self.name, self, [doc], e) def predict(self, docs): """Apply the pipe to a batch of docs, without modifying them. @@ -153,7 +128,7 @@ class Sentencizer(Pipe): examples (Iterable[Example]): The examples to score. RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans. - DOCS: https://nightly.spacy.io/api/sentencizer#score + DOCS: https://spacy.io/api/sentencizer#score """ def has_sents(doc): return doc.has_annotation("SENT_START") @@ -168,7 +143,7 @@ class Sentencizer(Pipe): RETURNS (bytes): The serialized object. - DOCS: https://nightly.spacy.io/api/sentencizer#to_bytes + DOCS: https://spacy.io/api/sentencizer#to_bytes """ return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)}) @@ -178,7 +153,7 @@ class Sentencizer(Pipe): bytes_data (bytes): The data to load. returns (Sentencizer): The loaded object. - DOCS: https://nightly.spacy.io/api/sentencizer#from_bytes + DOCS: https://spacy.io/api/sentencizer#from_bytes """ cfg = srsly.msgpack_loads(bytes_data) self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars)) @@ -187,7 +162,7 @@ class Sentencizer(Pipe): def to_disk(self, path, *, exclude=tuple()): """Serialize the sentencizer to disk. - DOCS: https://nightly.spacy.io/api/sentencizer#to_disk + DOCS: https://spacy.io/api/sentencizer#to_disk """ path = util.ensure_path(path) path = path.with_suffix(".json") @@ -197,7 +172,7 @@ class Sentencizer(Pipe): def from_disk(self, path, *, exclude=tuple()): """Load the sentencizer from disk. - DOCS: https://nightly.spacy.io/api/sentencizer#from_disk + DOCS: https://spacy.io/api/sentencizer#from_disk """ path = util.ensure_path(path) path = path.with_suffix(".json") diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index ad777ea58..f9472abf5 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -19,7 +19,7 @@ default_model_config = """ @architectures = "spacy.Tagger.v1" [model.tok2vec] -@architectures = "spacy.HashEmbedCNN.v1" +@architectures = "spacy.HashEmbedCNN.v2" pretrained_vectors = null width = 12 depth = 1 @@ -44,7 +44,7 @@ def make_senter(nlp: Language, name: str, model: Model): class SentenceRecognizer(Tagger): """Pipeline component for sentence segmentation. - DOCS: https://nightly.spacy.io/api/sentencerecognizer + DOCS: https://spacy.io/api/sentencerecognizer """ def __init__(self, vocab, model, name="senter"): """Initialize a sentence recognizer. @@ -54,7 +54,7 @@ class SentenceRecognizer(Tagger): name (str): The component instance name, used to add entries to the losses during training. - DOCS: https://nightly.spacy.io/api/sentencerecognizer#init + DOCS: https://spacy.io/api/sentencerecognizer#init """ self.vocab = vocab self.model = model @@ -80,7 +80,7 @@ class SentenceRecognizer(Tagger): docs (Iterable[Doc]): The documents to modify. batch_tag_ids: The IDs to set, produced by SentenceRecognizer.predict. - DOCS: https://nightly.spacy.io/api/sentencerecognizer#set_annotations + DOCS: https://spacy.io/api/sentencerecognizer#set_annotations """ if isinstance(docs, Doc): docs = [docs] @@ -105,7 +105,7 @@ class SentenceRecognizer(Tagger): scores: Scores representing the model's predictions. RETURNS (Tuple[float, float]): The loss and the gradient. - DOCS: https://nightly.spacy.io/api/sentencerecognizer#get_loss + DOCS: https://spacy.io/api/sentencerecognizer#get_loss """ validate_examples(examples, "SentenceRecognizer.get_loss") labels = self.labels @@ -135,9 +135,10 @@ class SentenceRecognizer(Tagger): returns a representative sample of gold-standard Example objects. nlp (Language): The current nlp object the component is part of. - DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize + DOCS: https://spacy.io/api/sentencerecognizer#initialize """ validate_get_examples(get_examples, "SentenceRecognizer.initialize") + util.check_lexeme_norms(self.vocab, "senter") doc_sample = [] label_sample = [] assert self.labels, Errors.E924.format(name=self.name) @@ -158,7 +159,7 @@ class SentenceRecognizer(Tagger): examples (Iterable[Example]): The examples to score. RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans. - DOCS: https://nightly.spacy.io/api/sentencerecognizer#score + DOCS: https://spacy.io/api/sentencerecognizer#score """ def has_sents(doc): return doc.has_annotation("SENT_START") diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py new file mode 100644 index 000000000..84a9b69cc --- /dev/null +++ b/spacy/pipeline/spancat.py @@ -0,0 +1,438 @@ +import numpy +from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast +from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops +from thinc.api import Optimizer +from thinc.types import Ragged, Ints2d, Floats2d, Ints1d + +from ..compat import Protocol, runtime_checkable +from ..scorer import Scorer +from ..language import Language +from .trainable_pipe import TrainablePipe +from ..tokens import Doc, SpanGroup, Span +from ..vocab import Vocab +from ..training import Example, validate_examples +from ..errors import Errors +from ..util import registry + + +spancat_default_config = """ +[model] +@architectures = "spacy.SpanCategorizer.v1" +scorer = {"@layers": "spacy.LinearLogistic.v1"} + +[model.reducer] +@layers = spacy.mean_max_reducer.v1 +hidden_size = 128 + +[model.tok2vec] +@architectures = "spacy.Tok2Vec.v1" + +[model.tok2vec.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = 96 +rows = [5000, 2000, 1000, 1000] +attrs = ["ORTH", "PREFIX", "SUFFIX", "SHAPE"] +include_static_vectors = false + +[model.tok2vec.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = ${model.tok2vec.embed.width} +window_size = 1 +maxout_pieces = 3 +depth = 4 +""" + +DEFAULT_SPANCAT_MODEL = Config().from_str(spancat_default_config)["model"] + + +@runtime_checkable +class Suggester(Protocol): + def __call__(self, docs: Iterable[Doc], *, ops: Optional[Ops] = None) -> Ragged: + ... + + +@registry.misc("spacy.ngram_suggester.v1") +def build_ngram_suggester(sizes: List[int]) -> Suggester: + """Suggest all spans of the given lengths. Spans are returned as a ragged + array of integers. The array has two columns, indicating the start and end + position.""" + + def ngram_suggester(docs: Iterable[Doc], *, ops: Optional[Ops] = None) -> Ragged: + if ops is None: + ops = get_current_ops() + spans = [] + lengths = [] + for doc in docs: + starts = ops.xp.arange(len(doc), dtype="i") + starts = starts.reshape((-1, 1)) + length = 0 + for size in sizes: + if size <= len(doc): + starts_size = starts[: len(doc) - (size - 1)] + spans.append(ops.xp.hstack((starts_size, starts_size + size))) + length += spans[-1].shape[0] + if spans: + assert spans[-1].ndim == 2, spans[-1].shape + lengths.append(length) + lengths_array = cast(Ints1d, ops.asarray(lengths, dtype="i")) + if len(spans) > 0: + output = Ragged(ops.xp.vstack(spans), lengths_array) + else: + output = Ragged(ops.xp.zeros((0, 0)), lengths_array) + + assert output.dataXd.ndim == 2 + return output + + return ngram_suggester + + +@registry.misc("spacy.ngram_range_suggester.v1") +def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester: + """Suggest all spans of the given lengths between a given min and max value - both inclusive. + Spans are returned as a ragged array of integers. The array has two columns, + indicating the start and end position.""" + sizes = list(range(min_size, max_size + 1)) + return build_ngram_suggester(sizes) + + +@Language.factory( + "spancat", + assigns=["doc.spans"], + default_config={ + "threshold": 0.5, + "spans_key": "sc", + "max_positive": None, + "model": DEFAULT_SPANCAT_MODEL, + "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, + }, + default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0}, +) +def make_spancat( + nlp: Language, + name: str, + suggester: Suggester, + model: Model[Tuple[List[Doc], Ragged], Floats2d], + spans_key: str, + threshold: float = 0.5, + max_positive: Optional[int] = None, +) -> "SpanCategorizer": + """Create a SpanCategorizer component. The span categorizer consists of two + parts: a suggester function that proposes candidate spans, and a labeller + model that predicts one or more labels for each span. + + suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans. + Spans are returned as a ragged array with two integer columns, for the + start and end positions. + model (Model[Tuple[List[Doc], Ragged], Floats2d]): A model instance that + is given a list of documents and (start, end) indices representing + candidate span offsets. The model predicts a probability for each category + for each span. + spans_key (str): Key of the doc.spans dict to save the spans under. During + initialization and training, the component will look for spans on the + reference document under the same key. + threshold (float): Minimum probability to consider a prediction positive. + Spans with a positive prediction will be saved on the Doc. Defaults to + 0.5. + max_positive (Optional[int]): Maximum number of labels to consider positive + per span. Defaults to None, indicating no limit. + """ + return SpanCategorizer( + nlp.vocab, + suggester=suggester, + model=model, + spans_key=spans_key, + threshold=threshold, + max_positive=max_positive, + name=name, + ) + + +class SpanCategorizer(TrainablePipe): + """Pipeline component to label spans of text. + + DOCS: https://spacy.io/api/spancategorizer + """ + + def __init__( + self, + vocab: Vocab, + model: Model[Tuple[List[Doc], Ragged], Floats2d], + suggester: Suggester, + name: str = "spancat", + *, + spans_key: str = "spans", + threshold: float = 0.5, + max_positive: Optional[int] = None, + ) -> None: + """Initialize the span categorizer. + + DOCS: https://spacy.io/api/spancategorizer#init + """ + self.cfg = { + "labels": [], + "spans_key": spans_key, + "threshold": threshold, + "max_positive": max_positive, + } + self.vocab = vocab + self.suggester = suggester + self.model = model + self.name = name + + @property + def key(self) -> str: + """Key of the doc.spans dict to save the spans under. During + initialization and training, the component will look for spans on the + reference document under the same key. + """ + return str(self.cfg["spans_key"]) + + def add_label(self, label: str) -> int: + """Add a new label to the pipe. + + label (str): The label to add. + RETURNS (int): 0 if label is already present, otherwise 1. + + DOCS: https://spacy.io/api/spancategorizer#add_label + """ + if not isinstance(label, str): + raise ValueError(Errors.E187) + if label in self.labels: + return 0 + self._allow_extra_label() + self.cfg["labels"].append(label) # type: ignore + self.vocab.strings.add(label) + return 1 + + @property + def labels(self) -> Tuple[str]: + """RETURNS (Tuple[str]): The labels currently added to the component. + + DOCS: https://spacy.io/api/spancategorizer#labels + """ + return tuple(self.cfg["labels"]) # type: ignore + + @property + def label_data(self) -> List[str]: + """RETURNS (List[str]): Information about the component's labels. + + DOCS: https://spacy.io/api/spancategorizer#label_data + """ + return list(self.labels) + + def predict(self, docs: Iterable[Doc]): + """Apply the pipeline's model to a batch of docs, without modifying them. + + docs (Iterable[Doc]): The documents to predict. + RETURNS: The models prediction for each document. + + DOCS: https://spacy.io/api/spancategorizer#predict + """ + indices = self.suggester(docs, ops=self.model.ops) + scores = self.model.predict((docs, indices)) # type: ignore + return indices, scores + + def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None: + """Modify a batch of Doc objects, using pre-computed scores. + + docs (Iterable[Doc]): The documents to modify. + scores: The scores to set, produced by SpanCategorizer.predict. + + DOCS: https://spacy.io/api/spancategorizer#set_annotations + """ + labels = self.labels + indices, scores = indices_scores + offset = 0 + for i, doc in enumerate(docs): + indices_i = indices[i].dataXd + doc.spans[self.key] = self._make_span_group( + doc, indices_i, scores[offset : offset + indices.lengths[i]], labels # type: ignore[arg-type] + ) + offset += indices.lengths[i] + + def update( + self, + examples: Iterable[Example], + *, + drop: float = 0.0, + sgd: Optional[Optimizer] = None, + losses: Optional[Dict[str, float]] = None, + ) -> Dict[str, float]: + """Learn from a batch of documents and gold-standard information, + updating the pipe's model. Delegates to predict and get_loss. + + examples (Iterable[Example]): A batch of Example objects. + drop (float): The dropout rate. + sgd (thinc.api.Optimizer): The optimizer. + losses (Dict[str, float]): Optional record of the loss during training. + Updated using the component name as the key. + RETURNS (Dict[str, float]): The updated losses dictionary. + + DOCS: https://spacy.io/api/spancategorizer#update + """ + if losses is None: + losses = {} + losses.setdefault(self.name, 0.0) + validate_examples(examples, "SpanCategorizer.update") + self._validate_categories(examples) + if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples): + # Handle cases where there are no tokens in any docs. + return losses + docs = [eg.predicted for eg in examples] + spans = self.suggester(docs, ops=self.model.ops) + if spans.lengths.sum() == 0: + return losses + set_dropout_rate(self.model, drop) + scores, backprop_scores = self.model.begin_update((docs, spans)) + loss, d_scores = self.get_loss(examples, (spans, scores)) + backprop_scores(d_scores) # type: ignore + if sgd is not None: + self.finish_update(sgd) + losses[self.name] += loss + return losses + + def get_loss( + self, examples: Iterable[Example], spans_scores: Tuple[Ragged, Floats2d] + ) -> Tuple[float, float]: + """Find the loss and gradient of loss for the batch of documents and + their predicted scores. + + examples (Iterable[Examples]): The batch of examples. + spans_scores: Scores representing the model's predictions. + RETURNS (Tuple[float, float]): The loss and the gradient. + + DOCS: https://spacy.io/api/spancategorizer#get_loss + """ + spans, scores = spans_scores + spans = Ragged( + self.model.ops.to_numpy(spans.data), self.model.ops.to_numpy(spans.lengths) + ) + label_map = {label: i for i, label in enumerate(self.labels)} + target = numpy.zeros(scores.shape, dtype=scores.dtype) + offset = 0 + for i, eg in enumerate(examples): + # Map (start, end) offset of spans to the row in the d_scores array, + # so that we can adjust the gradient for predictions that were + # in the gold standard. + spans_index = {} + spans_i = spans[i].dataXd + for j in range(spans.lengths[i]): + start = int(spans_i[j, 0]) # type: ignore + end = int(spans_i[j, 1]) # type: ignore + spans_index[(start, end)] = offset + j + for gold_span in self._get_aligned_spans(eg): + key = (gold_span.start, gold_span.end) + if key in spans_index: + row = spans_index[key] + k = label_map[gold_span.label_] + target[row, k] = 1.0 + # The target is a flat array for all docs. Track the position + # we're at within the flat array. + offset += spans.lengths[i] + target = self.model.ops.asarray(target, dtype="f") # type: ignore + # The target will have the values 0 (for untrue predictions) or 1 + # (for true predictions). + # The scores should be in the range [0, 1]. + # If the prediction is 0.9 and it's true, the gradient + # will be -0.1 (0.9 - 1.0). + # If the prediction is 0.9 and it's false, the gradient will be + # 0.9 (0.9 - 0.0) + d_scores = scores - target + loss = float((d_scores ** 2).sum()) + return loss, d_scores + + def initialize( + self, + get_examples: Callable[[], Iterable[Example]], + *, + nlp: Optional[Language] = None, + labels: Optional[List[str]] = None, + ) -> None: + """Initialize the pipe for training, using a representative set + of data examples. + + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. + nlp (Optional[Language]): The current nlp object the component is part of. + labels (Optional[List[str]]): The labels to add to the component, typically generated by the + `init labels` command. If no labels are provided, the get_examples + callback is used to extract the labels from the data. + + DOCS: https://spacy.io/api/spancategorizer#initialize + """ + subbatch: List[Example] = [] + if labels is not None: + for label in labels: + self.add_label(label) + for eg in get_examples(): + if labels is None: + for span in eg.reference.spans.get(self.key, []): + self.add_label(span.label_) + if len(subbatch) < 10: + subbatch.append(eg) + self._require_labels() + if subbatch: + docs = [eg.x for eg in subbatch] + spans = self.suggester(docs) + Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels)) + self.model.initialize(X=(docs, spans), Y=Y) + else: + self.model.initialize() + + def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]: + """Score a batch of examples. + + examples (Iterable[Example]): The examples to score. + RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats. + + DOCS: https://spacy.io/api/spancategorizer#score + """ + validate_examples(examples, "SpanCategorizer.score") + self._validate_categories(examples) + kwargs = dict(kwargs) + attr_prefix = "spans_" + kwargs.setdefault("attr", f"{attr_prefix}{self.key}") + kwargs.setdefault("allow_overlap", True) + kwargs.setdefault( + "getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], []) + ) + kwargs.setdefault("has_annotation", lambda doc: self.key in doc.spans) + return Scorer.score_spans(examples, **kwargs) + + def _validate_categories(self, examples: Iterable[Example]): + # TODO + pass + + def _get_aligned_spans(self, eg: Example): + return eg.get_aligned_spans_y2x( + eg.reference.spans.get(self.key, []), allow_overlap=True + ) + + def _make_span_group( + self, doc: Doc, indices: Ints2d, scores: Floats2d, labels: List[str] + ) -> SpanGroup: + spans = SpanGroup(doc, name=self.key) + max_positive = self.cfg["max_positive"] + threshold = self.cfg["threshold"] + + keeps = scores >= threshold + ranked = (scores * -1).argsort() # type: ignore + if max_positive is not None: + assert isinstance(max_positive, int) + span_filter = ranked[:, max_positive:] + for i, row in enumerate(span_filter): + keeps[i, row] = False + spans.attrs["scores"] = scores[keeps].flatten() + + indices = self.model.ops.to_numpy(indices) + keeps = self.model.ops.to_numpy(keeps) + + for i in range(indices.shape[0]): + start = indices[i, 0] + end = indices[i, 1] + + for j, keep in enumerate(keeps[i]): + if keep: + spans.append(Span(doc, start, end, label=labels[j])) + + return spans diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 85592aba5..fa260bdd6 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -1,5 +1,4 @@ # cython: infer_types=True, profile=True, binding=True -from typing import List import numpy import srsly from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config @@ -27,7 +26,7 @@ default_model_config = """ @architectures = "spacy.Tagger.v1" [model.tok2vec] -@architectures = "spacy.HashEmbedCNN.v1" +@architectures = "spacy.HashEmbedCNN.v2" pretrained_vectors = null width = 96 depth = 4 @@ -59,7 +58,7 @@ def make_tagger(nlp: Language, name: str, model: Model): class Tagger(TrainablePipe): """Pipeline component for part-of-speech tagging. - DOCS: https://nightly.spacy.io/api/tagger + DOCS: https://spacy.io/api/tagger """ def __init__(self, vocab, model, name="tagger"): """Initialize a part-of-speech tagger. @@ -69,7 +68,7 @@ class Tagger(TrainablePipe): name (str): The component instance name, used to add entries to the losses during training. - DOCS: https://nightly.spacy.io/api/tagger#init + DOCS: https://spacy.io/api/tagger#init """ self.vocab = vocab self.model = model @@ -86,7 +85,7 @@ class Tagger(TrainablePipe): RETURNS (Tuple[str]): The labels. - DOCS: https://nightly.spacy.io/api/tagger#labels + DOCS: https://spacy.io/api/tagger#labels """ return tuple(self.cfg["labels"]) @@ -95,41 +94,13 @@ class Tagger(TrainablePipe): """Data about the labels currently added to the component.""" return tuple(self.cfg["labels"]) - def __call__(self, doc): - """Apply the pipe to a Doc. - - doc (Doc): The document to process. - RETURNS (Doc): The processed Doc. - - DOCS: https://nightly.spacy.io/api/tagger#call - """ - tags = self.predict([doc]) - self.set_annotations([doc], tags) - return doc - - def pipe(self, stream, *, batch_size=128): - """Apply the pipe to a stream of documents. This usually happens under - the hood when the nlp object is called on a text and all components are - applied to the Doc. - - stream (Iterable[Doc]): A stream of documents. - batch_size (int): The number of documents to buffer. - YIELDS (Doc): Processed documents in order. - - DOCS: https://nightly.spacy.io/api/tagger#pipe - """ - for docs in util.minibatch(stream, size=batch_size): - tag_ids = self.predict(docs) - self.set_annotations(docs, tag_ids) - yield from docs - def predict(self, docs): """Apply the pipeline's model to a batch of docs, without modifying them. docs (Iterable[Doc]): The documents to predict. RETURNS: The models prediction for each document. - DOCS: https://nightly.spacy.io/api/tagger#predict + DOCS: https://spacy.io/api/tagger#predict """ if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. @@ -158,7 +129,7 @@ class Tagger(TrainablePipe): docs (Iterable[Doc]): The documents to modify. batch_tag_ids: The IDs to set, produced by Tagger.predict. - DOCS: https://nightly.spacy.io/api/tagger#set_annotations + DOCS: https://spacy.io/api/tagger#set_annotations """ if isinstance(docs, Doc): docs = [docs] @@ -175,8 +146,7 @@ class Tagger(TrainablePipe): def update(self, examples, *, drop=0., sgd=None, losses=None): """Learn from a batch of documents and gold-standard information, - updating the pipe's model. Delegates to predict, get_loss and - set_annotations. + updating the pipe's model. Delegates to predict and get_loss. examples (Iterable[Example]): A batch of Example objects. drop (float): The dropout rate. @@ -185,7 +155,7 @@ class Tagger(TrainablePipe): Updated using the component name as the key. RETURNS (Dict[str, float]): The updated losses dictionary. - DOCS: https://nightly.spacy.io/api/tagger#update + DOCS: https://spacy.io/api/tagger#update """ if losses is None: losses = {} @@ -205,8 +175,6 @@ class Tagger(TrainablePipe): self.finish_update(sgd) losses[self.name] += loss - docs = [eg.predicted for eg in examples] - self.set_annotations(docs, self._scores2guesses(tag_scores)) return losses def rehearse(self, examples, *, drop=0., sgd=None, losses=None): @@ -222,7 +190,7 @@ class Tagger(TrainablePipe): Updated using the component name as the key. RETURNS (Dict[str, float]): The updated losses dictionary. - DOCS: https://nightly.spacy.io/api/tagger#rehearse + DOCS: https://spacy.io/api/tagger#rehearse """ if losses is None: losses = {} @@ -251,10 +219,10 @@ class Tagger(TrainablePipe): scores: Scores representing the model's predictions. RETURNS (Tuple[float, float]): The loss and the gradient. - DOCS: https://nightly.spacy.io/api/tagger#get_loss + DOCS: https://spacy.io/api/tagger#get_loss """ validate_examples(examples, "Tagger.get_loss") - loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False) + loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix="!") # Convert empty tag "" to missing value None so that both misaligned # tokens and tokens with missing annotation have the default missing # value None. @@ -278,9 +246,10 @@ class Tagger(TrainablePipe): `init labels` command. If no labels are provided, the get_examples callback is used to extract the labels from the data. - DOCS: https://nightly.spacy.io/api/tagger#initialize + DOCS: https://spacy.io/api/tagger#initialize """ validate_get_examples(get_examples, "Tagger.initialize") + util.check_lexeme_norms(self.vocab, "tagger") if labels is not None: for tag in labels: self.add_label(tag) @@ -310,7 +279,7 @@ class Tagger(TrainablePipe): label (str): The label to add. RETURNS (int): 0 if label is already present, otherwise 1. - DOCS: https://nightly.spacy.io/api/tagger#add_label + DOCS: https://spacy.io/api/tagger#add_label """ if not isinstance(label, str): raise ValueError(Errors.E187) @@ -328,7 +297,7 @@ class Tagger(TrainablePipe): RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_token_attr for the attributes "tag". - DOCS: https://nightly.spacy.io/api/tagger#score + DOCS: https://spacy.io/api/tagger#score """ validate_examples(examples, "Tagger.score") return Scorer.score_token_attr(examples, "tag", **kwargs) diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 6d8c7b101..085b949cc 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -1,5 +1,5 @@ from itertools import islice -from typing import Iterable, Tuple, Optional, Dict, List, Callable, Iterator, Any +from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config from thinc.types import Floats2d import numpy @@ -9,7 +9,6 @@ from ..language import Language from ..training import Example, validate_examples, validate_get_examples from ..errors import Errors from ..scorer import Scorer -from .. import util from ..tokens import Doc from ..vocab import Vocab @@ -22,7 +21,7 @@ single_label_default_config = """ @architectures = "spacy.Tok2Vec.v2" [model.tok2vec.embed] -@architectures = "spacy.MultiHashEmbed.v1" +@architectures = "spacy.MultiHashEmbed.v2" width = 64 rows = [2000, 2000, 1000, 1000, 1000, 1000] attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] @@ -36,7 +35,7 @@ maxout_pieces = 3 depth = 2 [model.linear_model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = true ngram_size = 1 no_output_layer = false @@ -45,7 +44,7 @@ DEFAULT_SINGLE_TEXTCAT_MODEL = Config().from_str(single_label_default_config)["m single_label_bow_config = """ [model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = true ngram_size = 1 no_output_layer = false @@ -53,11 +52,11 @@ no_output_layer = false single_label_cnn_config = """ [model] -@architectures = "spacy.TextCatCNN.v1" +@architectures = "spacy.TextCatCNN.v2" exclusive_classes = true [model.tok2vec] -@architectures = "spacy.HashEmbedCNN.v1" +@architectures = "spacy.HashEmbedCNN.v2" pretrained_vectors = null width = 96 depth = 4 @@ -89,11 +88,9 @@ subword_features = true def make_textcat( nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float ) -> "TextCategorizer": - """Create a TextCategorizer compoment. The text categorizer predicts categories - over a whole document. It can learn one or more labels, and the labels can - be mutually exclusive (i.e. one true label per doc) or non-mutually exclusive - (i.e. zero or more labels may be true per doc). The multi-label setting is - controlled by the model instance that's provided. + """Create a TextCategorizer component. The text categorizer predicts categories + over a whole document. It can learn one or more labels, and the labels are considered + to be mutually exclusive (i.e. one true label per doc). model (Model[List[Doc], List[Floats2d]]): A model instance that predicts scores for each category. @@ -105,7 +102,7 @@ def make_textcat( class TextCategorizer(TrainablePipe): """Pipeline component for single-label text classification. - DOCS: https://nightly.spacy.io/api/textcategorizer + DOCS: https://spacy.io/api/textcategorizer """ def __init__( @@ -119,7 +116,7 @@ class TextCategorizer(TrainablePipe): losses during training. threshold (float): Cutoff to consider a prediction "positive". - DOCS: https://nightly.spacy.io/api/textcategorizer#init + DOCS: https://spacy.io/api/textcategorizer#init """ self.vocab = vocab self.model = model @@ -132,33 +129,17 @@ class TextCategorizer(TrainablePipe): def labels(self) -> Tuple[str]: """RETURNS (Tuple[str]): The labels currently added to the component. - DOCS: https://nightly.spacy.io/api/textcategorizer#labels + DOCS: https://spacy.io/api/textcategorizer#labels """ - return tuple(self.cfg["labels"]) + return tuple(self.cfg["labels"]) # type: ignore[arg-type, return-value] @property def label_data(self) -> List[str]: """RETURNS (List[str]): Information about the component's labels. - DOCS: https://nightly.spacy.io/api/textcategorizer#label_data + DOCS: https://spacy.io/api/textcategorizer#label_data """ - return self.labels - - def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]: - """Apply the pipe to a stream of documents. This usually happens under - the hood when the nlp object is called on a text and all components are - applied to the Doc. - - stream (Iterable[Doc]): A stream of documents. - batch_size (int): The number of documents to buffer. - YIELDS (Doc): Processed documents in order. - - DOCS: https://nightly.spacy.io/api/textcategorizer#pipe - """ - for docs in util.minibatch(stream, size=batch_size): - scores = self.predict(docs) - self.set_annotations(docs, scores) - yield from docs + return self.labels # type: ignore[return-value] def predict(self, docs: Iterable[Doc]): """Apply the pipeline's model to a batch of docs, without modifying them. @@ -166,13 +147,13 @@ class TextCategorizer(TrainablePipe): docs (Iterable[Doc]): The documents to predict. RETURNS: The models prediction for each document. - DOCS: https://nightly.spacy.io/api/textcategorizer#predict + DOCS: https://spacy.io/api/textcategorizer#predict """ if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. tensors = [doc.tensor for doc in docs] xp = get_array_module(tensors) - scores = xp.zeros((len(docs), len(self.labels))) + scores = xp.zeros((len(list(docs)), len(self.labels))) return scores scores = self.model.predict(docs) scores = self.model.ops.asarray(scores) @@ -184,7 +165,7 @@ class TextCategorizer(TrainablePipe): docs (Iterable[Doc]): The documents to modify. scores: The scores to set, produced by TextCategorizer.predict. - DOCS: https://nightly.spacy.io/api/textcategorizer#set_annotations + DOCS: https://spacy.io/api/textcategorizer#set_annotations """ for i, doc in enumerate(docs): for j, label in enumerate(self.labels): @@ -199,8 +180,7 @@ class TextCategorizer(TrainablePipe): losses: Optional[Dict[str, float]] = None, ) -> Dict[str, float]: """Learn from a batch of documents and gold-standard information, - updating the pipe's model. Delegates to predict, get_loss and - set_annotations. + updating the pipe's model. Delegates to predict and get_loss. examples (Iterable[Example]): A batch of Example objects. drop (float): The dropout rate. @@ -209,7 +189,7 @@ class TextCategorizer(TrainablePipe): Updated using the component name as the key. RETURNS (Dict[str, float]): The updated losses dictionary. - DOCS: https://nightly.spacy.io/api/textcategorizer#update + DOCS: https://spacy.io/api/textcategorizer#update """ if losses is None: losses = {} @@ -226,8 +206,6 @@ class TextCategorizer(TrainablePipe): if sgd is not None: self.finish_update(sgd) losses[self.name] += loss - docs = [eg.predicted for eg in examples] - self.set_annotations(docs, scores=scores) return losses def rehearse( @@ -250,10 +228,11 @@ class TextCategorizer(TrainablePipe): Updated using the component name as the key. RETURNS (Dict[str, float]): The updated losses dictionary. - DOCS: https://nightly.spacy.io/api/textcategorizer#rehearse + DOCS: https://spacy.io/api/textcategorizer#rehearse """ - if losses is not None: - losses.setdefault(self.name, 0.0) + if losses is None: + losses = {} + losses.setdefault(self.name, 0.0) if self._rehearsal_model is None: return losses validate_examples(examples, "TextCategorizer.rehearse") @@ -269,23 +248,23 @@ class TextCategorizer(TrainablePipe): bp_scores(gradient) if sgd is not None: self.finish_update(sgd) - if losses is not None: - losses[self.name] += (gradient ** 2).sum() + losses[self.name] += (gradient ** 2).sum() return losses def _examples_to_truth( - self, examples: List[Example] + self, examples: Iterable[Example] ) -> Tuple[numpy.ndarray, numpy.ndarray]: - truths = numpy.zeros((len(examples), len(self.labels)), dtype="f") - not_missing = numpy.ones((len(examples), len(self.labels)), dtype="f") + nr_examples = len(list(examples)) + truths = numpy.zeros((nr_examples, len(self.labels)), dtype="f") + not_missing = numpy.ones((nr_examples, len(self.labels)), dtype="f") for i, eg in enumerate(examples): for j, label in enumerate(self.labels): if label in eg.reference.cats: truths[i, j] = eg.reference.cats[label] else: not_missing[i, j] = 0.0 - truths = self.model.ops.asarray(truths) - return truths, not_missing + truths = self.model.ops.asarray(truths) # type: ignore + return truths, not_missing # type: ignore def get_loss(self, examples: Iterable[Example], scores) -> Tuple[float, float]: """Find the loss and gradient of loss for the batch of documents and @@ -295,12 +274,12 @@ class TextCategorizer(TrainablePipe): scores: Scores representing the model's predictions. RETURNS (Tuple[float, float]): The loss and the gradient. - DOCS: https://nightly.spacy.io/api/textcategorizer#get_loss + DOCS: https://spacy.io/api/textcategorizer#get_loss """ validate_examples(examples, "TextCategorizer.get_loss") self._validate_categories(examples) truths, not_missing = self._examples_to_truth(examples) - not_missing = self.model.ops.asarray(not_missing) + not_missing = self.model.ops.asarray(not_missing) # type: ignore d_scores = (scores - truths) / scores.shape[0] d_scores *= not_missing mean_square_error = (d_scores ** 2).sum(axis=1).mean() @@ -312,14 +291,16 @@ class TextCategorizer(TrainablePipe): label (str): The label to add. RETURNS (int): 0 if label is already present, otherwise 1. - DOCS: https://nightly.spacy.io/api/textcategorizer#add_label + DOCS: https://spacy.io/api/textcategorizer#add_label """ if not isinstance(label, str): raise ValueError(Errors.E187) if label in self.labels: return 0 self._allow_extra_label() - self.cfg["labels"].append(label) + self.cfg["labels"].append(label) # type: ignore[attr-defined] + if self.model and "resize_output" in self.model.attrs: + self.model = self.model.attrs["resize_output"](self.model, len(self.labels)) self.vocab.strings.add(label) return 1 @@ -328,7 +309,7 @@ class TextCategorizer(TrainablePipe): get_examples: Callable[[], Iterable[Example]], *, nlp: Optional[Language] = None, - labels: Optional[Dict] = None, + labels: Optional[Iterable[str]] = None, positive_label: Optional[str] = None, ) -> None: """Initialize the pipe for training, using a representative set @@ -337,11 +318,13 @@ class TextCategorizer(TrainablePipe): get_examples (Callable[[], Iterable[Example]]): Function that returns a representative sample of gold-standard Example objects. nlp (Language): The current nlp object the component is part of. - labels: The labels to add to the component, typically generated by the + labels (Optional[Iterable[str]]): The labels to add to the component, typically generated by the `init labels` command. If no labels are provided, the get_examples callback is used to extract the labels from the data. + positive_label (Optional[str]): The positive label for a binary task with exclusive classes, + `None` otherwise and by default. - DOCS: https://nightly.spacy.io/api/textcategorizer#initialize + DOCS: https://spacy.io/api/textcategorizer#initialize """ validate_get_examples(get_examples, "TextCategorizer.initialize") self._validate_categories(get_examples()) @@ -352,6 +335,8 @@ class TextCategorizer(TrainablePipe): else: for label in labels: self.add_label(label) + if len(self.labels) < 2: + raise ValueError(Errors.E867) if positive_label is not None: if positive_label not in self.labels: err = Errors.E920.format(pos_label=positive_label, labels=self.labels) @@ -374,21 +359,21 @@ class TextCategorizer(TrainablePipe): examples (Iterable[Example]): The examples to score. RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats. - DOCS: https://nightly.spacy.io/api/textcategorizer#score + DOCS: https://spacy.io/api/textcategorizer#score """ validate_examples(examples, "TextCategorizer.score") self._validate_categories(examples) + kwargs.setdefault("threshold", self.cfg["threshold"]) + kwargs.setdefault("positive_label", self.cfg["positive_label"]) return Scorer.score_cats( examples, "cats", labels=self.labels, multi_label=False, - positive_label=self.cfg["positive_label"], - threshold=self.cfg["threshold"], **kwargs, ) - def _validate_categories(self, examples: List[Example]): + def _validate_categories(self, examples: Iterable[Example]): """Check whether the provided examples all have single-label cats annotations.""" for ex in examples: if list(ex.reference.cats.values()).count(1.0) > 1: diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py index 41c5a1335..65961a38c 100644 --- a/spacy/pipeline/textcat_multilabel.py +++ b/spacy/pipeline/textcat_multilabel.py @@ -21,7 +21,7 @@ multi_label_default_config = """ @architectures = "spacy.Tok2Vec.v1" [model.tok2vec.embed] -@architectures = "spacy.MultiHashEmbed.v1" +@architectures = "spacy.MultiHashEmbed.v2" width = 64 rows = [2000, 2000, 1000, 1000, 1000, 1000] attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] @@ -35,7 +35,7 @@ maxout_pieces = 3 depth = 2 [model.linear_model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = false ngram_size = 1 no_output_layer = false @@ -44,7 +44,7 @@ DEFAULT_MULTI_TEXTCAT_MODEL = Config().from_str(multi_label_default_config)["mod multi_label_bow_config = """ [model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = false ngram_size = 1 no_output_layer = false @@ -52,11 +52,11 @@ no_output_layer = false multi_label_cnn_config = """ [model] -@architectures = "spacy.TextCatCNN.v1" +@architectures = "spacy.TextCatCNN.v2" exclusive_classes = false [model.tok2vec] -@architectures = "spacy.HashEmbedCNN.v1" +@architectures = "spacy.HashEmbedCNN.v2" pretrained_vectors = null width = 96 depth = 4 @@ -88,11 +88,10 @@ subword_features = true def make_multilabel_textcat( nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float ) -> "TextCategorizer": - """Create a TextCategorizer compoment. The text categorizer predicts categories - over a whole document. It can learn one or more labels, and the labels can - be mutually exclusive (i.e. one true label per doc) or non-mutually exclusive - (i.e. zero or more labels may be true per doc). The multi-label setting is - controlled by the model instance that's provided. + """Create a TextCategorizer component. The text categorizer predicts categories + over a whole document. It can learn one or more labels, and the labels are considered + to be non-mutually exclusive, which means that there can be zero or more labels + per doc). model (Model[List[Doc], List[Floats2d]]): A model instance that predicts scores for each category. @@ -104,7 +103,7 @@ def make_multilabel_textcat( class MultiLabel_TextCategorizer(TextCategorizer): """Pipeline component for multi-label text classification. - DOCS: https://nightly.spacy.io/api/multilabel_textcategorizer + DOCS: https://spacy.io/api/textcategorizer """ def __init__( @@ -123,7 +122,7 @@ class MultiLabel_TextCategorizer(TextCategorizer): losses during training. threshold (float): Cutoff to consider a prediction "positive". - DOCS: https://nightly.spacy.io/api/multilabel_textcategorizer#init + DOCS: https://spacy.io/api/textcategorizer#init """ self.vocab = vocab self.model = model @@ -132,12 +131,12 @@ class MultiLabel_TextCategorizer(TextCategorizer): cfg = {"labels": [], "threshold": threshold} self.cfg = dict(cfg) - def initialize( + def initialize( # type: ignore[override] self, get_examples: Callable[[], Iterable[Example]], *, nlp: Optional[Language] = None, - labels: Optional[Dict] = None, + labels: Optional[Iterable[str]] = None, ): """Initialize the pipe for training, using a representative set of data examples. @@ -149,7 +148,7 @@ class MultiLabel_TextCategorizer(TextCategorizer): `init labels` command. If no labels are provided, the get_examples callback is used to extract the labels from the data. - DOCS: https://nightly.spacy.io/api/multilabel_textcategorizer#initialize + DOCS: https://spacy.io/api/textcategorizer#initialize """ validate_get_examples(get_examples, "MultiLabel_TextCategorizer.initialize") if labels is None: @@ -173,19 +172,19 @@ class MultiLabel_TextCategorizer(TextCategorizer): examples (Iterable[Example]): The examples to score. RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats. - DOCS: https://nightly.spacy.io/api/multilabel_textcategorizer#score + DOCS: https://spacy.io/api/textcategorizer#score """ validate_examples(examples, "MultiLabel_TextCategorizer.score") + kwargs.setdefault("threshold", self.cfg["threshold"]) return Scorer.score_cats( examples, "cats", labels=self.labels, multi_label=True, - threshold=self.cfg["threshold"], **kwargs, ) - def _validate_categories(self, examples: List[Example]): + def _validate_categories(self, examples: Iterable[Example]): """This component allows any type of single- or multi-label annotations. This method overwrites the more strict one from 'textcat'.""" pass diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index a95fc8927..cb601e5dc 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -1,4 +1,4 @@ -from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List +from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any from thinc.api import Model, set_dropout_rate, Optimizer, Config from itertools import islice @@ -8,12 +8,10 @@ from ..tokens import Doc from ..vocab import Vocab from ..language import Language from ..errors import Errors -from ..util import minibatch - default_model_config = """ [model] -@architectures = "spacy.HashEmbedCNN.v1" +@architectures = "spacy.HashEmbedCNN.v2" pretrained_vectors = null width = 96 depth = 4 @@ -57,13 +55,13 @@ class Tok2Vec(TrainablePipe): a list of Doc objects as input, and output a list of 2d float arrays. name (str): The component instance name. - DOCS: https://nightly.spacy.io/api/tok2vec#init + DOCS: https://spacy.io/api/tok2vec#init """ self.vocab = vocab self.model = model self.name = name - self.listener_map = {} - self.cfg = {} + self.listener_map: Dict[str, List["Tok2VecListener"]] = {} + self.cfg: Dict[str, Any] = {} @property def listeners(self) -> List["Tok2VecListener"]: @@ -82,7 +80,19 @@ class Tok2Vec(TrainablePipe): def add_listener(self, listener: "Tok2VecListener", component_name: str) -> None: """Add a listener for a downstream component. Usually internals.""" self.listener_map.setdefault(component_name, []) - self.listener_map[component_name].append(listener) + if listener not in self.listener_map[component_name]: + self.listener_map[component_name].append(listener) + + def remove_listener(self, listener: "Tok2VecListener", component_name: str) -> bool: + """Remove a listener for a downstream component. Usually internals.""" + if component_name in self.listener_map: + if listener in self.listener_map[component_name]: + self.listener_map[component_name].remove(listener) + # If no listeners are left, remove entry + if not self.listener_map[component_name]: + del self.listener_map[component_name] + return True + return False def find_listeners(self, component) -> None: """Walk over a model of a processing component, looking for layers that @@ -99,36 +109,6 @@ class Tok2Vec(TrainablePipe): if isinstance(node, Tok2VecListener) and node.upstream_name in names: self.add_listener(node, component.name) - def __call__(self, doc: Doc) -> Doc: - """Add context-sensitive embeddings to the Doc.tensor attribute, allowing - them to be used as features by downstream components. - - docs (Doc): The Doc to process. - RETURNS (Doc): The processed Doc. - - DOCS: https://nightly.spacy.io/api/tok2vec#call - """ - tokvecses = self.predict([doc]) - self.set_annotations([doc], tokvecses) - return doc - - def pipe(self, stream: Iterator[Doc], *, batch_size: int = 128) -> Iterator[Doc]: - """Apply the pipe to a stream of documents. This usually happens under - the hood when the nlp object is called on a text and all components are - applied to the Doc. - - stream (Iterable[Doc]): A stream of documents. - batch_size (int): The number of documents to buffer. - YIELDS (Doc): Processed documents in order. - - DOCS: https://nightly.spacy.io/api/tok2vec#pipe - """ - for docs in minibatch(stream, batch_size): - docs = list(docs) - tokvecses = self.predict(docs) - self.set_annotations(docs, tokvecses) - yield from docs - def predict(self, docs: Iterable[Doc]): """Apply the pipeline's model to a batch of docs, without modifying them. Returns a single tensor for a batch of documents. @@ -136,12 +116,12 @@ class Tok2Vec(TrainablePipe): docs (Iterable[Doc]): The documents to predict. RETURNS: Vector representations for each token in the documents. - DOCS: https://nightly.spacy.io/api/tok2vec#predict + DOCS: https://spacy.io/api/tok2vec#predict """ tokvecs = self.model.predict(docs) batch_id = Tok2VecListener.get_batch_id(docs) for listener in self.listeners: - listener.receive(batch_id, tokvecs, lambda dX: []) + listener.receive(batch_id, tokvecs, _empty_backprop) return tokvecs def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None: @@ -150,7 +130,7 @@ class Tok2Vec(TrainablePipe): docs (Iterable[Doc]): The documents to modify. tokvecses: The tensors to set, produced by Tok2Vec.predict. - DOCS: https://nightly.spacy.io/api/tok2vec#set_annotations + DOCS: https://spacy.io/api/tok2vec#set_annotations """ for doc, tokvecs in zip(docs, tokvecses): assert tokvecs.shape[0] == len(doc) @@ -174,7 +154,7 @@ class Tok2Vec(TrainablePipe): Updated using the component name as the key. RETURNS (Dict[str, float]): The updated losses dictionary. - DOCS: https://nightly.spacy.io/api/tok2vec#update + DOCS: https://spacy.io/api/tok2vec#update """ if losses is None: losses = {} @@ -193,6 +173,7 @@ class Tok2Vec(TrainablePipe): for i in range(len(one_d_tokvecs)): d_tokvecs[i] += one_d_tokvecs[i] losses[self.name] += float((one_d_tokvecs[i] ** 2).sum()) + return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] def backprop(one_d_tokvecs): """Callback to actually do the backprop. Passed to last listener.""" @@ -207,7 +188,6 @@ class Tok2Vec(TrainablePipe): listener.receive(batch_id, tokvecs, accumulate_gradient) if self.listeners: self.listeners[-1].receive(batch_id, tokvecs, backprop) - self.set_annotations(docs, tokvecs) return losses def get_loss(self, examples, scores) -> None: @@ -226,7 +206,7 @@ class Tok2Vec(TrainablePipe): returns a representative sample of gold-standard Example objects. nlp (Language): The current nlp object the component is part of. - DOCS: https://nightly.spacy.io/api/tok2vec#initialize + DOCS: https://spacy.io/api/tok2vec#initialize """ validate_get_examples(get_examples, "Tok2Vec.initialize") doc_sample = [] @@ -265,12 +245,12 @@ class Tok2VecListener(Model): """ Model.__init__(self, name=self.name, forward=forward, dims={"nO": width}) self.upstream_name = upstream_name - self._batch_id = None + self._batch_id: Optional[int] = None self._outputs = None self._backprop = None @classmethod - def get_batch_id(cls, inputs: List[Doc]) -> int: + def get_batch_id(cls, inputs: Iterable[Doc]) -> int: """Calculate a content-sensitive hash of the batch of documents, to check whether the next batch of documents is unexpected. """ @@ -312,12 +292,18 @@ def forward(model: Tok2VecListener, inputs, is_train: bool): # of data. # When the components batch differently, we don't receive a matching # prediction from the upstream, so we can't predict. - if not all(doc.tensor.size for doc in inputs): - # But we do need to do *something* if the tensor hasn't been set. - # The compromise is to at least return data of the right shape, - # so the output is valid. - width = model.get_dim("nO") - outputs = [model.ops.alloc2f(len(doc), width) for doc in inputs] - else: - outputs = [doc.tensor for doc in inputs] + outputs = [] + width = model.get_dim("nO") + for doc in inputs: + if doc.tensor.size == 0: + # But we do need to do *something* if the tensor hasn't been set. + # The compromise is to at least return data of the right shape, + # so the output is valid. + outputs.append(model.ops.alloc2f(len(doc), width)) + else: + outputs.append(doc.tensor) return outputs, lambda dX: [] + + +def _empty_backprop(dX): # for pickling + return [] diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx index 1abd6b43e..76b0733cf 100644 --- a/spacy/pipeline/trainable_pipe.pyx +++ b/spacy/pipeline/trainable_pipe.pyx @@ -20,7 +20,7 @@ cdef class TrainablePipe(Pipe): from it and it defines the interface that components should follow to function as trainable components in a spaCy pipeline. - DOCS: https://nightly.spacy.io/api/pipe + DOCS: https://spacy.io/api/pipe """ def __init__(self, vocab: Vocab, model: Model, name: str, **cfg): """Initialize a pipeline component. @@ -28,9 +28,9 @@ cdef class TrainablePipe(Pipe): vocab (Vocab): The shared vocabulary. model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name. - **cfg: Additonal settings and config parameters. + **cfg: Additional settings and config parameters. - DOCS: https://nightly.spacy.io/api/pipe#init + DOCS: https://spacy.io/api/pipe#init """ self.vocab = vocab self.model = model @@ -45,11 +45,15 @@ cdef class TrainablePipe(Pipe): docs (Doc): The Doc to process. RETURNS (Doc): The processed Doc. - DOCS: https://nightly.spacy.io/api/pipe#call + DOCS: https://spacy.io/api/pipe#call """ - scores = self.predict([doc]) - self.set_annotations([doc], scores) - return doc + error_handler = self.get_error_handler() + try: + scores = self.predict([doc]) + self.set_annotations([doc], scores) + return doc + except Exception as e: + error_handler(self.name, self, [doc], e) def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]: """Apply the pipe to a stream of documents. This usually happens under @@ -58,14 +62,21 @@ cdef class TrainablePipe(Pipe): stream (Iterable[Doc]): A stream of documents. batch_size (int): The number of documents to buffer. + error_handler (Callable[[str, List[Doc], Exception], Any]): Function that + deals with a failing batch of documents. The default function just reraises + the exception. YIELDS (Doc): Processed documents in order. - DOCS: https://nightly.spacy.io/api/pipe#pipe + DOCS: https://spacy.io/api/pipe#pipe """ + error_handler = self.get_error_handler() for docs in util.minibatch(stream, size=batch_size): - scores = self.predict(docs) - self.set_annotations(docs, scores) - yield from docs + try: + scores = self.predict(docs) + self.set_annotations(docs, scores) + yield from docs + except Exception as e: + error_handler(self.name, self, docs, e) def predict(self, docs: Iterable[Doc]): """Apply the pipeline's model to a batch of docs, without modifying them. @@ -74,7 +85,7 @@ cdef class TrainablePipe(Pipe): docs (Iterable[Doc]): The documents to predict. RETURNS: Vector representations of the predictions. - DOCS: https://nightly.spacy.io/api/pipe#predict + DOCS: https://spacy.io/api/pipe#predict """ raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="predict", name=self.name)) @@ -84,18 +95,18 @@ cdef class TrainablePipe(Pipe): docs (Iterable[Doc]): The documents to modify. scores: The scores to assign. - DOCS: https://nightly.spacy.io/api/pipe#set_annotations + DOCS: https://spacy.io/api/pipe#set_annotations """ raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="set_annotations", name=self.name)) def update(self, examples: Iterable["Example"], - *, drop: float=0.0, + *, + drop: float=0.0, sgd: Optimizer=None, losses: Optional[Dict[str, float]]=None) -> Dict[str, float]: """Learn from a batch of documents and gold-standard information, - updating the pipe's model. Delegates to predict, get_loss and - set_annotations. + updating the pipe's model. Delegates to predict and get_loss. examples (Iterable[Example]): A batch of Example objects. drop (float): The dropout rate. @@ -104,7 +115,7 @@ cdef class TrainablePipe(Pipe): Updated using the component name as the key. RETURNS (Dict[str, float]): The updated losses dictionary. - DOCS: https://nightly.spacy.io/api/pipe#update + DOCS: https://spacy.io/api/pipe#update """ if losses is None: losses = {} @@ -122,8 +133,6 @@ cdef class TrainablePipe(Pipe): if sgd not in (None, False): self.finish_update(sgd) losses[self.name] += loss - docs = [eg.predicted for eg in examples] - self.set_annotations(docs, scores=scores) return losses def rehearse(self, @@ -143,7 +152,7 @@ cdef class TrainablePipe(Pipe): Updated using the component name as the key. RETURNS (Dict[str, float]): The updated losses dictionary. - DOCS: https://nightly.spacy.io/api/pipe#rehearse + DOCS: https://spacy.io/api/pipe#rehearse """ pass @@ -155,7 +164,7 @@ cdef class TrainablePipe(Pipe): scores: Scores representing the model's predictions. RETURNS (Tuple[float, float]): The loss and the gradient. - DOCS: https://nightly.spacy.io/api/pipe#get_loss + DOCS: https://spacy.io/api/pipe#get_loss """ raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_loss", name=self.name)) @@ -164,7 +173,7 @@ cdef class TrainablePipe(Pipe): RETURNS (thinc.api.Optimizer): The optimizer. - DOCS: https://nightly.spacy.io/api/pipe#create_optimizer + DOCS: https://spacy.io/api/pipe#create_optimizer """ return util.create_default_optimizer() @@ -178,7 +187,7 @@ cdef class TrainablePipe(Pipe): returns a representative sample of gold-standard Example objects. nlp (Language): The current nlp object the component is part of. - DOCS: https://nightly.spacy.io/api/pipe#initialize + DOCS: https://spacy.io/api/pipe#initialize """ raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="initialize", name=self.name)) @@ -191,7 +200,7 @@ cdef class TrainablePipe(Pipe): label (str): The label to add. RETURNS (int): 0 if label is already present, otherwise 1. - DOCS: https://nightly.spacy.io/api/pipe#add_label + DOCS: https://spacy.io/api/pipe#add_label """ raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name)) @@ -205,7 +214,12 @@ cdef class TrainablePipe(Pipe): def _allow_extra_label(self) -> None: """Raise an error if the component can not add any more labels.""" - if self.model.has_dim("nO") and self.model.get_dim("nO") == len(self.labels): + nO = None + if self.model.has_dim("nO"): + nO = self.model.get_dim("nO") + elif self.model.has_ref("output_layer") and self.model.get_ref("output_layer").has_dim("nO"): + nO = self.model.get_ref("output_layer").get_dim("nO") + if nO is not None and nO == len(self.labels): if not self.is_resizable: raise ValueError(Errors.E922.format(name=self.name, nO=self.model.get_dim("nO"))) @@ -221,7 +235,7 @@ cdef class TrainablePipe(Pipe): params (dict): The parameter values to use in the model. - DOCS: https://nightly.spacy.io/api/pipe#use_params + DOCS: https://spacy.io/api/pipe#use_params """ with self.model.use_params(params): yield @@ -233,7 +247,7 @@ cdef class TrainablePipe(Pipe): sgd (thinc.api.Optimizer): The optimizer. - DOCS: https://nightly.spacy.io/api/pipe#finish_update + DOCS: https://spacy.io/api/pipe#finish_update """ self.model.finish_update(sgd) @@ -253,13 +267,13 @@ cdef class TrainablePipe(Pipe): exclude (Iterable[str]): String names of serialization fields to exclude. RETURNS (bytes): The serialized object. - DOCS: https://nightly.spacy.io/api/pipe#to_bytes + DOCS: https://spacy.io/api/pipe#to_bytes """ self._validate_serialization_attrs() serialize = {} if hasattr(self, "cfg") and self.cfg is not None: serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) - serialize["vocab"] = self.vocab.to_bytes + serialize["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude) serialize["model"] = self.model.to_bytes return util.to_bytes(serialize, exclude) @@ -269,7 +283,7 @@ cdef class TrainablePipe(Pipe): exclude (Iterable[str]): String names of serialization fields to exclude. RETURNS (TrainablePipe): The loaded object. - DOCS: https://nightly.spacy.io/api/pipe#from_bytes + DOCS: https://spacy.io/api/pipe#from_bytes """ self._validate_serialization_attrs() @@ -282,7 +296,7 @@ cdef class TrainablePipe(Pipe): deserialize = {} if hasattr(self, "cfg") and self.cfg is not None: deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b)) - deserialize["vocab"] = lambda b: self.vocab.from_bytes(b) + deserialize["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude) deserialize["model"] = load_model util.from_bytes(bytes_data, deserialize, exclude) return self @@ -293,13 +307,13 @@ cdef class TrainablePipe(Pipe): path (str / Path): Path to a directory. exclude (Iterable[str]): String names of serialization fields to exclude. - DOCS: https://nightly.spacy.io/api/pipe#to_disk + DOCS: https://spacy.io/api/pipe#to_disk """ self._validate_serialization_attrs() serialize = {} if hasattr(self, "cfg") and self.cfg is not None: serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) - serialize["vocab"] = lambda p: self.vocab.to_disk(p) + serialize["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude) serialize["model"] = lambda p: self.model.to_disk(p) util.to_disk(path, serialize, exclude) @@ -310,20 +324,21 @@ cdef class TrainablePipe(Pipe): exclude (Iterable[str]): String names of serialization fields to exclude. RETURNS (TrainablePipe): The loaded object. - DOCS: https://nightly.spacy.io/api/pipe#from_disk + DOCS: https://spacy.io/api/pipe#from_disk """ self._validate_serialization_attrs() def load_model(p): try: - self.model.from_bytes(p.open("rb").read()) + with open(p, "rb") as mfile: + self.model.from_bytes(mfile.read()) except AttributeError: raise ValueError(Errors.E149) from None deserialize = {} if hasattr(self, "cfg") and self.cfg is not None: deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p)) - deserialize["vocab"] = lambda p: self.vocab.from_disk(p) + deserialize["vocab"] = lambda p: self.vocab.from_disk(p, exclude=exclude) deserialize["model"] = load_model util.from_disk(path, deserialize, exclude) return self diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 047805239..945652cad 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -7,7 +7,6 @@ from libcpp.vector cimport vector from libc.string cimport memset, memcpy from libc.stdlib cimport calloc, free import random -from typing import Optional import srsly from thinc.api import set_dropout_rate, CupyOps, get_array_module @@ -48,15 +47,43 @@ class Parser(TrainablePipe): beam_density=0.0, beam_update_prob=0.0, multitasks=tuple(), + incorrect_spans_key=None ): """Create a Parser. vocab (Vocab): The vocabulary object. Must be shared with documents to be processed. The value is set to the `.vocab` attribute. - **cfg: Configuration parameters. Set to the `.cfg` attribute. - If it doesn't include a value for 'moves', a new instance is - created with `self.TransitionSystem()`. This defines how the - parse-state is created, updated and evaluated. + model (Model): The model for the transition-based parser. The model needs + to have a specific substructure of named components --- see the + spacy.ml.tb_framework.TransitionModel for details. + name (str): The name of the pipeline component + moves (Optional[TransitionSystem]): This defines how the parse-state is created, + updated and evaluated. If 'moves' is None, a new instance is + created with `self.TransitionSystem()`. Defaults to `None`. + update_with_oracle_cut_size (int): During training, cut long sequences into + shorter segments by creating intermediate states based on the gold-standard + history. The model is not very sensitive to this parameter, so you usually + won't need to change it. 100 is a good default. + min_action_freq (int): The minimum frequency of labelled actions to retain. + Rarer labelled actions have their label backed-off to "dep". While this + primarily affects the label accuracy, it can also affect the attachment + structure, as the labels are used to represent the pseudo-projectivity + transformation. + learn_tokens (bool): Whether to learn to merge subtokens that are split + relative to the gold standard. Experimental. + beam_width (int): The number of candidate analyses to maintain. + beam_density (float): The minimum ratio between the scores of the first and + last candidates in the beam. This allows the parser to avoid exploring + candidates that are too far behind. This is mostly intended to improve + efficiency, but it can also improve accuracy as deeper search is not + always better. + beam_update_prob (float): The chance of making a beam update, instead of a + greedy update. Greedy updates are an approximation for the beam updates, + and are faster to compute. + multitasks: additional multi-tasking components. Experimental. + incorrect_spans_key (Optional[str]): Identifies spans that are known + to be incorrect entity annotations. The incorrect entity annotations + can be stored in the span group, under this key. """ self.vocab = vocab self.name = name @@ -68,11 +95,16 @@ class Parser(TrainablePipe): "learn_tokens": learn_tokens, "beam_width": beam_width, "beam_density": beam_density, - "beam_update_prob": beam_update_prob + "beam_update_prob": beam_update_prob, + "incorrect_spans_key": incorrect_spans_key } if moves is None: - # defined by EntityRecognizer as a BiluoPushDown - moves = self.TransitionSystem(self.vocab.strings) + # EntityRecognizer -> BiluoPushDown + # DependencyParser -> ArcEager + moves = self.TransitionSystem( + self.vocab.strings, + incorrect_spans_key=incorrect_spans_key + ) self.moves = moves self.model = model if self.moves.n_moves != 0: @@ -119,6 +151,10 @@ class Parser(TrainablePipe): # Available for subclasses, e.g. to deprojectivize return [] + @property + def incorrect_spans_key(self): + return self.cfg["incorrect_spans_key"] + def add_label(self, label): resized = False for action in self.moves.action_types: @@ -131,6 +167,23 @@ class Parser(TrainablePipe): return 1 return 0 + def _ensure_labels_are_added(self, docs): + """Ensure that all labels for a batch of docs are added.""" + resized = False + labels = set() + for doc in docs: + labels.update(self.moves.get_doc_labels(doc)) + for label in labels: + for action in self.moves.action_types: + added = self.moves.add_action(action, label) + if added: + self.vocab.strings.add(label) + resized = True + if resized: + self._resize() + return 1 + return 0 + def _resize(self): self.model.attrs["resize_output"](self.model, self.moves.n_moves) if self._rehearsal_model not in (True, False, None): @@ -157,39 +210,37 @@ class Parser(TrainablePipe): with self.model.use_params(params): yield - def __call__(self, Doc doc): - """Apply the parser or entity recognizer, setting the annotations onto - the `Doc` object. - - doc (Doc): The document to be processed. - """ - states = self.predict([doc]) - self.set_annotations([doc], states) - return doc - def pipe(self, docs, *, int batch_size=256): """Process a stream of documents. stream: The sequence of documents to process. batch_size (int): Number of documents to accumulate into a working set. + error_handler (Callable[[str, List[Doc], Exception], Any]): Function that + deals with a failing batch of documents. The default function just reraises + the exception. + YIELDS (Doc): Documents, in order. """ cdef Doc doc + error_handler = self.get_error_handler() for batch in util.minibatch(docs, size=batch_size): batch_in_order = list(batch) - by_length = sorted(batch, key=lambda doc: len(doc)) - for subbatch in util.minibatch(by_length, size=max(batch_size//4, 2)): - subbatch = list(subbatch) - parse_states = self.predict(subbatch) - self.set_annotations(subbatch, parse_states) - yield from batch_in_order + try: + by_length = sorted(batch, key=lambda doc: len(doc)) + for subbatch in util.minibatch(by_length, size=max(batch_size//4, 2)): + subbatch = list(subbatch) + parse_states = self.predict(subbatch) + self.set_annotations(subbatch, parse_states) + yield from batch_in_order + except Exception as e: + error_handler(self.name, self, batch_in_order, e) + def predict(self, docs): if isinstance(docs, Doc): docs = [docs] if not any(len(doc) for doc in docs): result = self.moves.init_batch(docs) - self._resize() return result if self.cfg["beam_width"] == 1: return self.greedy_parse(docs, drop=0.0) @@ -229,6 +280,9 @@ class Parser(TrainablePipe): losses = {} losses.setdefault(self.name, 0.) validate_examples(examples, "Parser.update") + self._ensure_labels_are_added( + [eg.x for eg in examples] + [eg.y for eg in examples] + ) for multitask in self._multitasks: multitask.update(examples, drop=drop, sgd=sgd) # We need to take care to act on the whole batch, because we might be @@ -313,10 +367,7 @@ class Parser(TrainablePipe): def initialize(self, get_examples, nlp=None, labels=None): validate_get_examples(get_examples, "Parser.initialize") - lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {}) - if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS: - langs = ", ".join(util.LEXEME_NORM_LANGS) - util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs)) + util.check_lexeme_norms(self.vocab, "parser or NER") if labels is not None: actions = dict(labels) else: @@ -355,7 +406,7 @@ class Parser(TrainablePipe): def to_disk(self, path, exclude=tuple()): serializers = { "model": lambda p: (self.model.to_disk(p) if self.model is not True else True), - "vocab": lambda p: self.vocab.to_disk(p), + "vocab": lambda p: self.vocab.to_disk(p, exclude=exclude), "moves": lambda p: self.moves.to_disk(p, exclude=["strings"]), "cfg": lambda p: srsly.write_json(p, self.cfg) } @@ -363,7 +414,7 @@ class Parser(TrainablePipe): def from_disk(self, path, exclude=tuple()): deserializers = { - "vocab": lambda p: self.vocab.from_disk(p), + "vocab": lambda p: self.vocab.from_disk(p, exclude=exclude), "moves": lambda p: self.moves.from_disk(p, exclude=["strings"]), "cfg": lambda p: self.cfg.update(srsly.read_json(p)), "model": lambda p: None, @@ -377,13 +428,13 @@ class Parser(TrainablePipe): self._resize() self.model.from_bytes(bytes_data) except AttributeError: - raise ValueError(Errors.E149) from None + raise ValueError(Errors.E149) return self def to_bytes(self, exclude=tuple()): serializers = { "model": lambda: (self.model.to_bytes()), - "vocab": lambda: self.vocab.to_bytes(), + "vocab": lambda: self.vocab.to_bytes(exclude=exclude), "moves": lambda: self.moves.to_bytes(exclude=["strings"]), "cfg": lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True) } @@ -391,7 +442,7 @@ class Parser(TrainablePipe): def from_bytes(self, bytes_data, exclude=tuple()): deserializers = { - "vocab": lambda b: self.vocab.from_bytes(b), + "vocab": lambda b: self.vocab.from_bytes(b, exclude=exclude), "moves": lambda b: self.moves.from_bytes(b, exclude=["strings"]), "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), "model": lambda b: None, diff --git a/spacy/py.typed b/spacy/py.typed new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/schemas.py b/spacy/schemas.py index d041845f3..73ddc45b1 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -4,7 +4,7 @@ from enum import Enum from pydantic import BaseModel, Field, ValidationError, validator, create_model from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool from pydantic.main import ModelMetaclass -from thinc.api import Optimizer, ConfigValidationError +from thinc.api import Optimizer, ConfigValidationError, Model from thinc.config import Promise from collections import defaultdict import inspect @@ -17,6 +17,7 @@ if TYPE_CHECKING: # This lets us add type hints for mypy etc. without causing circular imports from .language import Language # noqa: F401 from .training import Example # noqa: F401 + from .vocab import Vocab # noqa: F401 # fmt: off @@ -43,7 +44,7 @@ def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]: for error in errors: err_loc = " -> ".join([str(p) for p in error.get("loc", [])]) data[err_loc].append(error.get("msg")) - return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()] + return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()] # type: ignore[arg-type] # Initialization @@ -81,7 +82,7 @@ def get_arg_model( except ValueError: # Typically happens if the method is part of a Cython module without # binding=True. Here we just use an empty model that allows everything. - return create_model(name, __config__=ArgSchemaConfigExtra) + return create_model(name, __config__=ArgSchemaConfigExtra) # type: ignore[arg-type, return-value] has_variable = False for param in sig.parameters.values(): if param.name in exclude: @@ -101,8 +102,8 @@ def get_arg_model( default = param.default if param.default != param.empty else default_empty sig_args[param.name] = (annotation, default) is_strict = strict and not has_variable - sig_args["__config__"] = ArgSchemaConfig if is_strict else ArgSchemaConfigExtra - return create_model(name, **sig_args) + sig_args["__config__"] = ArgSchemaConfig if is_strict else ArgSchemaConfigExtra # type: ignore[assignment] + return create_model(name, **sig_args) # type: ignore[arg-type, return-value] def validate_init_settings( @@ -158,6 +159,7 @@ class TokenPatternString(BaseModel): NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in") IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset") IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset") + INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects") class Config: extra = "forbid" @@ -174,8 +176,9 @@ class TokenPatternNumber(BaseModel): REGEX: Optional[StrictStr] = Field(None, alias="regex") IN: Optional[List[StrictInt]] = Field(None, alias="in") NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in") - ISSUBSET: Optional[List[StrictInt]] = Field(None, alias="issubset") - ISSUPERSET: Optional[List[StrictInt]] = Field(None, alias="issuperset") + IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset") + IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset") + INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects") EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==") NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=") GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=") @@ -195,10 +198,10 @@ class TokenPatternNumber(BaseModel): class TokenPatternOperator(str, Enum): - plus: StrictStr = "+" - start: StrictStr = "*" - question: StrictStr = "?" - exclamation: StrictStr = "!" + plus: StrictStr = StrictStr("+") + start: StrictStr = StrictStr("*") + question: StrictStr = StrictStr("?") + exclamation: StrictStr = StrictStr("!") StringValue = Union[TokenPatternString, StrictStr] @@ -313,6 +316,7 @@ class ConfigSchemaTraining(BaseModel): optimizer: Optimizer = Field(..., title="The optimizer to use") logger: Logger = Field(..., title="The logger to track training progress") frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training") + annotating_components: List[str] = Field(..., title="Pipeline components that should set annotations during training") before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk") # fmt: on @@ -353,7 +357,7 @@ class ConfigSchemaPretrain(BaseModel): batcher: Batcher = Field(..., title="Batcher for the training data") component: str = Field(..., title="Component to find the layer to pretrain") layer: str = Field(..., title="Layer to pretrain. Whole model if empty.") - objective: Callable[["Vocab", "Model"], "Model"] = Field(..., title="A function that creates the pretraining objective.") + objective: Callable[["Vocab", Model], Model] = Field(..., title="A function that creates the pretraining objective.") # fmt: on class Config: @@ -381,7 +385,7 @@ class ConfigSchemaInit(BaseModel): class ConfigSchema(BaseModel): training: ConfigSchemaTraining nlp: ConfigSchemaNlp - pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {} + pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {} # type: ignore[assignment] components: Dict[str, Dict[str, Any]] corpora: Dict[str, Reader] initialize: ConfigSchemaInit @@ -446,6 +450,7 @@ class ProjectConfigCommand(BaseModel): class ProjectConfigSchema(BaseModel): # fmt: off vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands") + env: Dict[StrictStr, Any] = Field({}, title="Optional variable names to substitute in commands, mapped to environment variable names") assets: List[Union[ProjectConfigAssetURL, ProjectConfigAssetGit]] = Field([], title="Data assets") workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order") commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts") diff --git a/spacy/scorer.py b/spacy/scorer.py index dd0cde77b..ebab2382d 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -1,4 +1,5 @@ -from typing import Optional, Iterable, Dict, Set, Any, Callable, TYPE_CHECKING +from typing import Optional, Iterable, Dict, Set, List, Any, Callable, Tuple +from typing import TYPE_CHECKING import numpy as np from collections import defaultdict @@ -20,10 +21,16 @@ MISSING_VALUES = frozenset([None, 0, ""]) class PRFScore: """A precision / recall / F score.""" - def __init__(self) -> None: - self.tp = 0 - self.fp = 0 - self.fn = 0 + def __init__( + self, + *, + tp: int = 0, + fp: int = 0, + fn: int = 0, + ) -> None: + self.tp = tp + self.fp = fp + self.fn = fn def __len__(self) -> int: return self.tp + self.fp + self.fn @@ -68,8 +75,8 @@ class ROCAUCScore: may throw an error.""" def __init__(self) -> None: - self.golds = [] - self.cands = [] + self.golds: List[Any] = [] + self.cands: List[Any] = [] self.saved_score = 0.0 self.saved_score_at_len = 0 @@ -103,11 +110,12 @@ class Scorer: ) -> None: """Initialize the Scorer. - DOCS: https://nightly.spacy.io/api/scorer#init + DOCS: https://spacy.io/api/scorer#init """ - self.nlp = nlp self.cfg = cfg - if not nlp: + if nlp: + self.nlp = nlp + else: nlp = get_lang_class(default_lang)() for pipe in default_pipeline: nlp.add_pipe(pipe) @@ -119,11 +127,11 @@ class Scorer: examples (Iterable[Example]): The predicted annotations + correct annotations. RETURNS (Dict): A dictionary of scores. - DOCS: https://nightly.spacy.io/api/scorer#score + DOCS: https://spacy.io/api/scorer#score """ scores = {} if hasattr(self.nlp.tokenizer, "score"): - scores.update(self.nlp.tokenizer.score(examples, **self.cfg)) + scores.update(self.nlp.tokenizer.score(examples, **self.cfg)) # type: ignore for name, component in self.nlp.pipeline: if hasattr(component, "score"): scores.update(component.score(examples, **self.cfg)) @@ -139,7 +147,7 @@ class Scorer: RETURNS (Dict[str, Any]): A dictionary containing the scores token_acc/p/r/f. - DOCS: https://nightly.spacy.io/api/scorer#score_tokenization + DOCS: https://spacy.io/api/scorer#score_tokenization """ acc_score = PRFScore() prf_score = PRFScore() @@ -185,7 +193,7 @@ class Scorer: attr: str, *, getter: Callable[[Token, str], Any] = getattr, - missing_values: Set[Any] = MISSING_VALUES, + missing_values: Set[Any] = MISSING_VALUES, # type: ignore[assignment] **cfg, ) -> Dict[str, Any]: """Returns an accuracy score for a token-level attribute. @@ -195,10 +203,12 @@ class Scorer: getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, getter(token, attr) should return the value of the attribute for an individual token. + missing_values (Set[Any]): Attribute values to treat as missing annotation + in the reference annotation. RETURNS (Dict[str, Any]): A dictionary containing the accuracy score under the key attr_acc. - DOCS: https://nightly.spacy.io/api/scorer#score_token_attr + DOCS: https://spacy.io/api/scorer#score_token_attr """ tag_score = PRFScore() for example in examples: @@ -234,7 +244,7 @@ class Scorer: attr: str, *, getter: Callable[[Token, str], Any] = getattr, - missing_values: Set[Any] = MISSING_VALUES, + missing_values: Set[Any] = MISSING_VALUES, # type: ignore[assignment] **cfg, ) -> Dict[str, Any]: """Return PRF scores per feat for a token attribute in UFEATS format. @@ -244,6 +254,8 @@ class Scorer: getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, getter(token, attr) should return the value of the attribute for an individual token. + missing_values (Set[Any]): Attribute values to treat as missing annotation + in the reference annotation. RETURNS (dict): A dictionary containing the per-feat PRF scores under the key attr_per_feat. """ @@ -252,7 +264,7 @@ class Scorer: pred_doc = example.predicted gold_doc = example.reference align = example.alignment - gold_per_feat = {} + gold_per_feat: Dict[str, Set] = {} missing_indices = set() for gold_i, token in enumerate(gold_doc): value = getter(token, attr) @@ -267,7 +279,7 @@ class Scorer: gold_per_feat[field].add((gold_i, feat)) else: missing_indices.add(gold_i) - pred_per_feat = {} + pred_per_feat: Dict[str, Set] = {} for token in pred_doc: if token.orth_.isspace(): continue @@ -305,6 +317,8 @@ class Scorer: *, getter: Callable[[Doc, str], Iterable[Span]] = getattr, has_annotation: Optional[Callable[[Doc], bool]] = None, + labeled: bool = True, + allow_overlap: bool = False, **cfg, ) -> Dict[str, Any]: """Returns PRF scores for labeled spans. @@ -314,17 +328,25 @@ class Scorer: getter (Callable[[Doc, str], Iterable[Span]]): Defaults to getattr. If provided, getter(doc, attr) should return the spans for the individual doc. + has_annotation (Optional[Callable[[Doc], bool]]) should return whether a `Doc` + has annotation for this `attr`. Docs without annotation are skipped for + scoring purposes. + labeled (bool): Whether or not to include label information in + the evaluation. If set to 'False', two spans will be considered + equal if their start and end match, irrespective of their label. + allow_overlap (bool): Whether or not to allow overlapping spans. + If set to 'False', the alignment will automatically resolve conflicts. RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under the keys attr_p/r/f and the per-type PRF scores under attr_per_type. - DOCS: https://nightly.spacy.io/api/scorer#score_spans + DOCS: https://spacy.io/api/scorer#score_spans """ score = PRFScore() score_per_type = dict() for example in examples: pred_doc = example.predicted gold_doc = example.reference - # Option to handle docs without sents + # Option to handle docs without annotation for this attribute if has_annotation is not None: if not has_annotation(gold_doc): continue @@ -334,7 +356,7 @@ class Scorer: + [k.label_ for k in getter(pred_doc, attr)] ) # Set up all labels for per type scoring and prepare gold per type - gold_per_type = {label: set() for label in labels} + gold_per_type: Dict[str, Set] = {label: set() for label in labels} for label in labels: if label not in score_per_type: score_per_type[label] = PRFScore() @@ -342,33 +364,48 @@ class Scorer: gold_spans = set() pred_spans = set() for span in getter(gold_doc, attr): - gold_span = (span.label_, span.start, span.end - 1) + gold_span: Tuple + if labeled: + gold_span = (span.label_, span.start, span.end - 1) + else: + gold_span = (span.start, span.end - 1) gold_spans.add(gold_span) - gold_per_type[span.label_].add((span.label_, span.start, span.end - 1)) - pred_per_type = {label: set() for label in labels} - for span in example.get_aligned_spans_x2y(getter(pred_doc, attr)): - pred_spans.add((span.label_, span.start, span.end - 1)) - pred_per_type[span.label_].add((span.label_, span.start, span.end - 1)) + gold_per_type[span.label_].add(gold_span) + pred_per_type: Dict[str, Set] = {label: set() for label in labels} + for span in example.get_aligned_spans_x2y( + getter(pred_doc, attr), allow_overlap + ): + pred_span: Tuple + if labeled: + pred_span = (span.label_, span.start, span.end - 1) + else: + pred_span = (span.start, span.end - 1) + pred_spans.add(pred_span) + pred_per_type[span.label_].add(pred_span) # Scores per label - for k, v in score_per_type.items(): - if k in pred_per_type: - v.score_set(pred_per_type[k], gold_per_type[k]) + if labeled: + for k, v in score_per_type.items(): + if k in pred_per_type: + v.score_set(pred_per_type[k], gold_per_type[k]) # Score for all labels score.score_set(pred_spans, gold_spans) + # Assemble final result + final_scores: Dict[str, Any] = { + f"{attr}_p": None, + f"{attr}_r": None, + f"{attr}_f": None, + } + if labeled: + final_scores[f"{attr}_per_type"] = None if len(score) > 0: - return { - f"{attr}_p": score.precision, - f"{attr}_r": score.recall, - f"{attr}_f": score.fscore, - f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()}, - } - else: - return { - f"{attr}_p": None, - f"{attr}_r": None, - f"{attr}_f": None, - f"{attr}_per_type": None, - } + final_scores[f"{attr}_p"] = score.precision + final_scores[f"{attr}_r"] = score.recall + final_scores[f"{attr}_f"] = score.fscore + if labeled: + final_scores[f"{attr}_per_type"] = { + k: v.to_dict() for k, v in score_per_type.items() + } + return final_scores @staticmethod def score_cats( @@ -413,7 +450,7 @@ class Scorer: attr_f_per_type, attr_auc_per_type - DOCS: https://nightly.spacy.io/api/scorer#score_cats + DOCS: https://spacy.io/api/scorer#score_cats """ if threshold is None: threshold = 0.5 if multi_label else 0.0 @@ -461,7 +498,7 @@ class Scorer: if gold_score is not None and gold_score > 0: f_per_type[gold_label].fn += 1 elif pred_cats: - pred_label, pred_score = max(pred_cats, key=lambda it: it[1]) + pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1]) if pred_score >= threshold: f_per_type[pred_label].fp += 1 micro_prf = PRFScore() @@ -479,7 +516,7 @@ class Scorer: sum(auc.score if auc.is_binary() else 0.0 for auc in auc_per_type.values()) / n_cats ) - results = { + results: Dict[str, Any] = { f"{attr}_score": None, f"{attr}_score_desc": None, f"{attr}_micro_p": micro_prf.precision, @@ -519,7 +556,7 @@ class Scorer: negative_labels (Iterable[str]): The string values that refer to no annotation (e.g. "NIL") RETURNS (Dict[str, Any]): A dictionary containing the scores. - DOCS: https://nightly.spacy.io/api/scorer#score_links + DOCS: https://spacy.io/api/scorer#score_links """ f_per_type = {} for example in examples: @@ -531,27 +568,28 @@ class Scorer: gold_span = gold_ent_by_offset.get( (pred_ent.start_char, pred_ent.end_char), None ) - label = gold_span.label_ - if label not in f_per_type: - f_per_type[label] = PRFScore() - gold = gold_span.kb_id_ - # only evaluating entities that overlap between gold and pred, - # to disentangle the performance of the NEL from the NER - if gold is not None: - pred = pred_ent.kb_id_ - if gold in negative_labels and pred in negative_labels: - # ignore true negatives - pass - elif gold == pred: - f_per_type[label].tp += 1 - elif gold in negative_labels: - f_per_type[label].fp += 1 - elif pred in negative_labels: - f_per_type[label].fn += 1 - else: - # a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN - f_per_type[label].fp += 1 - f_per_type[label].fn += 1 + if gold_span is not None: + label = gold_span.label_ + if label not in f_per_type: + f_per_type[label] = PRFScore() + gold = gold_span.kb_id_ + # only evaluating entities that overlap between gold and pred, + # to disentangle the performance of the NEL from the NER + if gold is not None: + pred = pred_ent.kb_id_ + if gold in negative_labels and pred in negative_labels: + # ignore true negatives + pass + elif gold == pred: + f_per_type[label].tp += 1 + elif gold in negative_labels: + f_per_type[label].fp += 1 + elif pred in negative_labels: + f_per_type[label].fn += 1 + else: + # a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN + f_per_type[label].fp += 1 + f_per_type[label].fn += 1 micro_prf = PRFScore() for label_prf in f_per_type.values(): micro_prf.tp += label_prf.tp @@ -583,7 +621,7 @@ class Scorer: head_attr: str = "head", head_getter: Callable[[Token, str], Token] = getattr, ignore_labels: Iterable[str] = SimpleFrozenList(), - missing_values: Set[Any] = MISSING_VALUES, + missing_values: Set[Any] = MISSING_VALUES, # type: ignore[assignment] **cfg, ) -> Dict[str, Any]: """Returns the UAS, LAS, and LAS per type scores for dependency @@ -600,10 +638,12 @@ class Scorer: head_getter(token, attr) should return the value of the head for an individual token. ignore_labels (Tuple): Labels to ignore while scoring (e.g., punct). + missing_values (Set[Any]): Attribute values to treat as missing annotation + in the reference annotation. RETURNS (Dict[str, Any]): A dictionary containing the scores: attr_uas, attr_las, and attr_las_per_type. - DOCS: https://nightly.spacy.io/api/scorer#score_deps + DOCS: https://spacy.io/api/scorer#score_deps """ unlabelled = PRFScore() labelled = PRFScore() @@ -614,7 +654,7 @@ class Scorer: pred_doc = example.predicted align = example.alignment gold_deps = set() - gold_deps_per_dep = {} + gold_deps_per_dep: Dict[str, Set] = {} for gold_i, token in enumerate(gold_doc): dep = getter(token, attr) head = head_getter(token, head_attr) @@ -629,12 +669,12 @@ class Scorer: else: missing_indices.add(gold_i) pred_deps = set() - pred_deps_per_dep = {} + pred_deps_per_dep: Dict[str, Set] = {} for token in pred_doc: if token.orth_.isspace(): continue if align.x2y.lengths[token.i] != 1: - gold_i = None + gold_i = None # type: ignore else: gold_i = align.x2y[token.i].dataXd[0, 0] if gold_i not in missing_indices: diff --git a/spacy/strings.pyi b/spacy/strings.pyi new file mode 100644 index 000000000..5b4147e12 --- /dev/null +++ b/spacy/strings.pyi @@ -0,0 +1,22 @@ +from typing import Optional, Iterable, Iterator, Union, Any +from pathlib import Path + +def get_string_id(key: Union[str, int]) -> int: ... + +class StringStore: + def __init__( + self, strings: Optional[Iterable[str]] = ..., freeze: bool = ... + ) -> None: ... + def __getitem__(self, string_or_id: Union[bytes, str, int]) -> Union[str, int]: ... + def as_int(self, key: Union[bytes, str, int]) -> int: ... + def as_string(self, key: Union[bytes, str, int]) -> str: ... + def add(self, string: str) -> int: ... + def __len__(self) -> int: ... + def __contains__(self, string: str) -> bool: ... + def __iter__(self) -> Iterator[str]: ... + def __reduce__(self) -> Any: ... + def to_disk(self, path: Union[str, Path]) -> None: ... + def from_disk(self, path: Union[str, Path]) -> StringStore: ... + def to_bytes(self, **kwargs: Any) -> bytes: ... + def from_bytes(self, bytes_data: bytes, **kwargs: Any) -> StringStore: ... + def _reset_and_load(self, strings: Iterable[str]) -> None: ... diff --git a/spacy/strings.pyx b/spacy/strings.pyx index cd442729c..4a20cb8af 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -91,7 +91,7 @@ cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) e cdef class StringStore: """Look up strings by 64-bit hashes. - DOCS: https://nightly.spacy.io/api/stringstore + DOCS: https://spacy.io/api/stringstore """ def __init__(self, strings=None, freeze=False): """Create the StringStore. @@ -223,7 +223,7 @@ cdef class StringStore: it doesn't exist. Paths may be either strings or Path-like objects. """ path = util.ensure_path(path) - strings = list(self) + strings = sorted(self) srsly.write_json(path, strings) def from_disk(self, path): @@ -247,7 +247,7 @@ cdef class StringStore: RETURNS (bytes): The serialized form of the `StringStore` object. """ - return srsly.json_dumps(list(self)) + return srsly.json_dumps(sorted(self)) def from_bytes(self, bytes_data, **kwargs): """Load state from a binary string. diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index bd323e2d5..b88d11f0e 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -3,7 +3,13 @@ from spacy.util import get_lang_class def pytest_addoption(parser): - parser.addoption("--slow", action="store_true", help="include slow tests") + try: + parser.addoption("--slow", action="store_true", help="include slow tests") + parser.addoption("--issue", action="store", help="test specific issues") + # Options are already added, e.g. if conftest is copied in a build pipeline + # and runs twice + except ValueError: + pass def pytest_runtest_setup(item): @@ -16,10 +22,24 @@ def pytest_runtest_setup(item): # options weren't given. return item.config.getoption(f"--{opt}", False) + # Integration of boolean flags for opt in ["slow"]: if opt in item.keywords and not getopt(opt): pytest.skip(f"need --{opt} option to run") + # Special integration to mark tests with issue numbers + issues = getopt("issue") + if isinstance(issues, str): + if "issue" in item.keywords: + # Convert issues provided on the CLI to list of ints + issue_nos = [int(issue.strip()) for issue in issues.split(",")] + # Get all issues specified by decorators and check if they're provided + issue_refs = [mark.args[0] for mark in item.iter_markers(name="issue")] + if not any([ref in issue_nos for ref in issue_refs]): + pytest.skip(f"not referencing specified issues: {issue_nos}") + else: + pytest.skip("not referencing any issues") + # Fixtures for language tokenizers (languages sorted alphabetically) @@ -39,6 +59,11 @@ def ar_tokenizer(): return get_lang_class("ar")().tokenizer +@pytest.fixture(scope="session") +def bg_tokenizer(): + return get_lang_class("bg")().tokenizer + + @pytest.fixture(scope="session") def bn_tokenizer(): return get_lang_class("bn")().tokenizer @@ -120,6 +145,11 @@ def ga_tokenizer(): return get_lang_class("ga")().tokenizer +@pytest.fixture(scope="session") +def grc_tokenizer(): + return get_lang_class("grc")().tokenizer + + @pytest.fixture(scope="session") def gu_tokenizer(): return get_lang_class("gu")().tokenizer @@ -197,6 +227,11 @@ def ne_tokenizer(): return get_lang_class("ne")().tokenizer +@pytest.fixture(scope="session") +def nl_vocab(): + return get_lang_class("nl")().vocab + + @pytest.fixture(scope="session") def nl_tokenizer(): return get_lang_class("nl")().tokenizer @@ -265,17 +300,35 @@ def tt_tokenizer(): return get_lang_class("tt")().tokenizer +@pytest.fixture(scope="session") +def ky_tokenizer(): + return get_lang_class("ky")().tokenizer + + @pytest.fixture(scope="session") def uk_tokenizer(): pytest.importorskip("pymorphy2") return get_lang_class("uk")().tokenizer +@pytest.fixture +def uk_lemmatizer(): + pytest.importorskip("pymorphy2") + pytest.importorskip("pymorphy2_dicts_uk") + return get_lang_class("uk")().add_pipe("lemmatizer") + + @pytest.fixture(scope="session") def ur_tokenizer(): return get_lang_class("ur")().tokenizer +@pytest.fixture(scope="session") +def vi_tokenizer(): + pytest.importorskip("pyvi") + return get_lang_class("vi")().tokenizer + + @pytest.fixture(scope="session") def yo_tokenizer(): return get_lang_class("yo")().tokenizer diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index fa0206fdd..231b7c2a8 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -18,14 +18,9 @@ def _ner_example(ner): def test_doc_add_entities_set_ents_iob(en_vocab): text = ["This", "is", "a", "lion"] doc = Doc(en_vocab, words=text) - config = { - "learn_tokens": False, - "min_action_freq": 30, - "update_with_oracle_cut_size": 100, - } cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - ner = EntityRecognizer(en_vocab, model, **config) + ner = EntityRecognizer(en_vocab, model) ner.initialize(lambda: [_ner_example(ner)]) ner(doc) @@ -40,14 +35,9 @@ def test_ents_reset(en_vocab): """Ensure that resetting doc.ents does not change anything""" text = ["This", "is", "a", "lion"] doc = Doc(en_vocab, words=text) - config = { - "learn_tokens": False, - "min_action_freq": 30, - "update_with_oracle_cut_size": 100, - } cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - ner = EntityRecognizer(en_vocab, model, **config) + ner = EntityRecognizer(en_vocab, model) ner.initialize(lambda: [_ner_example(ner)]) ner(doc) orig_iobs = [t.ent_iob_ for t in doc] diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py index 0dc6c4866..302a9b6ea 100644 --- a/spacy/tests/doc/test_creation.py +++ b/spacy/tests/doc/test_creation.py @@ -63,3 +63,17 @@ def test_create_from_words_and_text(vocab): words = [" ", " ", "'", "dogs", "'", "\n\n", "run"] text = " 'dogs'\n\nrun " (words, spaces) = util.get_words_and_spaces(words + ["away"], text) + + +def test_create_with_heads_and_no_deps(vocab): + words = "I like ginger".split() + heads = list(range(len(words))) + with pytest.raises(ValueError): + Doc(vocab, words=words, heads=heads) + + +def test_create_invalid_pos(vocab): + words = "I like ginger".split() + pos = "QQ ZZ XX".split() + with pytest.raises(ValueError): + Doc(vocab, words=words, pos=pos) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index ea95ca772..57df87642 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -1,15 +1,17 @@ +import weakref + import pytest import numpy -import logging -import mock from spacy.lang.xx import MultiLanguage -from spacy.tokens import Doc, Span +from spacy.tokens import Doc, Span, Token from spacy.vocab import Vocab from spacy.lexeme import Lexeme from spacy.lang.en import English from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH +from .test_underscore import clean_underscore # noqa: F401 + def test_doc_api_init(en_vocab): words = ["a", "b", "c", "d"] @@ -154,13 +156,10 @@ def test_doc_api_serialize(en_tokenizer, text): def inner_func(d1, d2): return "hello!" - logger = logging.getLogger("spacy") - with mock.patch.object(logger, "warning") as mock_warning: - _ = tokens.to_bytes() # noqa: F841 - mock_warning.assert_not_called() + _ = tokens.to_bytes() # noqa: F841 + with pytest.warns(UserWarning): tokens.user_hooks["similarity"] = inner_func _ = tokens.to_bytes() # noqa: F841 - mock_warning.assert_called_once() def test_doc_api_set_ents(en_tokenizer): @@ -345,15 +344,27 @@ def test_doc_from_array_morph(en_vocab): assert [str(t.morph) for t in doc] == [str(t.morph) for t in new_doc] +@pytest.mark.usefixtures("clean_underscore") def test_doc_api_from_docs(en_tokenizer, de_tokenizer): - en_texts = ["Merging the docs is fun.", "", "They don't think alike."] + en_texts = [ + "Merging the docs is fun.", + "", + "They don't think alike. ", + "Another doc.", + ] en_texts_without_empty = [t for t in en_texts if len(t)] de_text = "Wie war die Frage?" en_docs = [en_tokenizer(text) for text in en_texts] - docs_idx = en_texts[0].index("docs") + en_docs[0].spans["group"] = [en_docs[0][1:4]] + en_docs[2].spans["group"] = [en_docs[2][1:4]] + en_docs[3].spans["group"] = [en_docs[3][0:1]] + span_group_texts = sorted( + [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text] + ) de_doc = de_tokenizer(de_text) - expected = (True, None, None, None) - en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = expected + Token.set_extension("is_ambiguous", default=False) + en_docs[0][2]._.is_ambiguous = True # docs + en_docs[2][3]._.is_ambiguous = True # think assert Doc.from_docs([]) is None assert de_doc is not Doc.from_docs([de_doc]) assert str(de_doc) == str(Doc.from_docs([de_doc])) @@ -363,40 +374,60 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): m_doc = Doc.from_docs(en_docs) assert len(en_texts_without_empty) == len(list(m_doc.sents)) - assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) - assert str(m_doc) == " ".join(en_texts_without_empty) + assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1]) + assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty]) p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] assert len(m_doc) == len(en_docs_tokens) think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think") + assert m_doc[2]._.is_ambiguous is True assert m_doc[9].idx == think_idx - with pytest.raises(AttributeError): - # not callable, because it was not set via set_extension - m_doc[2]._.is_ambiguous - assert len(m_doc.user_data) == len(en_docs[0].user_data) # but it's there + assert m_doc[9]._.is_ambiguous is True + assert not any([t._.is_ambiguous for t in m_doc[3:8]]) + assert "group" in m_doc.spans + assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]]) + assert bool(m_doc[11].whitespace_) m_doc = Doc.from_docs(en_docs, ensure_whitespace=False) assert len(en_texts_without_empty) == len(list(m_doc.sents)) - assert len(str(m_doc)) == sum(len(t) for t in en_texts) - assert str(m_doc) == "".join(en_texts) + assert len(m_doc.text) == sum(len(t) for t in en_texts) + assert m_doc.text == "".join(en_texts_without_empty) p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and not bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] assert len(m_doc) == len(en_docs_tokens) think_idx = len(en_texts[0]) + 0 + en_texts[2].index("think") assert m_doc[9].idx == think_idx + assert "group" in m_doc.spans + assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]]) + assert bool(m_doc[11].whitespace_) m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"]) - assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) + assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1]) # space delimiter considered, although spacy attribute was missing - assert str(m_doc) == " ".join(en_texts_without_empty) + assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty]) p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] assert len(m_doc) == len(en_docs_tokens) think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think") assert m_doc[9].idx == think_idx + assert "group" in m_doc.spans + assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]]) + + # can merge empty docs + doc = Doc.from_docs([en_tokenizer("")] * 10) + + # empty but set spans keys are preserved + en_docs = [en_tokenizer(text) for text in en_texts] + m_doc = Doc.from_docs(en_docs) + assert "group" not in m_doc.spans + for doc in en_docs: + doc.spans["group"] = [] + m_doc = Doc.from_docs(en_docs) + assert "group" in m_doc.spans + assert len(m_doc.spans["group"]) == 0 def test_doc_api_from_docs_ents(en_tokenizer): @@ -641,7 +672,8 @@ def test_doc_noun_chunks_not_implemented(): nlp = MultiLanguage() doc = nlp(text) with pytest.raises(NotImplementedError): - chunks = list(doc.noun_chunks) + _ = list(doc.noun_chunks) # noqa: F841 + def test_span_groups(en_tokenizer): doc = en_tokenizer("Some text about Colombia and the Czech Republic") @@ -662,3 +694,10 @@ def test_span_groups(en_tokenizer): assert doc.spans["hi"].has_overlap del doc.spans["hi"] assert "hi" not in doc.spans + + +def test_doc_spans_copy(en_tokenizer): + doc1 = en_tokenizer("Some text about Colombia and the Czech Republic") + assert weakref.ref(doc1) == doc1.spans.doc_ref + doc2 = doc1.copy() + assert weakref.ref(doc2) == doc2.spans.doc_ref diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index 48cd33890..20c302da1 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -108,9 +108,12 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab): words = ["The", "players", "start", "."] lemmas = [t.lower() for t in words] heads = [1, 2, 2, 2] + deps = ["dep"] * len(heads) tags = ["DT", "NN", "VBZ", "."] pos = ["DET", "NOUN", "VERB", "PUNCT"] - doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads, lemmas=lemmas) + doc = Doc( + en_vocab, words=words, tags=tags, pos=pos, heads=heads, deps=deps, lemmas=lemmas + ) assert len(doc) == 4 assert doc[0].text == "The" assert doc[0].tag_ == "DT" @@ -123,7 +126,9 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab): assert doc[0].tag_ == "NN" assert doc[0].pos_ == "NOUN" assert doc[0].lemma_ == "the players" - doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads, lemmas=lemmas) + doc = Doc( + en_vocab, words=words, tags=tags, pos=pos, heads=heads, deps=deps, lemmas=lemmas + ) assert len(doc) == 4 assert doc[0].text == "The" assert doc[0].tag_ == "DT" @@ -190,8 +195,9 @@ def test_doc_retokenize_span_np_merges(en_tokenizer): text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript." heads = [1, 1, 10, 7, 3, 3, 7, 10, 9, 10, 1, 10, 11, 12, 13, 13, 1] + deps = ["dep"] * len(heads) tokens = en_tokenizer(text) - doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) + doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) with doc.retokenize() as retokenizer: for ent in doc.ents: attrs = {"tag": ent.label_, "lemma": ent.lemma_, "ent_type": ent.label_} @@ -199,8 +205,9 @@ def test_doc_retokenize_span_np_merges(en_tokenizer): text = "One test with entities like New York City so the ents list is not void" heads = [1, 1, 1, 2, 3, 6, 7, 4, 12, 11, 11, 12, 1, 12, 12] + deps = ["dep"] * len(heads) tokens = en_tokenizer(text) - doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) + doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) with doc.retokenize() as retokenizer: for ent in doc.ents: retokenizer.merge(ent) @@ -210,6 +217,7 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer): # fmt: off text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n" heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15] + deps = ["dep"] * len(heads) tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"] ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)] ents = ["O"] * len(heads) @@ -221,7 +229,12 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer): # fmt: on tokens = en_tokenizer(text) doc = Doc( - tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents + tokens.vocab, + words=[t.text for t in tokens], + heads=heads, + deps=deps, + tags=tags, + ents=ents, ) assert len(doc) == 17 with doc.retokenize() as retokenizer: @@ -452,3 +465,30 @@ def test_retokenize_disallow_zero_length(en_vocab): with pytest.raises(ValueError): with doc.retokenize() as retokenizer: retokenizer.merge(doc[1:1]) + + +def test_doc_retokenize_merge_without_parse_keeps_sents(en_tokenizer): + text = "displaCy is a parse tool built with Javascript" + sent_starts = [1, 0, 0, 0, 1, 0, 0, 0] + tokens = en_tokenizer(text) + + # merging within a sentence keeps all sentence boundaries + doc = Doc(tokens.vocab, words=[t.text for t in tokens], sent_starts=sent_starts) + assert len(list(doc.sents)) == 2 + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[1:3]) + assert len(list(doc.sents)) == 2 + + # merging over a sentence boundary unsets it by default + doc = Doc(tokens.vocab, words=[t.text for t in tokens], sent_starts=sent_starts) + assert len(list(doc.sents)) == 2 + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[3:6]) + assert doc[3].is_sent_start is None + + # merging over a sentence boundary and setting sent_start + doc = Doc(tokens.vocab, words=[t.text for t in tokens], sent_starts=sent_starts) + assert len(list(doc.sents)) == 2 + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[3:6], attrs={"sent_start": True}) + assert len(list(doc.sents)) == 2 diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py index 6bfd508bc..16df1713d 100644 --- a/spacy/tests/doc/test_retokenize_split.py +++ b/spacy/tests/doc/test_retokenize_split.py @@ -44,7 +44,8 @@ def test_doc_retokenize_split_lemmas(en_vocab): # If lemmas are not set, leave unset words = ["LosAngeles", "start", "."] heads = [1, 2, 2] - doc = Doc(en_vocab, words=words, heads=heads) + deps = ["dep"] * len(heads) + doc = Doc(en_vocab, words=words, heads=heads, deps=deps) with doc.retokenize() as retokenizer: retokenizer.split( doc[0], @@ -57,7 +58,8 @@ def test_doc_retokenize_split_lemmas(en_vocab): # If lemmas are set, use split orth as default lemma words = ["LosAngeles", "start", "."] heads = [1, 2, 2] - doc = Doc(en_vocab, words=words, heads=heads) + deps = ["dep"] * len(heads) + doc = Doc(en_vocab, words=words, heads=heads, deps=deps) for t in doc: t.lemma_ = "a" with doc.retokenize() as retokenizer: diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 4c7f0c86b..2503ad94c 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -1,8 +1,14 @@ import pytest +import numpy +from numpy.testing import assert_array_equal from spacy.attrs import ORTH, LENGTH -from spacy.tokens import Doc, Span +from spacy.tokens import Doc, Span, Token from spacy.vocab import Vocab from spacy.util import filter_spans +from thinc.api import get_current_ops + +from ..util import add_vecs_to_vocab +from .test_underscore import clean_underscore # noqa: F401 @pytest.fixture @@ -12,9 +18,21 @@ def doc(en_tokenizer): heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 12, 12, 12, 12] deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det", "attr", "punct", "ROOT", "det", "npadvmod", "punct"] + ents = ["O", "O", "B-ENT", "I-ENT", "I-ENT", "I-ENT", "I-ENT", "O", "O", + "O", "O", "O", "O", "O"] # fmt: on tokens = en_tokenizer(text) - return Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) + lemmas = [t.text for t in tokens] # this is not correct, just a placeholder + spaces = [bool(t.whitespace_) for t in tokens] + return Doc( + tokens.vocab, + words=[t.text for t in tokens], + spaces=spaces, + heads=heads, + deps=deps, + ents=ents, + lemmas=lemmas, + ) @pytest.fixture @@ -78,7 +96,7 @@ def test_spans_span_sent(doc, doc_not_parsed): """Test span.sent property""" assert len(list(doc.sents)) assert doc[:2].sent.root.text == "is" - assert doc[:2].sent.text == "This is a sentence ." + assert doc[:2].sent.text == "This is a sentence." assert doc[6:7].sent.root.left_edge.text == "This" # test on manual sbd doc_not_parsed[0].is_sent_start = True @@ -116,6 +134,17 @@ def test_spans_lca_matrix(en_tokenizer): assert lca[1, 0] == 1 # slept & dog -> slept assert lca[1, 1] == 1 # slept & slept -> slept + # example from Span API docs + tokens = en_tokenizer("I like New York in Autumn") + doc = Doc( + tokens.vocab, + words=[t.text for t in tokens], + heads=[1, 1, 3, 1, 3, 4], + deps=["dep"] * len(tokens), + ) + lca = doc[1:4].get_lca_matrix() + assert_array_equal(lca, numpy.asarray([[0, 0, 0], [0, 1, 2], [0, 2, 2]])) + def test_span_similarity_match(): doc = Doc(Vocab(), words=["a", "b", "a", "b"]) @@ -195,6 +224,12 @@ def test_spans_by_character(doc): assert span1.end_char == span2.end_char assert span2.label_ == "GPE" + # unsupported alignment mode + with pytest.raises(ValueError): + span2 = doc.char_span( + span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk" + ) + def test_span_to_array(doc): span = doc[1:-2] @@ -212,12 +247,26 @@ def test_span_as_doc(doc): assert span_doc is not doc assert span_doc[0].idx == 0 + # partial initial entity is removed + assert len(span_doc.ents) == 0 + # full entity is preserved + span_doc = doc[2:10].as_doc() + assert len(span_doc.ents) == 1 + + # partial final entity is removed + span_doc = doc[0:5].as_doc() + assert len(span_doc.ents) == 0 + + +@pytest.mark.usefixtures("clean_underscore") def test_span_as_doc_user_data(doc): - """Test that the user_data can be preserved (but not by default). """ + """Test that the user_data can be preserved (but not by default).""" my_key = "my_info" my_value = 342 doc.user_data[my_key] = my_value + Token.set_extension("is_x", default=False) + doc[7]._.is_x = True span = doc[4:10] span_doc_with = span.as_doc(copy_user_data=True) @@ -226,6 +275,12 @@ def test_span_as_doc_user_data(doc): assert doc.user_data.get(my_key, None) is my_value assert span_doc_with.user_data.get(my_key, None) is my_value assert span_doc_without.user_data.get(my_key, None) is None + for i in range(len(span_doc_with)): + if i != 3: + assert span_doc_with[i]._.is_x is False + else: + assert span_doc_with[i]._.is_x is True + assert not any([t._.is_x for t in span_doc_without]) def test_span_string_label_kb_id(doc): @@ -236,20 +291,13 @@ def test_span_string_label_kb_id(doc): assert span.kb_id == doc.vocab.strings["Q342"] -def test_span_label_readonly(doc): +def test_span_attrs_writable(doc): span = Span(doc, 0, 1) - with pytest.raises(NotImplementedError): - span.label_ = "hello" - - -def test_span_kb_id_readonly(doc): - span = Span(doc, 0, 1) - with pytest.raises(NotImplementedError): - span.kb_id_ = "Q342" + span.label_ = "label" + span.kb_id_ = "kb_id" def test_span_ents_property(doc): - """Test span.ents for the """ doc.ents = [ (doc.vocab.strings["PRODUCT"], 0, 1), (doc.vocab.strings["PRODUCT"], 7, 8), @@ -271,7 +319,7 @@ def test_span_ents_property(doc): assert sentences[1].ents[0].start == 7 assert sentences[1].ents[0].end == 8 # Third sentence ents, Also tests end of sentence - assert sentences[2].ents[0].text == "a third ." + assert sentences[2].ents[0].text == "a third." assert sentences[2].ents[0].label_ == "PRODUCT" assert sentences[2].ents[0].start == 11 assert sentences[2].ents[0].end == 14 @@ -311,6 +359,9 @@ def test_span_eq_hash(doc, doc_not_parsed): assert hash(doc[0:2]) != hash(doc[1:3]) assert hash(doc[0:2]) != hash(doc_not_parsed[0:2]) + # check that an out-of-bounds is not equivalent to the span of the full doc + assert doc[0 : len(doc)] != doc[len(doc) : len(doc) + 1] + def test_span_boundaries(doc): start = 1 @@ -323,6 +374,39 @@ def test_span_boundaries(doc): with pytest.raises(IndexError): span[5] + empty_span_0 = doc[0:0] + assert empty_span_0.text == "" + assert empty_span_0.start == 0 + assert empty_span_0.end == 0 + assert empty_span_0.start_char == 0 + assert empty_span_0.end_char == 0 + + empty_span_1 = doc[1:1] + assert empty_span_1.text == "" + assert empty_span_1.start == 1 + assert empty_span_1.end == 1 + assert empty_span_1.start_char == empty_span_1.end_char + + oob_span_start = doc[-len(doc) - 1 : -len(doc) - 10] + assert oob_span_start.text == "" + assert oob_span_start.start == 0 + assert oob_span_start.end == 0 + assert oob_span_start.start_char == 0 + assert oob_span_start.end_char == 0 + + oob_span_end = doc[len(doc) + 1 : len(doc) + 10] + assert oob_span_end.text == "" + assert oob_span_end.start == len(doc) + assert oob_span_end.end == len(doc) + assert oob_span_end.start_char == len(doc.text) + assert oob_span_end.end_char == len(doc.text) + + +def test_span_lemma(doc): + # span lemmas should have the same number of spaces as the span + sp = doc[1:5] + assert len(sp.text.split(" ")) == len(sp.lemma_.split(" ")) + def test_sent(en_tokenizer): doc = en_tokenizer("Check span.sent raises error if doc is not sentencized.") @@ -330,3 +414,23 @@ def test_sent(en_tokenizer): assert not span.doc.has_annotation("SENT_START") with pytest.raises(ValueError): span.sent + + +def test_span_with_vectors(doc): + ops = get_current_ops() + prev_vectors = doc.vocab.vectors + vectors = [ + ("apple", ops.asarray([1, 2, 3])), + ("orange", ops.asarray([-1, -2, -3])), + ("And", ops.asarray([-1, -1, -1])), + ("juice", ops.asarray([5, 5, 10])), + ("pie", ops.asarray([7, 6.3, 8.9])), + ] + add_vecs_to_vocab(doc.vocab, vectors) + # 0-length span + assert_array_equal(ops.to_numpy(doc[0:0].vector), numpy.zeros((3,))) + # longer span with no vector + assert_array_equal(ops.to_numpy(doc[0:4].vector), numpy.zeros((3,))) + # single-token span with vector + assert_array_equal(ops.to_numpy(doc[10:11].vector), [-1, -1, -1]) + doc.vocab.vectors = prev_vectors diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index 1e13882c5..e715c5e85 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -95,7 +95,8 @@ def test_doc_token_api_ancestors(en_vocab): # the structure of this sentence depends on the English annotation scheme words = ["Yesterday", "I", "saw", "a", "dog", "that", "barked", "loudly", "."] heads = [2, 2, 2, 4, 2, 6, 4, 6, 2] - doc = Doc(en_vocab, words=words, heads=heads) + deps = ["dep"] * len(heads) + doc = Doc(en_vocab, words=words, heads=heads, deps=deps) assert [t.text for t in doc[6].ancestors] == ["dog", "saw"] assert [t.text for t in doc[1].ancestors] == ["saw"] assert [t.text for t in doc[2].ancestors] == [] @@ -146,7 +147,7 @@ def test_doc_token_api_head_setter(en_vocab): assert doc[4].left_edge.i == 0 assert doc[2].left_edge.i == 0 # head token must be from the same document - doc2 = Doc(en_vocab, words=words, heads=heads) + doc2 = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads)) with pytest.raises(ValueError): doc[0].head = doc2[0] # test sentence starts when two sentences are joined @@ -202,6 +203,12 @@ def test_set_pos(): assert doc[1].pos_ == "VERB" +def test_set_invalid_pos(): + doc = Doc(Vocab(), words=["hello", "world"]) + with pytest.raises(ValueError): + doc[0].pos_ = "blah" + + def test_tokens_sent(doc): """Test token.sent property""" assert len(list(doc.sents)) == 3 @@ -254,7 +261,7 @@ def test_token_api_non_conjuncts(en_vocab): def test_missing_head_dep(en_vocab): - """ Check that the Doc constructor and Example.from_dict parse missing information the same""" + """Check that the Doc constructor and Example.from_dict parse missing information the same""" heads = [1, 1, 1, 1, 2, None] # element 5 is missing deps = ["", "ROOT", "dobj", "cc", "conj", None] # element 0 and 5 are missing words = ["I", "like", "London", "and", "Berlin", "."] diff --git a/spacy/tests/enable_gpu.py b/spacy/tests/enable_gpu.py new file mode 100644 index 000000000..3d4fded10 --- /dev/null +++ b/spacy/tests/enable_gpu.py @@ -0,0 +1,3 @@ +from spacy import require_gpu + +require_gpu() diff --git a/spacy/tests/lang/bg/test_text.py b/spacy/tests/lang/bg/test_text.py new file mode 100644 index 000000000..e3a29fe5d --- /dev/null +++ b/spacy/tests/lang/bg/test_text.py @@ -0,0 +1,30 @@ +import pytest + + +@pytest.mark.parametrize( + "word,match", + [ + ("10", True), + ("1", True), + ("10000", True), + ("1.000", True), + ("бројка", False), + ("999,23", True), + ("едно", True), + ("две", True), + ("цифра", False), + ("единайсет", True), + ("десет", True), + ("сто", True), + ("брой", False), + ("хиляда", True), + ("милион", True), + (",", False), + ("милиарда", True), + ("билион", True), + ], +) +def test_bg_lex_attrs_like_number(bg_tokenizer, word, match): + tokens = bg_tokenizer(word) + assert len(tokens) == 1 + assert tokens[0].like_num == match diff --git a/spacy/tests/lang/ca/test_prefix_suffix_infix.py b/spacy/tests/lang/ca/test_prefix_suffix_infix.py index 83a75f056..a3c76ab5b 100644 --- a/spacy/tests/lang/ca/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/ca/test_prefix_suffix_infix.py @@ -5,7 +5,7 @@ import pytest "text,expected_tokens", [("d'un", ["d'", "un"]), ("s'ha", ["s'", "ha"])] ) def test_contractions(ca_tokenizer, text, expected_tokens): - """ Test that the contractions are split into two tokens""" + """Test that the contractions are split into two tokens""" tokens = ca_tokenizer(text) assert len(tokens) == 2 assert [t.text for t in tokens] == expected_tokens diff --git a/spacy/tests/lang/ca/test_text.py b/spacy/tests/lang/ca/test_text.py index 38f5fc708..55bad0e94 100644 --- a/spacy/tests/lang/ca/test_text.py +++ b/spacy/tests/lang/ca/test_text.py @@ -12,13 +12,13 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer): una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida.""" tokens = ca_tokenizer(text) - assert len(tokens) == 138 + assert len(tokens) == 140 @pytest.mark.parametrize( "text,length", [ - ("Perquè va anar-hi?", 6), + ("Perquè va anar-hi?", 4), ("“Ah no?”", 5), ("""Sí! "Anem", va contestar el Joan Carles""", 11), ("Van córrer aprox. 10km", 5), diff --git a/spacy/tests/lang/en/test_text.py b/spacy/tests/lang/en/test_text.py index 733e814f7..358f4c0f9 100644 --- a/spacy/tests/lang/en/test_text.py +++ b/spacy/tests/lang/en/test_text.py @@ -56,7 +56,9 @@ def test_lex_attrs_like_number(en_tokenizer, text, match): assert tokens[0].like_num == match -@pytest.mark.parametrize("word", ["third", "Millionth", "100th", "Hundredth"]) +@pytest.mark.parametrize( + "word", ["third", "Millionth", "100th", "Hundredth", "23rd", "52nd"] +) def test_en_lex_attrs_like_number_for_ordinal(word): assert like_num(word) diff --git a/spacy/tests/lang/fi/test_tokenizer.py b/spacy/tests/lang/fi/test_tokenizer.py index ae16c7eea..dc40e18a3 100644 --- a/spacy/tests/lang/fi/test_tokenizer.py +++ b/spacy/tests/lang/fi/test_tokenizer.py @@ -36,6 +36,24 @@ ABBREVIATION_INFLECTION_TESTS = [ ("EU:n toimesta tehtiin jotain.", ["EU:n", "toimesta", "tehtiin", "jotain", "."]), ] +CONTRACTION_TESTS = [ + ( + "Päätimme ettemme tule.", + ["Päätimme", "ett", "emme", "tule", "."], + ["päätimme", "että", "emme", "tule", "."], + ), + ( + "Miksei puhuttaisi?", + ["Miks", "ei", "puhuttaisi", "?"], + ["miksi", "ei", "puhuttaisi", "?"], + ), + ( + "He tottelivat vaikkeivat halunneet", + ["He", "tottelivat", "vaikk", "eivat", "halunneet"], + ["he", "tottelivat", "vaikka", "eivät", "halunneet"], + ), +] + @pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS) def test_fi_tokenizer_abbreviations(fi_tokenizer, text, expected_tokens): @@ -56,3 +74,12 @@ def test_fi_tokenizer_abbreviation_inflections(fi_tokenizer, text, expected_toke tokens = fi_tokenizer(text) token_list = [token.text for token in tokens if not token.is_space] assert expected_tokens == token_list + + +@pytest.mark.parametrize("text,expected_tokens,expected_norms", CONTRACTION_TESTS) +def test_fi_tokenizer_contractions(fi_tokenizer, text, expected_tokens, expected_norms): + tokens = fi_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + norm_list = [token.norm_ for token in tokens if not token.is_space] + assert expected_tokens == token_list + assert expected_norms == norm_list diff --git a/spacy/tests/lang/fr/test_prefix_suffix_infix.py b/spacy/tests/lang/fr/test_prefix_suffix_infix.py index 2ead34069..7770f807b 100644 --- a/spacy/tests/lang/fr/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/fr/test_prefix_suffix_infix.py @@ -1,5 +1,5 @@ import pytest -from spacy.language import Language +from spacy.language import Language, BaseDefaults from spacy.lang.punctuation import TOKENIZER_INFIXES from spacy.lang.char_classes import ALPHA @@ -12,7 +12,7 @@ def test_issue768(text, expected_tokens): SPLIT_INFIX = r"(?<=[{a}]\')(?=[{a}])".format(a=ALPHA) class FrenchTest(Language): - class Defaults(Language.Defaults): + class Defaults(BaseDefaults): infixes = TOKENIZER_INFIXES + [SPLIT_INFIX] fr_tokenizer_w_infix = FrenchTest().tokenizer diff --git a/spacy/tests/lang/grc/__init__.py b/spacy/tests/lang/grc/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/grc/test_text.py b/spacy/tests/lang/grc/test_text.py new file mode 100644 index 000000000..5d8317c36 --- /dev/null +++ b/spacy/tests/lang/grc/test_text.py @@ -0,0 +1,23 @@ +import pytest + + +@pytest.mark.parametrize( + "text,match", + [ + ("ι", True), + ("α", True), + ("ϟα", True), + ("ἑκατόν", True), + ("ἐνακόσια", True), + ("δισχίλια", True), + ("μύρια", True), + ("εἷς", True), + ("λόγος", False), + (",", False), + ("λβ", True), + ], +) +def test_lex_attrs_like_number(grc_tokenizer, text, match): + tokens = grc_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match diff --git a/spacy/tests/lang/hu/test_tokenizer.py b/spacy/tests/lang/hu/test_tokenizer.py index fd3acd0a0..0488474ae 100644 --- a/spacy/tests/lang/hu/test_tokenizer.py +++ b/spacy/tests/lang/hu/test_tokenizer.py @@ -294,7 +294,7 @@ WIKI_TESTS = [ ] EXTRA_TESTS = ( - DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS + WIKI_TESTS + TYPO_TESTS + DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS + WIKI_TESTS + TYPO_TESTS # type: ignore[operator] ) # normal: default tests + 10% of extra tests diff --git a/spacy/tests/lang/it/test_prefix_suffix_infix.py b/spacy/tests/lang/it/test_prefix_suffix_infix.py index 46f66b5e6..5834f9695 100644 --- a/spacy/tests/lang/it/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/it/test_prefix_suffix_infix.py @@ -5,7 +5,7 @@ import pytest "text,expected_tokens", [("c'è", ["c'", "è"]), ("l'ha", ["l'", "ha"])] ) def test_contractions(it_tokenizer, text, expected_tokens): - """ Test that the contractions are split into two tokens""" + """Test that the contractions are split into two tokens""" tokens = it_tokenizer(text) assert len(tokens) == 2 assert [t.text for t in tokens] == expected_tokens diff --git a/spacy/tests/lang/ky/__init__.py b/spacy/tests/lang/ky/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/ky/test_tokenizer.py b/spacy/tests/lang/ky/test_tokenizer.py new file mode 100644 index 000000000..91a048764 --- /dev/null +++ b/spacy/tests/lang/ky/test_tokenizer.py @@ -0,0 +1,89 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + + +INFIX_HYPHEN_TESTS = [ + ("Бала-чака жакшыбы?", "Бала-чака жакшыбы ?".split()), + ("Кыз-келиндер кийими.", "Кыз-келиндер кийими .".split()), +] + +PUNC_INSIDE_WORDS_TESTS = [ + ( + "Пассажир саны - 2,13 млн — киши/күнүнө (2010), 783,9 млн. киши/жылына.", + "Пассажир саны - 2,13 млн — киши / күнүнө ( 2010 ) ," + " 783,9 млн. киши / жылына .".split(), + ), + ('То"кой', 'То " кой'.split()), +] + +MIXED_ORDINAL_NUMS_TESTS = [("Эртең 22-январь...", "Эртең 22 - январь ...".split())] + +ABBREV_TESTS = [ + ("Маселе б-ча эртең келет", "Маселе б-ча эртең келет".split()), + ("Ахунбаев көч. турат.", "Ахунбаев көч. турат .".split()), + ("«3-жылы (б.з.ч.) туулган", "« 3 - жылы ( б.з.ч. ) туулган".split()), + ("Жүгөрү ж.б. дандар колдонулат", "Жүгөрү ж.б. дандар колдонулат".split()), + ("3-4 кк. курулган.", "3 - 4 кк. курулган .".split()), +] + +NAME_ABBREV_TESTS = [ + ("М.Жумаш", "М.Жумаш".split()), + ("М.жумаш", "М.жумаш".split()), + ("м.Жумаш", "м . Жумаш".split()), + ("Жумаш М.Н.", "Жумаш М.Н.".split()), + ("Жумаш.", "Жумаш .".split()), +] + +TYPOS_IN_PUNC_TESTS = [ + ("«3-жылда , туулган", "« 3 - жылда , туулган".split()), + ("«3-жылда,туулган", "« 3 - жылда , туулган".split()), + ("«3-жылда,туулган.", "« 3 - жылда , туулган .".split()), + ("Ал иштейт(качан?)", "Ал иштейт ( качан ? )".split()), + ("Ал (качан?)иштейт", "Ал ( качан ?) иштейт".split()), # "?)" => "?)" or "? )" +] + +LONG_TEXTS_TESTS = [ + ( + "Алыскы өлкөлөргө аздыр-көптүр татаалыраак жүрүштөргө чыккандар " + "азыраак: ал бир топ кымбат жана логистика маселесинин айынан " + "кыйла татаал. Мисалы, январдагы майрамдарда Мароккого үчүнчү " + "категориядагы маршрутка (100 чакырымдан кем эмес) барып " + "келгенге аракет кылдык.", + "Алыскы өлкөлөргө аздыр-көптүр татаалыраак жүрүштөргө чыккандар " + "азыраак : ал бир топ кымбат жана логистика маселесинин айынан " + "кыйла татаал . Мисалы , январдагы майрамдарда Мароккого үчүнчү " + "категориядагы маршрутка ( 100 чакырымдан кем эмес ) барып " + "келгенге аракет кылдык .".split(), + ) +] + +TESTCASES = ( + INFIX_HYPHEN_TESTS + + PUNC_INSIDE_WORDS_TESTS + + MIXED_ORDINAL_NUMS_TESTS + + ABBREV_TESTS + + NAME_ABBREV_TESTS + + LONG_TEXTS_TESTS + + TYPOS_IN_PUNC_TESTS +) + +NORM_TESTCASES = [ + ( + "ит, мышык ж.б.у.с. үй жаныбарлары.", + ["ит", ",", "мышык", "жана башка ушул сыяктуу", "үй", "жаныбарлары", "."], + ) +] + + +@pytest.mark.parametrize("text,expected_tokens", TESTCASES) +def test_ky_tokenizer_handles_testcases(ky_tokenizer, text, expected_tokens): + tokens = [token.text for token in ky_tokenizer(text) if not token.is_space] + assert expected_tokens == tokens + + +@pytest.mark.parametrize("text,norms", NORM_TESTCASES) +def test_ky_tokenizer_handles_norm_exceptions(ky_tokenizer, text, norms): + tokens = ky_tokenizer(text) + assert [token.norm_ for token in tokens] == norms diff --git a/spacy/tests/lang/nl/test_noun_chunks.py b/spacy/tests/lang/nl/test_noun_chunks.py new file mode 100644 index 000000000..73b501e4a --- /dev/null +++ b/spacy/tests/lang/nl/test_noun_chunks.py @@ -0,0 +1,209 @@ +from spacy.tokens import Doc +import pytest + + +@pytest.fixture +def nl_sample(nl_vocab): + # TEXT : + # Haar vriend lacht luid. We kregen alweer ruzie toen we de supermarkt ingingen. + # Aan het begin van de supermarkt is al het fruit en de groentes. Uiteindelijk hebben we dan ook + # geen avondeten gekocht. + words = [ + "Haar", + "vriend", + "lacht", + "luid", + ".", + "We", + "kregen", + "alweer", + "ruzie", + "toen", + "we", + "de", + "supermarkt", + "ingingen", + ".", + "Aan", + "het", + "begin", + "van", + "de", + "supermarkt", + "is", + "al", + "het", + "fruit", + "en", + "de", + "groentes", + ".", + "Uiteindelijk", + "hebben", + "we", + "dan", + "ook", + "geen", + "avondeten", + "gekocht", + ".", + ] + heads = [ + 1, + 2, + 2, + 2, + 2, + 6, + 6, + 6, + 6, + 13, + 13, + 12, + 13, + 6, + 6, + 17, + 17, + 24, + 20, + 20, + 17, + 24, + 24, + 24, + 24, + 27, + 27, + 24, + 24, + 36, + 36, + 36, + 36, + 36, + 35, + 36, + 36, + 36, + ] + deps = [ + "nmod:poss", + "nsubj", + "ROOT", + "advmod", + "punct", + "nsubj", + "ROOT", + "advmod", + "obj", + "mark", + "nsubj", + "det", + "obj", + "advcl", + "punct", + "case", + "det", + "obl", + "case", + "det", + "nmod", + "cop", + "advmod", + "det", + "ROOT", + "cc", + "det", + "conj", + "punct", + "advmod", + "aux", + "nsubj", + "advmod", + "advmod", + "det", + "obj", + "ROOT", + "punct", + ] + pos = [ + "PRON", + "NOUN", + "VERB", + "ADJ", + "PUNCT", + "PRON", + "VERB", + "ADV", + "NOUN", + "SCONJ", + "PRON", + "DET", + "NOUN", + "NOUN", + "PUNCT", + "ADP", + "DET", + "NOUN", + "ADP", + "DET", + "NOUN", + "AUX", + "ADV", + "DET", + "NOUN", + "CCONJ", + "DET", + "NOUN", + "PUNCT", + "ADJ", + "AUX", + "PRON", + "ADV", + "ADV", + "DET", + "NOUN", + "VERB", + "PUNCT", + ] + return Doc(nl_vocab, words=words, heads=heads, deps=deps, pos=pos) + + +@pytest.fixture +def nl_reference_chunking(): + # Using frog https://github.com/LanguageMachines/frog/ we obtain the following NOUN-PHRASES: + return [ + "haar vriend", + "we", + "ruzie", + "we", + "de supermarkt", + "het begin", + "de supermarkt", + "het fruit", + "de groentes", + "we", + "geen avondeten", + ] + + +def test_need_dep(nl_tokenizer): + """ + Test that noun_chunks raises Value Error for 'nl' language if Doc is not parsed. + """ + txt = "Haar vriend lacht luid." + doc = nl_tokenizer(txt) + + with pytest.raises(ValueError): + list(doc.noun_chunks) + + +def test_chunking(nl_sample, nl_reference_chunking): + """ + Test the noun chunks of a sample text. Uses a sample. + The sample text simulates a Doc object as would be produced by nl_core_news_md. + """ + chunks = [s.text.lower() for s in nl_sample.noun_chunks] + assert chunks == nl_reference_chunking diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py index de1871e64..36f4a75e0 100644 --- a/spacy/tests/lang/test_initialize.py +++ b/spacy/tests/lang/test_initialize.py @@ -4,11 +4,13 @@ from spacy.util import get_lang_class # fmt: off # Only include languages with no external dependencies -# excluded: ja, ru, th, uk, vi, zh -LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", - "et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is", - "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk", - "sl", "sq", "sr", "sv", "ta", "te", "tl", "tr", "tt", "ur", 'yo'] +# excluded: ja, ko, th, vi, zh +LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", + "en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi", + "hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv", + "mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa", + "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn", + "tr", "tt", "uk", "ur", "xx", "yo"] # fmt: on diff --git a/spacy/tests/lang/test_lemmatizers.py b/spacy/tests/lang/test_lemmatizers.py index a49d70d6b..e419f0a14 100644 --- a/spacy/tests/lang/test_lemmatizers.py +++ b/spacy/tests/lang/test_lemmatizers.py @@ -7,8 +7,8 @@ from spacy.util import get_lang_class # fmt: off # Only include languages with no external dependencies # excluded: ru, uk -# excluded for custom tables: pl -LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"] +# excluded for custom tables: es, pl +LANGUAGES = ["bn", "ca", "el", "en", "fa", "fr", "nb", "nl", "sv"] # fmt: on diff --git a/spacy/tests/lang/uk/test_lemmatizer.py b/spacy/tests/lang/uk/test_lemmatizer.py new file mode 100644 index 000000000..4a787b2a6 --- /dev/null +++ b/spacy/tests/lang/uk/test_lemmatizer.py @@ -0,0 +1,7 @@ +from spacy.tokens import Doc + + +def test_uk_lemmatizer(uk_lemmatizer): + """Check that the default uk lemmatizer runs.""" + doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"]) + uk_lemmatizer(doc) diff --git a/spacy/tests/lang/vi/__init__.py b/spacy/tests/lang/vi/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/vi/test_serialize.py b/spacy/tests/lang/vi/test_serialize.py new file mode 100644 index 000000000..ed4652df7 --- /dev/null +++ b/spacy/tests/lang/vi/test_serialize.py @@ -0,0 +1,33 @@ +from spacy.lang.vi import Vietnamese +from ...util import make_tempdir + + +def test_vi_tokenizer_serialize(vi_tokenizer): + tokenizer_bytes = vi_tokenizer.to_bytes() + nlp = Vietnamese() + nlp.tokenizer.from_bytes(tokenizer_bytes) + assert tokenizer_bytes == nlp.tokenizer.to_bytes() + assert nlp.tokenizer.use_pyvi is True + + with make_tempdir() as d: + file_path = d / "tokenizer" + vi_tokenizer.to_disk(file_path) + nlp = Vietnamese() + nlp.tokenizer.from_disk(file_path) + assert tokenizer_bytes == nlp.tokenizer.to_bytes() + assert nlp.tokenizer.use_pyvi is True + + # mode is (de)serialized correctly + nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}}) + nlp_bytes = nlp.to_bytes() + nlp_r = Vietnamese() + nlp_r.from_bytes(nlp_bytes) + assert nlp_bytes == nlp_r.to_bytes() + assert nlp_r.tokenizer.use_pyvi is False + + with make_tempdir() as d: + nlp.to_disk(d) + nlp_r = Vietnamese() + nlp_r.from_disk(d) + assert nlp_bytes == nlp_r.to_bytes() + assert nlp_r.tokenizer.use_pyvi is False diff --git a/spacy/tests/lang/vi/test_tokenizer.py b/spacy/tests/lang/vi/test_tokenizer.py new file mode 100644 index 000000000..3d0642d1e --- /dev/null +++ b/spacy/tests/lang/vi/test_tokenizer.py @@ -0,0 +1,47 @@ +import pytest + +from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS +from spacy.lang.vi import Vietnamese + + +# fmt: off +TOKENIZER_TESTS = [ + ("Đây là một văn bản bằng tiếng Việt Sau đó, đây là một văn bản khác bằng ngôn ngữ này", ['Đây', 'là', 'một', 'văn bản', 'bằng', 'tiếng', 'Việt', 'Sau', 'đó', ',', 'đây', 'là', 'một', 'văn bản', 'khác', 'bằng', 'ngôn ngữ', 'này']), +] +# fmt: on + + +@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS) +def test_vi_tokenizer(vi_tokenizer, text, expected_tokens): + tokens = [token.text for token in vi_tokenizer(text)] + assert tokens == expected_tokens + + +def test_vi_tokenizer_extra_spaces(vi_tokenizer): + # note: three spaces after "I" + tokens = vi_tokenizer("I like cheese.") + assert tokens[1].orth_ == " " + + +@pytest.mark.parametrize("text", NAUGHTY_STRINGS) +def test_vi_tokenizer_naughty_strings(vi_tokenizer, text): + tokens = vi_tokenizer(text) + assert tokens.text_with_ws == text + + +def test_vi_tokenizer_emptyish_texts(vi_tokenizer): + doc = vi_tokenizer("") + assert len(doc) == 0 + doc = vi_tokenizer(" ") + assert len(doc) == 1 + doc = vi_tokenizer("\n\n\n \t\t \n\n\n") + assert len(doc) == 1 + + +def test_vi_tokenizer_no_pyvi(): + """Test for whitespace tokenization without pyvi""" + nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}}) + text = "Đây là một văn bản bằng tiếng Việt Sau đó, đây là một văn bản khác bằng ngôn ngữ này" + doc = nlp(text) + assert [t.text for t in doc if not t.is_space] == text.split() + assert doc[4].text == " " diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py index 481187348..61ae43c52 100644 --- a/spacy/tests/matcher/test_dependency_matcher.py +++ b/spacy/tests/matcher/test_dependency_matcher.py @@ -4,7 +4,9 @@ import re import copy from mock import Mock from spacy.matcher import DependencyMatcher -from spacy.tokens import Doc +from spacy.tokens import Doc, Token + +from ..doc.test_underscore import clean_underscore # noqa: F401 @pytest.fixture @@ -334,3 +336,119 @@ def test_dependency_matcher_ops(en_vocab, doc, left, right, op, num_matches): matcher.add("pattern", [pattern]) matches = matcher(doc) assert len(matches) == num_matches + + +def test_dependency_matcher_long_matches(en_vocab, doc): + pattern = [ + {"RIGHT_ID": "quick", "RIGHT_ATTRS": {"DEP": "amod", "OP": "+"}}, + ] + + matcher = DependencyMatcher(en_vocab) + with pytest.raises(ValueError): + matcher.add("pattern", [pattern]) + + +@pytest.mark.usefixtures("clean_underscore") +def test_dependency_matcher_span_user_data(en_tokenizer): + doc = en_tokenizer("a b c d e") + for token in doc: + token.head = doc[0] + token.dep_ = "a" + Token.set_extension("is_c", default=False) + doc[2]._.is_c = True + pattern = [ + {"RIGHT_ID": "c", "RIGHT_ATTRS": {"_": {"is_c": True}}}, + ] + matcher = DependencyMatcher(en_tokenizer.vocab) + matcher.add("C", [pattern]) + doc_matches = matcher(doc) + offset = 1 + span_matches = matcher(doc[offset:]) + for doc_match, span_match in zip(sorted(doc_matches), sorted(span_matches)): + assert doc_match[0] == span_match[0] + for doc_t_i, span_t_i in zip(doc_match[1], span_match[1]): + assert doc_t_i == span_t_i + offset + + +def test_dependency_matcher_order_issue(en_tokenizer): + # issue from #9263 + doc = en_tokenizer("I like text") + doc[2].head = doc[1] + + # this matches on attrs but not rel op + pattern1 = [ + {"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "like"}}, + { + "LEFT_ID": "root", + "RIGHT_ID": "r", + "RIGHT_ATTRS": {"ORTH": "text"}, + "REL_OP": "<", + }, + ] + + # this matches on rel op but not attrs + pattern2 = [ + {"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "like"}}, + { + "LEFT_ID": "root", + "RIGHT_ID": "r", + "RIGHT_ATTRS": {"ORTH": "fish"}, + "REL_OP": ">", + }, + ] + + matcher = DependencyMatcher(en_tokenizer.vocab) + + # This should behave the same as the next pattern + matcher.add("check", [pattern1, pattern2]) + matches = matcher(doc) + + assert matches == [] + + # use a new matcher + matcher = DependencyMatcher(en_tokenizer.vocab) + # adding one at a time under same label gets a match + matcher.add("check", [pattern1]) + matcher.add("check", [pattern2]) + matches = matcher(doc) + + assert matches == [] + + +def test_dependency_matcher_remove(en_tokenizer): + # issue from #9263 + doc = en_tokenizer("The red book") + doc[1].head = doc[2] + + # this matches + pattern1 = [ + {"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "book"}}, + { + "LEFT_ID": "root", + "RIGHT_ID": "r", + "RIGHT_ATTRS": {"ORTH": "red"}, + "REL_OP": ">", + }, + ] + + # add and then remove it + matcher = DependencyMatcher(en_tokenizer.vocab) + matcher.add("check", [pattern1]) + matcher.remove("check") + + # this matches on rel op but not attrs + pattern2 = [ + {"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "flag"}}, + { + "LEFT_ID": "root", + "RIGHT_ID": "r", + "RIGHT_ATTRS": {"ORTH": "blue"}, + "REL_OP": ">", + }, + ] + + # Adding this new pattern with the same label, which should not match + matcher.add("check", [pattern2]) + matches = matcher(doc) + + assert matches == [] diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 91f843a93..c02d65cdf 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -33,6 +33,15 @@ def test_matcher_from_api_docs(en_vocab): assert len(patterns[0]) +def test_matcher_empty_patterns_warns(en_vocab): + matcher = Matcher(en_vocab) + assert len(matcher) == 0 + doc = Doc(en_vocab, words=["This", "is", "quite", "something"]) + with pytest.warns(UserWarning): + matcher(doc) + assert len(doc.ents) == 0 + + def test_matcher_from_usage_docs(en_vocab): text = "Wow 😀 This is really cool! 😂 😂" doc = Doc(en_vocab, words=text.split(" ")) @@ -261,6 +270,16 @@ def test_matcher_subset_value_operator(en_vocab): doc[0].tag_ = "A" assert len(matcher(doc)) == 0 + # IS_SUBSET with a list value + Token.set_extension("ext", default=[]) + matcher = Matcher(en_vocab) + pattern = [{"_": {"ext": {"IS_SUBSET": ["A", "B"]}}}] + matcher.add("M", [pattern]) + doc = Doc(en_vocab, words=["a", "b", "c"]) + doc[0]._.ext = ["A"] + doc[1]._.ext = ["C", "D"] + assert len(matcher(doc)) == 2 + def test_matcher_superset_value_operator(en_vocab): matcher = Matcher(en_vocab) @@ -299,6 +318,72 @@ def test_matcher_superset_value_operator(en_vocab): doc[0].tag_ = "A" assert len(matcher(doc)) == 3 + # IS_SUPERSET with a list value + Token.set_extension("ext", default=[]) + matcher = Matcher(en_vocab) + pattern = [{"_": {"ext": {"IS_SUPERSET": ["A"]}}}] + matcher.add("M", [pattern]) + doc = Doc(en_vocab, words=["a", "b", "c"]) + doc[0]._.ext = ["A", "B"] + assert len(matcher(doc)) == 1 + + +def test_matcher_intersect_value_operator(en_vocab): + matcher = Matcher(en_vocab) + pattern = [{"MORPH": {"INTERSECTS": ["Feat=Val", "Feat2=Val2", "Feat3=Val3"]}}] + matcher.add("M", [pattern]) + doc = Doc(en_vocab, words=["a", "b", "c"]) + assert len(matcher(doc)) == 0 + doc[0].set_morph("Feat=Val") + assert len(matcher(doc)) == 1 + doc[0].set_morph("Feat=Val|Feat2=Val2") + assert len(matcher(doc)) == 1 + doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3") + assert len(matcher(doc)) == 1 + doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4") + assert len(matcher(doc)) == 1 + + # INTERSECTS with a single value is the same as IN + matcher = Matcher(en_vocab) + pattern = [{"TAG": {"INTERSECTS": ["A", "B"]}}] + matcher.add("M", [pattern]) + doc = Doc(en_vocab, words=["a", "b", "c"]) + doc[0].tag_ = "A" + assert len(matcher(doc)) == 1 + + # INTERSECTS with an empty pattern list matches nothing + matcher = Matcher(en_vocab) + pattern = [{"TAG": {"INTERSECTS": []}}] + matcher.add("M", [pattern]) + doc = Doc(en_vocab, words=["a", "b", "c"]) + doc[0].tag_ = "A" + assert len(matcher(doc)) == 0 + + # INTERSECTS with a list value + Token.set_extension("ext", default=[]) + matcher = Matcher(en_vocab) + pattern = [{"_": {"ext": {"INTERSECTS": ["A", "C"]}}}] + matcher.add("M", [pattern]) + doc = Doc(en_vocab, words=["a", "b", "c"]) + doc[0]._.ext = ["A", "B"] + assert len(matcher(doc)) == 1 + + # INTERSECTS with an empty pattern list matches nothing + matcher = Matcher(en_vocab) + pattern = [{"_": {"ext": {"INTERSECTS": []}}}] + matcher.add("M", [pattern]) + doc = Doc(en_vocab, words=["a", "b", "c"]) + doc[0]._.ext = ["A", "B"] + assert len(matcher(doc)) == 0 + + # INTERSECTS with an empty value matches nothing + matcher = Matcher(en_vocab) + pattern = [{"_": {"ext": {"INTERSECTS": ["A", "B"]}}}] + matcher.add("M", [pattern]) + doc = Doc(en_vocab, words=["a", "b", "c"]) + doc[0]._.ext = [] + assert len(matcher(doc)) == 0 + def test_matcher_morph_handling(en_vocab): # order of features in pattern doesn't matter @@ -472,6 +557,7 @@ def test_matcher_schema_token_attributes(en_vocab, pattern, text): assert len(matches) == 1 +@pytest.mark.filterwarnings("ignore:\\[W036") def test_matcher_valid_callback(en_vocab): """Test that on_match can only be None or callable.""" matcher = Matcher(en_vocab) @@ -490,6 +576,16 @@ def test_matcher_callback(en_vocab): mock.assert_called_once_with(matcher, doc, 0, matches) +def test_matcher_callback_with_alignments(en_vocab): + mock = Mock() + matcher = Matcher(en_vocab) + pattern = [{"ORTH": "test"}] + matcher.add("Rule", [pattern], on_match=mock) + doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) + matches = matcher(doc, with_alignments=True) + mock.assert_called_once_with(matcher, doc, 0, matches) + + def test_matcher_span(matcher): text = "JavaScript is good but Java is better" doc = Doc(matcher.vocab, words=text.split()) @@ -513,6 +609,12 @@ def test_matcher_as_spans(matcher): assert matches[1].text == "Java" assert matches[1].label_ == "Java" + matches = matcher(doc[1:], as_spans=True) + assert len(matches) == 1 + assert isinstance(matches[0], Span) + assert matches[0].text == "Java" + assert matches[0].label_ == "Java" + def test_matcher_deprecated(matcher): doc = Doc(matcher.vocab, words=["hello", "world"]) @@ -521,3 +623,22 @@ def test_matcher_deprecated(matcher): pass assert record.list assert "spaCy v3.0" in str(record.list[0].message) + + +def test_matcher_remove_zero_operator(en_vocab): + matcher = Matcher(en_vocab) + pattern = [{"OP": "!"}] + matcher.add("Rule", [pattern]) + doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) + matches = matcher(doc) + assert len(matches) == 0 + assert "Rule" in matcher + matcher.remove("Rule") + assert "Rule" not in matcher + + +def test_matcher_no_zero_length(en_vocab): + doc = Doc(en_vocab, words=["a", "b"], tags=["A", "B"]) + matcher = Matcher(en_vocab) + matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]]) + assert len(matcher(doc)) == 0 diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py index 5f4c2991a..dcbe1ff33 100644 --- a/spacy/tests/matcher/test_matcher_logic.py +++ b/spacy/tests/matcher/test_matcher_logic.py @@ -180,6 +180,7 @@ def test_matcher_sets_return_correct_tokens(en_vocab): assert texts == ["zero", "one", "two"] +@pytest.mark.filterwarnings("ignore:\\[W036") def test_matcher_remove(): nlp = English() matcher = Matcher(nlp.vocab) @@ -204,3 +205,100 @@ def test_matcher_remove(): # removing again should throw an error with pytest.raises(ValueError): matcher.remove("Rule") + + +def test_matcher_with_alignments_greedy_longest(en_vocab): + cases = [ + ("aaab", "a* b", [0, 0, 0, 1]), + ("baab", "b a* b", [0, 1, 1, 2]), + ("aaab", "a a a b", [0, 1, 2, 3]), + ("aaab", "a+ b", [0, 0, 0, 1]), + ("aaba", "a+ b a+", [0, 0, 1, 2]), + ("aabaa", "a+ b a+", [0, 0, 1, 2, 2]), + ("aaba", "a+ b a*", [0, 0, 1, 2]), + ("aaaa", "a*", [0, 0, 0, 0]), + ("baab", "b a* b b*", [0, 1, 1, 2]), + ("aabb", "a* b* a*", [0, 0, 1, 1]), + ("aaab", "a+ a+ a b", [0, 1, 2, 3]), + ("aaab", "a+ a+ a+ b", [0, 1, 2, 3]), + ("aaab", "a+ a a b", [0, 1, 2, 3]), + ("aaab", "a+ a a", [0, 1, 2]), + ("aaab", "a+ a a?", [0, 1, 2]), + ("aaaa", "a a a a a?", [0, 1, 2, 3]), + ("aaab", "a+ a b", [0, 0, 1, 2]), + ("aaab", "a+ a+ b", [0, 0, 1, 2]), + ] + for string, pattern_str, result in cases: + matcher = Matcher(en_vocab) + doc = Doc(matcher.vocab, words=list(string)) + pattern = [] + for part in pattern_str.split(): + if part.endswith("+"): + pattern.append({"ORTH": part[0], "OP": "+"}) + elif part.endswith("*"): + pattern.append({"ORTH": part[0], "OP": "*"}) + elif part.endswith("?"): + pattern.append({"ORTH": part[0], "OP": "?"}) + else: + pattern.append({"ORTH": part}) + matcher.add("PATTERN", [pattern], greedy="LONGEST") + matches = matcher(doc, with_alignments=True) + n_matches = len(matches) + + _, s, e, expected = matches[0] + + assert expected == result, (string, pattern_str, s, e, n_matches) + + +def test_matcher_with_alignments_nongreedy(en_vocab): + cases = [ + (0, "aaab", "a* b", [[0, 1], [0, 0, 1], [0, 0, 0, 1], [1]]), + (1, "baab", "b a* b", [[0, 1, 1, 2]]), + (2, "aaab", "a a a b", [[0, 1, 2, 3]]), + (3, "aaab", "a+ b", [[0, 1], [0, 0, 1], [0, 0, 0, 1]]), + (4, "aaba", "a+ b a+", [[0, 1, 2], [0, 0, 1, 2]]), + ( + 5, + "aabaa", + "a+ b a+", + [[0, 1, 2], [0, 0, 1, 2], [0, 0, 1, 2, 2], [0, 1, 2, 2]], + ), + (6, "aaba", "a+ b a*", [[0, 1], [0, 0, 1], [0, 0, 1, 2], [0, 1, 2]]), + (7, "aaaa", "a*", [[0], [0, 0], [0, 0, 0], [0, 0, 0, 0]]), + (8, "baab", "b a* b b*", [[0, 1, 1, 2]]), + ( + 9, + "aabb", + "a* b* a*", + [[1], [2], [2, 2], [0, 1], [0, 0, 1], [0, 0, 1, 1], [0, 1, 1], [1, 1]], + ), + (10, "aaab", "a+ a+ a b", [[0, 1, 2, 3]]), + (11, "aaab", "a+ a+ a+ b", [[0, 1, 2, 3]]), + (12, "aaab", "a+ a a b", [[0, 1, 2, 3]]), + (13, "aaab", "a+ a a", [[0, 1, 2]]), + (14, "aaab", "a+ a a?", [[0, 1], [0, 1, 2]]), + (15, "aaaa", "a a a a a?", [[0, 1, 2, 3]]), + (16, "aaab", "a+ a b", [[0, 1, 2], [0, 0, 1, 2]]), + (17, "aaab", "a+ a+ b", [[0, 1, 2], [0, 0, 1, 2]]), + ] + for case_id, string, pattern_str, results in cases: + matcher = Matcher(en_vocab) + doc = Doc(matcher.vocab, words=list(string)) + pattern = [] + for part in pattern_str.split(): + if part.endswith("+"): + pattern.append({"ORTH": part[0], "OP": "+"}) + elif part.endswith("*"): + pattern.append({"ORTH": part[0], "OP": "*"}) + elif part.endswith("?"): + pattern.append({"ORTH": part[0], "OP": "?"}) + else: + pattern.append({"ORTH": part}) + + matcher.add("PATTERN", [pattern]) + matches = matcher(doc, with_alignments=True) + n_matches = len(matches) + + for _, s, e, expected in matches: + assert expected in results, (case_id, string, pattern_str, s, e, n_matches) + assert len(expected) == e - s diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 1b81fd780..478949601 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -318,3 +318,44 @@ def test_phrase_matcher_deprecated(en_vocab): pass assert record.list assert "spaCy v3.0" in str(record.list[0].message) + + +@pytest.mark.parametrize("attr", ["SENT_START", "IS_SENT_START"]) +def test_phrase_matcher_sent_start(en_vocab, attr): + _ = PhraseMatcher(en_vocab, attr=attr) # noqa: F841 + + +def test_span_in_phrasematcher(en_vocab): + """Ensure that PhraseMatcher accepts Span and Doc as input""" + # fmt: off + words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."] + # fmt: on + doc = Doc(en_vocab, words=words) + span = doc[:8] + pattern = Doc(en_vocab, words=["Spans", "and", "Docs"]) + matcher = PhraseMatcher(en_vocab) + matcher.add("SPACY", [pattern]) + matches_doc = matcher(doc) + matches_span = matcher(span) + assert len(matches_doc) == 1 + assert len(matches_span) == 1 + + +def test_span_v_doc_in_phrasematcher(en_vocab): + """Ensure that PhraseMatcher only returns matches in input Span and not in entire Doc""" + # fmt: off + words = [ + "I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "Spans", + "and", "Docs", "in", "my", "matchers", "," "and", "Spans", "and", "Docs", + "everywhere", "." + ] + # fmt: on + doc = Doc(en_vocab, words=words) + span = doc[9:15] # second clause + pattern = Doc(en_vocab, words=["Spans", "and", "Docs"]) + matcher = PhraseMatcher(en_vocab) + matcher.add("SPACY", [pattern]) + matches_doc = matcher(doc) + matches_span = matcher(span) + assert len(matches_doc) == 3 + assert len(matches_span) == 1 diff --git a/spacy/tests/package/__init__.py b/spacy/tests/package/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py index a0e43ccfa..75908df59 100644 --- a/spacy/tests/package/test_requirements.py +++ b/spacy/tests/package/test_requirements.py @@ -6,21 +6,26 @@ def test_build_dependencies(): # Check that library requirements are pinned exactly the same across different setup files. # TODO: correct checks for numpy rather than ignoring libs_ignore_requirements = [ - "numpy", "pytest", "pytest-timeout", "mock", "flake8", + "hypothesis", + "pre-commit", + "mypy", + "types-dataclasses", + "types-mock", + "types-requests", ] # ignore language-specific packages that shouldn't be installed by all libs_ignore_setup = [ - "numpy", "fugashi", "natto-py", "pythainlp", "sudachipy", "sudachidict_core", "spacy-pkuseg", + "thinc-apple-ops", ] # check requirements.txt diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index 2f750b60c..f89e993e9 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -1,6 +1,7 @@ import pytest from thinc.api import Adam, fix_random_seed from spacy import registry +from spacy.language import Language from spacy.attrs import NORM from spacy.vocab import Vocab from spacy.training import Example @@ -17,14 +18,9 @@ def vocab(): @pytest.fixture def parser(vocab): - config = { - "learn_tokens": False, - "min_action_freq": 30, - "update_with_oracle_cut_size": 100, - } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - parser = DependencyParser(vocab, model, **config) + parser = DependencyParser(vocab, model) return parser @@ -76,19 +72,14 @@ def test_add_label(parser): def test_add_label_deserializes_correctly(): - config = { - "learn_tokens": False, - "min_action_freq": 30, - "update_with_oracle_cut_size": 100, - } cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - ner1 = EntityRecognizer(Vocab(), model, **config) + ner1 = EntityRecognizer(Vocab(), model) ner1.add_label("C") ner1.add_label("B") ner1.add_label("A") ner1.initialize(lambda: [_ner_example(ner1)]) - ner2 = EntityRecognizer(Vocab(), model, **config) + ner2 = EntityRecognizer(Vocab(), model) # the second model needs to be resized before we can call from_bytes ner2.model.attrs["resize_output"](ner2.model, ner1.moves.n_moves) @@ -112,14 +103,56 @@ def test_add_label_get_label(pipe_cls, n_moves, model_config): """ labels = ["A", "B", "C"] model = registry.resolve({"model": model_config}, validate=True)["model"] - config = { - "learn_tokens": False, - "min_action_freq": 30, - "update_with_oracle_cut_size": 100, - } - pipe = pipe_cls(Vocab(), model, **config) + pipe = pipe_cls(Vocab(), model) for label in labels: pipe.add_label(label) assert len(pipe.move_names) == len(labels) * n_moves pipe_labels = sorted(list(pipe.labels)) assert pipe_labels == labels + + +def test_ner_labels_added_implicitly_on_predict(): + nlp = Language() + ner = nlp.add_pipe("ner") + for label in ["A", "B", "C"]: + ner.add_label(label) + nlp.initialize() + doc = Doc(nlp.vocab, words=["hello", "world"], ents=["B-D", "O"]) + ner(doc) + assert [t.ent_type_ for t in doc] == ["D", ""] + assert "D" in ner.labels + + +def test_ner_labels_added_implicitly_on_beam_parse(): + nlp = Language() + ner = nlp.add_pipe("beam_ner") + for label in ["A", "B", "C"]: + ner.add_label(label) + nlp.initialize() + doc = Doc(nlp.vocab, words=["hello", "world"], ents=["B-D", "O"]) + ner.beam_parse([doc], beam_width=32) + assert "D" in ner.labels + + +def test_ner_labels_added_implicitly_on_greedy_parse(): + nlp = Language() + ner = nlp.add_pipe("beam_ner") + for label in ["A", "B", "C"]: + ner.add_label(label) + nlp.initialize() + doc = Doc(nlp.vocab, words=["hello", "world"], ents=["B-D", "O"]) + ner.greedy_parse([doc]) + assert "D" in ner.labels + + +def test_ner_labels_added_implicitly_on_update(): + nlp = Language() + ner = nlp.add_pipe("ner") + for label in ["A", "B", "C"]: + ner.add_label(label) + nlp.initialize() + doc = Doc(nlp.vocab, words=["hello", "world"], ents=["B-D", "O"]) + example = Example(nlp.make_doc(doc.text), doc) + assert "D" not in ner.labels + nlp.update([example]) + assert "D" in ner.labels diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index 66c22c60b..cba6fa81e 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -130,14 +130,9 @@ def test_get_oracle_actions(): deps.append(dep) ents.append(ent) doc = Doc(Vocab(), words=[t[1] for t in annot_tuples]) - config = { - "learn_tokens": False, - "min_action_freq": 0, - "update_with_oracle_cut_size": 100, - } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - parser = DependencyParser(doc.vocab, model, **config) + parser = DependencyParser(doc.vocab, model) parser.moves.add_action(0, "") parser.moves.add_action(1, "") parser.moves.add_action(1, "") diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 0ff5c5a66..587d1fff1 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -2,18 +2,19 @@ import pytest from numpy.testing import assert_equal from spacy.attrs import ENT_IOB -from spacy import util +from spacy import util, registry from spacy.lang.en import English from spacy.language import Language from spacy.lookups import Lookups from spacy.pipeline._parser_internals.ner import BiluoPushDown from spacy.training import Example -from spacy.tokens import Doc +from spacy.tokens import Doc, Span from spacy.vocab import Vocab import logging from ..util import make_tempdir - +from ...pipeline import EntityRecognizer +from ...pipeline.ner import DEFAULT_NER_MODEL TRAIN_DATA = [ ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), @@ -21,6 +22,11 @@ TRAIN_DATA = [ ] +@pytest.fixture +def neg_key(): + return "non_entities" + + @pytest.fixture def vocab(): return Vocab() @@ -59,39 +65,70 @@ def test_get_oracle_moves(tsys, doc, entity_annots): assert names == ["U-PERSON", "O", "O", "B-GPE", "L-GPE", "O"] -@pytest.mark.filterwarnings("ignore::UserWarning") -def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots): - entity_annots = [(s, e, "!" + label) for s, e, label in entity_annots] +def test_negative_samples_two_word_input(tsys, vocab, neg_key): + """Test that we don't get stuck in a two word input when we have a negative + span. This could happen if we don't have the right check on the B action. + """ + tsys.cfg["neg_key"] = neg_key + doc = Doc(vocab, words=["A", "B"]) + entity_annots = [None, None] example = Example.from_dict(doc, {"entities": entity_annots}) - ex_dict = example.to_dict() - - for i, tag in enumerate(ex_dict["doc_annotation"]["entities"]): - if tag == "L-!GPE": - ex_dict["doc_annotation"]["entities"][i] = "-" - example = Example.from_dict(doc, ex_dict) - + # These mean that the oracle sequence shouldn't have O for the first + # word, and it shouldn't analyse it as B-PERSON, L-PERSON + example.y.spans[neg_key] = [ + Span(example.y, 0, 1, label="O"), + Span(example.y, 0, 2, label="PERSON"), + ] act_classes = tsys.get_oracle_sequence(example) names = [tsys.get_class_name(act) for act in act_classes] assert names + assert names[0] != "O" + assert names[0] != "B-PERSON" + assert names[1] != "L-PERSON" -def test_get_oracle_moves_negative_entities2(tsys, vocab): - doc = Doc(vocab, words=["A", "B", "C", "D"]) - entity_annots = ["B-!PERSON", "L-!PERSON", "B-!PERSON", "L-!PERSON"] +def test_negative_samples_three_word_input(tsys, vocab, neg_key): + """Test that we exclude a 2-word entity correctly using a negative example.""" + tsys.cfg["neg_key"] = neg_key + doc = Doc(vocab, words=["A", "B", "C"]) + entity_annots = [None, None, None] example = Example.from_dict(doc, {"entities": entity_annots}) + # These mean that the oracle sequence shouldn't have O for the first + # word, and it shouldn't analyse it as B-PERSON, L-PERSON + example.y.spans[neg_key] = [ + Span(example.y, 0, 1, label="O"), + Span(example.y, 0, 2, label="PERSON"), + ] act_classes = tsys.get_oracle_sequence(example) names = [tsys.get_class_name(act) for act in act_classes] assert names + assert names[0] != "O" + assert names[1] != "B-PERSON" -@pytest.mark.skip(reason="Maybe outdated? Unsure") -def test_get_oracle_moves_negative_O(tsys, vocab): - doc = Doc(vocab, words=["A", "B", "C", "D"]) - entity_annots = ["O", "!O", "O", "!O"] +def test_negative_samples_U_entity(tsys, vocab, neg_key): + """Test that we exclude a 2-word entity correctly using a negative example.""" + tsys.cfg["neg_key"] = neg_key + doc = Doc(vocab, words=["A"]) + entity_annots = [None] example = Example.from_dict(doc, {"entities": entity_annots}) + # These mean that the oracle sequence shouldn't have O for the first + # word, and it shouldn't analyse it as B-PERSON, L-PERSON + example.y.spans[neg_key] = [ + Span(example.y, 0, 1, label="O"), + Span(example.y, 0, 1, label="PERSON"), + ] act_classes = tsys.get_oracle_sequence(example) names = [tsys.get_class_name(act) for act in act_classes] assert names + assert names[0] != "O" + assert names[0] != "U-PERSON" + + +def test_negative_sample_key_is_in_config(vocab, entity_types): + actions = BiluoPushDown.get_actions(entity_types=entity_types) + tsys = BiluoPushDown(vocab.strings, actions, incorrect_spans_key="non_entities") + assert tsys.cfg["neg_key"] == "non_entities" # We can't easily represent this on a Doc object. Not sure what the best solution @@ -213,6 +250,27 @@ def test_train_empty(): nlp.update(batch, losses=losses) +def test_train_negative_deprecated(): + """Test that the deprecated negative entity format raises a custom error.""" + train_data = [ + ("Who is Shaka Khan?", {"entities": [(7, 17, "!PERSON")]}), + ] + + nlp = English() + train_examples = [] + for t in train_data: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + ner = nlp.add_pipe("ner", last=True) + ner.add_label("PERSON") + nlp.initialize() + for itn in range(2): + losses = {} + batches = util.minibatch(train_examples, size=8) + for batch in batches: + with pytest.raises(ValueError): + nlp.update(batch, losses=losses) + + def test_overwrite_token(): nlp = English() nlp.add_pipe("ner") @@ -252,12 +310,12 @@ def test_ruler_before_ner(): # 1 : Entity Ruler - should set "this" to B and everything else to empty patterns = [{"label": "THING", "pattern": "This"}] ruler = nlp.add_pipe("entity_ruler") - ruler.add_patterns(patterns) # 2: untrained NER - should set everything else to O untrained_ner = nlp.add_pipe("ner") untrained_ner.add_label("MY_LABEL") nlp.initialize() + ruler.add_patterns(patterns) doc = nlp("This is Antti Korhonen speaking in Finland") expected_iobs = ["B", "O", "O", "O", "O", "O", "O"] expected_types = ["THING", "", "", "", "", "", ""] @@ -265,6 +323,16 @@ def test_ruler_before_ner(): assert [token.ent_type_ for token in doc] == expected_types +def test_ner_constructor(en_vocab): + config = { + "update_with_oracle_cut_size": 100, + } + cfg = {"model": DEFAULT_NER_MODEL} + model = registry.resolve(cfg, validate=True)["model"] + EntityRecognizer(en_vocab, model, **config) + EntityRecognizer(en_vocab, model) + + def test_ner_before_ruler(): """Test that an entity_ruler works after an NER: the second can overwrite O annotations""" nlp = English() @@ -356,6 +424,26 @@ def test_overfitting_IO(): assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, no_batch_deps) + # test that kb_id is preserved + test_text = "I like London and London." + doc = nlp.make_doc(test_text) + doc.ents = [Span(doc, 2, 3, label="LOC", kb_id=1234)] + ents = doc.ents + assert len(ents) == 1 + assert ents[0].text == "London" + assert ents[0].label_ == "LOC" + assert ents[0].kb_id == 1234 + doc = nlp.get_pipe("ner")(doc) + ents = doc.ents + assert len(ents) == 2 + assert ents[0].text == "London" + assert ents[0].label_ == "LOC" + assert ents[0].kb_id == 1234 + # ent added by ner has kb_id == 0 + assert ents[1].text == "London" + assert ents[1].label_ == "LOC" + assert ents[1].kb_id == 0 + @pytest.mark.xfail(reason="no beam parser yet") def test_beam_ner_scores(): @@ -394,7 +482,7 @@ def test_beam_ner_scores(): @pytest.mark.xfail(reason="no beam parser yet") -def test_beam_overfitting_IO(): +def test_beam_overfitting_IO(neg_key): # Simple test to try and quickly overfit the Beam NER component nlp = English() beam_width = 16 @@ -402,6 +490,7 @@ def test_beam_overfitting_IO(): config = { "beam_width": beam_width, "beam_density": beam_density, + "incorrect_spans_key": neg_key, } ner = nlp.add_pipe("beam_ner", config=config) train_examples = [] @@ -418,12 +507,13 @@ def test_beam_overfitting_IO(): assert losses["beam_ner"] < 0.0001 # test the scores from the beam - test_text = "I like London." + test_text = "I like London" docs = [nlp.make_doc(test_text)] beams = ner.predict(docs) entity_scores = ner.scored_ents(beams)[0] assert entity_scores[(2, 3, "LOC")] == 1.0 assert entity_scores[(2, 3, "PERSON")] == 0.0 + assert len(nlp(test_text).ents) == 1 # Also test the results are still the same after IO with make_tempdir() as tmp_dir: @@ -436,6 +526,108 @@ def test_beam_overfitting_IO(): assert entity_scores2[(2, 3, "LOC")] == 1.0 assert entity_scores2[(2, 3, "PERSON")] == 0.0 + # Try to unlearn the entity by using negative annotations + neg_doc = nlp.make_doc(test_text) + neg_ex = Example(neg_doc, neg_doc) + neg_ex.reference.spans[neg_key] = [Span(neg_doc, 2, 3, "LOC")] + neg_train_examples = [neg_ex] + + for i in range(20): + losses = {} + nlp.update(neg_train_examples, sgd=optimizer, losses=losses) + + # test the "untrained" model + assert len(nlp(test_text).ents) == 0 + + +def test_neg_annotation(neg_key): + """Check that the NER update works with a negative annotation that is a different label of the correct one, + or partly overlapping, etc""" + nlp = English() + beam_width = 16 + beam_density = 0.0001 + config = { + "beam_width": beam_width, + "beam_density": beam_density, + "incorrect_spans_key": neg_key, + } + ner = nlp.add_pipe("beam_ner", config=config) + train_text = "Who is Shaka Khan?" + neg_doc = nlp.make_doc(train_text) + ner.add_label("PERSON") + ner.add_label("ORG") + example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]}) + example.reference.spans[neg_key] = [ + Span(neg_doc, 2, 4, "ORG"), + Span(neg_doc, 2, 3, "PERSON"), + Span(neg_doc, 1, 4, "PERSON"), + ] + + optimizer = nlp.initialize() + for i in range(2): + losses = {} + nlp.update([example], sgd=optimizer, losses=losses) + + +def test_neg_annotation_conflict(neg_key): + # Check that NER raises for a negative annotation that is THE SAME as a correct one + nlp = English() + beam_width = 16 + beam_density = 0.0001 + config = { + "beam_width": beam_width, + "beam_density": beam_density, + "incorrect_spans_key": neg_key, + } + ner = nlp.add_pipe("beam_ner", config=config) + train_text = "Who is Shaka Khan?" + neg_doc = nlp.make_doc(train_text) + ner.add_label("PERSON") + ner.add_label("LOC") + example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]}) + example.reference.spans[neg_key] = [Span(neg_doc, 2, 4, "PERSON")] + assert len(example.reference.ents) == 1 + assert example.reference.ents[0].text == "Shaka Khan" + assert example.reference.ents[0].label_ == "PERSON" + assert len(example.reference.spans[neg_key]) == 1 + assert example.reference.spans[neg_key][0].text == "Shaka Khan" + assert example.reference.spans[neg_key][0].label_ == "PERSON" + + optimizer = nlp.initialize() + for i in range(2): + losses = {} + with pytest.raises(ValueError): + nlp.update([example], sgd=optimizer, losses=losses) + + +def test_beam_valid_parse(neg_key): + """Regression test for previously flakey behaviour""" + nlp = English() + beam_width = 16 + beam_density = 0.0001 + config = { + "beam_width": beam_width, + "beam_density": beam_density, + "incorrect_spans_key": neg_key, + } + nlp.add_pipe("beam_ner", config=config) + # fmt: off + tokens = ['FEDERAL', 'NATIONAL', 'MORTGAGE', 'ASSOCIATION', '(', 'Fannie', 'Mae', '):', 'Posted', 'yields', 'on', '30', 'year', 'mortgage', 'commitments', 'for', 'delivery', 'within', '30', 'days', '(', 'priced', 'at', 'par', ')', '9.75', '%', ',', 'standard', 'conventional', 'fixed', '-', 'rate', 'mortgages', ';', '8.70', '%', ',', '6/2', 'rate', 'capped', 'one', '-', 'year', 'adjustable', 'rate', 'mortgages', '.', 'Source', ':', 'Telerate', 'Systems', 'Inc.'] + iob = ['B-ORG', 'I-ORG', 'I-ORG', 'L-ORG', 'O', 'B-ORG', 'L-ORG', 'O', 'O', 'O', 'O', 'B-DATE', 'L-DATE', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'L-DATE', 'O', 'O', 'O', 'O', 'O', 'B-PERCENT', 'L-PERCENT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERCENT', 'L-PERCENT', 'O', 'U-CARDINAL', 'O', 'O', 'B-DATE', 'I-DATE', 'L-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] + # fmt: on + + doc = Doc(nlp.vocab, words=tokens) + example = Example.from_dict(doc, {"ner": iob}) + neg_span = Span(doc, 50, 53, "ORG") + example.reference.spans[neg_key] = [neg_span] + + optimizer = nlp.initialize() + + for i in range(5): + losses = {} + nlp.update([example], sgd=optimizer, losses=losses) + assert "beam_ner" in losses + def test_ner_warns_no_lookups(caplog): nlp = English() diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 64c71f821..574963f1f 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -5,10 +5,11 @@ from spacy.attrs import DEP from spacy.lang.en import English from spacy.training import Example from spacy.tokens import Doc -from spacy import util +from spacy import util, registry from ..util import apply_transition_sequence, make_tempdir - +from ...pipeline import DependencyParser +from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL TRAIN_DATA = [ ( @@ -217,6 +218,18 @@ def test_parser_set_sent_starts(en_vocab): assert token.head in sent +def test_parser_constructor(en_vocab): + config = { + "learn_tokens": False, + "min_action_freq": 30, + "update_with_oracle_cut_size": 100, + } + cfg = {"model": DEFAULT_PARSER_MODEL} + model = registry.resolve(cfg, validate=True)["model"] + DependencyParser(en_vocab, model, **config) + DependencyParser(en_vocab, model) + + @pytest.mark.parametrize("pipe_name", PARSERS) def test_incomplete_data(pipe_name): # Test that the parser works with incomplete information diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py index 8ca4039a2..50da60594 100644 --- a/spacy/tests/parser/test_parse_navigate.py +++ b/spacy/tests/parser/test_parse_navigate.py @@ -69,7 +69,7 @@ def heads(): def test_parser_parse_navigate_consistency(en_vocab, words, heads): - doc = Doc(en_vocab, words=words, heads=heads) + doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads)) for head in doc: for child in head.lefts: assert child.head == head @@ -109,7 +109,7 @@ def test_parser_parse_navigate_child_consistency(en_vocab, words, heads): def test_parser_parse_navigate_edges(en_vocab, words, heads): - doc = Doc(en_vocab, words=words, heads=heads) + doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads)) for token in doc: subtree = list(token.subtree) debug = "\t".join((token.text, token.left_edge.text, subtree[0].text)) diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index 595bfa537..d71388900 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -23,14 +23,9 @@ def _parser_example(parser): @pytest.fixture def parser(vocab): vocab.strings.add("ROOT") - config = { - "learn_tokens": False, - "min_action_freq": 30, - "update_with_oracle_cut_size": 100, - } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - parser = DependencyParser(vocab, model, **config) + parser = DependencyParser(vocab, model) parser.cfg["token_vector_width"] = 4 parser.cfg["hidden_width"] = 32 # parser.add_label('right') diff --git a/spacy/tests/pipeline/test_annotates_on_update.py b/spacy/tests/pipeline/test_annotates_on_update.py new file mode 100644 index 000000000..869b8b874 --- /dev/null +++ b/spacy/tests/pipeline/test_annotates_on_update.py @@ -0,0 +1,112 @@ +from typing import Callable, Iterable, Iterator +import pytest + +from thinc.api import Config +from spacy.language import Language +from spacy.training import Example +from spacy.training.loop import train +from spacy.lang.en import English +from spacy.util import registry, load_model_from_config + + +@pytest.fixture +def config_str(): + return """ + [nlp] + lang = "en" + pipeline = ["sentencizer","assert_sents"] + disabled = [] + before_creation = null + after_creation = null + after_pipeline_creation = null + batch_size = 1000 + tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} + + [components] + + [components.assert_sents] + factory = "assert_sents" + + [components.sentencizer] + factory = "sentencizer" + punct_chars = null + + [training] + dev_corpus = "corpora.dev" + train_corpus = "corpora.train" + annotating_components = ["sentencizer"] + max_steps = 2 + + [corpora] + + [corpora.dev] + @readers = "unannotated_corpus" + + [corpora.train] + @readers = "unannotated_corpus" + """ + + +def test_annotates_on_update(): + # The custom component checks for sentence annotation + @Language.factory("assert_sents", default_config={}) + def assert_sents(nlp, name): + return AssertSents(name) + + class AssertSents: + def __init__(self, name, **cfg): + self.name = name + pass + + def __call__(self, doc): + if not doc.has_annotation("SENT_START"): + raise ValueError("No sents") + return doc + + def update(self, examples, *, drop=0.0, sgd=None, losses=None): + for example in examples: + if not example.predicted.has_annotation("SENT_START"): + raise ValueError("No sents") + return {} + + nlp = English() + nlp.add_pipe("sentencizer") + nlp.add_pipe("assert_sents") + + # When the pipeline runs, annotations are set + nlp("This is a sentence.") + + examples = [] + for text in ["a a", "b b", "c c"]: + examples.append(Example(nlp.make_doc(text), nlp(text))) + + for example in examples: + assert not example.predicted.has_annotation("SENT_START") + + # If updating without setting annotations, assert_sents will raise an error + with pytest.raises(ValueError): + nlp.update(examples) + + # Updating while setting annotations for the sentencizer succeeds + nlp.update(examples, annotates=["sentencizer"]) + + +def test_annotating_components_from_config(config_str): + @registry.readers("unannotated_corpus") + def create_unannotated_corpus() -> Callable[[Language], Iterable[Example]]: + return UnannotatedCorpus() + + class UnannotatedCorpus: + def __call__(self, nlp: Language) -> Iterator[Example]: + for text in ["a a", "b b", "c c"]: + doc = nlp.make_doc(text) + yield Example(doc, doc) + + orig_config = Config().from_str(config_str) + nlp = load_model_from_config(orig_config, auto_fill=True, validate=True) + assert nlp.config["training"]["annotating_components"] == ["sentencizer"] + train(nlp) + + nlp.config["training"]["annotating_components"] = [] + with pytest.raises(ValueError): + train(nlp) diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 743800536..a98d01964 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -2,7 +2,7 @@ from typing import Callable, Iterable import pytest from numpy.testing import assert_equal from spacy.attrs import ENT_KB_ID - +from spacy.compat import pickle from spacy.kb import KnowledgeBase, get_candidates, Candidate from spacy.vocab import Vocab @@ -154,6 +154,40 @@ def test_kb_serialize(nlp): mykb.from_disk(d / "unknown" / "kb") +@pytest.mark.issue(9137) +def test_kb_serialize_2(nlp): + v = [5, 6, 7, 8] + kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4) + kb1.set_entities(["E1"], [1], [v]) + assert kb1.get_vector("E1") == v + with make_tempdir() as d: + kb1.to_disk(d / "kb") + kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4) + kb2.from_disk(d / "kb") + assert kb2.get_vector("E1") == v + + +def test_kb_set_entities(nlp): + """Test that set_entities entirely overwrites the previous set of entities""" + v = [5, 6, 7, 8] + v1 = [1, 1, 1, 0] + v2 = [2, 2, 2, 3] + kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4) + kb1.set_entities(["E0"], [1], [v]) + assert kb1.get_entity_strings() == ["E0"] + kb1.set_entities(["E1", "E2"], [1, 9], [v1, v2]) + assert set(kb1.get_entity_strings()) == {"E1", "E2"} + assert kb1.get_vector("E1") == v1 + assert kb1.get_vector("E2") == v2 + with make_tempdir() as d: + kb1.to_disk(d / "kb") + kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4) + kb2.from_disk(d / "kb") + assert set(kb2.get_entity_strings()) == {"E1", "E2"} + assert kb2.get_vector("E1") == v1 + assert kb2.get_vector("E2") == v2 + + def test_kb_serialize_vocab(nlp): """Test serialization of the KB and custom strings""" entity = "MyFunnyID" @@ -230,7 +264,7 @@ def test_el_pipe_configuration(nlp): def get_lowercased_candidates(kb, span): return kb.get_alias_candidates(span.text.lower()) - @registry.misc.register("spacy.LowercaseCandidateGenerator.v1") + @registry.misc("spacy.LowercaseCandidateGenerator.v1") def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]: return get_lowercased_candidates @@ -250,6 +284,16 @@ def test_el_pipe_configuration(nlp): assert doc[2].ent_kb_id_ == "Q2" +def test_nel_nsents(nlp): + """Test that n_sents can be set through the configuration""" + entity_linker = nlp.add_pipe("entity_linker", config={}) + assert entity_linker.n_sents == 0 + entity_linker = nlp.replace_pipe( + "entity_linker", "entity_linker", config={"n_sents": 2} + ) + assert entity_linker.n_sents == 2 + + def test_vocab_serialization(nlp): """Test that string information is retained across storage""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) @@ -282,6 +326,9 @@ def test_vocab_serialization(nlp): assert candidates[0].alias == adam_hash assert candidates[0].alias_ == "adam" + assert kb_new_vocab.get_vector("Q2") == [2] + assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4) + def test_append_alias(nlp): """Test that we can append additional alias-entity pairs""" @@ -313,6 +360,7 @@ def test_append_alias(nlp): assert len(mykb.get_alias_candidates("douglas")) == 3 +@pytest.mark.filterwarnings("ignore:\\[W036") def test_append_invalid_alias(nlp): """Test that append an alias will throw an error if prior probs are exceeding 1""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) @@ -331,6 +379,7 @@ def test_append_invalid_alias(nlp): mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2) +@pytest.mark.filterwarnings("ignore:\\[W036") def test_preserving_links_asdoc(nlp): """Test that Span.as_doc preserves the existing entity links""" vector_length = 1 @@ -425,7 +474,6 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"] def test_overfitting_IO(): # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly nlp = English() - nlp.add_pipe("sentencizer", first=True) vector_length = 3 assert "Q2146908" not in nlp.vocab.strings @@ -465,6 +513,9 @@ def test_overfitting_IO(): nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["entity_linker"] < 0.001 + # adding additional components that are required for the entity_linker + nlp.add_pipe("sentencizer", first=True) + # Add a custom component to recognize "Russ Cochran" as an entity for the example training data patterns = [ {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]} @@ -536,6 +587,106 @@ def test_kb_serialization(): assert "RandomWord" in nlp2.vocab.strings +@pytest.mark.xfail(reason="Needs fixing") +def test_kb_pickle(): + # Test that the KB can be pickled + nlp = English() + kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3) + kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + assert not kb_1.contains_alias("Russ Cochran") + kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8]) + assert kb_1.contains_alias("Russ Cochran") + data = pickle.dumps(kb_1) + kb_2 = pickle.loads(data) + assert kb_2.contains_alias("Russ Cochran") + + +@pytest.mark.xfail(reason="Needs fixing") +def test_nel_pickle(): + # Test that a pipeline with an EL component can be pickled + def create_kb(vocab): + kb = KnowledgeBase(vocab, entity_vector_length=3) + kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + kb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8]) + return kb + + nlp_1 = English() + nlp_1.add_pipe("ner") + entity_linker_1 = nlp_1.add_pipe("entity_linker", last=True) + entity_linker_1.set_kb(create_kb) + assert nlp_1.pipe_names == ["ner", "entity_linker"] + assert entity_linker_1.kb.contains_alias("Russ Cochran") + + data = pickle.dumps(nlp_1) + nlp_2 = pickle.loads(data) + assert nlp_2.pipe_names == ["ner", "entity_linker"] + entity_linker_2 = nlp_2.get_pipe("entity_linker") + assert entity_linker_2.kb.contains_alias("Russ Cochran") + + +def test_kb_to_bytes(): + # Test that the KB's to_bytes method works correctly + nlp = English() + kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3) + kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + kb_1.add_entity(entity="Q66", freq=9, entity_vector=[1, 2, 3]) + kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8]) + kb_1.add_alias(alias="Boeing", entities=["Q66"], probabilities=[0.5]) + kb_1.add_alias( + alias="Randomness", entities=["Q66", "Q2146908"], probabilities=[0.1, 0.2] + ) + assert kb_1.contains_alias("Russ Cochran") + kb_bytes = kb_1.to_bytes() + kb_2 = KnowledgeBase(nlp.vocab, entity_vector_length=3) + assert not kb_2.contains_alias("Russ Cochran") + kb_2 = kb_2.from_bytes(kb_bytes) + # check that both KBs are exactly the same + assert kb_1.get_size_entities() == kb_2.get_size_entities() + assert kb_1.entity_vector_length == kb_2.entity_vector_length + assert kb_1.get_entity_strings() == kb_2.get_entity_strings() + assert kb_1.get_vector("Q2146908") == kb_2.get_vector("Q2146908") + assert kb_1.get_vector("Q66") == kb_2.get_vector("Q66") + assert kb_2.contains_alias("Russ Cochran") + assert kb_1.get_size_aliases() == kb_2.get_size_aliases() + assert kb_1.get_alias_strings() == kb_2.get_alias_strings() + assert len(kb_1.get_alias_candidates("Russ Cochran")) == len( + kb_2.get_alias_candidates("Russ Cochran") + ) + assert len(kb_1.get_alias_candidates("Randomness")) == len( + kb_2.get_alias_candidates("Randomness") + ) + + +def test_nel_to_bytes(): + # Test that a pipeline with an EL component can be converted to bytes + def create_kb(vocab): + kb = KnowledgeBase(vocab, entity_vector_length=3) + kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + kb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8]) + return kb + + nlp_1 = English() + nlp_1.add_pipe("ner") + entity_linker_1 = nlp_1.add_pipe("entity_linker", last=True) + entity_linker_1.set_kb(create_kb) + assert entity_linker_1.kb.contains_alias("Russ Cochran") + assert nlp_1.pipe_names == ["ner", "entity_linker"] + + nlp_bytes = nlp_1.to_bytes() + nlp_2 = English() + nlp_2.add_pipe("ner") + nlp_2.add_pipe("entity_linker", last=True) + assert nlp_2.pipe_names == ["ner", "entity_linker"] + assert not nlp_2.get_pipe("entity_linker").kb.contains_alias("Russ Cochran") + nlp_2 = nlp_2.from_bytes(nlp_bytes) + kb_2 = nlp_2.get_pipe("entity_linker").kb + assert kb_2.contains_alias("Russ Cochran") + assert kb_2.get_vector("Q2146908") == [6, -4, 3] + assert_almost_equal( + kb_2.get_prior_prob(entity="Q2146908", alias="Russ Cochran"), 0.8 + ) + + def test_scorer_links(): train_examples = [] nlp = English() diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py index 3f998d78d..dc0ca0301 100644 --- a/spacy/tests/pipeline/test_entity_ruler.py +++ b/spacy/tests/pipeline/test_entity_ruler.py @@ -5,6 +5,7 @@ from spacy.tokens import Span from spacy.language import Language from spacy.pipeline import EntityRuler from spacy.errors import MatchPatternError +from thinc.api import NumpyOps, get_current_ops @pytest.fixture @@ -45,6 +46,17 @@ def test_entity_ruler_init(nlp, patterns): assert doc.ents[1].label_ == "BYE" +def test_entity_ruler_no_patterns_warns(nlp): + ruler = EntityRuler(nlp) + assert len(ruler) == 0 + assert len(ruler.labels) == 0 + nlp.add_pipe("entity_ruler") + assert nlp.pipe_names == ["entity_ruler"] + with pytest.warns(UserWarning): + doc = nlp("hello world bye bye") + assert len(doc.ents) == 0 + + def test_entity_ruler_init_patterns(nlp, patterns): # initialize with patterns ruler = nlp.add_pipe("entity_ruler") @@ -77,6 +89,20 @@ def test_entity_ruler_init_clear(nlp, patterns): assert len(ruler.labels) == 0 +def test_entity_ruler_clear(nlp, patterns): + """Test that initialization clears patterns.""" + ruler = nlp.add_pipe("entity_ruler") + ruler.add_patterns(patterns) + assert len(ruler.labels) == 4 + doc = nlp("hello world") + assert len(doc.ents) == 1 + ruler.clear() + assert len(ruler.labels) == 0 + with pytest.warns(UserWarning): + doc = nlp("hello world") + assert len(doc.ents) == 0 + + def test_entity_ruler_existing(nlp, patterns): ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) @@ -201,13 +227,14 @@ def test_entity_ruler_overlapping_spans(nlp): @pytest.mark.parametrize("n_process", [1, 2]) def test_entity_ruler_multiprocessing(nlp, n_process): - texts = ["I enjoy eating Pizza Hut pizza."] + if isinstance(get_current_ops, NumpyOps) or n_process < 2: + texts = ["I enjoy eating Pizza Hut pizza."] - patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut", "id": "1234"}] + patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut", "id": "1234"}] - ruler = nlp.add_pipe("entity_ruler") - ruler.add_patterns(patterns) + ruler = nlp.add_pipe("entity_ruler") + ruler.add_patterns(patterns) - for doc in nlp.pipe(texts, n_process=2): - for ent in doc.ents: - assert ent.ent_id_ == "1234" + for doc in nlp.pipe(texts, n_process=2): + for ent in doc.ents: + assert ent.ent_id_ == "1234" diff --git a/spacy/tests/pipeline/test_lemmatizer.py b/spacy/tests/pipeline/test_lemmatizer.py index 1943d3dd7..0d2d3d6e5 100644 --- a/spacy/tests/pipeline/test_lemmatizer.py +++ b/spacy/tests/pipeline/test_lemmatizer.py @@ -1,6 +1,5 @@ import pytest -import logging -import mock +import pickle from spacy import util, registry from spacy.lang.en import English from spacy.lookups import Lookups @@ -58,10 +57,10 @@ def test_lemmatizer_config(nlp): # warning if no POS assigned doc = nlp.make_doc("coping") - logger = logging.getLogger("spacy") - with mock.patch.object(logger, "warning") as mock_warning: + with pytest.warns(UserWarning): doc = lemmatizer(doc) - mock_warning.assert_called_once() + # warns once by default + doc = lemmatizer(doc) # works with POS doc = nlp.make_doc("coping") @@ -106,6 +105,9 @@ def test_lemmatizer_serialize(nlp): doc2 = nlp2.make_doc("coping") doc2[0].pos_ = "VERB" assert doc2[0].lemma_ == "" - doc2 = lemmatizer(doc2) + doc2 = lemmatizer2(doc2) assert doc2[0].text == "coping" assert doc2[0].lemma_ == "cope" + + # Make sure that lemmatizer cache can be pickled + pickle.dumps(lemmatizer2) diff --git a/spacy/tests/pipeline/test_models.py b/spacy/tests/pipeline/test_models.py index d04ac9cd4..e3fd28d0f 100644 --- a/spacy/tests/pipeline/test_models.py +++ b/spacy/tests/pipeline/test_models.py @@ -4,7 +4,7 @@ import numpy import pytest from numpy.testing import assert_almost_equal from spacy.vocab import Vocab -from thinc.api import NumpyOps, Model, data_validation +from thinc.api import Model, data_validation, get_current_ops from thinc.types import Array2d, Ragged from spacy.lang.en import English @@ -13,7 +13,7 @@ from spacy.ml._character_embed import CharacterEmbed from spacy.tokens import Doc -OPS = NumpyOps() +OPS = get_current_ops() texts = ["These are 4 words", "Here just three"] l0 = [[1, 2], [3, 4], [5, 6], [7, 8]] @@ -82,7 +82,9 @@ def util_batch_unbatch_docs_list( Y_batched = model.predict(in_data) Y_not_batched = [model.predict([u])[0] for u in in_data] for i in range(len(Y_batched)): - assert_almost_equal(Y_batched[i], Y_not_batched[i], decimal=4) + assert_almost_equal( + OPS.to_numpy(Y_batched[i]), OPS.to_numpy(Y_not_batched[i]), decimal=4 + ) def util_batch_unbatch_docs_array( @@ -91,7 +93,7 @@ def util_batch_unbatch_docs_array( with data_validation(True): model.initialize(in_data, out_data) Y_batched = model.predict(in_data).tolist() - Y_not_batched = [model.predict([u])[0] for u in in_data] + Y_not_batched = [model.predict([u])[0].tolist() for u in in_data] assert_almost_equal(Y_batched, Y_not_batched, decimal=4) @@ -100,8 +102,8 @@ def util_batch_unbatch_docs_ragged( ): with data_validation(True): model.initialize(in_data, out_data) - Y_batched = model.predict(in_data) + Y_batched = model.predict(in_data).data.tolist() Y_not_batched = [] for u in in_data: Y_not_batched.extend(model.predict([u]).data.tolist()) - assert_almost_equal(Y_batched.data, Y_not_batched, decimal=4) + assert_almost_equal(Y_batched, Y_not_batched, decimal=4) diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index 2af4b1efb..0c2554727 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -135,8 +135,8 @@ def test_pipe_class_component_defaults(): self, nlp: Language, name: str, - value1: StrictInt = 10, - value2: StrictStr = "hello", + value1: StrictInt = StrictInt(10), + value2: StrictStr = StrictStr("hello"), ): self.nlp = nlp self.value1 = value1 @@ -160,7 +160,7 @@ def test_pipe_class_component_model(): "@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": { - "@architectures": "spacy.TextCatBOW.v1", + "@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False, @@ -196,7 +196,11 @@ def test_pipe_class_component_model_custom(): @Language.factory(name, default_config=default_config) class Component: def __init__( - self, nlp: Language, model: Model, name: str, value1: StrictInt = 10 + self, + nlp: Language, + model: Model, + name: str, + value1: StrictInt = StrictInt(10), ): self.nlp = nlp self.model = model @@ -332,24 +336,44 @@ def test_language_factories_invalid(): @pytest.mark.parametrize( - "weights,expected", + "weights,override,expected", [ - ([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {"a": 0.33, "b": 0.33, "c": 0.33}), - ([{"a": 1.0}, {"b": 50}, {"c": 123}], {"a": 0.33, "b": 0.33, "c": 0.33}), + ([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {}, {"a": 0.33, "b": 0.33, "c": 0.33}), + ([{"a": 1.0}, {"b": 50}, {"c": 100}], {}, {"a": 0.01, "b": 0.33, "c": 0.66}), ( [{"a": 0.7, "b": 0.3}, {"c": 1.0}, {"d": 0.5, "e": 0.5}], + {}, {"a": 0.23, "b": 0.1, "c": 0.33, "d": 0.17, "e": 0.17}, ), ( - [{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}], - {"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25}, + [{"a": 100, "b": 300}, {"c": 50, "d": 50}], + {}, + {"a": 0.2, "b": 0.6, "c": 0.1, "d": 0.1}, + ), + ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {}, {"a": 0.33, "b": 0.67}), + ([{"a": 0.5, "b": 0.0}], {}, {"a": 1.0, "b": 0.0}), + ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.0}, {"a": 0.0, "b": 1.0}), + ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {}, {"a": 0.0, "b": 0.0, "c": 0.0}), + ([{"a": 0.0, "b": 0.0}, {"c": 1.0}], {}, {"a": 0.0, "b": 0.0, "c": 1.0}), + ( + [{"a": 0.0, "b": 0.0}, {"c": 0.0}], + {"c": 0.2}, + {"a": 0.0, "b": 0.0, "c": 1.0}, + ), + ( + [{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}], + {"a": 0.0, "b": 0.0}, + {"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5}, + ), + ( + [{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}], + {"a": 0.0, "b": 0.0, "f": 0.0}, + {"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5, "f": 0.0}, ), - ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75}), - ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"a": 0.0, "b": 0.0, "c": 0.0}), ], ) -def test_language_factories_combine_score_weights(weights, expected): - result = combine_score_weights(weights) +def test_language_factories_combine_score_weights(weights, override, expected): + result = combine_score_weights(weights, override) assert sum(result.values()) in (0.99, 1.0, 0.0) assert result == expected @@ -375,17 +399,17 @@ def test_language_factories_scores(): # Test with custom defaults config = nlp.config.copy() config["training"]["score_weights"]["a1"] = 0.0 - config["training"]["score_weights"]["b3"] = 1.0 + config["training"]["score_weights"]["b3"] = 1.3 nlp = English.from_config(config) score_weights = nlp.config["training"]["score_weights"] - expected = {"a1": 0.0, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.34} + expected = {"a1": 0.0, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.65} assert score_weights == expected # Test with null values config = nlp.config.copy() config["training"]["score_weights"]["a1"] = None nlp = English.from_config(config) score_weights = nlp.config["training"]["score_weights"] - expected = {"a1": None, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.35} + expected = {"a1": None, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.66} assert score_weights == expected @@ -402,6 +426,36 @@ def test_pipe_factories_from_source(): nlp.add_pipe("custom", source=source_nlp) +def test_pipe_factories_from_source_language_subclass(): + class CustomEnglishDefaults(English.Defaults): + stop_words = set(["custom", "stop"]) + + @registry.languages("custom_en") + class CustomEnglish(English): + lang = "custom_en" + Defaults = CustomEnglishDefaults + + source_nlp = English() + source_nlp.add_pipe("tagger") + + # custom subclass + nlp = CustomEnglish() + nlp.add_pipe("tagger", source=source_nlp) + assert "tagger" in nlp.pipe_names + + # non-subclass + nlp = German() + nlp.add_pipe("tagger", source=source_nlp) + assert "tagger" in nlp.pipe_names + + # mismatched vectors + nlp = English() + nlp.vocab.vectors.resize((1, 4)) + nlp.vocab.vectors.add("cat", vector=[1, 2, 3, 4]) + with pytest.warns(UserWarning): + nlp.add_pipe("tagger", source=source_nlp) + + def test_pipe_factories_from_source_custom(): """Test adding components from a source model with custom components.""" name = "test_pipe_factories_from_source_custom" @@ -451,13 +505,27 @@ def test_pipe_factories_from_source_config(): assert config["arg"] == "world" -def test_pipe_factories_decorator_idempotent(): +class PipeFactoriesIdempotent: + def __init__(self, nlp, name): + ... + + def __call__(self, doc): + ... + + +@pytest.mark.parametrize( + "i,func,func2", + [ + (0, lambda nlp, name: lambda doc: doc, lambda doc: doc), + (1, PipeFactoriesIdempotent, PipeFactoriesIdempotent(None, None)), + ], +) +def test_pipe_factories_decorator_idempotent(i, func, func2): """Check that decorator can be run multiple times if the function is the same. This is especially relevant for live reloading because we don't want spaCy to raise an error if a module registering components is reloaded. """ - name = "test_pipe_factories_decorator_idempotent" - func = lambda nlp, name: lambda doc: doc + name = f"test_pipe_factories_decorator_idempotent_{i}" for i in range(5): Language.factory(name, func=func) nlp = Language() @@ -466,7 +534,6 @@ def test_pipe_factories_decorator_idempotent(): # Make sure it also works for component decorator, which creates the # factory function name2 = f"{name}2" - func2 = lambda doc: doc for i in range(5): Language.component(name2, func=func2) nlp = Language() diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 6a21ddfaa..87fd64307 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -1,7 +1,9 @@ import pytest from spacy.language import Language from spacy.pipeline import TrainablePipe +from spacy.training import Example from spacy.util import SimpleFrozenList, get_arg_names +from spacy.lang.en import English @pytest.fixture @@ -50,7 +52,7 @@ def test_cant_add_pipe_first_and_last(nlp): nlp.add_pipe("new_pipe", first=True, last=True) -@pytest.mark.parametrize("name", ["my_component"]) +@pytest.mark.parametrize("name", ["test_get_pipe"]) def test_get_pipe(nlp, name): with pytest.raises(KeyError): nlp.get_pipe(name) @@ -60,7 +62,7 @@ def test_get_pipe(nlp, name): @pytest.mark.parametrize( "name,replacement,invalid_replacement", - [("my_component", "other_pipe", lambda doc: doc)], + [("test_replace_pipe", "other_pipe", lambda doc: doc)], ) def test_replace_pipe(nlp, name, replacement, invalid_replacement): with pytest.raises(ValueError): @@ -83,9 +85,9 @@ def test_replace_last_pipe(nlp): def test_replace_pipe_config(nlp): nlp.add_pipe("entity_linker") nlp.add_pipe("sentencizer") - assert nlp.get_pipe("entity_linker").cfg["incl_prior"] is True + assert nlp.get_pipe("entity_linker").incl_prior is True nlp.replace_pipe("entity_linker", "entity_linker", config={"incl_prior": False}) - assert nlp.get_pipe("entity_linker").cfg["incl_prior"] is False + assert nlp.get_pipe("entity_linker").incl_prior is False @pytest.mark.parametrize("old_name,new_name", [("old_pipe", "new_pipe")]) @@ -417,3 +419,46 @@ def test_pipe_methods_initialize(): assert "test" in nlp.config["initialize"]["components"] nlp.remove_pipe("test") assert "test" not in nlp.config["initialize"]["components"] + + +def test_update_with_annotates(): + name = "test_with_annotates" + results = {} + + def make_component(name): + results[name] = "" + + def component(doc): + nonlocal results + results[name] += doc.text + return doc + + return component + + Language.component(f"{name}1", func=make_component(f"{name}1")) + Language.component(f"{name}2", func=make_component(f"{name}2")) + + components = set([f"{name}1", f"{name}2"]) + + nlp = English() + texts = ["a", "bb", "ccc"] + examples = [] + for text in texts: + examples.append(Example(nlp.make_doc(text), nlp.make_doc(text))) + + for components_to_annotate in [ + [], + [f"{name}1"], + [f"{name}1", f"{name}2"], + [f"{name}2", f"{name}1"], + ]: + for key in results: + results[key] = "" + nlp = English(vocab=nlp.vocab) + nlp.add_pipe(f"{name}1") + nlp.add_pipe(f"{name}2") + nlp.update(examples, annotates=components_to_annotate) + for component in components_to_annotate: + assert results[component] == "".join(eg.predicted.text for eg in examples) + for component in components - set(components_to_annotate): + assert results[component] == "" diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py new file mode 100644 index 000000000..5c3a9d27d --- /dev/null +++ b/spacy/tests/pipeline/test_spancat.py @@ -0,0 +1,367 @@ +import pytest +import numpy +from numpy.testing import assert_array_equal, assert_almost_equal +from thinc.api import get_current_ops + +from spacy import util +from spacy.lang.en import English +from spacy.language import Language +from spacy.tokens import SpanGroup +from spacy.tokens._dict_proxies import SpanGroups +from spacy.training import Example +from spacy.util import fix_random_seed, registry, make_tempdir + +OPS = get_current_ops() + +SPAN_KEY = "labeled_spans" + +TRAIN_DATA = [ + ("Who is Shaka Khan?", {"spans": {SPAN_KEY: [(7, 17, "PERSON")]}}), + ( + "I like London and Berlin.", + {"spans": {SPAN_KEY: [(7, 13, "LOC"), (18, 24, "LOC")]}}, + ), +] + +TRAIN_DATA_OVERLAPPING = [ + ("Who is Shaka Khan?", {"spans": {SPAN_KEY: [(7, 17, "PERSON")]}}), + ( + "I like London and Berlin", + {"spans": {SPAN_KEY: [(7, 13, "LOC"), (18, 24, "LOC"), (7, 24, "DOUBLE_LOC")]}}, + ), +] + + +def make_examples(nlp, data=TRAIN_DATA): + train_examples = [] + for t in data: + eg = Example.from_dict(nlp.make_doc(t[0]), t[1]) + train_examples.append(eg) + return train_examples + + +def test_no_label(): + nlp = Language() + nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) + with pytest.raises(ValueError): + nlp.initialize() + + +def test_no_resize(): + nlp = Language() + spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) + spancat.add_label("Thing") + spancat.add_label("Phrase") + assert spancat.labels == ("Thing", "Phrase") + nlp.initialize() + assert spancat.model.get_dim("nO") == 2 + # this throws an error because the spancat can't be resized after initialization + with pytest.raises(ValueError): + spancat.add_label("Stuff") + + +def test_implicit_labels(): + nlp = Language() + spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) + assert len(spancat.labels) == 0 + train_examples = make_examples(nlp) + nlp.initialize(get_examples=lambda: train_examples) + assert spancat.labels == ("PERSON", "LOC") + + +def test_explicit_labels(): + nlp = Language() + spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) + assert len(spancat.labels) == 0 + spancat.add_label("PERSON") + spancat.add_label("LOC") + nlp.initialize() + assert spancat.labels == ("PERSON", "LOC") + + +def test_doc_gc(): + # If the Doc object is garbage collected, the spans won't be functional afterwards + nlp = Language() + spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) + spancat.add_label("PERSON") + nlp.initialize() + texts = [ + "Just a sentence.", + "I like London and Berlin", + "I like Berlin", + "I eat ham.", + ] + all_spans = [doc.spans for doc in nlp.pipe(texts)] + for text, spangroups in zip(texts, all_spans): + assert isinstance(spangroups, SpanGroups) + for key, spangroup in spangroups.items(): + assert isinstance(spangroup, SpanGroup) + assert len(spangroup) > 0 + with pytest.raises(RuntimeError): + span = spangroup[0] + + +@pytest.mark.parametrize( + "max_positive,nr_results", [(None, 4), (1, 2), (2, 3), (3, 4), (4, 4)] +) +def test_make_spangroup(max_positive, nr_results): + fix_random_seed(0) + nlp = Language() + spancat = nlp.add_pipe( + "spancat", + config={"spans_key": SPAN_KEY, "threshold": 0.5, "max_positive": max_positive}, + ) + doc = nlp.make_doc("Greater London") + ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2]) + indices = ngram_suggester([doc])[0].dataXd + assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]])) + labels = ["Thing", "City", "Person", "GreatCity"] + scores = numpy.asarray( + [[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f" + ) + spangroup = spancat._make_span_group(doc, indices, scores, labels) + assert len(spangroup) == nr_results + + # first span is always the second token "London" + assert spangroup[0].text == "London" + assert spangroup[0].label_ == "City" + assert_almost_equal(0.6, spangroup.attrs["scores"][0], 5) + + # second span depends on the number of positives that were allowed + assert spangroup[1].text == "Greater London" + if max_positive == 1: + assert spangroup[1].label_ == "GreatCity" + assert_almost_equal(0.9, spangroup.attrs["scores"][1], 5) + else: + assert spangroup[1].label_ == "Thing" + assert_almost_equal(0.8, spangroup.attrs["scores"][1], 5) + + if nr_results > 2: + assert spangroup[2].text == "Greater London" + if max_positive == 2: + assert spangroup[2].label_ == "GreatCity" + assert_almost_equal(0.9, spangroup.attrs["scores"][2], 5) + else: + assert spangroup[2].label_ == "City" + assert_almost_equal(0.7, spangroup.attrs["scores"][2], 5) + + assert spangroup[-1].text == "Greater London" + assert spangroup[-1].label_ == "GreatCity" + assert_almost_equal(0.9, spangroup.attrs["scores"][-1], 5) + + +def test_ngram_suggester(en_tokenizer): + # test different n-gram lengths + for size in [1, 2, 3]: + ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[size]) + docs = [ + en_tokenizer(text) + for text in [ + "a", + "a b", + "a b c", + "a b c d", + "a b c d e", + "a " * 100, + ] + ] + ngrams = ngram_suggester(docs) + # span sizes are correct + for s in ngrams.data: + assert s[1] - s[0] == size + # spans are within docs + offset = 0 + for i, doc in enumerate(docs): + spans = ngrams.dataXd[offset : offset + ngrams.lengths[i]] + spans_set = set() + for span in spans: + assert 0 <= span[0] < len(doc) + assert 0 < span[1] <= len(doc) + spans_set.add((int(span[0]), int(span[1]))) + # spans are unique + assert spans.shape[0] == len(spans_set) + offset += ngrams.lengths[i] + # the number of spans is correct + assert_array_equal( + OPS.to_numpy(ngrams.lengths), + [max(0, len(doc) - (size - 1)) for doc in docs], + ) + + # test 1-3-gram suggestions + ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2, 3]) + docs = [ + en_tokenizer(text) for text in ["a", "a b", "a b c", "a b c d", "a b c d e"] + ] + ngrams = ngram_suggester(docs) + assert_array_equal(OPS.to_numpy(ngrams.lengths), [1, 3, 6, 9, 12]) + assert_array_equal( + OPS.to_numpy(ngrams.data), + [ + # doc 0 + [0, 1], + # doc 1 + [0, 1], + [1, 2], + [0, 2], + # doc 2 + [0, 1], + [1, 2], + [2, 3], + [0, 2], + [1, 3], + [0, 3], + # doc 3 + [0, 1], + [1, 2], + [2, 3], + [3, 4], + [0, 2], + [1, 3], + [2, 4], + [0, 3], + [1, 4], + # doc 4 + [0, 1], + [1, 2], + [2, 3], + [3, 4], + [4, 5], + [0, 2], + [1, 3], + [2, 4], + [3, 5], + [0, 3], + [1, 4], + [2, 5], + ], + ) + + # test some empty docs + ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1]) + docs = [en_tokenizer(text) for text in ["", "a", ""]] + ngrams = ngram_suggester(docs) + assert_array_equal(OPS.to_numpy(ngrams.lengths), [len(doc) for doc in docs]) + + # test all empty docs + ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1]) + docs = [en_tokenizer(text) for text in ["", "", ""]] + ngrams = ngram_suggester(docs) + assert_array_equal(OPS.to_numpy(ngrams.lengths), [len(doc) for doc in docs]) + + +def test_ngram_sizes(en_tokenizer): + # test that the range suggester works well + size_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2, 3]) + suggester_factory = registry.misc.get("spacy.ngram_range_suggester.v1") + range_suggester = suggester_factory(min_size=1, max_size=3) + docs = [ + en_tokenizer(text) for text in ["a", "a b", "a b c", "a b c d", "a b c d e"] + ] + ngrams_1 = size_suggester(docs) + ngrams_2 = range_suggester(docs) + assert_array_equal(OPS.to_numpy(ngrams_1.lengths), [1, 3, 6, 9, 12]) + assert_array_equal(OPS.to_numpy(ngrams_1.lengths), OPS.to_numpy(ngrams_2.lengths)) + assert_array_equal(OPS.to_numpy(ngrams_1.data), OPS.to_numpy(ngrams_2.data)) + + # one more variation + suggester_factory = registry.misc.get("spacy.ngram_range_suggester.v1") + range_suggester = suggester_factory(min_size=2, max_size=4) + ngrams_3 = range_suggester(docs) + assert_array_equal(OPS.to_numpy(ngrams_3.lengths), [0, 1, 3, 6, 9]) + + +def test_overfitting_IO(): + # Simple test to try and quickly overfit the spancat component - ensuring the ML models work correctly + fix_random_seed(0) + nlp = English() + spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) + train_examples = make_examples(nlp) + optimizer = nlp.initialize(get_examples=lambda: train_examples) + assert spancat.model.get_dim("nO") == 2 + assert set(spancat.labels) == {"LOC", "PERSON"} + + for i in range(50): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + assert losses["spancat"] < 0.01 + + # test the trained model + test_text = "I like London and Berlin" + doc = nlp(test_text) + assert doc.spans[spancat.key] == doc.spans[SPAN_KEY] + spans = doc.spans[SPAN_KEY] + assert len(spans) == 2 + assert len(spans.attrs["scores"]) == 2 + assert min(spans.attrs["scores"]) > 0.9 + assert set([span.text for span in spans]) == {"London", "Berlin"} + assert set([span.label_ for span in spans]) == {"LOC"} + + # Also test the results are still the same after IO + with make_tempdir() as tmp_dir: + nlp.to_disk(tmp_dir) + nlp2 = util.load_model_from_path(tmp_dir) + doc2 = nlp2(test_text) + spans2 = doc2.spans[SPAN_KEY] + assert len(spans2) == 2 + assert len(spans2.attrs["scores"]) == 2 + assert min(spans2.attrs["scores"]) > 0.9 + assert set([span.text for span in spans2]) == {"London", "Berlin"} + assert set([span.label_ for span in spans2]) == {"LOC"} + + # Test scoring + scores = nlp.evaluate(train_examples) + assert f"spans_{SPAN_KEY}_f" in scores + assert scores[f"spans_{SPAN_KEY}_p"] == 1.0 + assert scores[f"spans_{SPAN_KEY}_r"] == 1.0 + assert scores[f"spans_{SPAN_KEY}_f"] == 1.0 + + # also test that the spancat works for just a single entity in a sentence + doc = nlp("London") + assert len(doc.spans[spancat.key]) == 1 + + +def test_overfitting_IO_overlapping(): + # Test for overfitting on overlapping entities + fix_random_seed(0) + nlp = English() + spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) + + train_examples = make_examples(nlp, data=TRAIN_DATA_OVERLAPPING) + optimizer = nlp.initialize(get_examples=lambda: train_examples) + assert spancat.model.get_dim("nO") == 3 + assert set(spancat.labels) == {"PERSON", "LOC", "DOUBLE_LOC"} + + for i in range(50): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + assert losses["spancat"] < 0.01 + + # test the trained model + test_text = "I like London and Berlin" + doc = nlp(test_text) + spans = doc.spans[SPAN_KEY] + assert len(spans) == 3 + assert len(spans.attrs["scores"]) == 3 + assert min(spans.attrs["scores"]) > 0.9 + assert set([span.text for span in spans]) == { + "London", + "Berlin", + "London and Berlin", + } + assert set([span.label_ for span in spans]) == {"LOC", "DOUBLE_LOC"} + + # Also test the results are still the same after IO + with make_tempdir() as tmp_dir: + nlp.to_disk(tmp_dir) + nlp2 = util.load_model_from_path(tmp_dir) + doc2 = nlp2(test_text) + spans2 = doc2.spans[SPAN_KEY] + assert len(spans2) == 3 + assert len(spans2.attrs["scores"]) == 3 + assert min(spans2.attrs["scores"]) > 0.9 + assert set([span.text for span in spans2]) == { + "London", + "Berlin", + "London and Berlin", + } + assert set([span.label_ for span in spans2]) == {"LOC", "DOUBLE_LOC"} diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index 282961755..ec14b70da 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -132,8 +132,8 @@ def test_incomplete_data(): # test the trained model test_text = "I like blue eggs" doc = nlp(test_text) - assert doc[1].tag_ is "V" - assert doc[2].tag_ is "J" + assert doc[1].tag_ == "V" + assert doc[2].tag_ == "J" def test_overfitting_IO(): @@ -154,20 +154,20 @@ def test_overfitting_IO(): # test the trained model test_text = "I like blue eggs" doc = nlp(test_text) - assert doc[0].tag_ is "N" - assert doc[1].tag_ is "V" - assert doc[2].tag_ is "J" - assert doc[3].tag_ is "N" + assert doc[0].tag_ == "N" + assert doc[1].tag_ == "V" + assert doc[2].tag_ == "J" + assert doc[3].tag_ == "N" # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) - assert doc2[0].tag_ is "N" - assert doc2[1].tag_ is "V" - assert doc2[2].tag_ is "J" - assert doc2[3].tag_ is "N" + assert doc2[0].tag_ == "N" + assert doc2[1].tag_ == "V" + assert doc2[2].tag_ == "J" + assert doc2[3].tag_ == "N" # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions texts = [ @@ -182,6 +182,17 @@ def test_overfitting_IO(): assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, no_batch_deps) + # Try to unlearn the first 'N' tag with negative annotation + neg_ex = Example.from_dict(nlp.make_doc(test_text), {"tags": ["!N", "V", "J", "N"]}) + + for i in range(20): + losses = {} + nlp.update([neg_ex], sgd=optimizer, losses=losses) + + # test the "untrained" tag + doc3 = nlp(test_text) + assert doc3[0].tag_ != "N" + def test_tagger_requires_labels(): nlp = English() diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 2b01a9cc8..b134b8508 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -1,7 +1,7 @@ import pytest import random import numpy.random -from numpy.testing import assert_equal +from numpy.testing import assert_almost_equal from thinc.api import fix_random_seed from spacy import util from spacy.lang.en import English @@ -108,6 +108,12 @@ def test_label_types(name): textcat.add_label("answer") with pytest.raises(ValueError): textcat.add_label(9) + # textcat requires at least two labels + if name == "textcat": + with pytest.raises(ValueError): + nlp.initialize() + else: + nlp.initialize() @pytest.mark.parametrize("name", ["textcat", "textcat_multilabel"]) @@ -131,19 +137,129 @@ def test_implicit_label(name, get_examples): nlp.initialize(get_examples=get_examples(nlp)) -@pytest.mark.parametrize("name", ["textcat", "textcat_multilabel"]) -def test_no_resize(name): +# fmt: off +@pytest.mark.parametrize( + "name,textcat_config", + [ + # BOW + ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), + ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}), + # ENSEMBLE + ("textcat", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}}), + ("textcat", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}}), + # CNN + ("textcat", {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), + ], +) +# fmt: on +def test_no_resize(name, textcat_config): + """The old textcat architectures weren't resizable""" nlp = Language() - textcat = nlp.add_pipe(name) + pipe_config = {"model": textcat_config} + textcat = nlp.add_pipe(name, config=pipe_config) textcat.add_label("POSITIVE") textcat.add_label("NEGATIVE") nlp.initialize() - assert textcat.model.get_dim("nO") >= 2 + assert textcat.model.maybe_get_dim("nO") in [2, None] # this throws an error because the textcat can't be resized after initialization with pytest.raises(ValueError): textcat.add_label("NEUTRAL") +# fmt: off +@pytest.mark.parametrize( + "name,textcat_config", + [ + # BOW + ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), + ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}), + # CNN + ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), + ], +) +# fmt: on +def test_resize(name, textcat_config): + """The new textcat architectures are resizable""" + nlp = Language() + pipe_config = {"model": textcat_config} + textcat = nlp.add_pipe(name, config=pipe_config) + textcat.add_label("POSITIVE") + textcat.add_label("NEGATIVE") + assert textcat.model.maybe_get_dim("nO") in [2, None] + nlp.initialize() + assert textcat.model.maybe_get_dim("nO") in [2, None] + textcat.add_label("NEUTRAL") + assert textcat.model.maybe_get_dim("nO") in [3, None] + + +# fmt: off +@pytest.mark.parametrize( + "name,textcat_config", + [ + # BOW + ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), + ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}), + # CNN + ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), + ], +) +# fmt: on +def test_resize_same_results(name, textcat_config): + # Ensure that the resized textcat classifiers still produce the same results for old labels + fix_random_seed(0) + nlp = English() + pipe_config = {"model": textcat_config} + textcat = nlp.add_pipe(name, config=pipe_config) + + train_examples = [] + for text, annotations in TRAIN_DATA_SINGLE_LABEL: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + optimizer = nlp.initialize(get_examples=lambda: train_examples) + assert textcat.model.maybe_get_dim("nO") in [2, None] + + for i in range(5): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + + # test the trained model before resizing + test_text = "I am happy." + doc = nlp(test_text) + assert len(doc.cats) == 2 + pos_pred = doc.cats["POSITIVE"] + neg_pred = doc.cats["NEGATIVE"] + + # test the trained model again after resizing + textcat.add_label("NEUTRAL") + doc = nlp(test_text) + assert len(doc.cats) == 3 + assert doc.cats["POSITIVE"] == pos_pred + assert doc.cats["NEGATIVE"] == neg_pred + assert doc.cats["NEUTRAL"] <= 1 + + for i in range(5): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + + # test the trained model again after training further with new label + doc = nlp(test_text) + assert len(doc.cats) == 3 + assert doc.cats["POSITIVE"] != pos_pred + assert doc.cats["NEGATIVE"] != neg_pred + for cat in doc.cats: + assert doc.cats[cat] <= 1 + + def test_error_with_multi_labels(): nlp = Language() nlp.add_pipe("textcat") @@ -222,8 +338,12 @@ def test_overfitting_IO(): batch_cats_1 = [doc.cats for doc in nlp.pipe(texts)] batch_cats_2 = [doc.cats for doc in nlp.pipe(texts)] no_batch_cats = [doc.cats for doc in [nlp(text) for text in texts]] - assert_equal(batch_cats_1, batch_cats_2) - assert_equal(batch_cats_1, no_batch_cats) + for cats_1, cats_2 in zip(batch_cats_1, batch_cats_2): + for cat in cats_1: + assert_almost_equal(cats_1[cat], cats_2[cat], decimal=5) + for cats_1, cats_2 in zip(batch_cats_1, no_batch_cats): + for cat in cats_1: + assert_almost_equal(cats_1[cat], cats_2[cat], decimal=5) def test_overfitting_IO_multi(): @@ -270,22 +390,26 @@ def test_overfitting_IO_multi(): batch_deps_1 = [doc.cats for doc in nlp.pipe(texts)] batch_deps_2 = [doc.cats for doc in nlp.pipe(texts)] no_batch_deps = [doc.cats for doc in [nlp(text) for text in texts]] - assert_equal(batch_deps_1, batch_deps_2) - assert_equal(batch_deps_1, no_batch_deps) + for cats_1, cats_2 in zip(batch_deps_1, batch_deps_2): + for cat in cats_1: + assert_almost_equal(cats_1[cat], cats_2[cat], decimal=5) + for cats_1, cats_2 in zip(batch_deps_1, no_batch_deps): + for cat in cats_1: + assert_almost_equal(cats_1[cat], cats_2[cat], decimal=5) # fmt: off @pytest.mark.parametrize( "name,train_data,textcat_config", [ - ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}), - ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}), - ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}), - ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}), - ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}), - ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}), - ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), - ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), + ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}), + ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}), + ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}), + ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}), + ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}), + ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}), + ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), + ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), ], ) # fmt: on @@ -370,3 +494,53 @@ def test_textcat_evaluation(): assert scores["cats_micro_p"] == 4 / 5 assert scores["cats_micro_r"] == 4 / 6 + + +def test_textcat_threshold(): + # Ensure the scorer can be called with a different threshold + nlp = English() + nlp.add_pipe("textcat") + + train_examples = [] + for text, annotations in TRAIN_DATA_SINGLE_LABEL: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + nlp.initialize(get_examples=lambda: train_examples) + + # score the model (it's not actually trained but that doesn't matter) + scores = nlp.evaluate(train_examples) + assert 0 <= scores["cats_score"] <= 1 + + scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 1.0}) + assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 0 + + scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 0}) + macro_f = scores["cats_score"] + assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0 + + scores = nlp.evaluate( + train_examples, scorer_cfg={"threshold": 0, "positive_label": "POSITIVE"} + ) + pos_f = scores["cats_score"] + assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0 + assert pos_f > macro_f + + +def test_textcat_multi_threshold(): + # Ensure the scorer can be called with a different threshold + nlp = English() + nlp.add_pipe("textcat_multilabel") + + train_examples = [] + for text, annotations in TRAIN_DATA_SINGLE_LABEL: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + nlp.initialize(get_examples=lambda: train_examples) + + # score the model (it's not actually trained but that doesn't matter) + scores = nlp.evaluate(train_examples) + assert 0 <= scores["cats_score"] <= 1 + + scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 1.0}) + assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 0 + + scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 0}) + assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0 diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index 90052a9c8..eeea906bb 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -1,5 +1,4 @@ import pytest - from spacy.ml.models.tok2vec import build_Tok2Vec_model from spacy.ml.models.tok2vec import MultiHashEmbed, CharacterEmbed from spacy.ml.models.tok2vec import MishWindowEncoder, MaxoutWindowEncoder @@ -9,11 +8,10 @@ from spacy.tokens import Doc from spacy.training import Example from spacy import util from spacy.lang.en import English -from ..util import get_batch +from thinc.api import Config, get_current_ops +from numpy.testing import assert_array_equal -from thinc.api import Config - -from numpy.testing import assert_equal +from ..util import get_batch, make_tempdir def test_empty_doc(): @@ -57,17 +55,17 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): assert doc_vec.shape == (len(doc), width) -# fmt: off @pytest.mark.parametrize( "width,embed_arch,embed_config,encode_arch,encode_config", + # fmt: off [ (8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}), (8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}), (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}), (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}), ], + # fmt: on ) -# fmt: on def test_tok2vec_configs(width, embed_arch, embed_config, encode_arch, encode_config): embed_config["width"] = width encode_config["width"] = width @@ -131,8 +129,14 @@ cfg_string = """ """ TRAIN_DATA = [ - ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), - ("Eat blue ham", {"tags": ["V", "J", "N"]}), + ( + "I like green eggs", + {"tags": ["N", "V", "J", "N"], "cats": {"preference": 1.0, "imperative": 0.0}}, + ), + ( + "Eat blue ham", + {"tags": ["V", "J", "N"], "cats": {"preference": 0.0, "imperative": 1.0}}, + ), ] @@ -162,7 +166,8 @@ def test_tok2vec_listener(): doc = nlp("Running the pipeline as a whole.") doc_tensor = tagger_tok2vec.predict([doc])[0] - assert_equal(doc.tensor, doc_tensor) + ops = get_current_ops() + assert_array_equal(ops.to_numpy(doc.tensor), ops.to_numpy(doc_tensor)) # TODO: should this warn or error? nlp.select_pipes(disable="tok2vec") @@ -187,3 +192,224 @@ def test_tok2vec_listener_callback(): Y, get_dX = tagger.model.begin_update(docs) # assure that the backprop call works (and doesn't hit a 'None' callback) assert get_dX(Y) is not None + + +def test_replace_listeners(): + orig_config = Config().from_str(cfg_string) + nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) + examples = [Example.from_dict(nlp.make_doc("x y"), {"tags": ["V", "Z"]})] + nlp.initialize(lambda: examples) + tok2vec = nlp.get_pipe("tok2vec") + tagger = nlp.get_pipe("tagger") + assert isinstance(tagger.model.layers[0], Tok2VecListener) + assert tok2vec.listener_map["tagger"][0] == tagger.model.layers[0] + assert ( + nlp.config["components"]["tok2vec"]["model"]["@architectures"] + == "spacy.Tok2Vec.v2" + ) + assert ( + nlp.config["components"]["tagger"]["model"]["tok2vec"]["@architectures"] + == "spacy.Tok2VecListener.v1" + ) + nlp.replace_listeners("tok2vec", "tagger", ["model.tok2vec"]) + assert not isinstance(tagger.model.layers[0], Tok2VecListener) + t2v_cfg = nlp.config["components"]["tok2vec"]["model"] + assert t2v_cfg["@architectures"] == "spacy.Tok2Vec.v2" + assert nlp.config["components"]["tagger"]["model"]["tok2vec"] == t2v_cfg + with pytest.raises(ValueError): + nlp.replace_listeners("invalid", "tagger", ["model.tok2vec"]) + with pytest.raises(ValueError): + nlp.replace_listeners("tok2vec", "parser", ["model.tok2vec"]) + with pytest.raises(ValueError): + nlp.replace_listeners("tok2vec", "tagger", ["model.yolo"]) + with pytest.raises(ValueError): + nlp.replace_listeners("tok2vec", "tagger", ["model.tok2vec", "model.yolo"]) + # attempt training with the new pipeline + optimizer = nlp.initialize(lambda: examples) + for i in range(2): + losses = {} + nlp.update(examples, sgd=optimizer, losses=losses) + assert losses["tok2vec"] == 0.0 + assert losses["tagger"] > 0.0 + + +cfg_string_multi = """ + [nlp] + lang = "en" + pipeline = ["tok2vec","tagger", "ner"] + + [components] + + [components.tagger] + factory = "tagger" + + [components.tagger.model] + @architectures = "spacy.Tagger.v1" + nO = null + + [components.tagger.model.tok2vec] + @architectures = "spacy.Tok2VecListener.v1" + width = ${components.tok2vec.model.encode.width} + + [components.ner] + factory = "ner" + + [components.ner.model] + @architectures = "spacy.TransitionBasedParser.v2" + + [components.ner.model.tok2vec] + @architectures = "spacy.Tok2VecListener.v1" + width = ${components.tok2vec.model.encode.width} + + [components.tok2vec] + factory = "tok2vec" + + [components.tok2vec.model] + @architectures = "spacy.Tok2Vec.v2" + + [components.tok2vec.model.embed] + @architectures = "spacy.MultiHashEmbed.v1" + width = ${components.tok2vec.model.encode.width} + rows = [2000, 1000, 1000, 1000] + attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] + include_static_vectors = false + + [components.tok2vec.model.encode] + @architectures = "spacy.MaxoutWindowEncoder.v2" + width = 96 + depth = 4 + window_size = 1 + maxout_pieces = 3 + """ + + +def test_replace_listeners_from_config(): + orig_config = Config().from_str(cfg_string_multi) + nlp = util.load_model_from_config(orig_config, auto_fill=True) + annots = {"tags": ["V", "Z"], "entities": [(0, 1, "A"), (1, 2, "B")]} + examples = [Example.from_dict(nlp.make_doc("x y"), annots)] + nlp.initialize(lambda: examples) + tok2vec = nlp.get_pipe("tok2vec") + tagger = nlp.get_pipe("tagger") + ner = nlp.get_pipe("ner") + assert tok2vec.listening_components == ["tagger", "ner"] + assert any(isinstance(node, Tok2VecListener) for node in ner.model.walk()) + assert any(isinstance(node, Tok2VecListener) for node in tagger.model.walk()) + with make_tempdir() as dir_path: + nlp.to_disk(dir_path) + base_model = str(dir_path) + new_config = { + "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]}, + "components": { + "tok2vec": {"source": base_model}, + "tagger": { + "source": base_model, + "replace_listeners": ["model.tok2vec"], + }, + "ner": {"source": base_model}, + }, + } + new_nlp = util.load_model_from_config(new_config, auto_fill=True) + new_nlp.initialize(lambda: examples) + tok2vec = new_nlp.get_pipe("tok2vec") + tagger = new_nlp.get_pipe("tagger") + ner = new_nlp.get_pipe("ner") + assert tok2vec.listening_components == ["ner"] + assert any(isinstance(node, Tok2VecListener) for node in ner.model.walk()) + assert not any(isinstance(node, Tok2VecListener) for node in tagger.model.walk()) + t2v_cfg = new_nlp.config["components"]["tok2vec"]["model"] + assert t2v_cfg["@architectures"] == "spacy.Tok2Vec.v2" + assert new_nlp.config["components"]["tagger"]["model"]["tok2vec"] == t2v_cfg + assert ( + new_nlp.config["components"]["ner"]["model"]["tok2vec"]["@architectures"] + == "spacy.Tok2VecListener.v1" + ) + + +cfg_string_multi_textcat = """ + [nlp] + lang = "en" + pipeline = ["tok2vec","textcat_multilabel","tagger"] + + [components] + + [components.textcat_multilabel] + factory = "textcat_multilabel" + + [components.textcat_multilabel.model] + @architectures = "spacy.TextCatEnsemble.v2" + nO = null + + [components.textcat_multilabel.model.tok2vec] + @architectures = "spacy.Tok2VecListener.v1" + width = ${components.tok2vec.model.encode.width} + + [components.textcat_multilabel.model.linear_model] + @architectures = "spacy.TextCatBOW.v1" + exclusive_classes = false + ngram_size = 1 + no_output_layer = false + + [components.tagger] + factory = "tagger" + + [components.tagger.model] + @architectures = "spacy.Tagger.v1" + nO = null + + [components.tagger.model.tok2vec] + @architectures = "spacy.Tok2VecListener.v1" + width = ${components.tok2vec.model.encode.width} + + [components.tok2vec] + factory = "tok2vec" + + [components.tok2vec.model] + @architectures = "spacy.Tok2Vec.v2" + + [components.tok2vec.model.embed] + @architectures = "spacy.MultiHashEmbed.v1" + width = ${components.tok2vec.model.encode.width} + rows = [2000, 1000, 1000, 1000] + attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] + include_static_vectors = false + + [components.tok2vec.model.encode] + @architectures = "spacy.MaxoutWindowEncoder.v2" + width = 96 + depth = 4 + window_size = 1 + maxout_pieces = 3 + """ + + +def test_tok2vec_listeners_textcat(): + orig_config = Config().from_str(cfg_string_multi_textcat) + nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) + assert nlp.pipe_names == ["tok2vec", "textcat_multilabel", "tagger"] + tagger = nlp.get_pipe("tagger") + textcat = nlp.get_pipe("textcat_multilabel") + tok2vec = nlp.get_pipe("tok2vec") + tagger_tok2vec = tagger.model.get_ref("tok2vec") + textcat_tok2vec = textcat.model.get_ref("tok2vec") + assert isinstance(tok2vec, Tok2Vec) + assert isinstance(tagger_tok2vec, Tok2VecListener) + assert isinstance(textcat_tok2vec, Tok2VecListener) + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + + optimizer = nlp.initialize(lambda: train_examples) + for i in range(50): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + + docs = list(nlp.pipe(["Eat blue ham", "I like green eggs"])) + cats0 = docs[0].cats + assert cats0["preference"] < 0.1 + assert cats0["imperative"] > 0.9 + cats1 = docs[1].cats + assert cats1["preference"] > 0.1 + assert cats1["imperative"] < 0.9 + assert [t.tag_ for t in docs[0]] == ["V", "J", "N"] + assert [t.tag_ for t in docs[1]] == ["N", "V", "J", "N"] diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index 362ba67ae..e123d2df9 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -190,14 +190,9 @@ def test_issue3345(): doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) doc[4].is_sent_start = True ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}]) - config = { - "learn_tokens": False, - "min_action_freq": 30, - "update_with_oracle_cut_size": 100, - } cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - ner = EntityRecognizer(doc.vocab, model, **config) + ner = EntityRecognizer(doc.vocab, model) # Add the OUT action. I wouldn't have thought this would be necessary... ner.moves.add_action(5, "") ner.add_label("GPE") diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py index 0505571c2..71c3768dd 100644 --- a/spacy/tests/regression/test_issue3501-4000.py +++ b/spacy/tests/regression/test_issue3501-4000.py @@ -197,7 +197,7 @@ def test_issue3555(en_vocab): def test_issue3611(): - """ Test whether adding n-grams in the textcat works even when n > token length of some docs """ + """Test whether adding n-grams in the textcat works even when n > token length of some docs""" unique_classes = ["offensive", "inoffensive"] x_train = [ "This is an offensive text", @@ -259,8 +259,6 @@ def test_issue3830_no_subtok(): """Test that the parser doesn't have subtok label if not learn_tokens""" config = { "learn_tokens": False, - "min_action_freq": 30, - "update_with_oracle_cut_size": 100, } model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"] parser = DependencyParser(Vocab(), model, **config) @@ -274,8 +272,6 @@ def test_issue3830_with_subtok(): """Test that the parser does have subtok label if learn_tokens=True.""" config = { "learn_tokens": True, - "min_action_freq": 30, - "update_with_oracle_cut_size": 100, } model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"] parser = DependencyParser(Vocab(), model, **config) @@ -286,7 +282,7 @@ def test_issue3830_with_subtok(): def test_issue3839(en_vocab): - """Test that match IDs returned by the matcher are correct, are in the string """ + """Test that match IDs returned by the matcher are correct, are in the string""" doc = Doc(en_vocab, words=["terrific", "group", "of", "people"]) matcher = Matcher(en_vocab) match_id = "PATTERN" @@ -370,7 +366,7 @@ def test_issue3951(en_vocab): def test_issue3959(): - """ Ensure that a modified pos attribute is serialized correctly.""" + """Ensure that a modified pos attribute is serialized correctly.""" nlp = English() doc = nlp( "displaCy uses JavaScript, SVG and CSS to show you how computers understand language" diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py index 5f65faee4..1cdb6e90b 100644 --- a/spacy/tests/regression/test_issue4001-4500.py +++ b/spacy/tests/regression/test_issue4001-4500.py @@ -38,7 +38,7 @@ def test_issue4002(en_vocab): def test_issue4030(): - """ Test whether textcat works fine with empty doc """ + """Test whether textcat works fine with empty doc""" unique_classes = ["offensive", "inoffensive"] x_train = [ "This is an offensive text", @@ -237,7 +237,7 @@ def test_issue4190(): def test_issue4267(): - """ Test that running an entity_ruler after ner gives consistent results""" + """Test that running an entity_ruler after ner gives consistent results""" nlp = English() ner = nlp.add_pipe("ner") ner.add_label("PEOPLE") @@ -289,7 +289,7 @@ def test_multiple_predictions(): @pytest.mark.xfail(reason="no beam parser yet") def test_issue4313(): - """ This should not crash or exit with some strange error code """ + """This should not crash or exit with some strange error code""" beam_width = 16 beam_density = 0.0001 nlp = English() @@ -304,14 +304,14 @@ def test_issue4313(): doc = nlp("What do you think about Apple ?") assert len(ner.labels) == 1 assert "SOME_LABEL" in ner.labels - ner.add_label("MY_ORG") # TODO: not sure if we want this to be necessary... apple_ent = Span(doc, 5, 6, label="MY_ORG") doc.ents = list(doc.ents) + [apple_ent] # ensure the beam_parse still works with the new label docs = [doc] - ner = nlp.get_pipe("beam_ner") ner.beam_parse(docs, drop=0.0, beam_width=beam_width, beam_density=beam_density) + assert len(ner.labels) == 2 + assert "MY_ORG" in ner.labels def test_issue4348(): diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py index 6dbbc233b..effd67306 100644 --- a/spacy/tests/regression/test_issue4501-5000.py +++ b/spacy/tests/regression/test_issue4501-5000.py @@ -9,6 +9,7 @@ from spacy.language import Language from spacy.util import ensure_path, load_model_from_path import numpy import pickle +from thinc.api import NumpyOps, get_current_ops from ..util import make_tempdir @@ -151,7 +152,7 @@ def test_issue4707(): def test_issue4725_1(): - """ Ensure the pickling of the NER goes well""" + """Ensure the pickling of the NER goes well""" vocab = Vocab(vectors_name="test_vocab_add_vector") nlp = English(vocab=vocab) config = { @@ -169,21 +170,22 @@ def test_issue4725_1(): def test_issue4725_2(): - # ensures that this runs correctly and doesn't hang or crash because of the global vectors - # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows), - # or because of issues with pickling the NER (cf test_issue4725_1) - vocab = Vocab(vectors_name="test_vocab_add_vector") - data = numpy.ndarray((5, 3), dtype="f") - data[0] = 1.0 - data[1] = 2.0 - vocab.set_vector("cat", data[0]) - vocab.set_vector("dog", data[1]) - nlp = English(vocab=vocab) - nlp.add_pipe("ner") - nlp.initialize() - docs = ["Kurt is in London."] * 10 - for _ in nlp.pipe(docs, batch_size=2, n_process=2): - pass + if isinstance(get_current_ops, NumpyOps): + # ensures that this runs correctly and doesn't hang or crash because of the global vectors + # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows), + # or because of issues with pickling the NER (cf test_issue4725_1) + vocab = Vocab(vectors_name="test_vocab_add_vector") + data = numpy.ndarray((5, 3), dtype="f") + data[0] = 1.0 + data[1] = 2.0 + vocab.set_vector("cat", data[0]) + vocab.set_vector("dog", data[1]) + nlp = English(vocab=vocab) + nlp.add_pipe("ner") + nlp.initialize() + docs = ["Kurt is in London."] * 10 + for _ in nlp.pipe(docs, batch_size=2, n_process=2): + pass def test_issue4849(): @@ -204,10 +206,11 @@ def test_issue4849(): count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) assert count_ents == 2 # USING 2 PROCESSES - count_ents = 0 - for doc in nlp.pipe([text], n_process=2): - count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) - assert count_ents == 2 + if isinstance(get_current_ops, NumpyOps): + count_ents = 0 + for doc in nlp.pipe([text], n_process=2): + count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) + assert count_ents == 2 @Language.factory("my_pipe") @@ -239,10 +242,11 @@ def test_issue4903(): nlp.add_pipe("sentencizer") nlp.add_pipe("my_pipe", after="sentencizer") text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."] - docs = list(nlp.pipe(text, n_process=2)) - assert docs[0].text == "I like bananas." - assert docs[1].text == "Do you like them?" - assert docs[2].text == "No, I prefer wasabi." + if isinstance(get_current_ops(), NumpyOps): + docs = list(nlp.pipe(text, n_process=2)) + assert docs[0].text == "I like bananas." + assert docs[1].text == "Do you like them?" + assert docs[2].text == "No, I prefer wasabi." def test_issue4924(): diff --git a/spacy/tests/regression/test_issue5001-5500.py b/spacy/tests/regression/test_issue5001-5500.py index dbfe78679..bc9bcb982 100644 --- a/spacy/tests/regression/test_issue5001-5500.py +++ b/spacy/tests/regression/test_issue5001-5500.py @@ -6,6 +6,7 @@ from spacy.language import Language from spacy.lang.en.syntax_iterators import noun_chunks from spacy.vocab import Vocab import spacy +from thinc.api import get_current_ops import pytest from ...util import make_tempdir @@ -54,22 +55,26 @@ def test_issue5082(): ruler.add_patterns(patterns) parsed_vectors_1 = [t.vector for t in nlp(text)] assert len(parsed_vectors_1) == 4 - numpy.testing.assert_array_equal(parsed_vectors_1[0], array1) - numpy.testing.assert_array_equal(parsed_vectors_1[1], array2) - numpy.testing.assert_array_equal(parsed_vectors_1[2], array3) - numpy.testing.assert_array_equal(parsed_vectors_1[3], array4) + ops = get_current_ops() + numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[0]), array1) + numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[1]), array2) + numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[2]), array3) + numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[3]), array4) nlp.add_pipe("merge_entities") parsed_vectors_2 = [t.vector for t in nlp(text)] assert len(parsed_vectors_2) == 3 - numpy.testing.assert_array_equal(parsed_vectors_2[0], array1) - numpy.testing.assert_array_equal(parsed_vectors_2[1], array2) - numpy.testing.assert_array_equal(parsed_vectors_2[2], array34) + numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[0]), array1) + numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[1]), array2) + numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[2]), array34) def test_issue5137(): - @Language.factory("my_component") + factory_name = "test_issue5137" + pipe_name = "my_component" + + @Language.factory(factory_name) class MyComponent: - def __init__(self, nlp, name="my_component", categories="all_categories"): + def __init__(self, nlp, name=pipe_name, categories="all_categories"): self.nlp = nlp self.categories = categories self.name = name @@ -84,17 +89,17 @@ def test_issue5137(): pass nlp = English() - my_component = nlp.add_pipe("my_component") + my_component = nlp.add_pipe(factory_name, name=pipe_name) assert my_component.categories == "all_categories" with make_tempdir() as tmpdir: nlp.to_disk(tmpdir) - overrides = {"components": {"my_component": {"categories": "my_categories"}}} + overrides = {"components": {pipe_name: {"categories": "my_categories"}}} nlp2 = spacy.load(tmpdir, config=overrides) - assert nlp2.get_pipe("my_component").categories == "my_categories" + assert nlp2.get_pipe(pipe_name).categories == "my_categories" def test_issue5141(en_vocab): - """ Ensure an empty DocBin does not crash on serialization """ + """Ensure an empty DocBin does not crash on serialization""" doc_bin = DocBin(attrs=["DEP", "HEAD"]) assert list(doc_bin.get_docs(en_vocab)) == [] doc_bin_bytes = doc_bin.to_bytes() diff --git a/spacy/tests/regression/test_issue5501-6000.py b/spacy/tests/regression/test_issue5501-6000.py index 8d1199e98..355ffffeb 100644 --- a/spacy/tests/regression/test_issue5501-6000.py +++ b/spacy/tests/regression/test_issue5501-6000.py @@ -1,5 +1,6 @@ import pytest -from thinc.api import Config, fix_random_seed +from numpy.testing import assert_almost_equal +from thinc.api import Config, fix_random_seed, get_current_ops from spacy.lang.en import English from spacy.pipeline.textcat import single_label_default_config, single_label_bow_config @@ -44,11 +45,12 @@ def test_issue5551(textcat_config): nlp.update([Example.from_dict(doc, annots)]) # Store the result of each iteration result = pipe.model.predict([doc]) - results.append(list(result[0])) + results.append(result[0]) # All results should be the same because of the fixed seed assert len(results) == 3 - assert results[0] == results[1] - assert results[0] == results[2] + ops = get_current_ops() + assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]), decimal=5) + assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]), decimal=5) def test_issue5838(): diff --git a/spacy/tests/regression/test_issue6258.py b/spacy/tests/regression/test_issue6001-6500.py similarity index 55% rename from spacy/tests/regression/test_issue6258.py rename to spacy/tests/regression/test_issue6001-6500.py index 9ce9026c0..470b2f388 100644 --- a/spacy/tests/regression/test_issue6258.py +++ b/spacy/tests/regression/test_issue6001-6500.py @@ -1,6 +1,21 @@ -import pytest +from spacy.util import filter_spans from pydantic import ValidationError from spacy.schemas import TokenPattern, TokenPatternSchema +import pytest + + +def test_issue6207(en_tokenizer): + doc = en_tokenizer("zero one two three four five six") + + # Make spans + s1 = doc[:4] + s2 = doc[3:6] # overlaps with s1 + s3 = doc[5:7] # overlaps with s2, not s1 + + result = filter_spans((s1, s2, s3)) + assert s1 in result + assert s2 not in result + assert s3 in result def test_issue6258(): diff --git a/spacy/tests/regression/test_issue6207.py b/spacy/tests/regression/test_issue6207.py deleted file mode 100644 index 9d8b047bf..000000000 --- a/spacy/tests/regression/test_issue6207.py +++ /dev/null @@ -1,15 +0,0 @@ -from spacy.util import filter_spans - - -def test_issue6207(en_tokenizer): - doc = en_tokenizer("zero one two three four five six") - - # Make spans - s1 = doc[:4] - s2 = doc[3:6] # overlaps with s1 - s3 = doc[5:7] # overlaps with s2, not s1 - - result = filter_spans((s1, s2, s3)) - assert s1 in result - assert s2 not in result - assert s3 in result diff --git a/spacy/tests/regression/test_issue6501-7000.py b/spacy/tests/regression/test_issue6501-7000.py new file mode 100644 index 000000000..f57e4085c --- /dev/null +++ b/spacy/tests/regression/test_issue6501-7000.py @@ -0,0 +1,230 @@ +import pytest +from spacy.lang.en import English +import numpy as np +import spacy +from spacy.tokens import Doc +from spacy.matcher import PhraseMatcher +from spacy.tokens import DocBin +from spacy.util import load_config_from_str +from spacy.training import Example +from spacy.training.initialize import init_nlp +import pickle + +from ..util import make_tempdir + + +def test_issue6730(en_vocab): + """Ensure that the KB does not accept empty strings, but otherwise IO works fine.""" + from spacy.kb import KnowledgeBase + + kb = KnowledgeBase(en_vocab, entity_vector_length=3) + kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3]) + + with pytest.raises(ValueError): + kb.add_alias(alias="", entities=["1"], probabilities=[0.4]) + assert kb.contains_alias("") is False + + kb.add_alias(alias="x", entities=["1"], probabilities=[0.2]) + kb.add_alias(alias="y", entities=["1"], probabilities=[0.1]) + + with make_tempdir() as tmp_dir: + kb.to_disk(tmp_dir) + kb.from_disk(tmp_dir) + assert kb.get_size_aliases() == 2 + assert set(kb.get_alias_strings()) == {"x", "y"} + + +def test_issue6755(en_tokenizer): + doc = en_tokenizer("This is a magnificent sentence.") + span = doc[:0] + assert span.text_with_ws == "" + assert span.text == "" + + +@pytest.mark.parametrize( + "sentence, start_idx,end_idx,label", + [("Welcome to Mumbai, my friend", 11, 17, "GPE")], +) +def test_issue6815_1(sentence, start_idx, end_idx, label): + nlp = English() + doc = nlp(sentence) + span = doc[:].char_span(start_idx, end_idx, label=label) + assert span.label_ == label + + +@pytest.mark.parametrize( + "sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)] +) +def test_issue6815_2(sentence, start_idx, end_idx, kb_id): + nlp = English() + doc = nlp(sentence) + span = doc[:].char_span(start_idx, end_idx, kb_id=kb_id) + assert span.kb_id == kb_id + + +@pytest.mark.parametrize( + "sentence, start_idx,end_idx,vector", + [("Welcome to Mumbai, my friend", 11, 17, np.array([0.1, 0.2, 0.3]))], +) +def test_issue6815_3(sentence, start_idx, end_idx, vector): + nlp = English() + doc = nlp(sentence) + span = doc[:].char_span(start_idx, end_idx, vector=vector) + assert (span.vector == vector).all() + + +def test_issue6839(en_vocab): + """Ensure that PhraseMatcher accepts Span as input""" + # fmt: off + words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."] + # fmt: on + doc = Doc(en_vocab, words=words) + span = doc[:8] + pattern = Doc(en_vocab, words=["Spans", "and", "Docs"]) + matcher = PhraseMatcher(en_vocab) + matcher.add("SPACY", [pattern]) + matches = matcher(span) + assert matches + + +CONFIG_ISSUE_6908 = """ +[paths] +train = "TRAIN_PLACEHOLDER" +raw = null +init_tok2vec = null +vectors = null + +[system] +seed = 0 +gpu_allocator = null + +[nlp] +lang = "en" +pipeline = ["textcat"] +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +batch_size = 1000 + +[components] + +[components.textcat] +factory = "TEXTCAT_PLACEHOLDER" + +[corpora] + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths:train} + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths:train} + + +[training] +train_corpus = "corpora.train" +dev_corpus = "corpora.dev" +seed = ${system.seed} +gpu_allocator = ${system.gpu_allocator} +frozen_components = [] +before_to_disk = null + +[pretraining] + +[initialize] +vectors = ${paths.vectors} +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null +before_init = null +after_init = null + +[initialize.components] + +[initialize.components.textcat] +labels = ['label1', 'label2'] + +[initialize.tokenizer] +""" + + +@pytest.mark.parametrize( + "component_name", + ["textcat", "textcat_multilabel"], +) +def test_issue6908(component_name): + """Test intializing textcat with labels in a list""" + + def create_data(out_file): + nlp = spacy.blank("en") + doc = nlp.make_doc("Some text") + doc.cats = {"label1": 0, "label2": 1} + out_data = DocBin(docs=[doc]).to_bytes() + with out_file.open("wb") as file_: + file_.write(out_data) + + with make_tempdir() as tmp_path: + train_path = tmp_path / "train.spacy" + create_data(train_path) + config_str = CONFIG_ISSUE_6908.replace("TEXTCAT_PLACEHOLDER", component_name) + config_str = config_str.replace("TRAIN_PLACEHOLDER", train_path.as_posix()) + config = load_config_from_str(config_str) + init_nlp(config) + + +CONFIG_ISSUE_6950 = """ +[nlp] +lang = "en" +pipeline = ["tok2vec", "tagger"] + +[components] + +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.Tok2Vec.v1" + +[components.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = ${components.tok2vec.model.encode:width} +attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] +rows = [5000,2500,2500,2500] +include_static_vectors = false + +[components.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = 96 +depth = 4 +window_size = 1 +maxout_pieces = 3 + +[components.ner] +factory = "ner" + +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode:width} +upstream = "*" +""" + + +def test_issue6950(): + """Test that the nlp object with initialized tok2vec with listeners pickles + correctly (and doesn't have lambdas). + """ + nlp = English.from_config(load_config_from_str(CONFIG_ISSUE_6950)) + nlp.initialize(lambda: [Example.from_dict(nlp.make_doc("hello"), {"tags": ["V"]})]) + pickle.dumps(nlp) + nlp("hello") + pickle.dumps(nlp) diff --git a/spacy/tests/regression/test_issue7001-8000.py b/spacy/tests/regression/test_issue7001-8000.py new file mode 100644 index 000000000..5bb7cc08e --- /dev/null +++ b/spacy/tests/regression/test_issue7001-8000.py @@ -0,0 +1,281 @@ +from spacy.cli.evaluate import print_textcats_auc_per_cat, print_prf_per_type +from spacy.lang.en import English +from spacy.training import Example +from spacy.tokens.doc import Doc +from spacy.vocab import Vocab +from spacy.kb import KnowledgeBase +from spacy.pipeline._parser_internals.arc_eager import ArcEager +from spacy.util import load_config_from_str, load_config +from spacy.cli.init_config import fill_config +from thinc.api import Config +from wasabi import msg + +from ..util import make_tempdir + + +def test_issue7019(): + scores = {"LABEL_A": 0.39829102, "LABEL_B": 0.938298329382, "LABEL_C": None} + print_textcats_auc_per_cat(msg, scores) + scores = { + "LABEL_A": {"p": 0.3420302, "r": 0.3929020, "f": 0.49823928932}, + "LABEL_B": {"p": None, "r": None, "f": None}, + } + print_prf_per_type(msg, scores, name="foo", type="bar") + + +CONFIG_7029 = """ +[nlp] +lang = "en" +pipeline = ["tok2vec", "tagger"] + +[components] + +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.Tok2Vec.v1" + +[components.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = ${components.tok2vec.model.encode:width} +attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] +rows = [5000,2500,2500,2500] +include_static_vectors = false + +[components.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = 96 +depth = 4 +window_size = 1 +maxout_pieces = 3 + +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode:width} +upstream = "*" +""" + + +def test_issue7029(): + """Test that an empty document doesn't mess up an entire batch.""" + TRAIN_DATA = [ + ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), + ("Eat blue ham", {"tags": ["V", "J", "N"]}), + ] + nlp = English.from_config(load_config_from_str(CONFIG_7029)) + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(50): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + texts = ["first", "second", "third", "fourth", "and", "then", "some", ""] + docs1 = list(nlp.pipe(texts, batch_size=1)) + docs2 = list(nlp.pipe(texts, batch_size=4)) + assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]] + + +def test_issue7055(): + """Test that fill-config doesn't turn sourced components into factories.""" + source_cfg = { + "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger"]}, + "components": { + "tok2vec": {"factory": "tok2vec"}, + "tagger": {"factory": "tagger"}, + }, + } + source_nlp = English.from_config(source_cfg) + with make_tempdir() as dir_path: + # We need to create a loadable source pipeline + source_path = dir_path / "test_model" + source_nlp.to_disk(source_path) + base_cfg = { + "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]}, + "components": { + "tok2vec": {"source": str(source_path)}, + "tagger": {"source": str(source_path)}, + "ner": {"factory": "ner"}, + }, + } + base_cfg = Config(base_cfg) + base_path = dir_path / "base.cfg" + base_cfg.to_disk(base_path) + output_path = dir_path / "config.cfg" + fill_config(output_path, base_path, silent=True) + filled_cfg = load_config(output_path) + assert filled_cfg["components"]["tok2vec"]["source"] == str(source_path) + assert filled_cfg["components"]["tagger"]["source"] == str(source_path) + assert filled_cfg["components"]["ner"]["factory"] == "ner" + assert "model" in filled_cfg["components"]["ner"] + + +def test_issue7056(): + """Test that the Unshift transition works properly, and doesn't cause + sentence segmentation errors.""" + vocab = Vocab() + ae = ArcEager( + vocab.strings, ArcEager.get_actions(left_labels=["amod"], right_labels=["pobj"]) + ) + doc = Doc(vocab, words="Severe pain , after trauma".split()) + state = ae.init_batch([doc])[0] + ae.apply_transition(state, "S") + ae.apply_transition(state, "L-amod") + ae.apply_transition(state, "S") + ae.apply_transition(state, "S") + ae.apply_transition(state, "S") + ae.apply_transition(state, "R-pobj") + ae.apply_transition(state, "D") + ae.apply_transition(state, "D") + ae.apply_transition(state, "D") + assert not state.eol() + + +def test_partial_links(): + # Test that having some entities on the doc without gold links, doesn't crash + TRAIN_DATA = [ + ( + "Russ Cochran his reprints include EC Comics.", + { + "links": {(0, 12): {"Q2146908": 1.0}}, + "entities": [(0, 12, "PERSON")], + "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0], + }, + ) + ] + nlp = English() + vector_length = 3 + train_examples = [] + for text, annotation in TRAIN_DATA: + doc = nlp(text) + train_examples.append(Example.from_dict(doc, annotation)) + + def create_kb(vocab): + # create artificial KB + mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) + mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9]) + return mykb + + # Create and train the Entity Linker + entity_linker = nlp.add_pipe("entity_linker", last=True) + entity_linker.set_kb(create_kb) + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(2): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + + # adding additional components that are required for the entity_linker + nlp.add_pipe("sentencizer", first=True) + patterns = [ + {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}, + {"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]}, + ] + ruler = nlp.add_pipe("entity_ruler", before="entity_linker") + ruler.add_patterns(patterns) + + # this will run the pipeline on the examples and shouldn't crash + results = nlp.evaluate(train_examples) + assert "PERSON" in results["ents_per_type"] + assert "PERSON" in results["nel_f_per_type"] + assert "ORG" in results["ents_per_type"] + assert "ORG" not in results["nel_f_per_type"] + + +def test_issue7065(): + text = "Kathleen Battle sang in Mahler 's Symphony No. 8 at the Cincinnati Symphony Orchestra 's May Festival." + nlp = English() + nlp.add_pipe("sentencizer") + ruler = nlp.add_pipe("entity_ruler") + patterns = [ + { + "label": "THING", + "pattern": [ + {"LOWER": "symphony"}, + {"LOWER": "no"}, + {"LOWER": "."}, + {"LOWER": "8"}, + ], + } + ] + ruler.add_patterns(patterns) + + doc = nlp(text) + sentences = [s for s in doc.sents] + assert len(sentences) == 2 + sent0 = sentences[0] + ent = doc.ents[0] + assert ent.start < sent0.end < ent.end + assert sentences.index(ent.sent) == 0 + + +def test_issue7065_b(): + # Test that the NEL doesn't crash when an entity crosses a sentence boundary + nlp = English() + vector_length = 3 + nlp.add_pipe("sentencizer") + text = "Mahler 's Symphony No. 8 was beautiful." + entities = [(0, 6, "PERSON"), (10, 24, "WORK")] + links = { + (0, 6): {"Q7304": 1.0, "Q270853": 0.0}, + (10, 24): {"Q7304": 0.0, "Q270853": 1.0}, + } + sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0] + doc = nlp(text) + example = Example.from_dict( + doc, {"entities": entities, "links": links, "sent_starts": sent_starts} + ) + train_examples = [example] + + def create_kb(vocab): + # create artificial KB + mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) + mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7]) + mykb.add_alias( + alias="No. 8", + entities=["Q270853"], + probabilities=[1.0], + ) + mykb.add_entity(entity="Q7304", freq=12, entity_vector=[6, -4, 3]) + mykb.add_alias( + alias="Mahler", + entities=["Q7304"], + probabilities=[1.0], + ) + return mykb + + # Create the Entity Linker component and add it to the pipeline + entity_linker = nlp.add_pipe("entity_linker", last=True) + entity_linker.set_kb(create_kb) + # train the NEL pipe + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(2): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + + # Add a custom rule-based component to mimick NER + patterns = [ + {"label": "PERSON", "pattern": [{"LOWER": "mahler"}]}, + { + "label": "WORK", + "pattern": [ + {"LOWER": "symphony"}, + {"LOWER": "no"}, + {"LOWER": "."}, + {"LOWER": "8"}, + ], + }, + ] + ruler = nlp.add_pipe("entity_ruler", before="entity_linker") + ruler.add_patterns(patterns) + # test the trained model - this should not throw E148 + doc = nlp(text) + assert doc diff --git a/spacy/tests/regression/test_issue7716.py b/spacy/tests/regression/test_issue7716.py new file mode 100644 index 000000000..811952792 --- /dev/null +++ b/spacy/tests/regression/test_issue7716.py @@ -0,0 +1,54 @@ +import pytest +from thinc.api import Adam +from spacy.attrs import NORM +from spacy.vocab import Vocab +from spacy import registry +from spacy.training import Example +from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL +from spacy.tokens import Doc +from spacy.pipeline import DependencyParser + + +@pytest.fixture +def vocab(): + return Vocab(lex_attr_getters={NORM: lambda s: s}) + + +def _parser_example(parser): + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) + gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]} + return Example.from_dict(doc, gold) + + +@pytest.fixture +def parser(vocab): + vocab.strings.add("ROOT") + cfg = {"model": DEFAULT_PARSER_MODEL} + model = registry.resolve(cfg, validate=True)["model"] + parser = DependencyParser(vocab, model) + parser.cfg["token_vector_width"] = 4 + parser.cfg["hidden_width"] = 32 + # parser.add_label('right') + parser.add_label("left") + parser.initialize(lambda: [_parser_example(parser)]) + sgd = Adam(0.001) + + for i in range(10): + losses = {} + doc = Doc(vocab, words=["a", "b", "c", "d"]) + example = Example.from_dict( + doc, {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]} + ) + parser.update([example], sgd=sgd, losses=losses) + return parser + + +@pytest.mark.xfail(reason="Not fixed yet") +def test_partial_annotation(parser): + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) + doc[2].is_sent_start = False + # Note that if the following line is used, then doc[2].is_sent_start == False + # doc[3].is_sent_start = False + + doc = parser(doc) + assert doc[2].is_sent_start == False diff --git a/spacy/tests/regression/test_issue8168.py b/spacy/tests/regression/test_issue8168.py new file mode 100644 index 000000000..e3f3b5cfa --- /dev/null +++ b/spacy/tests/regression/test_issue8168.py @@ -0,0 +1,24 @@ +import pytest +from spacy.lang.en import English + + +@pytest.mark.issue(8168) +def test_issue8168(): + nlp = English() + ruler = nlp.add_pipe("entity_ruler") + patterns = [ + {"label": "ORG", "pattern": "Apple"}, + { + "label": "GPE", + "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], + "id": "san-francisco", + }, + { + "label": "GPE", + "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], + "id": "san-francisco", + }, + ] + ruler.add_patterns(patterns) + + assert ruler._ent_ids == {8043148519967183733: ("GPE", "san-francisco")} diff --git a/spacy/tests/regression/test_issue8190.py b/spacy/tests/regression/test_issue8190.py new file mode 100644 index 000000000..6ddbe53e0 --- /dev/null +++ b/spacy/tests/regression/test_issue8190.py @@ -0,0 +1,21 @@ +import spacy +from spacy.lang.en import English +from ..util import make_tempdir + + +def test_issue8190(): + """Test that config overrides are not lost after load is complete.""" + source_cfg = { + "nlp": { + "lang": "en", + }, + "custom": {"key": "value"}, + } + source_nlp = English.from_config(source_cfg) + with make_tempdir() as dir_path: + # We need to create a loadable source pipeline + source_path = dir_path / "test_model" + source_nlp.to_disk(source_path) + nlp = spacy.load(source_path, config={"custom": {"key": "updated_value"}}) + + assert nlp.config["custom"]["key"] == "updated_value" diff --git a/spacy/tests/regression/test_issue8216.py b/spacy/tests/regression/test_issue8216.py new file mode 100644 index 000000000..00cd6da3b --- /dev/null +++ b/spacy/tests/regression/test_issue8216.py @@ -0,0 +1,33 @@ +import pytest + +from spacy import registry +from spacy.language import Language + + +@pytest.fixture +def nlp(): + return Language() + + +@pytest.fixture +@registry.misc("entity_ruler_patterns") +def patterns(): + return [ + {"label": "HELLO", "pattern": "hello world"}, + {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, + {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, + {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, + {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, + {"label": "TECH_ORG", "pattern": "Microsoft", "id": "a2"}, + ] + + +def test_entity_ruler_fix8216(nlp, patterns): + """Test that patterns don't get added excessively.""" + ruler = nlp.add_pipe("entity_ruler", config={"validate": True}) + ruler.add_patterns(patterns) + pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values()) + assert pattern_count > 0 + ruler.add_patterns([]) + after_count = sum(len(mm) for mm in ruler.matcher._patterns.values()) + assert after_count == pattern_count diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index ef650d7cd..b259fc8fb 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -4,7 +4,12 @@ import spacy from spacy.lang.en import English from spacy.lang.de import German from spacy.language import Language, DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH -from spacy.util import registry, load_model_from_config, load_config +from spacy.util import ( + registry, + load_model_from_config, + load_config, + load_config_from_str, +) from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder from spacy.schemas import ConfigSchema, ConfigSchemaPretrain @@ -138,7 +143,7 @@ subword_features = false """ -@registry.architectures.register("my_test_parser") +@registry.architectures("my_test_parser") def my_parser(): tok2vec = build_Tok2Vec_model( MultiHashEmbed( @@ -210,7 +215,7 @@ def test_create_nlp_from_config_multiple_instances(): def test_serialize_nlp(): - """ Create a custom nlp pipeline from config and ensure it serializes it correctly """ + """Create a custom nlp pipeline from config and ensure it serializes it correctly""" nlp_config = Config().from_str(nlp_config_string) nlp = load_model_from_config(nlp_config, auto_fill=True) nlp.get_pipe("tagger").add_label("A") @@ -230,7 +235,7 @@ def test_serialize_nlp(): def test_serialize_custom_nlp(): - """ Create a custom nlp pipeline and ensure it serializes it correctly""" + """Create a custom nlp pipeline and ensure it serializes it correctly""" nlp = English() parser_cfg = dict() parser_cfg["model"] = {"@architectures": "my_test_parser"} @@ -250,7 +255,7 @@ def test_serialize_custom_nlp(): @pytest.mark.parametrize("parser_config_string", [parser_config_string_upper]) def test_serialize_parser(parser_config_string): - """ Create a non-default parser config to check nlp serializes it correctly """ + """Create a non-default parser config to check nlp serializes it correctly""" nlp = English() model_config = Config().from_str(parser_config_string) parser = nlp.add_pipe("parser", config=model_config) @@ -269,7 +274,7 @@ def test_serialize_parser(parser_config_string): def test_config_nlp_roundtrip(): - """Test that a config prduced by the nlp object passes training config + """Test that a config produced by the nlp object passes training config validation.""" nlp = English() nlp.add_pipe("entity_ruler") @@ -439,3 +444,32 @@ def test_config_only_resolve_relevant_blocks(): nlp.initialize() nlp.config["initialize"]["lookups"] = None nlp.initialize() + + +def test_hyphen_in_config(): + hyphen_config_str = """ + [nlp] + lang = "en" + pipeline = ["my_punctual_component"] + + [components] + + [components.my_punctual_component] + factory = "my_punctual_component" + punctuation = ["?","-"] + """ + + @spacy.Language.factory("my_punctual_component") + class MyPunctualComponent(object): + name = "my_punctual_component" + + def __init__( + self, + nlp, + name, + punctuation, + ): + self.punctuation = punctuation + + nlp = English.from_config(load_config_from_str(hyphen_config_str)) + assert nlp.get_pipe("my_punctual_component").punctuation == ["?", "-"] diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py index 837c128af..23afaf26c 100644 --- a/spacy/tests/serialize/test_serialize_doc.py +++ b/spacy/tests/serialize/test_serialize_doc.py @@ -1,5 +1,5 @@ import pytest -from spacy.tokens.doc import Underscore +from spacy.tokens.underscore import Underscore import spacy from spacy.lang.en import English @@ -64,13 +64,17 @@ def test_serialize_doc_span_groups(en_vocab): def test_serialize_doc_bin(): - doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True) + doc_bin = DocBin( + attrs=["LEMMA", "ENT_IOB", "ENT_TYPE", "NORM", "ENT_ID"], store_user_data=True + ) texts = ["Some text", "Lots of texts...", "..."] cats = {"A": 0.5} nlp = English() for doc in nlp.pipe(texts): doc.cats = cats doc.spans["start"] = [doc[0:2]] + doc[0].norm_ = "UNUSUAL_TOKEN_NORM" + doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID" doc_bin.add(doc) bytes_data = doc_bin.to_bytes() @@ -82,6 +86,8 @@ def test_serialize_doc_bin(): assert doc.text == texts[i] assert doc.cats == cats assert len(doc.spans) == 1 + assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM" + assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID" def test_serialize_doc_bin_unknown_spaces(en_vocab): diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index 352c335ea..1e0ae3c76 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -3,6 +3,7 @@ from typing import Callable from spacy import util from spacy.util import ensure_path, registry, load_model_from_config from spacy.kb import KnowledgeBase +from spacy.vocab import Vocab from thinc.api import Config from ..util import make_tempdir @@ -108,10 +109,10 @@ def test_serialize_subclassed_kb(): super().__init__(vocab, entity_vector_length) self.custom_field = custom_field - @registry.misc.register("spacy.CustomKB.v1") + @registry.misc("spacy.CustomKB.v1") def custom_kb( entity_vector_length: int, custom_field: int - ) -> Callable[["Vocab"], KnowledgeBase]: + ) -> Callable[[Vocab], KnowledgeBase]: def custom_kb_factory(vocab): kb = SubKnowledgeBase( vocab=vocab, diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 48c7082bb..05871a524 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -1,5 +1,5 @@ import pytest -from spacy import registry, Vocab +from spacy import registry, Vocab, load from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer from spacy.pipeline import TextCategorizer, SentenceRecognizer, TrainablePipe from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL @@ -60,18 +60,10 @@ def taggers(en_vocab): @pytest.mark.parametrize("Parser", test_parsers) def test_serialize_parser_roundtrip_bytes(en_vocab, Parser): - config = { - "learn_tokens": False, - "min_action_freq": 0, - "update_with_oracle_cut_size": 100, - "beam_width": 1, - "beam_update_prob": 1.0, - "beam_density": 0.0, - } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - parser = Parser(en_vocab, model, **config) - new_parser = Parser(en_vocab, model, **config) + parser = Parser(en_vocab, model) + new_parser = Parser(en_vocab, model) new_parser = new_parser.from_bytes(parser.to_bytes(exclude=["vocab"])) bytes_2 = new_parser.to_bytes(exclude=["vocab"]) bytes_3 = parser.to_bytes(exclude=["vocab"]) @@ -84,43 +76,27 @@ def test_serialize_parser_strings(Parser): vocab1 = Vocab() label = "FunnyLabel" assert label not in vocab1.strings - config = { - "learn_tokens": False, - "min_action_freq": 0, - "update_with_oracle_cut_size": 100, - "beam_width": 1, - "beam_update_prob": 1.0, - "beam_density": 0.0, - } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - parser1 = Parser(vocab1, model, **config) + parser1 = Parser(vocab1, model) parser1.add_label(label) assert label in parser1.vocab.strings vocab2 = Vocab() assert label not in vocab2.strings - parser2 = Parser(vocab2, model, **config) + parser2 = Parser(vocab2, model) parser2 = parser2.from_bytes(parser1.to_bytes(exclude=["vocab"])) assert label in parser2.vocab.strings @pytest.mark.parametrize("Parser", test_parsers) def test_serialize_parser_roundtrip_disk(en_vocab, Parser): - config = { - "learn_tokens": False, - "min_action_freq": 0, - "update_with_oracle_cut_size": 100, - "beam_width": 1, - "beam_update_prob": 1.0, - "beam_density": 0.0, - } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - parser = Parser(en_vocab, model, **config) + parser = Parser(en_vocab, model) with make_tempdir() as d: file_path = d / "parser" parser.to_disk(file_path) - parser_d = Parser(en_vocab, model, **config) + parser_d = Parser(en_vocab, model) parser_d = parser_d.from_disk(file_path) parser_bytes = parser.to_bytes(exclude=["model", "vocab"]) parser_d_bytes = parser_d.to_bytes(exclude=["model", "vocab"]) @@ -198,17 +174,12 @@ def test_serialize_textcat_empty(en_vocab): def test_serialize_pipe_exclude(en_vocab, Parser): cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - config = { - "learn_tokens": False, - "min_action_freq": 0, - "update_with_oracle_cut_size": 100, - } def get_new_parser(): - new_parser = Parser(en_vocab, model, **config) + new_parser = Parser(en_vocab, model) return new_parser - parser = Parser(en_vocab, model, **config) + parser = Parser(en_vocab, model) parser.cfg["foo"] = "bar" new_parser = get_new_parser().from_bytes(parser.to_bytes(exclude=["vocab"])) assert "foo" in new_parser.cfg @@ -297,3 +268,21 @@ def test_serialize_custom_trainable_pipe(): pipe.to_disk(d) new_pipe = CustomPipe(Vocab(), Linear()).from_disk(d) assert new_pipe.to_bytes() == pipe_bytes + + +def test_load_without_strings(): + nlp = spacy.blank("en") + orig_strings_length = len(nlp.vocab.strings) + word = "unlikely_word_" * 20 + nlp.vocab.strings.add(word) + assert len(nlp.vocab.strings) == orig_strings_length + 1 + with make_tempdir() as d: + nlp.to_disk(d) + # reload with strings + reloaded_nlp = load(d) + assert len(nlp.vocab.strings) == len(reloaded_nlp.vocab.strings) + assert word in reloaded_nlp.vocab.strings + # reload without strings + reloaded_nlp = load(d, exclude=["strings"]) + assert orig_strings_length == len(reloaded_nlp.vocab.strings) + assert word not in reloaded_nlp.vocab.strings diff --git a/spacy/tests/serialize/test_serialize_tokenizer.py b/spacy/tests/serialize/test_serialize_tokenizer.py index ae612114a..a9450cd04 100644 --- a/spacy/tests/serialize/test_serialize_tokenizer.py +++ b/spacy/tests/serialize/test_serialize_tokenizer.py @@ -26,10 +26,14 @@ def test_serialize_custom_tokenizer(en_vocab, en_tokenizer): assert tokenizer.rules != {} assert tokenizer.token_match is not None assert tokenizer.url_match is not None + assert tokenizer.prefix_search is not None + assert tokenizer.infix_finditer is not None tokenizer.from_bytes(tokenizer_bytes) assert tokenizer.rules == {} assert tokenizer.token_match is None assert tokenizer.url_match is None + assert tokenizer.prefix_search is None + assert tokenizer.infix_finditer is None tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{"ORTH": "ABC"}, {"ORTH": "."}]}) tokenizer.rules = {} diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index 45a546203..3fe9363bf 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -49,9 +49,9 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2): vocab1_d = Vocab().from_disk(file_path1) vocab2_d = Vocab().from_disk(file_path2) # check strings rather than lexemes, which are only reloaded on demand - assert strings1 == [s for s in vocab1_d.strings] - assert strings2 == [s for s in vocab2_d.strings] - if strings1 == strings2: + assert set(strings1) == set([s for s in vocab1_d.strings]) + assert set(strings2) == set([s for s in vocab2_d.strings]) + if set(strings1) == set(strings2): assert [s for s in vocab1_d.strings] == [s for s in vocab2_d.strings] else: assert [s for s in vocab1_d.strings] != [s for s in vocab2_d.strings] @@ -96,7 +96,7 @@ def test_serialize_stringstore_roundtrip_bytes(strings1, strings2): sstore2 = StringStore(strings=strings2) sstore1_b = sstore1.to_bytes() sstore2_b = sstore2.to_bytes() - if strings1 == strings2: + if set(strings1) == set(strings2): assert sstore1_b == sstore2_b else: assert sstore1_b != sstore2_b @@ -104,7 +104,7 @@ def test_serialize_stringstore_roundtrip_bytes(strings1, strings2): assert sstore1.to_bytes() == sstore1_b new_sstore1 = StringStore().from_bytes(sstore1_b) assert new_sstore1.to_bytes() == sstore1_b - assert list(new_sstore1) == strings1 + assert set(new_sstore1) == set(strings1) @pytest.mark.parametrize("strings1,strings2", test_strings) @@ -118,12 +118,12 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2): sstore2.to_disk(file_path2) sstore1_d = StringStore().from_disk(file_path1) sstore2_d = StringStore().from_disk(file_path2) - assert list(sstore1_d) == list(sstore1) - assert list(sstore2_d) == list(sstore2) - if strings1 == strings2: - assert list(sstore1_d) == list(sstore2_d) + assert set(sstore1_d) == set(sstore1) + assert set(sstore2_d) == set(sstore2) + if set(strings1) == set(strings2): + assert set(sstore1_d) == set(sstore2_d) else: - assert list(sstore1_d) != list(sstore2_d) + assert set(sstore1_d) != set(sstore2_d) @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) diff --git a/spacy/tests/test_architectures.py b/spacy/tests/test_architectures.py index 31b2a2d2f..26eabd4e5 100644 --- a/spacy/tests/test_architectures.py +++ b/spacy/tests/test_architectures.py @@ -4,12 +4,11 @@ from thinc.api import Linear from catalogue import RegistryError -@registry.architectures.register("my_test_function") -def create_model(nr_in, nr_out): - return Linear(nr_in, nr_out) - - def test_get_architecture(): + @registry.architectures("my_test_function") + def create_model(nr_in, nr_out): + return Linear(nr_in, nr_out) + arch = registry.architectures.get("my_test_function") assert arch is create_model with pytest.raises(RegistryError): diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index bfbee677a..72bbe04e5 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -4,17 +4,24 @@ from spacy.training import docs_to_json, offsets_to_biluo_tags from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate from spacy.lang.nl import Dutch -from spacy.util import ENV_VARS +from spacy.util import ENV_VARS, load_model_from_config from spacy.cli import info from spacy.cli.init_config import init_config, RECOMMENDATIONS from spacy.cli._util import validate_project_commands, parse_config_overrides from spacy.cli._util import load_project_config, substitute_project_variables +from spacy.cli._util import is_subpath_of from spacy.cli._util import string_to_list +from spacy import about +from spacy.util import get_minor_version +from spacy.cli.validate import get_model_pkgs +from spacy.cli.download import get_compatibility, get_version +from spacy.cli.package import get_third_party_dependencies from thinc.api import ConfigValidationError, Config import srsly import os from .util import make_tempdir +from ..cli.init_pipeline import _init_labels def test_cli_info(): @@ -307,8 +314,12 @@ def test_project_config_validation2(config, n_errors): assert len(errors) == n_errors -def test_project_config_interpolation(): - variables = {"a": 10, "b": {"c": "foo", "d": True}} +@pytest.mark.parametrize( + "int_value", + [10, pytest.param("10", marks=pytest.mark.xfail)], +) +def test_project_config_interpolation(int_value): + variables = {"a": int_value, "b": {"c": "foo", "d": True}} commands = [ {"name": "x", "script": ["hello ${vars.a} ${vars.b.c}"]}, {"name": "y", "script": ["${vars.b.c} ${vars.b.d}"]}, @@ -317,6 +328,8 @@ def test_project_config_interpolation(): with make_tempdir() as d: srsly.write_yaml(d / "project.yml", project) cfg = load_project_config(d) + assert type(cfg) == dict + assert type(cfg["commands"]) == list assert cfg["commands"][0]["script"][0] == "hello 10 foo" assert cfg["commands"][1]["script"][0] == "foo true" commands = [{"name": "x", "script": ["hello ${vars.a} ${vars.b.e}"]}] @@ -325,6 +338,42 @@ def test_project_config_interpolation(): substitute_project_variables(project) +@pytest.mark.parametrize( + "greeting", + [342, "everyone", "tout le monde", pytest.param("42", marks=pytest.mark.xfail)], +) +def test_project_config_interpolation_override(greeting): + variables = {"a": "world"} + commands = [ + {"name": "x", "script": ["hello ${vars.a}"]}, + ] + overrides = {"vars.a": greeting} + project = {"commands": commands, "vars": variables} + with make_tempdir() as d: + srsly.write_yaml(d / "project.yml", project) + cfg = load_project_config(d, overrides=overrides) + assert type(cfg) == dict + assert type(cfg["commands"]) == list + assert cfg["commands"][0]["script"][0] == f"hello {greeting}" + + +def test_project_config_interpolation_env(): + variables = {"a": 10} + env_var = "SPACY_TEST_FOO" + env_vars = {"foo": env_var} + commands = [{"name": "x", "script": ["hello ${vars.a} ${env.foo}"]}] + project = {"commands": commands, "vars": variables, "env": env_vars} + with make_tempdir() as d: + srsly.write_yaml(d / "project.yml", project) + cfg = load_project_config(d) + assert cfg["commands"][0]["script"][0] == "hello 10 " + os.environ[env_var] = "123" + with make_tempdir() as d: + srsly.write_yaml(d / "project.yml", project) + cfg = load_project_config(d) + assert cfg["commands"][0]["script"][0] == "hello 10 123" + + @pytest.mark.parametrize( "args,expected", [ @@ -380,10 +429,20 @@ def test_parse_cli_overrides(): "pipeline", [["tagger", "parser", "ner"], [], ["ner", "textcat", "sentencizer"]] ) @pytest.mark.parametrize("optimize", ["efficiency", "accuracy"]) -def test_init_config(lang, pipeline, optimize): +@pytest.mark.parametrize("pretraining", [True, False]) +def test_init_config(lang, pipeline, optimize, pretraining): # TODO: add more tests and also check for GPU with transformers - config = init_config(lang=lang, pipeline=pipeline, optimize=optimize, gpu=False) + config = init_config( + lang=lang, + pipeline=pipeline, + optimize=optimize, + pretraining=pretraining, + gpu=False, + ) assert isinstance(config, Config) + if pretraining: + config["paths"]["raw_text"] = "my_data.jsonl" + load_model_from_config(config, auto_fill=True) def test_model_recommendations(): @@ -430,3 +489,88 @@ def test_string_to_list(value): def test_string_to_list_intify(value): assert string_to_list(value, intify=False) == ["1", "2", "3"] assert string_to_list(value, intify=True) == [1, 2, 3] + + +def test_download_compatibility(): + model_name = "en_core_web_sm" + compatibility = get_compatibility() + version = get_version(model_name, compatibility) + assert get_minor_version(about.__version__) == get_minor_version(version) + + +def test_validate_compatibility_table(): + model_pkgs, compat = get_model_pkgs() + spacy_version = get_minor_version(about.__version__) + current_compat = compat.get(spacy_version, {}) + assert len(current_compat) > 0 + assert "en_core_web_sm" in current_compat + + +@pytest.mark.parametrize("component_name", ["ner", "textcat", "spancat", "tagger"]) +def test_init_labels(component_name): + nlp = Dutch() + component = nlp.add_pipe(component_name) + for label in ["T1", "T2", "T3", "T4"]: + component.add_label(label) + assert len(nlp.get_pipe(component_name).labels) == 4 + + with make_tempdir() as tmp_dir: + _init_labels(nlp, tmp_dir) + + config = init_config( + lang="nl", + pipeline=[component_name], + optimize="efficiency", + gpu=False, + ) + config["initialize"]["components"][component_name] = { + "labels": { + "@readers": "spacy.read_labels.v1", + "path": f"{tmp_dir}/{component_name}.json", + } + } + + nlp2 = load_model_from_config(config, auto_fill=True) + assert len(nlp2.get_pipe(component_name).labels) == 0 + nlp2.initialize() + assert len(nlp2.get_pipe(component_name).labels) == 4 + + +def test_get_third_party_dependencies(): + # We can't easily test the detection of third-party packages here, but we + # can at least make sure that the function and its importlib magic runs. + nlp = Dutch() + # Test with component factory based on Cython module + nlp.add_pipe("tagger") + assert get_third_party_dependencies(nlp.config) == [] + + # Test with legacy function + nlp = Dutch() + nlp.add_pipe( + "textcat", + config={ + "model": { + # Do not update from legacy architecture spacy.TextCatBOW.v1 + "@architectures": "spacy.TextCatBOW.v1", + "exclusive_classes": True, + "ngram_size": 1, + "no_output_layer": False, + } + }, + ) + get_third_party_dependencies(nlp.config) == [] + + +@pytest.mark.parametrize( + "parent,child,expected", + [ + ("/tmp", "/tmp", True), + ("/tmp", "/", False), + ("/tmp", "/tmp/subdir", True), + ("/tmp", "/tmpdir", False), + ("/tmp", "/tmp/subdir/..", True), + ("/tmp", "/tmp/..", False), + ], +) +def test_is_subpath_of(parent, child, expected): + assert is_subpath_of(parent, child) == expected diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index d6efce32f..8dbb6fd75 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -1,4 +1,6 @@ import itertools +import logging +from unittest import mock import pytest from spacy.language import Language from spacy.tokens import Doc, Span @@ -6,12 +8,46 @@ from spacy.vocab import Vocab from spacy.training import Example from spacy.lang.en import English from spacy.lang.de import German -from spacy.util import registry +from spacy.util import registry, ignore_error, raise_error import spacy +from thinc.api import CupyOps, NumpyOps, get_current_ops from .util import add_vecs_to_vocab, assert_docs_equal +try: + import torch + + # Ensure that we don't deadlock in multiprocessing tests. + torch.set_num_threads(1) + torch.set_num_interop_threads(1) +except ImportError: + pass + + +def evil_component(doc): + if "2" in doc.text: + raise ValueError("no dice") + return doc + + +def perhaps_set_sentences(doc): + if not doc.text.startswith("4"): + doc[-1].is_sent_start = True + return doc + + +def assert_sents_error(doc): + if not doc.has_annotation("SENT_START"): + raise ValueError("no sents") + return doc + + +def warn_error(proc_name, proc, docs, e): + logger = logging.getLogger("spacy") + logger.warning(f"Trouble with component {proc_name}.") + + @pytest.fixture def nlp(): nlp = Language(Vocab()) @@ -90,19 +126,16 @@ def test_evaluate_no_pipe(nlp): nlp.evaluate([Example.from_dict(doc, annots)]) -@Language.component("test_language_vector_modification_pipe") def vector_modification_pipe(doc): doc.vector += 1 return doc -@Language.component("test_language_userdata_pipe") def userdata_pipe(doc): doc.user_data["foo"] = "bar" return doc -@Language.component("test_language_ner_pipe") def ner_pipe(doc): span = Span(doc, 0, 1, label="FIRST") doc.ents += (span,) @@ -120,6 +153,11 @@ def sample_vectors(): @pytest.fixture def nlp2(nlp, sample_vectors): + Language.component( + "test_language_vector_modification_pipe", func=vector_modification_pipe + ) + Language.component("test_language_userdata_pipe", func=userdata_pipe) + Language.component("test_language_ner_pipe", func=ner_pipe) add_vecs_to_vocab(nlp.vocab, sample_vectors) nlp.add_pipe("test_language_vector_modification_pipe") nlp.add_pipe("test_language_ner_pipe") @@ -140,25 +178,140 @@ def texts(): @pytest.mark.parametrize("n_process", [1, 2]) def test_language_pipe(nlp2, n_process, texts): - texts = texts * 10 - expecteds = [nlp2(text) for text in texts] - docs = nlp2.pipe(texts, n_process=n_process, batch_size=2) + ops = get_current_ops() + if isinstance(ops, NumpyOps) or n_process < 2: + texts = texts * 10 + expecteds = [nlp2(text) for text in texts] + docs = nlp2.pipe(texts, n_process=n_process, batch_size=2) - for doc, expected_doc in zip(docs, expecteds): - assert_docs_equal(doc, expected_doc) + for doc, expected_doc in zip(docs, expecteds): + assert_docs_equal(doc, expected_doc) @pytest.mark.parametrize("n_process", [1, 2]) def test_language_pipe_stream(nlp2, n_process, texts): - # check if nlp.pipe can handle infinite length iterator properly. - stream_texts = itertools.cycle(texts) - texts0, texts1 = itertools.tee(stream_texts) - expecteds = (nlp2(text) for text in texts0) - docs = nlp2.pipe(texts1, n_process=n_process, batch_size=2) + ops = get_current_ops() + if isinstance(ops, NumpyOps) or n_process < 2: + # check if nlp.pipe can handle infinite length iterator properly. + stream_texts = itertools.cycle(texts) + texts0, texts1 = itertools.tee(stream_texts) + expecteds = (nlp2(text) for text in texts0) + docs = nlp2.pipe(texts1, n_process=n_process, batch_size=2) - n_fetch = 20 - for doc, expected_doc in itertools.islice(zip(docs, expecteds), n_fetch): - assert_docs_equal(doc, expected_doc) + n_fetch = 20 + for doc, expected_doc in itertools.islice(zip(docs, expecteds), n_fetch): + assert_docs_equal(doc, expected_doc) + + +@pytest.mark.parametrize("n_process", [1, 2]) +def test_language_pipe_error_handler(n_process): + """Test that the error handling of nlp.pipe works well""" + ops = get_current_ops() + if isinstance(ops, NumpyOps) or n_process < 2: + nlp = English() + nlp.add_pipe("merge_subtokens") + nlp.initialize() + texts = ["Curious to see what will happen to this text.", "And this one."] + # the pipeline fails because there's no parser + with pytest.raises(ValueError): + nlp(texts[0]) + with pytest.raises(ValueError): + list(nlp.pipe(texts, n_process=n_process)) + nlp.set_error_handler(raise_error) + with pytest.raises(ValueError): + list(nlp.pipe(texts, n_process=n_process)) + # set explicitely to ignoring + nlp.set_error_handler(ignore_error) + docs = list(nlp.pipe(texts, n_process=n_process)) + assert len(docs) == 0 + nlp(texts[0]) + + +@pytest.mark.parametrize("n_process", [1, 2]) +def test_language_pipe_error_handler_custom(en_vocab, n_process): + """Test the error handling of a custom component that has no pipe method""" + Language.component("my_evil_component", func=evil_component) + ops = get_current_ops() + if isinstance(ops, NumpyOps) or n_process < 2: + nlp = English() + nlp.add_pipe("my_evil_component") + texts = ["TEXT 111", "TEXT 222", "TEXT 333", "TEXT 342", "TEXT 666"] + with pytest.raises(ValueError): + # the evil custom component throws an error + list(nlp.pipe(texts)) + + nlp.set_error_handler(warn_error) + logger = logging.getLogger("spacy") + with mock.patch.object(logger, "warning") as mock_warning: + # the errors by the evil custom component raise a warning for each + # bad doc + docs = list(nlp.pipe(texts, n_process=n_process)) + # HACK/TODO? the warnings in child processes don't seem to be + # detected by the mock logger + if n_process == 1: + mock_warning.assert_called() + assert mock_warning.call_count == 2 + assert len(docs) + mock_warning.call_count == len(texts) + assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"] + + +@pytest.mark.parametrize("n_process", [1, 2]) +def test_language_pipe_error_handler_pipe(en_vocab, n_process): + """Test the error handling of a component's pipe method""" + Language.component("my_perhaps_sentences", func=perhaps_set_sentences) + Language.component("assert_sents_error", func=assert_sents_error) + ops = get_current_ops() + if isinstance(ops, NumpyOps) or n_process < 2: + texts = [f"{str(i)} is enough. Done" for i in range(100)] + nlp = English() + nlp.add_pipe("my_perhaps_sentences") + nlp.add_pipe("assert_sents_error") + nlp.initialize() + with pytest.raises(ValueError): + # assert_sents_error requires sentence boundaries, will throw an error otherwise + docs = list(nlp.pipe(texts, n_process=n_process, batch_size=10)) + nlp.set_error_handler(ignore_error) + docs = list(nlp.pipe(texts, n_process=n_process, batch_size=10)) + # we lose/ignore the failing 4,40-49 docs + assert len(docs) == 89 + + +@pytest.mark.parametrize("n_process", [1, 2]) +def test_language_pipe_error_handler_make_doc_actual(n_process): + """Test the error handling for make_doc""" + # TODO: fix so that the following test is the actual behavior + + ops = get_current_ops() + if isinstance(ops, NumpyOps) or n_process < 2: + nlp = English() + nlp.max_length = 10 + texts = ["12345678901234567890", "12345"] * 10 + with pytest.raises(ValueError): + list(nlp.pipe(texts, n_process=n_process)) + nlp.default_error_handler = ignore_error + if n_process == 1: + with pytest.raises(ValueError): + list(nlp.pipe(texts, n_process=n_process)) + else: + docs = list(nlp.pipe(texts, n_process=n_process)) + assert len(docs) == 0 + + +@pytest.mark.xfail +@pytest.mark.parametrize("n_process", [1, 2]) +def test_language_pipe_error_handler_make_doc_preferred(n_process): + """Test the error handling for make_doc""" + + ops = get_current_ops() + if isinstance(ops, NumpyOps) or n_process < 2: + nlp = English() + nlp.max_length = 10 + texts = ["12345678901234567890", "12345"] * 10 + with pytest.raises(ValueError): + list(nlp.pipe(texts, n_process=n_process)) + nlp.default_error_handler = ignore_error + docs = list(nlp.pipe(texts, n_process=n_process)) + assert len(docs) == 0 def test_language_from_config_before_after_init(): @@ -278,6 +431,37 @@ def test_language_from_config_before_after_init_invalid(): English.from_config(config) +def test_language_whitespace_tokenizer(): + """Test the custom whitespace tokenizer from the docs.""" + + class WhitespaceTokenizer: + def __init__(self, vocab): + self.vocab = vocab + + def __call__(self, text): + words = text.split(" ") + spaces = [True] * len(words) + # Avoid zero-length tokens + for i, word in enumerate(words): + if word == "": + words[i] = " " + spaces[i] = False + # Remove the final trailing space + if words[-1] == " ": + words = words[0:-1] + spaces = spaces[0:-1] + else: + spaces[-1] = False + + return Doc(self.vocab, words=words, spaces=spaces) + + nlp = spacy.blank("en") + nlp.tokenizer = WhitespaceTokenizer(nlp.vocab) + text = " What's happened to me? he thought. It wasn't a dream. " + doc = nlp(text) + assert doc.text == text + + def test_language_custom_tokenizer(): """Test that a fully custom tokenizer can be plugged in via the registry.""" name = "test_language_custom_tokenizer" @@ -334,3 +518,37 @@ def test_language_init_invalid_vocab(value): with pytest.raises(ValueError) as e: Language(value) assert err_fragment in str(e.value) + + +def test_language_source_and_vectors(nlp2): + nlp = Language(Vocab()) + textcat = nlp.add_pipe("textcat") + for label in ("POSITIVE", "NEGATIVE"): + textcat.add_label(label) + nlp.initialize() + long_string = "thisisalongstring" + assert long_string not in nlp.vocab.strings + assert long_string not in nlp2.vocab.strings + nlp.vocab.strings.add(long_string) + assert nlp.vocab.vectors.to_bytes() != nlp2.vocab.vectors.to_bytes() + vectors_bytes = nlp.vocab.vectors.to_bytes() + with pytest.warns(UserWarning): + nlp2.add_pipe("textcat", name="textcat2", source=nlp) + # strings should be added + assert long_string in nlp2.vocab.strings + # vectors should remain unmodified + assert nlp.vocab.vectors.to_bytes() == vectors_bytes + + +@pytest.mark.skipif( + not isinstance(get_current_ops(), CupyOps), reason="test requires GPU" +) +def test_multiprocessing_gpu_warning(nlp2, texts): + texts = texts * 10 + docs = nlp2.pipe(texts, n_process=2, batch_size=2) + + with pytest.warns(UserWarning, match="multiprocessing with GPU models"): + with pytest.raises(ValueError): + # Trigger multi-processing. + for _ in docs: + pass diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 587365bfe..4dd56a4a5 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -7,6 +7,7 @@ from spacy import util from spacy import prefer_gpu, require_gpu, require_cpu from spacy.util import dot_to_object, SimpleFrozenList from thinc.api import Config, Optimizer, ConfigValidationError +from thinc.api import set_current_ops from spacy.training.batchers import minibatch_by_words from spacy.lang.en import English from spacy.lang.nl import Dutch @@ -15,7 +16,7 @@ from spacy.schemas import ConfigSchemaTraining from thinc.api import get_current_ops, NumpyOps, CupyOps -from .util import get_random_doc +from .util import get_random_doc, make_tempdir @pytest.fixture @@ -80,6 +81,7 @@ def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2): def test_prefer_gpu(): + current_ops = get_current_ops() try: import cupy # noqa: F401 @@ -87,9 +89,11 @@ def test_prefer_gpu(): assert isinstance(get_current_ops(), CupyOps) except ImportError: assert not prefer_gpu() + set_current_ops(current_ops) def test_require_gpu(): + current_ops = get_current_ops() try: import cupy # noqa: F401 @@ -98,9 +102,11 @@ def test_require_gpu(): except ImportError: with pytest.raises(ValueError): require_gpu() + set_current_ops(current_ops) def test_require_cpu(): + current_ops = get_current_ops() require_cpu() assert isinstance(get_current_ops(), NumpyOps) try: @@ -112,6 +118,7 @@ def test_require_cpu(): pass require_cpu() assert isinstance(get_current_ops(), NumpyOps) + set_current_ops(current_ops) def test_ascii_filenames(): @@ -204,6 +211,25 @@ def test_dot_to_dict(dot_notation, expected): assert util.dict_to_dot(result) == dot_notation +def test_set_dot_to_object(): + config = {"foo": {"bar": 1, "baz": {"x": "y"}}, "test": {"a": {"b": "c"}}} + with pytest.raises(KeyError): + util.set_dot_to_object(config, "foo.bar.baz", 100) + with pytest.raises(KeyError): + util.set_dot_to_object(config, "hello.world", 100) + with pytest.raises(KeyError): + util.set_dot_to_object(config, "test.a.b.c", 100) + util.set_dot_to_object(config, "foo.bar", 100) + assert config["foo"]["bar"] == 100 + util.set_dot_to_object(config, "foo.baz.x", {"hello": "world"}) + assert config["foo"]["baz"]["x"]["hello"] == "world" + assert config["test"]["a"]["b"] == "c" + util.set_dot_to_object(config, "foo", 123) + assert config["foo"] == 123 + util.set_dot_to_object(config, "test", "hello") + assert dict(config) == {"foo": 123, "test": "hello"} + + @pytest.mark.parametrize( "doc_sizes, expected_batches", [ @@ -247,7 +273,7 @@ def test_util_minibatch(doc_sizes, expected_batches): ], ) def test_util_minibatch_oversize(doc_sizes, expected_batches): - """ Test that oversized documents are returned in their own batch""" + """Test that oversized documents are returned in their own batch""" docs = [get_random_doc(doc_size) for doc_size in doc_sizes] tol = 0.2 batch_size = 1000 @@ -269,7 +295,7 @@ def test_util_dot_section(): factory = "textcat" [components.textcat.model] - @architectures = "spacy.TextCatBOW.v1" + @architectures = "spacy.TextCatBOW.v2" exclusive_classes = true ngram_size = 1 no_output_layer = false @@ -327,3 +353,50 @@ def test_resolve_dot_names(): errors = e.value.errors assert len(errors) == 1 assert errors[0]["loc"] == ["training", "xyz"] + + +def test_import_code(): + code_str = """ +from spacy import Language + +class DummyComponent: + def __init__(self, vocab, name): + pass + + def initialize(self, get_examples, *, nlp, dummy_param: int): + pass + +@Language.factory( + "dummy_component", +) +def make_dummy_component( + nlp: Language, name: str +): + return DummyComponent(nlp.vocab, name) +""" + + with make_tempdir() as temp_dir: + code_path = os.path.join(temp_dir, "code.py") + with open(code_path, "w") as fileh: + fileh.write(code_str) + + import_file("python_code", code_path) + config = {"initialize": {"components": {"dummy_component": {"dummy_param": 1}}}} + nlp = English.from_config(config) + nlp.add_pipe("dummy_component") + nlp.initialize() + + +def test_to_ternary_int(): + assert to_ternary_int(True) == 1 + assert to_ternary_int(None) == 0 + assert to_ternary_int(False) == -1 + assert to_ternary_int(1) == 1 + assert to_ternary_int(1.0) == 1 + assert to_ternary_int(0) == 0 + assert to_ternary_int(0.0) == 0 + assert to_ternary_int(-1) == -1 + assert to_ternary_int(5) == -1 + assert to_ternary_int(-10) == -1 + assert to_ternary_int("string") == -1 + assert to_ternary_int([0, "string"]) == -1 diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py index 200d7dcfd..2306cabb7 100644 --- a/spacy/tests/test_models.py +++ b/spacy/tests/test_models.py @@ -1,11 +1,14 @@ from typing import List import pytest from thinc.api import fix_random_seed, Adam, set_dropout_rate -from numpy.testing import assert_array_equal +from thinc.api import Ragged, reduce_mean, Logistic, chain, Relu +from numpy.testing import assert_array_equal, assert_array_almost_equal import numpy from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder from spacy.ml.models import build_bow_text_classifier, build_simple_cnn_text_classifier +from spacy.ml.models import build_spancat_model from spacy.ml.staticvectors import StaticVectors +from spacy.ml.extract_spans import extract_spans, _get_span_indices from spacy.lang.en import English from spacy.lang.en.examples import sentences as EN_SENTENCES @@ -109,7 +112,7 @@ def test_models_initialize_consistently(seed, model_func, kwargs): model2.initialize() params1 = get_all_params(model1) params2 = get_all_params(model2) - assert_array_equal(params1, params2) + assert_array_equal(model1.ops.to_numpy(params1), model2.ops.to_numpy(params2)) @pytest.mark.parametrize( @@ -134,14 +137,25 @@ def test_models_predict_consistently(seed, model_func, kwargs, get_X): for i in range(len(tok2vec1)): for j in range(len(tok2vec1[i])): assert_array_equal( - numpy.asarray(tok2vec1[i][j]), numpy.asarray(tok2vec2[i][j]) + numpy.asarray(model1.ops.to_numpy(tok2vec1[i][j])), + numpy.asarray(model2.ops.to_numpy(tok2vec2[i][j])), ) + try: + Y1 = model1.ops.to_numpy(Y1) + Y2 = model2.ops.to_numpy(Y2) + except Exception: + pass if isinstance(Y1, numpy.ndarray): assert_array_equal(Y1, Y2) elif isinstance(Y1, List): assert len(Y1) == len(Y2) for y1, y2 in zip(Y1, Y2): + try: + y1 = model1.ops.to_numpy(y1) + y2 = model2.ops.to_numpy(y2) + except Exception: + pass assert_array_equal(y1, y2) else: raise ValueError(f"Could not compare type {type(Y1)}") @@ -169,12 +183,18 @@ def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X): model.finish_update(optimizer) updated_params = get_all_params(model) with pytest.raises(AssertionError): - assert_array_equal(initial_params, updated_params) + assert_array_equal( + model.ops.to_numpy(initial_params), model.ops.to_numpy(updated_params) + ) return model model1 = get_updated_model() model2 = get_updated_model() - assert_array_equal(get_all_params(model1), get_all_params(model2)) + assert_array_almost_equal( + model1.ops.to_numpy(get_all_params(model1)), + model2.ops.to_numpy(get_all_params(model2)), + decimal=5, + ) @pytest.mark.parametrize("model_func,kwargs", [(StaticVectors, {"nO": 128, "nM": 300})]) @@ -189,3 +209,63 @@ def test_empty_docs(model_func, kwargs): # Test backprop output, backprop = model.begin_update(docs) backprop(output) + + +def test_init_extract_spans(): + extract_spans().initialize() + + +def test_extract_spans_span_indices(): + model = extract_spans().initialize() + spans = Ragged( + model.ops.asarray([[0, 3], [2, 3], [5, 7]], dtype="i"), + model.ops.asarray([2, 1], dtype="i"), + ) + x_lengths = model.ops.asarray([5, 10], dtype="i") + indices = _get_span_indices(model.ops, spans, x_lengths) + assert list(indices) == [0, 1, 2, 2, 10, 11] + + +def test_extract_spans_forward_backward(): + model = extract_spans().initialize() + X = Ragged(model.ops.alloc2f(15, 4), model.ops.asarray([5, 10], dtype="i")) + spans = Ragged( + model.ops.asarray([[0, 3], [2, 3], [5, 7]], dtype="i"), + model.ops.asarray([2, 1], dtype="i"), + ) + Y, backprop = model.begin_update((X, spans)) + assert list(Y.lengths) == [3, 1, 2] + assert Y.dataXd.shape == (6, 4) + dX, spans2 = backprop(Y) + assert spans2 is spans + assert dX.dataXd.shape == X.dataXd.shape + assert list(dX.lengths) == list(X.lengths) + + +def test_spancat_model_init(): + model = build_spancat_model( + build_Tok2Vec_model(**get_tok2vec_kwargs()), reduce_mean(), Logistic() + ) + model.initialize() + + +def test_spancat_model_forward_backward(nO=5): + tok2vec = build_Tok2Vec_model(**get_tok2vec_kwargs()) + docs = get_docs() + spans_list = [] + lengths = [] + for doc in docs: + spans_list.append(doc[:2]) + spans_list.append(doc[1:4]) + lengths.append(2) + spans = Ragged( + tok2vec.ops.asarray([[s.start, s.end] for s in spans_list], dtype="i"), + tok2vec.ops.asarray(lengths, dtype="i"), + ) + model = build_spancat_model( + tok2vec, reduce_mean(), chain(Relu(nO=nO), Logistic()) + ).initialize(X=(docs, spans)) + + Y, backprop = model((docs, spans), is_train=True) + assert Y.shape == (spans.dataXd.shape[0], nO) + backprop(Y) diff --git a/spacy/tests/test_pickles.py b/spacy/tests/test_pickles.py index e4c67b672..0c56ae0d2 100644 --- a/spacy/tests/test_pickles.py +++ b/spacy/tests/test_pickles.py @@ -1,7 +1,9 @@ import pytest import numpy import srsly +from spacy.lang.en import English from spacy.strings import StringStore +from spacy.tokens import Doc from spacy.vocab import Vocab from spacy.attrs import NORM @@ -20,7 +22,10 @@ def test_pickle_string_store(text1, text2): @pytest.mark.parametrize("text1,text2", [("dog", "cat")]) def test_pickle_vocab(text1, text2): - vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]}) + vocab = Vocab( + lex_attr_getters={int(NORM): lambda string: string[:-1]}, + get_noun_chunks=English.Defaults.syntax_iterators.get("noun_chunks"), + ) vocab.set_vector("dog", numpy.ones((5,), dtype="f")) lex1 = vocab[text1] lex2 = vocab[text2] @@ -34,4 +39,23 @@ def test_pickle_vocab(text1, text2): assert unpickled[text2].norm == lex2.norm assert unpickled[text1].norm != unpickled[text2].norm assert unpickled.vectors is not None + assert unpickled.get_noun_chunks is not None assert list(vocab["dog"].vector) == [1.0, 1.0, 1.0, 1.0, 1.0] + + +def test_pickle_doc(en_vocab): + words = ["a", "b", "c"] + deps = ["dep"] * len(words) + heads = [0] * len(words) + doc = Doc( + en_vocab, + words=words, + deps=deps, + heads=heads, + ) + data = srsly.pickle_dumps(doc) + unpickled = srsly.pickle_loads(data) + assert [t.text for t in unpickled] == words + assert [t.dep_ for t in unpickled] == deps + assert [t.head.i for t in unpickled] == heads + assert list(doc.noun_chunks) == [] diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index 4dddca404..16cc97f6d 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -3,10 +3,10 @@ import pytest from pytest import approx from spacy.training import Example from spacy.training.iob_utils import offsets_to_biluo_tags -from spacy.scorer import Scorer, ROCAUCScore +from spacy.scorer import Scorer, ROCAUCScore, PRFScore from spacy.scorer import _roc_auc_score, _roc_curve from spacy.lang.en import English -from spacy.tokens import Doc +from spacy.tokens import Doc, Span test_las_apple = [ @@ -403,3 +403,72 @@ def test_roc_auc_score(): score.score_set(0.75, 1) with pytest.raises(ValueError): _ = score.score # noqa: F841 + + +def test_score_spans(): + nlp = English() + text = "This is just a random sentence." + key = "my_spans" + gold = nlp.make_doc(text) + pred = nlp.make_doc(text) + spans = [] + spans.append(gold.char_span(0, 4, label="PERSON")) + spans.append(gold.char_span(0, 7, label="ORG")) + spans.append(gold.char_span(8, 12, label="ORG")) + gold.spans[key] = spans + + def span_getter(doc, span_key): + return doc.spans[span_key] + + # Predict exactly the same, but overlapping spans will be discarded + pred.spans[key] = spans + eg = Example(pred, gold) + scores = Scorer.score_spans([eg], attr=key, getter=span_getter) + assert scores[f"{key}_p"] == 1.0 + assert scores[f"{key}_r"] < 1.0 + + # Allow overlapping, now both precision and recall should be 100% + pred.spans[key] = spans + eg = Example(pred, gold) + scores = Scorer.score_spans([eg], attr=key, getter=span_getter, allow_overlap=True) + assert scores[f"{key}_p"] == 1.0 + assert scores[f"{key}_r"] == 1.0 + + # Change the predicted labels + new_spans = [Span(pred, span.start, span.end, label="WRONG") for span in spans] + pred.spans[key] = new_spans + eg = Example(pred, gold) + scores = Scorer.score_spans([eg], attr=key, getter=span_getter, allow_overlap=True) + assert scores[f"{key}_p"] == 0.0 + assert scores[f"{key}_r"] == 0.0 + assert f"{key}_per_type" in scores + + # Discard labels from the evaluation + scores = Scorer.score_spans( + [eg], attr=key, getter=span_getter, allow_overlap=True, labeled=False + ) + assert scores[f"{key}_p"] == 1.0 + assert scores[f"{key}_r"] == 1.0 + assert f"{key}_per_type" not in scores + + +def test_prf_score(): + cand = {"hi", "ho"} + gold1 = {"yo", "hi"} + gold2 = set() + + a = PRFScore() + a.score_set(cand=cand, gold=gold1) + assert (a.precision, a.recall, a.fscore) == approx((0.5, 0.5, 0.5)) + + b = PRFScore() + b.score_set(cand=cand, gold=gold2) + assert (b.precision, b.recall, b.fscore) == approx((0.0, 0.0, 0.0)) + + c = a + b + assert (c.precision, c.recall, c.fscore) == approx((0.25, 0.5, 0.33333333)) + + a += b + assert (a.precision, a.recall, a.fscore) == approx( + (c.precision, c.recall, c.fscore) + ) diff --git a/spacy/tests/test_ty.py b/spacy/tests/test_ty.py new file mode 100644 index 000000000..2037520df --- /dev/null +++ b/spacy/tests/test_ty.py @@ -0,0 +1,18 @@ +import spacy +from spacy import ty + + +def test_component_types(): + nlp = spacy.blank("en") + tok2vec = nlp.create_pipe("tok2vec") + tagger = nlp.create_pipe("tagger") + entity_ruler = nlp.create_pipe("entity_ruler") + assert isinstance(tok2vec, ty.TrainableComponent) + assert isinstance(tagger, ty.TrainableComponent) + assert not isinstance(entity_ruler, ty.TrainableComponent) + assert isinstance(tok2vec, ty.InitializableComponent) + assert isinstance(tagger, ty.InitializableComponent) + assert isinstance(entity_ruler, ty.InitializableComponent) + assert isinstance(tok2vec, ty.ListenedToComponent) + assert not isinstance(tagger, ty.ListenedToComponent) + assert not isinstance(entity_ruler, ty.ListenedToComponent) diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py index ea6cf91be..0a10ae67d 100644 --- a/spacy/tests/tokenizer/test_explain.py +++ b/spacy/tests/tokenizer/test_explain.py @@ -1,5 +1,7 @@ import pytest +import re from spacy.util import get_lang_class +from spacy.tokenizer import Tokenizer # Only include languages with no external dependencies # "is" seems to confuse importlib, so we're also excluding it for now @@ -60,3 +62,18 @@ def test_tokenizer_explain(lang): tokens = [t.text for t in tokenizer(sentence) if not t.is_space] debug_tokens = [t[1] for t in tokenizer.explain(sentence)] assert tokens == debug_tokens + + +def test_tokenizer_explain_special_matcher(en_vocab): + suffix_re = re.compile(r"[\.]$") + infix_re = re.compile(r"[/]") + rules = {"a.": [{"ORTH": "a."}]} + tokenizer = Tokenizer( + en_vocab, + rules=rules, + suffix_search=suffix_re.search, + infix_finditer=infix_re.finditer, + ) + tokens = [t.text for t in tokenizer("a/a.")] + explain_tokens = [t[1] for t in tokenizer.explain("a/a.")] + assert tokens == explain_tokens diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 4f5eddb95..7d0c16745 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -1,4 +1,5 @@ import pytest +import re from spacy.vocab import Vocab from spacy.tokenizer import Tokenizer from spacy.util import ensure_path @@ -83,7 +84,8 @@ Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, n @pytest.mark.parametrize("file_name", ["sun.txt"]) def test_tokenizer_handle_text_from_file(tokenizer, file_name): loc = ensure_path(__file__).parent / file_name - text = loc.open("r", encoding="utf8").read() + with loc.open("r", encoding="utf8") as infile: + text = infile.read() assert len(text) != 0 tokens = tokenizer(text) assert len(tokens) > 100 @@ -186,3 +188,27 @@ def test_tokenizer_special_cases_spaces(tokenizer): assert [t.text for t in tokenizer("a b c")] == ["a", "b", "c"] tokenizer.add_special_case("a b c", [{"ORTH": "a b c"}]) assert [t.text for t in tokenizer("a b c")] == ["a b c"] + + +def test_tokenizer_flush_cache(en_vocab): + suffix_re = re.compile(r"[\.]$") + tokenizer = Tokenizer( + en_vocab, + suffix_search=suffix_re.search, + ) + assert [t.text for t in tokenizer("a.")] == ["a", "."] + tokenizer.suffix_search = None + assert [t.text for t in tokenizer("a.")] == ["a."] + + +def test_tokenizer_flush_specials(en_vocab): + suffix_re = re.compile(r"[\.]$") + rules = {"a a": [{"ORTH": "a a"}]} + tokenizer1 = Tokenizer( + en_vocab, + suffix_search=suffix_re.search, + rules=rules, + ) + assert [t.text for t in tokenizer1("a a.")] == ["a a", "."] + tokenizer1.rules = {} + assert [t.text for t in tokenizer1("a a.")] == ["a", "a", "."] diff --git a/spacy/tests/training/test_augmenters.py b/spacy/tests/training/test_augmenters.py index 0bd4d5ef2..43a78e4b0 100644 --- a/spacy/tests/training/test_augmenters.py +++ b/spacy/tests/training/test_augmenters.py @@ -38,19 +38,59 @@ def doc(nlp): @pytest.mark.filterwarnings("ignore::UserWarning") -def test_make_orth_variants(nlp, doc): +def test_make_orth_variants(nlp): single = [ {"tags": ["NFP"], "variants": ["…", "..."]}, {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]}, ] + # fmt: off + words = ["\n\n", "A", "\t", "B", "a", "b", "…", "...", "-", "—", "–", "--", "---", "——"] + tags = ["_SP", "NN", "\t", "NN", "NN", "NN", "NFP", "NFP", ":", ":", ":", ":", ":", ":"] + # fmt: on + spaces = [True] * len(words) + spaces[0] = False + spaces[2] = False + doc = Doc(nlp.vocab, words=words, spaces=spaces, tags=tags) augmenter = create_orth_variants_augmenter( level=0.2, lower=0.5, orth_variants={"single": single} ) - with make_docbin([doc]) as output_file: + with make_docbin([doc] * 10) as output_file: reader = Corpus(output_file, augmenter=augmenter) - # Due to randomness, only test that it works without errors for now + # Due to randomness, only test that it works without errors list(reader(nlp)) + # check that the following settings lowercase everything + augmenter = create_orth_variants_augmenter( + level=1.0, lower=1.0, orth_variants={"single": single} + ) + with make_docbin([doc] * 10) as output_file: + reader = Corpus(output_file, augmenter=augmenter) + for example in reader(nlp): + for token in example.reference: + assert token.text == token.text.lower() + + # check that lowercasing is applied without tags + doc = Doc(nlp.vocab, words=words, spaces=[True] * len(words)) + augmenter = create_orth_variants_augmenter( + level=1.0, lower=1.0, orth_variants={"single": single} + ) + with make_docbin([doc] * 10) as output_file: + reader = Corpus(output_file, augmenter=augmenter) + for example in reader(nlp): + for ex_token, doc_token in zip(example.reference, doc): + assert ex_token.text == doc_token.text.lower() + + # check that no lowercasing is applied with lower=0.0 + doc = Doc(nlp.vocab, words=words, spaces=[True] * len(words)) + augmenter = create_orth_variants_augmenter( + level=1.0, lower=0.0, orth_variants={"single": single} + ) + with make_docbin([doc] * 10) as output_file: + reader = Corpus(output_file, augmenter=augmenter) + for example in reader(nlp): + for ex_token, doc_token in zip(example.reference, doc): + assert ex_token.text == doc_token.text + def test_lowercase_augmenter(nlp, doc): augmenter = create_lower_casing_augmenter(level=1.0) @@ -66,6 +106,21 @@ def test_lowercase_augmenter(nlp, doc): assert ref_ent.text == orig_ent.text.lower() assert [t.pos_ for t in eg.reference] == [t.pos_ for t in doc] + # check that augmentation works when lowercasing leads to different + # predicted tokenization + words = ["A", "B", "CCC."] + doc = Doc(nlp.vocab, words=words) + with make_docbin([doc]) as output_file: + reader = Corpus(output_file, augmenter=augmenter) + corpus = list(reader(nlp)) + eg = corpus[0] + assert eg.reference.text == doc.text.lower() + assert eg.predicted.text == doc.text.lower() + assert [t.text for t in eg.reference] == [t.lower() for t in words] + assert [t.text for t in eg.predicted] == [ + t.text for t in nlp.make_doc(doc.text.lower()) + ] + @pytest.mark.filterwarnings("ignore::UserWarning") def test_custom_data_augmentation(nlp, doc): diff --git a/spacy/tests/training/test_new_example.py b/spacy/tests/training/test_new_example.py index be3419b82..4dd90f416 100644 --- a/spacy/tests/training/test_new_example.py +++ b/spacy/tests/training/test_new_example.py @@ -2,6 +2,7 @@ import pytest from spacy.training.example import Example from spacy.tokens import Doc from spacy.vocab import Vocab +from spacy.util import to_ternary_int def test_Example_init_requires_doc_objects(): @@ -121,7 +122,7 @@ def test_Example_from_dict_with_morphology(annots): [ { "words": ["This", "is", "one", "sentence", "this", "is", "another"], - "sent_starts": [1, 0, 0, 0, 1, 0, 0], + "sent_starts": [1, False, 0, None, True, -1, -5.7], } ], ) @@ -131,7 +132,12 @@ def test_Example_from_dict_with_sent_start(annots): example = Example.from_dict(predicted, annots) assert len(list(example.reference.sents)) == 2 for i, token in enumerate(example.reference): - assert bool(token.is_sent_start) == bool(annots["sent_starts"][i]) + if to_ternary_int(annots["sent_starts"][i]) == 1: + assert token.is_sent_start is True + elif to_ternary_int(annots["sent_starts"][i]) == 0: + assert token.is_sent_start is None + else: + assert token.is_sent_start is False @pytest.mark.parametrize( @@ -176,6 +182,27 @@ def test_Example_from_dict_with_entities(annots): assert example.reference[5].ent_type_ == "LOC" +def test_Example_from_dict_with_empty_entities(): + annots = { + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "entities": [], + } + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + example = Example.from_dict(predicted, annots) + # entities as empty list sets everything to O + assert example.reference.has_annotation("ENT_IOB") + assert len(list(example.reference.ents)) == 0 + assert all(token.ent_iob_ == "O" for token in example.reference) + # various unset/missing entities leaves entities unset + annots["entities"] = None + example = Example.from_dict(predicted, annots) + assert not example.reference.has_annotation("ENT_IOB") + annots.pop("entities", None) + example = Example.from_dict(predicted, annots) + assert not example.reference.has_annotation("ENT_IOB") + + @pytest.mark.parametrize( "annots", [ @@ -196,6 +223,104 @@ def test_Example_from_dict_with_entities_invalid(annots): assert len(list(example.reference.ents)) == 0 +@pytest.mark.parametrize( + "annots", + [ + { + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "entities": [ + (7, 15, "LOC"), + (11, 15, "LOC"), + (20, 26, "LOC"), + ], # overlapping + } + ], +) +def test_Example_from_dict_with_entities_overlapping(annots): + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + with pytest.raises(ValueError): + Example.from_dict(predicted, annots) + + +@pytest.mark.parametrize( + "annots", + [ + { + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "spans": { + "cities": [(7, 15, "LOC"), (20, 26, "LOC")], + "people": [(0, 1, "PERSON")], + }, + } + ], +) +def test_Example_from_dict_with_spans(annots): + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + example = Example.from_dict(predicted, annots) + assert len(list(example.reference.ents)) == 0 + assert len(list(example.reference.spans["cities"])) == 2 + assert len(list(example.reference.spans["people"])) == 1 + for span in example.reference.spans["cities"]: + assert span.label_ == "LOC" + for span in example.reference.spans["people"]: + assert span.label_ == "PERSON" + + +@pytest.mark.parametrize( + "annots", + [ + { + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "spans": { + "cities": [(7, 15, "LOC"), (11, 15, "LOC"), (20, 26, "LOC")], + "people": [(0, 1, "PERSON")], + }, + } + ], +) +def test_Example_from_dict_with_spans_overlapping(annots): + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + example = Example.from_dict(predicted, annots) + assert len(list(example.reference.ents)) == 0 + assert len(list(example.reference.spans["cities"])) == 3 + assert len(list(example.reference.spans["people"])) == 1 + for span in example.reference.spans["cities"]: + assert span.label_ == "LOC" + for span in example.reference.spans["people"]: + assert span.label_ == "PERSON" + + +@pytest.mark.parametrize( + "annots", + [ + { + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "spans": [(0, 1, "PERSON")], + }, + { + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "spans": {"cities": (7, 15, "LOC")}, + }, + { + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "spans": {"cities": [7, 11]}, + }, + { + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "spans": {"cities": [[7]]}, + }, + ], +) +def test_Example_from_dict_with_spans_invalid(annots): + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + with pytest.raises(ValueError): + Example.from_dict(predicted, annots) + + @pytest.mark.parametrize( "annots", [ diff --git a/spacy/tests/training/test_pretraining.py b/spacy/tests/training/test_pretraining.py new file mode 100644 index 000000000..8ee54b544 --- /dev/null +++ b/spacy/tests/training/test_pretraining.py @@ -0,0 +1,348 @@ +from pathlib import Path +import numpy as np +import pytest +import srsly +from spacy.vocab import Vocab +from thinc.api import Config + +from ..util import make_tempdir +from ... import util +from ...lang.en import English +from ...training.initialize import init_nlp +from ...training.loop import train +from ...training.pretrain import pretrain +from ...tokens import Doc, DocBin +from ...language import DEFAULT_CONFIG_PRETRAIN_PATH, DEFAULT_CONFIG_PATH + +pretrain_string_listener = """ +[nlp] +lang = "en" +pipeline = ["tok2vec", "tagger"] + +[components] + +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 342 +depth = 4 +window_size = 1 +embed_size = 2000 +maxout_pieces = 3 +subword_features = true + +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "spacy.Tagger.v1" + +[components.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.width} + +[pretraining] +max_epochs = 5 + +[training] +max_epochs = 5 +""" + +pretrain_string_internal = """ +[nlp] +lang = "en" +pipeline = ["tagger"] + +[components] + +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "spacy.Tagger.v1" + +[components.tagger.model.tok2vec] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 342 +depth = 4 +window_size = 1 +embed_size = 2000 +maxout_pieces = 3 +subword_features = true + +[pretraining] +max_epochs = 5 + +[training] +max_epochs = 5 +""" + + +pretrain_string_vectors = """ +[nlp] +lang = "en" +pipeline = ["tok2vec", "tagger"] + +[components] + +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 342 +depth = 4 +window_size = 1 +embed_size = 2000 +maxout_pieces = 3 +subword_features = true + +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "spacy.Tagger.v1" + +[components.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.width} + +[pretraining] +max_epochs = 5 + +[pretraining.objective] +@architectures = spacy.PretrainVectors.v1 +maxout_pieces = 3 +hidden_size = 300 +loss = cosine + +[training] +max_epochs = 5 +""" + +CHAR_OBJECTIVES = [ + {}, + {"@architectures": "spacy.PretrainCharacters.v1"}, + { + "@architectures": "spacy.PretrainCharacters.v1", + "maxout_pieces": 5, + "hidden_size": 42, + "n_characters": 2, + }, +] + +VECTOR_OBJECTIVES = [ + { + "@architectures": "spacy.PretrainVectors.v1", + "maxout_pieces": 3, + "hidden_size": 300, + "loss": "cosine", + }, + { + "@architectures": "spacy.PretrainVectors.v1", + "maxout_pieces": 2, + "hidden_size": 200, + "loss": "L2", + }, +] + + +def test_pretraining_default(): + """Test that pretraining defaults to a character objective""" + config = Config().from_str(pretrain_string_internal) + nlp = util.load_model_from_config(config, auto_fill=True, validate=False) + filled = nlp.config + pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH) + filled = pretrain_config.merge(filled) + assert "PretrainCharacters" in filled["pretraining"]["objective"]["@architectures"] + + +@pytest.mark.parametrize("objective", CHAR_OBJECTIVES) +def test_pretraining_tok2vec_characters(objective): + """Test that pretraining works with the character objective""" + config = Config().from_str(pretrain_string_listener) + config["pretraining"]["objective"] = objective + nlp = util.load_model_from_config(config, auto_fill=True, validate=False) + filled = nlp.config + pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH) + filled = pretrain_config.merge(filled) + with make_tempdir() as tmp_dir: + file_path = write_sample_jsonl(tmp_dir) + filled["paths"]["raw_text"] = file_path + filled = filled.interpolate() + assert filled["pretraining"]["component"] == "tok2vec" + pretrain(filled, tmp_dir) + assert Path(tmp_dir / "model0.bin").exists() + assert Path(tmp_dir / "model4.bin").exists() + assert not Path(tmp_dir / "model5.bin").exists() + + +@pytest.mark.parametrize("objective", VECTOR_OBJECTIVES) +def test_pretraining_tok2vec_vectors_fail(objective): + """Test that pretraining doesn't works with the vectors objective if there are no static vectors""" + config = Config().from_str(pretrain_string_listener) + config["pretraining"]["objective"] = objective + nlp = util.load_model_from_config(config, auto_fill=True, validate=False) + filled = nlp.config + pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH) + filled = pretrain_config.merge(filled) + with make_tempdir() as tmp_dir: + file_path = write_sample_jsonl(tmp_dir) + filled["paths"]["raw_text"] = file_path + filled = filled.interpolate() + assert filled["initialize"]["vectors"] is None + with pytest.raises(ValueError): + pretrain(filled, tmp_dir) + + +@pytest.mark.parametrize("objective", VECTOR_OBJECTIVES) +def test_pretraining_tok2vec_vectors(objective): + """Test that pretraining works with the vectors objective and static vectors defined""" + config = Config().from_str(pretrain_string_listener) + config["pretraining"]["objective"] = objective + nlp = util.load_model_from_config(config, auto_fill=True, validate=False) + filled = nlp.config + pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH) + filled = pretrain_config.merge(filled) + with make_tempdir() as tmp_dir: + file_path = write_sample_jsonl(tmp_dir) + filled["paths"]["raw_text"] = file_path + nlp_path = write_vectors_model(tmp_dir) + filled["initialize"]["vectors"] = nlp_path + filled = filled.interpolate() + pretrain(filled, tmp_dir) + + +@pytest.mark.parametrize("config", [pretrain_string_internal, pretrain_string_listener]) +def test_pretraining_tagger_tok2vec(config): + """Test pretraining of the tagger's tok2vec layer (via a listener)""" + config = Config().from_str(pretrain_string_listener) + nlp = util.load_model_from_config(config, auto_fill=True, validate=False) + filled = nlp.config + pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH) + filled = pretrain_config.merge(filled) + with make_tempdir() as tmp_dir: + file_path = write_sample_jsonl(tmp_dir) + filled["paths"]["raw_text"] = file_path + filled["pretraining"]["component"] = "tagger" + filled["pretraining"]["layer"] = "tok2vec" + filled = filled.interpolate() + pretrain(filled, tmp_dir) + assert Path(tmp_dir / "model0.bin").exists() + assert Path(tmp_dir / "model4.bin").exists() + assert not Path(tmp_dir / "model5.bin").exists() + + +def test_pretraining_tagger(): + """Test pretraining of the tagger itself will throw an error (not an appropriate tok2vec layer)""" + config = Config().from_str(pretrain_string_internal) + nlp = util.load_model_from_config(config, auto_fill=True, validate=False) + filled = nlp.config + pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH) + filled = pretrain_config.merge(filled) + with make_tempdir() as tmp_dir: + file_path = write_sample_jsonl(tmp_dir) + filled["paths"]["raw_text"] = file_path + filled["pretraining"]["component"] = "tagger" + filled = filled.interpolate() + with pytest.raises(ValueError): + pretrain(filled, tmp_dir) + + +def test_pretraining_training(): + """Test that training can use a pretrained Tok2Vec model""" + config = Config().from_str(pretrain_string_internal) + nlp = util.load_model_from_config(config, auto_fill=True, validate=False) + filled = nlp.config + pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH) + filled = pretrain_config.merge(filled) + train_config = util.load_config(DEFAULT_CONFIG_PATH) + filled = train_config.merge(filled) + with make_tempdir() as tmp_dir: + pretrain_dir = tmp_dir / "pretrain" + pretrain_dir.mkdir() + file_path = write_sample_jsonl(pretrain_dir) + filled["paths"]["raw_text"] = file_path + filled["pretraining"]["component"] = "tagger" + filled["pretraining"]["layer"] = "tok2vec" + train_dir = tmp_dir / "train" + train_dir.mkdir() + train_path, dev_path = write_sample_training(train_dir) + filled["paths"]["train"] = train_path + filled["paths"]["dev"] = dev_path + filled = filled.interpolate() + P = filled["pretraining"] + nlp_base = init_nlp(filled) + model_base = ( + nlp_base.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed") + ) + embed_base = None + for node in model_base.walk(): + if node.name == "hashembed": + embed_base = node + pretrain(filled, pretrain_dir) + pretrained_model = Path(pretrain_dir / "model3.bin") + assert pretrained_model.exists() + filled["initialize"]["init_tok2vec"] = str(pretrained_model) + nlp = init_nlp(filled) + model = nlp.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed") + embed = None + for node in model.walk(): + if node.name == "hashembed": + embed = node + # ensure that the tok2vec weights are actually changed by the pretraining + assert np.any(np.not_equal(embed.get_param("E"), embed_base.get_param("E"))) + train(nlp, train_dir) + + +def write_sample_jsonl(tmp_dir): + data = [ + { + "meta": {"id": "1"}, + "text": "This is the best TV you'll ever buy!", + "cats": {"pos": 1, "neg": 0}, + }, + { + "meta": {"id": "2"}, + "text": "I wouldn't buy this again.", + "cats": {"pos": 0, "neg": 1}, + }, + ] + file_path = f"{tmp_dir}/text.jsonl" + srsly.write_jsonl(file_path, data) + return file_path + + +def write_sample_training(tmp_dir): + words = ["The", "players", "start", "."] + tags = ["DT", "NN", "VBZ", "."] + doc = Doc(English().vocab, words=words, tags=tags) + doc_bin = DocBin() + doc_bin.add(doc) + train_path = f"{tmp_dir}/train.spacy" + dev_path = f"{tmp_dir}/dev.spacy" + doc_bin.to_disk(train_path) + doc_bin.to_disk(dev_path) + return train_path, dev_path + + +def write_vectors_model(tmp_dir): + import numpy + + vocab = Vocab() + vector_data = { + "dog": numpy.random.uniform(-1, 1, (300,)), + "cat": numpy.random.uniform(-1, 1, (300,)), + "orange": numpy.random.uniform(-1, 1, (300,)), + } + for word, vector in vector_data.items(): + vocab.set_vector(word, vector) + nlp_path = tmp_dir / "vectors_model" + nlp = English(vocab) + nlp.to_disk(nlp_path) + return str(nlp_path) diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py index 1c698abcc..8c5c81625 100644 --- a/spacy/tests/training/test_readers.py +++ b/spacy/tests/training/test_readers.py @@ -1,6 +1,6 @@ from typing import Dict, Iterable, Callable import pytest -from thinc.api import Config +from thinc.api import Config, fix_random_seed from spacy import Language from spacy.util import load_model_from_config, registry, resolve_dot_names from spacy.schemas import ConfigSchemaTraining @@ -27,8 +27,8 @@ def test_readers(): factory = "textcat" """ - @registry.readers.register("myreader.v1") - def myreader() -> Dict[str, Callable[[Language, str], Iterable[Example]]]: + @registry.readers("myreader.v1") + def myreader() -> Dict[str, Callable[[Language], Iterable[Example]]]: annots = {"cats": {"POS": 1.0, "NEG": 0.0}} def reader(nlp: Language): @@ -64,8 +64,8 @@ def test_readers(): @pytest.mark.parametrize( "reader,additional_config", [ - ("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 2}), - ("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 2}), + ("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 10}), + ("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 10}), ("ml_datasets.cmu_movies.v1", {"limit": 10, "freq_cutoff": 200, "split": 0.8}), ], ) @@ -82,17 +82,18 @@ def test_cat_readers(reader, additional_config): [nlp] lang = "en" - pipeline = ["tok2vec", "textcat"] + pipeline = ["tok2vec", "textcat_multilabel"] [components] [components.tok2vec] factory = "tok2vec" - [components.textcat] - factory = "textcat" + [components.textcat_multilabel] + factory = "textcat_multilabel" """ config = Config().from_str(nlp_config_string) + fix_random_seed(config["training"]["seed"]) config["corpora"]["@readers"] = reader config["corpora"].update(additional_config) nlp = load_model_from_config(config, auto_fill=True) diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index c7a85bf87..cd428be15 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -336,8 +336,8 @@ def test_gold_biluo_additional_whitespace(en_vocab, en_tokenizer): def test_gold_biluo_4791(en_vocab, en_tokenizer): - doc = en_tokenizer("I'll return the ₹54 amount") - gold_words = ["I", "'ll", "return", "the", "₹", "54", "amount"] + doc = en_tokenizer("I'll return the A54 amount") + gold_words = ["I", "'ll", "return", "the", "A", "54", "amount"] gold_spaces = [False, True, True, True, False, True, False] entities = [(16, 19, "MONEY")] example = Example.from_dict( @@ -426,6 +426,37 @@ def test_aligned_spans_x2y(en_vocab, en_tokenizer): assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2), (4, 6)] +def test_aligned_spans_y2x_overlap(en_vocab, en_tokenizer): + text = "I flew to San Francisco Valley" + nlp = English() + doc = nlp(text) + # the reference doc has overlapping spans + gold_doc = nlp.make_doc(text) + spans = [] + prefix = "I flew to " + spans.append( + gold_doc.char_span(len(prefix), len(prefix + "San Francisco"), label="CITY") + ) + spans.append( + gold_doc.char_span( + len(prefix), len(prefix + "San Francisco Valley"), label="VALLEY" + ) + ) + spans_key = "overlap_ents" + gold_doc.spans[spans_key] = spans + example = Example(doc, gold_doc) + spans_gold = example.reference.spans[spans_key] + assert [(ent.start, ent.end) for ent in spans_gold] == [(3, 5), (3, 6)] + + # Ensure that 'get_aligned_spans_y2x' has the aligned entities correct + spans_y2x_no_overlap = example.get_aligned_spans_y2x( + spans_gold, allow_overlap=False + ) + assert [(ent.start, ent.end) for ent in spans_y2x_no_overlap] == [(3, 5)] + spans_y2x_overlap = example.get_aligned_spans_y2x(spans_gold, allow_overlap=True) + assert [(ent.start, ent.end) for ent in spans_y2x_overlap] == [(3, 5), (3, 6)] + + def test_gold_ner_missing_tags(en_tokenizer): doc = en_tokenizer("I flew to Silicon Valley via London.") biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] diff --git a/spacy/tests/util.py b/spacy/tests/util.py index ef7b4d00d..365ea4349 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -5,6 +5,7 @@ import srsly from spacy.tokens import Doc from spacy.vocab import Vocab from spacy.util import make_tempdir # noqa: F401 +from thinc.api import get_current_ops @contextlib.contextmanager @@ -58,7 +59,10 @@ def add_vecs_to_vocab(vocab, vectors): def get_cosine(vec1, vec2): """Get cosine for two given vectors""" - return numpy.dot(vec1, vec2) / (numpy.linalg.norm(vec1) * numpy.linalg.norm(vec2)) + OPS = get_current_ops() + v1 = OPS.to_numpy(OPS.asarray(vec1)) + v2 = OPS.to_numpy(OPS.asarray(vec2)) + return numpy.dot(v1, v2) / (numpy.linalg.norm(v1) * numpy.linalg.norm(v2)) def assert_docs_equal(doc1, doc2): diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py index 4288f427c..b6fee6628 100644 --- a/spacy/tests/vocab_vectors/test_lexeme.py +++ b/spacy/tests/vocab_vectors/test_lexeme.py @@ -55,6 +55,7 @@ def test_vocab_lexeme_add_flag_provided_id(en_vocab): assert en_vocab["199"].check_flag(IS_DIGIT) is False assert en_vocab["the"].check_flag(is_len4) is False assert en_vocab["dogs"].check_flag(is_len4) is True + en_vocab.add_flag(lambda string: string.isdigit(), flag_id=IS_DIGIT) def test_vocab_lexeme_oov_rank(en_vocab): diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 4257022ea..23597455f 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -1,14 +1,17 @@ import pytest import numpy from numpy.testing import assert_allclose, assert_equal +from thinc.api import get_current_ops from spacy.vocab import Vocab from spacy.vectors import Vectors from spacy.tokenizer import Tokenizer -from spacy.strings import hash_string +from spacy.strings import hash_string # type: ignore from spacy.tokens import Doc from ..util import add_vecs_to_vocab, get_cosine, make_tempdir +OPS = get_current_ops() + @pytest.fixture def strings(): @@ -18,21 +21,21 @@ def strings(): @pytest.fixture def vectors(): return [ - ("apple", [1, 2, 3]), - ("orange", [-1, -2, -3]), - ("and", [-1, -1, -1]), - ("juice", [5, 5, 10]), - ("pie", [7, 6.3, 8.9]), + ("apple", OPS.asarray([1, 2, 3])), + ("orange", OPS.asarray([-1, -2, -3])), + ("and", OPS.asarray([-1, -1, -1])), + ("juice", OPS.asarray([5, 5, 10])), + ("pie", OPS.asarray([7, 6.3, 8.9])), ] @pytest.fixture def ngrams_vectors(): return [ - ("apple", [1, 2, 3]), - ("app", [-0.1, -0.2, -0.3]), - ("ppl", [-0.2, -0.3, -0.4]), - ("pl", [0.7, 0.8, 0.9]), + ("apple", OPS.asarray([1, 2, 3])), + ("app", OPS.asarray([-0.1, -0.2, -0.3])), + ("ppl", OPS.asarray([-0.2, -0.3, -0.4])), + ("pl", OPS.asarray([0.7, 0.8, 0.9])), ] @@ -171,8 +174,10 @@ def test_vectors_most_similar_identical(): @pytest.mark.parametrize("text", ["apple and orange"]) def test_vectors_token_vector(tokenizer_v, vectors, text): doc = tokenizer_v(text) - assert vectors[0] == (doc[0].text, list(doc[0].vector)) - assert vectors[1] == (doc[2].text, list(doc[2].vector)) + assert vectors[0][0] == doc[0].text + assert all([a == b for a, b in zip(vectors[0][1], doc[0].vector)]) + assert vectors[1][0] == doc[2].text + assert all([a == b for a, b in zip(vectors[1][1], doc[2].vector)]) @pytest.mark.parametrize("text", ["apple"]) @@ -301,7 +306,7 @@ def test_vectors_doc_doc_similarity(vocab, text1, text2): def test_vocab_add_vector(): vocab = Vocab(vectors_name="test_vocab_add_vector") - data = numpy.ndarray((5, 3), dtype="f") + data = OPS.xp.ndarray((5, 3), dtype="f") data[0] = 1.0 data[1] = 2.0 vocab.set_vector("cat", data[0]) @@ -320,10 +325,10 @@ def test_vocab_prune_vectors(): _ = vocab["cat"] # noqa: F841 _ = vocab["dog"] # noqa: F841 _ = vocab["kitten"] # noqa: F841 - data = numpy.ndarray((5, 3), dtype="f") - data[0] = [1.0, 1.2, 1.1] - data[1] = [0.3, 1.3, 1.0] - data[2] = [0.9, 1.22, 1.05] + data = OPS.xp.ndarray((5, 3), dtype="f") + data[0] = OPS.asarray([1.0, 1.2, 1.1]) + data[1] = OPS.asarray([0.3, 1.3, 1.0]) + data[2] = OPS.asarray([0.9, 1.22, 1.05]) vocab.set_vector("cat", data[0]) vocab.set_vector("dog", data[1]) vocab.set_vector("kitten", data[2]) @@ -332,40 +337,41 @@ def test_vocab_prune_vectors(): assert list(remap.keys()) == ["kitten"] neighbour, similarity = list(remap.values())[0] assert neighbour == "cat", remap - assert_allclose(similarity, get_cosine(data[0], data[2]), atol=1e-4, rtol=1e-3) + cosine = get_cosine(data[0], data[2]) + assert_allclose(float(similarity), cosine, atol=1e-4, rtol=1e-3) def test_vectors_serialize(): - data = numpy.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f") + data = OPS.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f") v = Vectors(data=data, keys=["A", "B", "C"]) b = v.to_bytes() v_r = Vectors() v_r.from_bytes(b) - assert_equal(v.data, v_r.data) + assert_equal(OPS.to_numpy(v.data), OPS.to_numpy(v_r.data)) assert v.key2row == v_r.key2row v.resize((5, 4)) v_r.resize((5, 4)) - row = v.add("D", vector=numpy.asarray([1, 2, 3, 4], dtype="f")) - row_r = v_r.add("D", vector=numpy.asarray([1, 2, 3, 4], dtype="f")) + row = v.add("D", vector=OPS.asarray([1, 2, 3, 4], dtype="f")) + row_r = v_r.add("D", vector=OPS.asarray([1, 2, 3, 4], dtype="f")) assert row == row_r - assert_equal(v.data, v_r.data) + assert_equal(OPS.to_numpy(v.data), OPS.to_numpy(v_r.data)) assert v.is_full == v_r.is_full with make_tempdir() as d: v.to_disk(d) v_r.from_disk(d) - assert_equal(v.data, v_r.data) + assert_equal(OPS.to_numpy(v.data), OPS.to_numpy(v_r.data)) assert v.key2row == v_r.key2row v.resize((5, 4)) v_r.resize((5, 4)) - row = v.add("D", vector=numpy.asarray([10, 20, 30, 40], dtype="f")) - row_r = v_r.add("D", vector=numpy.asarray([10, 20, 30, 40], dtype="f")) + row = v.add("D", vector=OPS.asarray([10, 20, 30, 40], dtype="f")) + row_r = v_r.add("D", vector=OPS.asarray([10, 20, 30, 40], dtype="f")) assert row == row_r - assert_equal(v.data, v_r.data) + assert_equal(OPS.to_numpy(v.data), OPS.to_numpy(v_r.data)) def test_vector_is_oov(): vocab = Vocab(vectors_name="test_vocab_is_oov") - data = numpy.ndarray((5, 3), dtype="f") + data = OPS.xp.ndarray((5, 3), dtype="f") data[0] = 1.0 data[1] = 2.0 vocab.set_vector("cat", data[0]) diff --git a/spacy/tests/vocab_vectors/test_vocab_api.py b/spacy/tests/vocab_vectors/test_vocab_api.py index a687059be..56ef1d108 100644 --- a/spacy/tests/vocab_vectors/test_vocab_api.py +++ b/spacy/tests/vocab_vectors/test_vocab_api.py @@ -1,5 +1,5 @@ import pytest -from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA +from spacy.attrs import LEMMA, ORTH, IS_ALPHA from spacy.parts_of_speech import NOUN, VERB @@ -30,7 +30,6 @@ def test_vocab_api_shape_attr(en_vocab, text): ("VERB", VERB), ("LEMMA", LEMMA), ("ORTH", ORTH), - ("PROB", PROB), ], ) def test_vocab_api_symbols(en_vocab, string, symbol): diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index 9c1398a17..719e8e6f5 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -14,7 +14,7 @@ cdef class Tokenizer: cdef Pool mem cdef PreshMap _cache cdef PreshMap _specials - cpdef readonly Vocab vocab + cdef readonly Vocab vocab cdef object _token_match cdef object _url_match @@ -23,8 +23,8 @@ cdef class Tokenizer: cdef object _infix_finditer cdef object _rules cdef PhraseMatcher _special_matcher - cdef int _property_init_count - cdef int _property_init_max + cdef int _property_init_count # TODO: unused, remove in v3.1 + cdef int _property_init_max # TODO: unused, remove in v3.1 cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases) cdef int _apply_special_cases(self, Doc doc) except -1 diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 40b7edbcb..5a89e5a17 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -20,18 +20,19 @@ from .attrs import intify_attrs from .symbols import ORTH, NORM from .errors import Errors, Warnings from . import util -from .util import registry +from .util import registry, get_words_and_spaces from .attrs import intify_attrs from .symbols import ORTH from .scorer import Scorer from .training import validate_examples +from .tokens import Span cdef class Tokenizer: """Segment text, and create Doc objects with the discovered segment boundaries. - DOCS: https://nightly.spacy.io/api/tokenizer + DOCS: https://spacy.io/api/tokenizer """ def __init__(self, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, infix_finditer=None, token_match=None, @@ -54,7 +55,7 @@ cdef class Tokenizer: EXAMPLE: >>> tokenizer = Tokenizer(nlp.vocab) - DOCS: https://nightly.spacy.io/api/tokenizer#init + DOCS: https://spacy.io/api/tokenizer#init """ self.mem = Pool() self._cache = PreshMap() @@ -68,8 +69,6 @@ cdef class Tokenizer: self._rules = {} self._special_matcher = PhraseMatcher(self.vocab) self._load_special_cases(rules) - self._property_init_count = 0 - self._property_init_max = 4 property token_match: def __get__(self): @@ -78,8 +77,6 @@ cdef class Tokenizer: def __set__(self, token_match): self._token_match = token_match self._reload_special_cases() - if self._property_init_count <= self._property_init_max: - self._property_init_count += 1 property url_match: def __get__(self): @@ -87,7 +84,7 @@ cdef class Tokenizer: def __set__(self, url_match): self._url_match = url_match - self._flush_cache() + self._reload_special_cases() property prefix_search: def __get__(self): @@ -96,8 +93,6 @@ cdef class Tokenizer: def __set__(self, prefix_search): self._prefix_search = prefix_search self._reload_special_cases() - if self._property_init_count <= self._property_init_max: - self._property_init_count += 1 property suffix_search: def __get__(self): @@ -106,8 +101,6 @@ cdef class Tokenizer: def __set__(self, suffix_search): self._suffix_search = suffix_search self._reload_special_cases() - if self._property_init_count <= self._property_init_max: - self._property_init_count += 1 property infix_finditer: def __get__(self): @@ -116,8 +109,6 @@ cdef class Tokenizer: def __set__(self, infix_finditer): self._infix_finditer = infix_finditer self._reload_special_cases() - if self._property_init_count <= self._property_init_max: - self._property_init_count += 1 property rules: def __get__(self): @@ -125,7 +116,7 @@ cdef class Tokenizer: def __set__(self, rules): self._rules = {} - self._reset_cache([key for key in self._cache]) + self._flush_cache() self._flush_specials() self._cache = PreshMap() self._specials = PreshMap() @@ -147,7 +138,7 @@ cdef class Tokenizer: string (str): The string to tokenize. RETURNS (Doc): A container for linguistic annotations. - DOCS: https://nightly.spacy.io/api/tokenizer#call + DOCS: https://spacy.io/api/tokenizer#call """ doc = self._tokenize_affixes(string, True) self._apply_special_cases(doc) @@ -209,7 +200,7 @@ cdef class Tokenizer: Defaults to 1000. YIELDS (Doc): A sequence of Doc objects, in order. - DOCS: https://nightly.spacy.io/api/tokenizer#pipe + DOCS: https://spacy.io/api/tokenizer#pipe """ for text in texts: yield self(text) @@ -225,6 +216,7 @@ cdef class Tokenizer: self.mem.free(cached) def _flush_specials(self): + self._special_matcher = PhraseMatcher(self.vocab) for k in self._specials: cached = <_Cached*>self._specials.get(k) del self._specials[k] @@ -245,7 +237,7 @@ cdef class Tokenizer: cdef int offset cdef int modified_doc_length # Find matches for special cases - self._special_matcher.find_matches(doc, &c_matches) + self._special_matcher.find_matches(doc, 0, doc.length, &c_matches) # Skip processing if no matches if c_matches.size() == 0: return True @@ -529,7 +521,7 @@ cdef class Tokenizer: and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. - DOCS: https://nightly.spacy.io/api/tokenizer#find_infix + DOCS: https://spacy.io/api/tokenizer#find_infix """ if self.infix_finditer is None: return 0 @@ -542,7 +534,7 @@ cdef class Tokenizer: string (str): The string to segment. RETURNS (int): The length of the prefix if present, otherwise `None`. - DOCS: https://nightly.spacy.io/api/tokenizer#find_prefix + DOCS: https://spacy.io/api/tokenizer#find_prefix """ if self.prefix_search is None: return 0 @@ -556,7 +548,7 @@ cdef class Tokenizer: string (str): The string to segment. Returns (int): The length of the suffix if present, otherwise `None`. - DOCS: https://nightly.spacy.io/api/tokenizer#find_suffix + DOCS: https://spacy.io/api/tokenizer#find_suffix """ if self.suffix_search is None: return 0 @@ -567,7 +559,6 @@ cdef class Tokenizer: """Add special-case tokenization rules.""" if special_cases is not None: for chunk, substrings in sorted(special_cases.items()): - self._validate_special_case(chunk, substrings) self.add_special_case(chunk, substrings) def _validate_special_case(self, chunk, substrings): @@ -596,7 +587,7 @@ cdef class Tokenizer: a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. - DOCS: https://nightly.spacy.io/api/tokenizer#add_special_case + DOCS: https://spacy.io/api/tokenizer#add_special_case """ self._validate_special_case(string, substrings) substrings = list(substrings) @@ -615,16 +606,9 @@ cdef class Tokenizer: self._special_matcher.add(string, None, self._tokenize_affixes(string, False)) def _reload_special_cases(self): - try: - self._property_init_count - except AttributeError: - return - # only reload if all 4 of prefix, suffix, infix, token_match have - # have been initialized - if self.vocab is not None and self._property_init_count >= self._property_init_max: - self._flush_cache() - self._flush_specials() - self._load_special_cases(self._rules) + self._flush_cache() + self._flush_specials() + self._load_special_cases(self._rules) def explain(self, text): """A debugging tokenizer that provides information about which @@ -635,11 +619,17 @@ cdef class Tokenizer: string (str): The string to tokenize. RETURNS (list): A list of (pattern_string, token_string) tuples - DOCS: https://nightly.spacy.io/api/tokenizer#explain + DOCS: https://spacy.io/api/tokenizer#explain """ prefix_search = self.prefix_search + if prefix_search is None: + prefix_search = re.compile("a^").search suffix_search = self.suffix_search + if suffix_search is None: + suffix_search = re.compile("a^").search infix_finditer = self.infix_finditer + if infix_finditer is None: + infix_finditer = re.compile("a^").finditer token_match = self.token_match if token_match is None: token_match = re.compile("a^").match @@ -687,7 +677,7 @@ cdef class Tokenizer: tokens.append(("URL_MATCH", substring)) substring = '' elif substring in special_cases: - tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) + tokens.extend((f"SPECIAL-{i + 1}", self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) substring = '' elif list(infix_finditer(substring)): infixes = infix_finditer(substring) @@ -705,7 +695,33 @@ cdef class Tokenizer: tokens.append(("TOKEN", substring)) substring = '' tokens.extend(reversed(suffixes)) - return tokens + # Find matches for special cases handled by special matcher + words, spaces = get_words_and_spaces([t[1] for t in tokens], text) + t_words = [] + t_spaces = [] + for word, space in zip(words, spaces): + if not word.isspace(): + t_words.append(word) + t_spaces.append(space) + doc = Doc(self.vocab, words=t_words, spaces=t_spaces) + matches = self._special_matcher(doc) + spans = [Span(doc, s, e, label=m_id) for m_id, s, e in matches] + spans = util.filter_spans(spans) + # Replace matched tokens with their exceptions + i = 0 + final_tokens = [] + spans_by_start = {s.start: s for s in spans} + while i < len(tokens): + if i in spans_by_start: + span = spans_by_start[i] + exc = [d[ORTH] for d in special_cases[span.label_]] + for j, orth in enumerate(exc): + final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth])) + i += len(span) + else: + final_tokens.append(tokens[i]) + i += 1 + return final_tokens def score(self, examples, **kwargs): validate_examples(examples, "Tokenizer.score") @@ -718,7 +734,7 @@ cdef class Tokenizer: it doesn't exist. exclude (list): String names of serialization fields to exclude. - DOCS: https://nightly.spacy.io/api/tokenizer#to_disk + DOCS: https://spacy.io/api/tokenizer#to_disk """ path = util.ensure_path(path) with path.open("wb") as file_: @@ -732,7 +748,7 @@ cdef class Tokenizer: exclude (list): String names of serialization fields to exclude. RETURNS (Tokenizer): The modified `Tokenizer` object. - DOCS: https://nightly.spacy.io/api/tokenizer#from_disk + DOCS: https://spacy.io/api/tokenizer#from_disk """ path = util.ensure_path(path) with path.open("rb") as file_: @@ -746,10 +762,10 @@ cdef class Tokenizer: exclude (list): String names of serialization fields to exclude. RETURNS (bytes): The serialized form of the `Tokenizer` object. - DOCS: https://nightly.spacy.io/api/tokenizer#to_bytes + DOCS: https://spacy.io/api/tokenizer#to_bytes """ serializers = { - "vocab": lambda: self.vocab.to_bytes(), + "vocab": lambda: self.vocab.to_bytes(exclude=exclude), "prefix_search": lambda: _get_regex_pattern(self.prefix_search), "suffix_search": lambda: _get_regex_pattern(self.suffix_search), "infix_finditer": lambda: _get_regex_pattern(self.infix_finditer), @@ -766,11 +782,11 @@ cdef class Tokenizer: exclude (list): String names of serialization fields to exclude. RETURNS (Tokenizer): The `Tokenizer` object. - DOCS: https://nightly.spacy.io/api/tokenizer#from_bytes + DOCS: https://spacy.io/api/tokenizer#from_bytes """ data = {} deserializers = { - "vocab": lambda b: self.vocab.from_bytes(b), + "vocab": lambda b: self.vocab.from_bytes(b, exclude=exclude), "prefix_search": lambda b: data.setdefault("prefix_search", b), "suffix_search": lambda b: data.setdefault("suffix_search", b), "infix_finditer": lambda b: data.setdefault("infix_finditer", b), @@ -778,6 +794,15 @@ cdef class Tokenizer: "url_match": lambda b: data.setdefault("url_match", b), "exceptions": lambda b: data.setdefault("rules", b) } + # reset all properties and flush all caches (through rules), + # reset rules first so that _reload_special_cases is trivial/fast as + # the other properties are reset + self.rules = {} + self.prefix_search = None + self.suffix_search = None + self.infix_finditer = None + self.token_match = None + self.url_match = None msg = util.from_bytes(bytes_data, deserializers, exclude) if "prefix_search" in data and isinstance(data["prefix_search"], str): self.prefix_search = re.compile(data["prefix_search"]).search @@ -785,22 +810,12 @@ cdef class Tokenizer: self.suffix_search = re.compile(data["suffix_search"]).search if "infix_finditer" in data and isinstance(data["infix_finditer"], str): self.infix_finditer = re.compile(data["infix_finditer"]).finditer - # for token_match and url_match, set to None to override the language - # defaults if no regex is provided if "token_match" in data and isinstance(data["token_match"], str): self.token_match = re.compile(data["token_match"]).match - else: - self.token_match = None if "url_match" in data and isinstance(data["url_match"], str): self.url_match = re.compile(data["url_match"]).match - else: - self.url_match = None if "rules" in data and isinstance(data["rules"], dict): - # make sure to hard reset the cache to remove data from the default exceptions - self._rules = {} - self._flush_cache() - self._flush_specials() - self._load_special_cases(data["rules"]) + self.rules = data["rules"] return self diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py index 1aefa2b7c..64090925d 100644 --- a/spacy/tokens/__init__.py +++ b/spacy/tokens/__init__.py @@ -1,7 +1,8 @@ from .doc import Doc from .token import Token from .span import Span +from .span_group import SpanGroup from ._serialize import DocBin from .morphanalysis import MorphAnalysis -__all__ = ["Doc", "Token", "Span", "DocBin", "MorphAnalysis"] +__all__ = ["Doc", "Token", "Span", "SpanGroup", "DocBin", "MorphAnalysis"] diff --git a/spacy/tokens/_dict_proxies.py b/spacy/tokens/_dict_proxies.py index 7b2d2d5b5..83399eafa 100644 --- a/spacy/tokens/_dict_proxies.py +++ b/spacy/tokens/_dict_proxies.py @@ -1,9 +1,10 @@ -from typing import Iterable, Tuple, Union, TYPE_CHECKING +from typing import Iterable, Tuple, Union, Optional, TYPE_CHECKING import weakref from collections import UserDict import srsly from .span_group import SpanGroup +from ..errors import Errors if TYPE_CHECKING: # This lets us add type hints for mypy etc. without causing circular imports @@ -13,7 +14,7 @@ if TYPE_CHECKING: # Why inherit from UserDict instead of dict here? # Well, the 'dict' class doesn't necessarily delegate everything nicely, -# for performance reasons. The UserDict is slower by better behaved. +# for performance reasons. The UserDict is slower but better behaved. # See https://treyhunner.com/2019/04/why-you-shouldnt-inherit-from-list-and-dict-in-python/0ww class SpanGroups(UserDict): """A dict-like proxy held by the Doc, to control access to span groups.""" @@ -22,7 +23,7 @@ class SpanGroups(UserDict): self, doc: "Doc", items: Iterable[Tuple[str, SpanGroup]] = tuple() ) -> None: self.doc_ref = weakref.ref(doc) - UserDict.__init__(self, items) + UserDict.__init__(self, items) # type: ignore[arg-type] def __setitem__(self, key: str, value: Union[SpanGroup, Iterable["Span"]]) -> None: if not isinstance(value, SpanGroup): @@ -31,7 +32,13 @@ class SpanGroups(UserDict): UserDict.__setitem__(self, key, value) def _make_span_group(self, name: str, spans: Iterable["Span"]) -> SpanGroup: - return SpanGroup(self.doc_ref(), name=name, spans=spans) + doc = self._ensure_doc() + return SpanGroup(doc, name=name, spans=spans) + + def copy(self, doc: Optional["Doc"] = None) -> "SpanGroups": + if doc is None: + doc = self._ensure_doc() + return SpanGroups(doc).from_bytes(self.to_bytes()) def copy(self) -> "SpanGroups": return SpanGroups(self.doc_ref()).from_bytes(self.to_bytes()) @@ -45,8 +52,14 @@ class SpanGroups(UserDict): def from_bytes(self, bytes_data: bytes) -> "SpanGroups": msg = srsly.msgpack_loads(bytes_data) self.clear() - doc = self.doc_ref() + doc = self._ensure_doc() for value_bytes in msg: group = SpanGroup(doc).from_bytes(value_bytes) self[group.name] = group return self + + def _ensure_doc(self) -> "Doc": + doc = self.doc_ref() + if doc is None: + raise ValueError(Errors.E866) + return doc diff --git a/spacy/tokens/_retokenize.pyi b/spacy/tokens/_retokenize.pyi new file mode 100644 index 000000000..8834d38c0 --- /dev/null +++ b/spacy/tokens/_retokenize.pyi @@ -0,0 +1,21 @@ +from typing import Dict, Any, Union, List, Tuple +from .doc import Doc +from .span import Span +from .token import Token +from .. import Vocab + +class Retokenizer: + def __init__(self, doc: Doc) -> None: ... + def merge(self, span: Span, attrs: Dict[Union[str, int], Any] = ...) -> None: ... + def split( + self, + token: Token, + orths: List[str], + heads: List[Union[Token, Tuple[Token, int]]], + attrs: Dict[Union[str, int], List[Any]] = ..., + ) -> None: ... + def __enter__(self) -> Retokenizer: ... + def __exit__(self, *args: Any) -> None: ... + +def normalize_token_attrs(vocab: Vocab, attrs: Dict): ... +def set_token_attrs(py_token: Token, attrs: Dict): ... diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index ed8c4323e..43e6d4aa7 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -24,8 +24,8 @@ from ..strings import get_string_id cdef class Retokenizer: """Helper class for doc.retokenize() context manager. - DOCS: https://nightly.spacy.io/api/doc#retokenize - USAGE: https://nightly.spacy.io/usage/linguistic-features#retokenization + DOCS: https://spacy.io/api/doc#retokenize + USAGE: https://spacy.io/usage/linguistic-features#retokenization """ cdef Doc doc cdef list merges @@ -47,7 +47,7 @@ cdef class Retokenizer: span (Span): The span to merge. attrs (dict): Attributes to set on the merged token. - DOCS: https://nightly.spacy.io/api/doc#retokenizer.merge + DOCS: https://spacy.io/api/doc#retokenizer.merge """ if (span.start, span.end) in self._spans_to_merge: return @@ -73,7 +73,7 @@ cdef class Retokenizer: attrs (dict): Attributes to set on all split tokens. Attribute names mapped to list of per-token attribute values. - DOCS: https://nightly.spacy.io/api/doc#retokenizer.split + DOCS: https://spacy.io/api/doc#retokenizer.split """ if ''.join(orths) != token.text: raise ValueError(Errors.E117.format(new=''.join(orths), old=token.text)) @@ -281,7 +281,8 @@ def _merge(Doc doc, merges): for i in range(doc.length): doc.c[i].head -= i # Set the left/right children, left/right edges - set_children_from_heads(doc.c, 0, doc.length) + if doc.has_annotation("DEP"): + set_children_from_heads(doc.c, 0, doc.length) # Make sure ent_iob remains consistent make_iob_consistent(doc.c, doc.length) # Return the merged Python object @@ -294,7 +295,19 @@ def _resize_tensor(tensor, ranges): for i in range(start, end-1): delete.append(i) xp = get_array_module(tensor) - return xp.delete(tensor, delete, axis=0) + if xp is numpy: + return xp.delete(tensor, delete, axis=0) + else: + offset = 0 + copy_start = 0 + resized_shape = (tensor.shape[0] - len(delete), tensor.shape[1]) + for start, end in ranges: + if copy_start > 0: + tensor[copy_start - offset:start - offset] = tensor[copy_start: start] + offset += end - start - 1 + copy_start = end - 1 + tensor[copy_start - offset:resized_shape[0]] = tensor[copy_start:] + return xp.asarray(tensor[:resized_shape[0]]) def _split(Doc doc, int token_index, orths, heads, attrs): @@ -331,7 +344,13 @@ def _split(Doc doc, int token_index, orths, heads, attrs): to_process_tensor = (doc.tensor is not None and doc.tensor.size != 0) if to_process_tensor: xp = get_array_module(doc.tensor) - doc.tensor = xp.append(doc.tensor, xp.zeros((nb_subtokens,doc.tensor.shape[1]), dtype="float32"), axis=0) + if xp is numpy: + doc.tensor = xp.append(doc.tensor, xp.zeros((nb_subtokens,doc.tensor.shape[1]), dtype="float32"), axis=0) + else: + shape = (doc.tensor.shape[0] + nb_subtokens, doc.tensor.shape[1]) + resized_array = xp.zeros(shape, dtype="float32") + resized_array[:doc.tensor.shape[0]] = doc.tensor[:doc.tensor.shape[0]] + doc.tensor = resized_array for token_to_move in range(orig_length - 1, token_index, -1): doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move] if to_process_tensor: @@ -348,7 +367,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs): token.norm = 0 # reset norm if to_process_tensor: # setting the tensors of the split tokens to array of zeros - doc.tensor[token_index + i] = xp.zeros((1,doc.tensor.shape[1]), dtype="float32") + doc.tensor[token_index + i:token_index + i + 1] = xp.zeros((1,doc.tensor.shape[1]), dtype="float32") # Update the character offset of the subtokens if i != 0: token.idx = orig_token.idx + idx_offset @@ -392,7 +411,8 @@ def _split(Doc doc, int token_index, orths, heads, attrs): for i in range(doc.length): doc.c[i].head -= i # set children from head - set_children_from_heads(doc.c, 0, doc.length) + if doc.has_annotation("DEP"): + set_children_from_heads(doc.c, 0, doc.length) def _validate_extensions(extensions): diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index bb1f515ec..510a2ea71 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -1,6 +1,7 @@ -from typing import Iterable, Iterator, Union +from typing import List, Dict, Set, Iterable, Iterator, Union, Optional from pathlib import Path import numpy +from numpy import ndarray import zlib import srsly from thinc.api import NumpyOps @@ -8,7 +9,7 @@ from thinc.api import NumpyOps from .doc import Doc from ..vocab import Vocab from ..compat import copy_reg -from ..attrs import SPACY, ORTH, intify_attr +from ..attrs import SPACY, ORTH, intify_attr, IDS from ..errors import Errors from ..util import ensure_path, SimpleFrozenList @@ -62,19 +63,25 @@ class DocBin: store_user_data (bool): Whether to write the `Doc.user_data` to bytes/file. docs (Iterable[Doc]): Docs to add. - DOCS: https://nightly.spacy.io/api/docbin#init + DOCS: https://spacy.io/api/docbin#init """ - attrs = sorted([intify_attr(attr) for attr in attrs]) + int_attrs = [intify_attr(attr) for attr in attrs] + if None in int_attrs: + non_valid = [attr for attr in attrs if intify_attr(attr) is None] + raise KeyError( + Errors.E983.format(dict="attrs", key=non_valid, keys=IDS.keys()) + ) from None + attrs = sorted(int_attrs) self.version = "0.1" self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY] self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0] - self.tokens = [] - self.spaces = [] - self.cats = [] - self.span_groups = [] - self.user_data = [] - self.flags = [] - self.strings = set() + self.tokens: List[ndarray] = [] + self.spaces: List[ndarray] = [] + self.cats: List[Dict] = [] + self.span_groups: List[bytes] = [] + self.user_data: List[Optional[bytes]] = [] + self.flags: List[Dict] = [] + self.strings: Set[str] = set() self.store_user_data = store_user_data for doc in docs: self.add(doc) @@ -88,7 +95,7 @@ class DocBin: doc (Doc): The Doc object to add. - DOCS: https://nightly.spacy.io/api/docbin#add + DOCS: https://spacy.io/api/docbin#add """ array = doc.to_array(self.attrs) if len(array.shape) == 1: @@ -103,10 +110,12 @@ class DocBin: self.strings.add(token.text) self.strings.add(token.tag_) self.strings.add(token.lemma_) + self.strings.add(token.norm_) self.strings.add(str(token.morph)) self.strings.add(token.dep_) self.strings.add(token.ent_type_) self.strings.add(token.ent_kb_id_) + self.strings.add(token.ent_id_) self.cats.append(doc.cats) self.user_data.append(srsly.msgpack_dumps(doc.user_data)) self.span_groups.append(doc.spans.to_bytes()) @@ -122,7 +131,7 @@ class DocBin: vocab (Vocab): The shared vocab. YIELDS (Doc): The Doc objects. - DOCS: https://nightly.spacy.io/api/docbin#get_docs + DOCS: https://spacy.io/api/docbin#get_docs """ for string in self.strings: vocab[string] @@ -130,11 +139,11 @@ class DocBin: for i in range(len(self.tokens)): flags = self.flags[i] tokens = self.tokens[i] - spaces = self.spaces[i] + spaces: Optional[ndarray] = self.spaces[i] if flags.get("has_unknown_spaces"): spaces = None - doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces) - doc = doc.from_array(self.attrs, tokens) + doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces) # type: ignore + doc = doc.from_array(self.attrs, tokens) # type: ignore doc.cats = self.cats[i] if self.span_groups[i]: doc.spans.from_bytes(self.span_groups[i]) @@ -153,7 +162,7 @@ class DocBin: other (DocBin): The DocBin to merge into the current bin. - DOCS: https://nightly.spacy.io/api/docbin#merge + DOCS: https://spacy.io/api/docbin#merge """ if self.attrs != other.attrs: raise ValueError( @@ -180,7 +189,7 @@ class DocBin: RETURNS (bytes): The serialized DocBin. - DOCS: https://nightly.spacy.io/api/docbin#to_bytes + DOCS: https://spacy.io/api/docbin#to_bytes """ for tokens in self.tokens: assert len(tokens.shape) == 2, tokens.shape # this should never happen @@ -208,7 +217,7 @@ class DocBin: bytes_data (bytes): The data to load from. RETURNS (DocBin): The loaded DocBin. - DOCS: https://nightly.spacy.io/api/docbin#from_bytes + DOCS: https://spacy.io/api/docbin#from_bytes """ try: msg = srsly.msgpack_loads(zlib.decompress(bytes_data)) @@ -240,11 +249,14 @@ class DocBin: path (str / Path): The file path. - DOCS: https://nightly.spacy.io/api/docbin#to_disk + DOCS: https://spacy.io/api/docbin#to_disk """ path = ensure_path(path) with path.open("wb") as file_: - file_.write(self.to_bytes()) + try: + file_.write(self.to_bytes()) + except ValueError: + raise ValueError(Errors.E870) def from_disk(self, path: Union[str, Path]) -> "DocBin": """Load the DocBin from a file (typically called .spacy). @@ -252,7 +264,7 @@ class DocBin: path (str / Path): The file path. RETURNS (DocBin): The loaded DocBin. - DOCS: https://nightly.spacy.io/api/docbin#to_disk + DOCS: https://spacy.io/api/docbin#to_disk """ path = ensure_path(path) with path.open("rb") as file_: diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi new file mode 100644 index 000000000..2b18cee7a --- /dev/null +++ b/spacy/tokens/doc.pyi @@ -0,0 +1,170 @@ +from typing import Callable, Protocol, Iterable, Iterator, Optional +from typing import Union, Tuple, List, Dict, Any, overload +from cymem.cymem import Pool +from thinc.types import Floats1d, Floats2d, Ints2d +from .span import Span +from .token import Token +from ._dict_proxies import SpanGroups +from ._retokenize import Retokenizer +from ..lexeme import Lexeme +from ..vocab import Vocab +from .underscore import Underscore +from pathlib import Path +import numpy + +class DocMethod(Protocol): + def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc] + +class Doc: + vocab: Vocab + mem: Pool + spans: SpanGroups + max_length: int + length: int + sentiment: float + cats: Dict[str, float] + user_hooks: Dict[str, Callable[..., Any]] + user_token_hooks: Dict[str, Callable[..., Any]] + user_span_hooks: Dict[str, Callable[..., Any]] + tensor: numpy.ndarray + user_data: Dict[str, Any] + has_unknown_spaces: bool + @classmethod + def set_extension( + cls, + name: str, + default: Optional[Any] = ..., + getter: Optional[Callable[[Doc], Any]] = ..., + setter: Optional[Callable[[Doc, Any], None]] = ..., + method: Optional[DocMethod] = ..., + force: bool = ..., + ) -> None: ... + @classmethod + def get_extension( + cls, name: str + ) -> Tuple[ + Optional[Any], + Optional[DocMethod], + Optional[Callable[[Doc], Any]], + Optional[Callable[[Doc, Any], None]], + ]: ... + @classmethod + def has_extension(cls, name: str) -> bool: ... + @classmethod + def remove_extension( + cls, name: str + ) -> Tuple[ + Optional[Any], + Optional[DocMethod], + Optional[Callable[[Doc], Any]], + Optional[Callable[[Doc, Any], None]], + ]: ... + def __init__( + self, + vocab: Vocab, + words: Optional[List[str]] = ..., + spaces: Optional[List[bool]] = ..., + user_data: Optional[Dict[Any, Any]] = ..., + tags: Optional[List[str]] = ..., + pos: Optional[List[str]] = ..., + morphs: Optional[List[str]] = ..., + lemmas: Optional[List[str]] = ..., + heads: Optional[List[int]] = ..., + deps: Optional[List[str]] = ..., + sent_starts: Optional[List[Union[bool, None]]] = ..., + ents: Optional[List[str]] = ..., + ) -> None: ... + @property + def _(self) -> Underscore: ... + @property + def is_tagged(self) -> bool: ... + @property + def is_parsed(self) -> bool: ... + @property + def is_nered(self) -> bool: ... + @property + def is_sentenced(self) -> bool: ... + def has_annotation( + self, attr: Union[int, str], *, require_complete: bool = ... + ) -> bool: ... + @overload + def __getitem__(self, i: int) -> Token: ... + @overload + def __getitem__(self, i: slice) -> Span: ... + def __iter__(self) -> Iterator[Token]: ... + def __len__(self) -> int: ... + def __unicode__(self) -> str: ... + def __bytes__(self) -> bytes: ... + def __str__(self) -> str: ... + def __repr__(self) -> str: ... + @property + def doc(self) -> Doc: ... + def char_span( + self, + start_idx: int, + end_idx: int, + label: Union[int, str] = ..., + kb_id: Union[int, str] = ..., + vector: Optional[Floats1d] = ..., + alignment_mode: str = ..., + ) -> Span: ... + def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ... + @property + def has_vector(self) -> bool: ... + vector: Floats1d + vector_norm: float + @property + def text(self) -> str: ... + @property + def text_with_ws(self) -> str: ... + ents: Tuple[Span] + def set_ents( + self, + entities: List[Span], + *, + blocked: Optional[List[Span]] = ..., + missing: Optional[List[Span]] = ..., + outside: Optional[List[Span]] = ..., + default: str = ... + ) -> None: ... + @property + def noun_chunks(self) -> Iterator[Span]: ... + @property + def sents(self) -> Iterator[Span]: ... + @property + def lang(self) -> int: ... + @property + def lang_(self) -> str: ... + def count_by( + self, attr_id: int, exclude: Optional[Any] = ..., counts: Optional[Any] = ... + ) -> Dict[Any, int]: ... + def from_array(self, attrs: List[int], array: Ints2d) -> Doc: ... + def to_array(self, py_attr_ids: List[int]) -> numpy.ndarray: ... + @staticmethod + def from_docs( + docs: List[Doc], + ensure_whitespace: bool = ..., + attrs: Optional[Union[Tuple[Union[str, int]], List[Union[int, str]]]] = ..., + ) -> Doc: ... + def get_lca_matrix(self) -> Ints2d: ... + def copy(self) -> Doc: ... + def to_disk( + self, path: Union[str, Path], *, exclude: Iterable[str] = ... + ) -> None: ... + def from_disk( + self, path: Union[str, Path], *, exclude: Union[List[str], Tuple[str]] = ... + ) -> Doc: ... + def to_bytes(self, *, exclude: Union[List[str], Tuple[str]] = ...) -> bytes: ... + def from_bytes( + self, bytes_data: bytes, *, exclude: Union[List[str], Tuple[str]] = ... + ) -> Doc: ... + def to_dict(self, *, exclude: Union[List[str], Tuple[str]] = ...) -> bytes: ... + def from_dict( + self, msg: bytes, *, exclude: Union[List[str], Tuple[str]] = ... + ) -> Doc: ... + def extend_tensor(self, tensor: Floats2d) -> None: ... + def retokenize(self) -> Retokenizer: ... + def to_json(self, underscore: Optional[List[str]] = ...) -> Dict[str, Any]: ... + def to_utf8_array(self, nr_char: int = ...) -> Ints2d: ... + @staticmethod + def _get_array_attrs() -> Tuple[Any]: ... diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 66ad722b7..ee3fa8906 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -6,7 +6,7 @@ from libc.math cimport sqrt from libc.stdint cimport int32_t, uint64_t import copy -from collections import Counter +from collections import Counter, defaultdict from enum import Enum import itertools import numpy @@ -30,6 +30,7 @@ from ..compat import copy_reg, pickle from ..errors import Errors, Warnings from ..morphology import Morphology from .. import util +from .. import parts_of_speech from .underscore import Underscore, get_ext_args from ._retokenize import Retokenizer from ._serialize import ALL_ATTRS as DOCBIN_ALL_ATTRS @@ -116,7 +117,7 @@ cdef class Doc: >>> from spacy.tokens import Doc >>> doc = Doc(nlp.vocab, words=["hello", "world", "!"], spaces=[True, False, False]) - DOCS: https://nightly.spacy.io/api/doc + DOCS: https://spacy.io/api/doc """ @classmethod @@ -130,8 +131,8 @@ cdef class Doc: method (callable): Optional method for method extension. force (bool): Force overwriting existing attribute. - DOCS: https://nightly.spacy.io/api/doc#set_extension - USAGE: https://nightly.spacy.io/usage/processing-pipelines#custom-components-attributes + DOCS: https://spacy.io/api/doc#set_extension + USAGE: https://spacy.io/usage/processing-pipelines#custom-components-attributes """ if cls.has_extension(name) and not kwargs.get("force", False): raise ValueError(Errors.E090.format(name=name, obj="Doc")) @@ -144,7 +145,7 @@ cdef class Doc: name (str): Name of the extension. RETURNS (tuple): A `(default, method, getter, setter)` tuple. - DOCS: https://nightly.spacy.io/api/doc#get_extension + DOCS: https://spacy.io/api/doc#get_extension """ return Underscore.doc_extensions.get(name) @@ -155,7 +156,7 @@ cdef class Doc: name (str): Name of the extension. RETURNS (bool): Whether the extension has been registered. - DOCS: https://nightly.spacy.io/api/doc#has_extension + DOCS: https://spacy.io/api/doc#has_extension """ return name in Underscore.doc_extensions @@ -167,7 +168,7 @@ cdef class Doc: RETURNS (tuple): A `(default, method, getter, setter)` tuple of the removed extension. - DOCS: https://nightly.spacy.io/api/doc#remove_extension + DOCS: https://spacy.io/api/doc#remove_extension """ if not cls.has_extension(name): raise ValueError(Errors.E046.format(name=name)) @@ -193,11 +194,12 @@ cdef class Doc: vocab (Vocab): A vocabulary object, which must match any models you want to use (e.g. tokenizer, parser, entity recognizer). - words (Optional[List[str]]): A list of unicode strings to add to the document - as words. If `None`, defaults to empty list. - spaces (Optional[List[bool]]): A list of boolean values, of the same length as - words. True means that the word is followed by a space, False means - it is not. If `None`, defaults to `[True]*len(words)` + words (Optional[List[Union[str, int]]]): A list of unicode strings or + hash values to add to the document as words. If `None`, defaults to + empty list. + spaces (Optional[List[bool]]): A list of boolean values, of the same + length as `words`. `True` means that the word is followed by a space, + `False` means it is not. If `None`, defaults to `[True]*len(words)` user_data (dict or None): Optional extra data to attach to the Doc. tags (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.tag. Defaults to None. @@ -219,7 +221,7 @@ cdef class Doc: length as words, as IOB tags to assign as token.ent_iob and token.ent_type. Defaults to None. - DOCS: https://nightly.spacy.io/api/doc#init + DOCS: https://spacy.io/api/doc#init """ self.vocab = vocab size = max(20, (len(words) if words is not None else 0)) @@ -265,7 +267,10 @@ cdef class Doc: elif isinstance(word, bytes): raise ValueError(Errors.E028.format(value=word)) else: - lexeme = self.vocab.get_by_orth(self.vocab.mem, word) + try: + lexeme = self.vocab.get_by_orth(self.vocab.mem, word) + except TypeError: + raise TypeError(Errors.E1022.format(wtype=type(word))) self.push_back(lexeme, has_space) if heads is not None: @@ -275,6 +280,8 @@ cdef class Doc: deps = [dep if dep is not None else MISSING_DEP_ for dep in deps] if deps and not heads: heads = [0] * len(deps) + if heads and not deps: + raise ValueError(Errors.E1017) if sent_starts is not None: for i in range(len(sent_starts)): if sent_starts[i] is True: @@ -283,6 +290,10 @@ cdef class Doc: sent_starts[i] = -1 elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]: sent_starts[i] = 0 + if pos is not None: + for pp in set(pos): + if pp not in parts_of_speech.IDS: + raise ValueError(Errors.E1021.format(pp=pp)) ent_iobs = None ent_types = None if ents is not None: @@ -399,7 +410,7 @@ cdef class Doc: every token in the doc. RETURNS (bool): Whether annotation is present. - DOCS: https://nightly.spacy.io/api/doc#has_annotation + DOCS: https://spacy.io/api/doc#has_annotation """ # empty docs are always annotated @@ -450,7 +461,7 @@ cdef class Doc: You can use negative indices and open-ended ranges, which have their normal Python semantics. - DOCS: https://nightly.spacy.io/api/doc#getitem + DOCS: https://spacy.io/api/doc#getitem """ if isinstance(i, slice): start, stop = util.normalize_slice(len(self), i.start, i.stop, i.step) @@ -467,7 +478,7 @@ cdef class Doc: than-Python speeds are required, you can instead access the annotations as a numpy array, or access the underlying C data directly from Cython. - DOCS: https://nightly.spacy.io/api/doc#iter + DOCS: https://spacy.io/api/doc#iter """ cdef int i for i in range(self.length): @@ -478,7 +489,7 @@ cdef class Doc: RETURNS (int): The number of tokens in the document. - DOCS: https://nightly.spacy.io/api/doc#len + DOCS: https://spacy.io/api/doc#len """ return self.length @@ -519,14 +530,15 @@ cdef class Doc: partially covered by the character span). Defaults to "strict". RETURNS (Span): The newly constructed object. - DOCS: https://nightly.spacy.io/api/doc#char_span + DOCS: https://spacy.io/api/doc#char_span """ if not isinstance(label, int): label = self.vocab.strings.add(label) if not isinstance(kb_id, int): kb_id = self.vocab.strings.add(kb_id) - if alignment_mode not in ("strict", "contract", "expand"): - alignment_mode = "strict" + alignment_modes = ("strict", "contract", "expand") + if alignment_mode not in alignment_modes: + raise ValueError(Errors.E202.format(mode=alignment_mode, modes=", ".join(alignment_modes))) cdef int start = token_by_char(self.c, self.length, start_idx) if start < 0 or (alignment_mode == "strict" and start_idx != self[start].idx): return None @@ -561,7 +573,7 @@ cdef class Doc: `Span`, `Token` and `Lexeme` objects. RETURNS (float): A scalar similarity score. Higher is more similar. - DOCS: https://nightly.spacy.io/api/doc#similarity + DOCS: https://spacy.io/api/doc#similarity """ if "similarity" in self.user_hooks: return self.user_hooks["similarity"](self, other) @@ -594,7 +606,7 @@ cdef class Doc: RETURNS (bool): Whether a word vector is associated with the object. - DOCS: https://nightly.spacy.io/api/doc#has_vector + DOCS: https://spacy.io/api/doc#has_vector """ if "has_vector" in self.user_hooks: return self.user_hooks["has_vector"](self) @@ -612,7 +624,7 @@ cdef class Doc: RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array representing the document's semantics. - DOCS: https://nightly.spacy.io/api/doc#vector + DOCS: https://spacy.io/api/doc#vector """ def __get__(self): if "vector" in self.user_hooks: @@ -640,7 +652,7 @@ cdef class Doc: RETURNS (float): The L2 norm of the vector representation. - DOCS: https://nightly.spacy.io/api/doc#vector_norm + DOCS: https://spacy.io/api/doc#vector_norm """ def __get__(self): if "vector_norm" in self.user_hooks: @@ -680,7 +692,7 @@ cdef class Doc: RETURNS (tuple): Entities in the document, one `Span` per entity. - DOCS: https://nightly.spacy.io/api/doc#ents + DOCS: https://spacy.io/api/doc#ents """ def __get__(self): cdef int i @@ -826,7 +838,7 @@ cdef class Doc: YIELDS (Span): Noun chunks in the document. - DOCS: https://nightly.spacy.io/api/doc#noun_chunks + DOCS: https://spacy.io/api/doc#noun_chunks """ if self.noun_chunks_iterator is None: raise NotImplementedError(Errors.E894.format(lang=self.vocab.lang)) @@ -849,7 +861,7 @@ cdef class Doc: YIELDS (Span): Sentences in the document. - DOCS: https://nightly.spacy.io/api/doc#sents + DOCS: https://spacy.io/api/doc#sents """ if not self.has_annotation("SENT_START"): raise ValueError(Errors.E030) @@ -906,7 +918,7 @@ cdef class Doc: can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) or string name (e.g. 'LEMMA' or 'lemma'). - attr_ids (list[]): A list of attributes (int IDs or string names). + py_attr_ids (list[]): A list of attributes (int IDs or string names). RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row per word, and one column per attribute indicated in the input `attr_ids`. @@ -958,7 +970,7 @@ cdef class Doc: attr_id (int): The attribute ID to key the counts. RETURNS (dict): A dictionary mapping attributes to integer counts. - DOCS: https://nightly.spacy.io/api/doc#count_by + DOCS: https://spacy.io/api/doc#count_by """ cdef int i cdef attr_t attr @@ -1005,7 +1017,7 @@ cdef class Doc: array (numpy.ndarray[ndim=2, dtype='int32']): The attribute values. RETURNS (Doc): Itself. - DOCS: https://nightly.spacy.io/api/doc#from_array + DOCS: https://spacy.io/api/doc#from_array """ # Handle scalar/list inputs of strings/ints for py_attr_ids # See also #3064 @@ -1097,7 +1109,7 @@ cdef class Doc: attrs (list): Optional list of attribute ID ints or attribute name strings. RETURNS (Doc): A doc that contains the concatenated docs, or None if no docs were given. - DOCS: https://nightly.spacy.io/api/doc#from_docs + DOCS: https://spacy.io/api/doc#from_docs """ if not docs: return None @@ -1119,13 +1131,14 @@ cdef class Doc: concat_words = [] concat_spaces = [] concat_user_data = {} + concat_spans = defaultdict(list) char_offset = 0 for doc in docs: concat_words.extend(t.text for t in doc) concat_spaces.extend(bool(t.whitespace_) for t in doc) for key, value in doc.user_data.items(): - if isinstance(key, tuple) and len(key) == 4: + if isinstance(key, tuple) and len(key) == 4 and key[0] == "._.": data_type, name, start, end = key if start is not None or end is not None: start += char_offset @@ -1136,8 +1149,21 @@ cdef class Doc: warnings.warn(Warnings.W101.format(name=name)) else: warnings.warn(Warnings.W102.format(key=key, value=value)) + for key in doc.spans: + # if a spans key is in any doc, include it in the merged doc + # even if it is empty + if key not in concat_spans: + concat_spans[key] = [] + for span in doc.spans[key]: + concat_spans[key].append(( + span.start_char + char_offset, + span.end_char + char_offset, + span.label, + span.kb_id, + span.text, # included as a check + )) char_offset += len(doc.text) - if ensure_whitespace and not (len(doc) > 0 and doc[-1].is_space): + if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space and not bool(doc[-1].whitespace_): char_offset += 1 arrays = [doc.to_array(attrs) for doc in docs] @@ -1147,11 +1173,12 @@ cdef class Doc: for i, array in enumerate(arrays[:-1]): if len(array) > 0 and not docs[i][-1].is_space: array[-1][spacy_index] = 1 - token_offset = -1 - for doc in docs[:-1]: - token_offset += len(doc) - if not (len(doc) > 0 and doc[-1].is_space): - concat_spaces[token_offset] = True + if len(concat_spaces) > 0: + token_offset = -1 + for doc in docs[:-1]: + token_offset += len(doc) + if not (len(doc) > 0 and doc[-1].is_space): + concat_spaces[token_offset] = True concat_array = numpy.concatenate(arrays) @@ -1159,6 +1186,22 @@ cdef class Doc: concat_doc.from_array(attrs, concat_array) + for key in concat_spans: + if key not in concat_doc.spans: + concat_doc.spans[key] = [] + for span_tuple in concat_spans[key]: + span = concat_doc.char_span( + span_tuple[0], + span_tuple[1], + label=span_tuple[2], + kb_id=span_tuple[3], + ) + text = span_tuple[4] + if span is not None and span.text == text: + concat_doc.spans[key].append(span) + else: + raise ValueError(Errors.E873.format(key=key, text=text)) + return concat_doc def get_lca_matrix(self): @@ -1169,7 +1212,7 @@ cdef class Doc: RETURNS (np.array[ndim=2, dtype=numpy.int32]): LCA matrix with shape (n, n), where n = len(self). - DOCS: https://nightly.spacy.io/api/doc#get_lca_matrix + DOCS: https://spacy.io/api/doc#get_lca_matrix """ return numpy.asarray(_get_lca_matrix(self, 0, len(self))) @@ -1188,6 +1231,7 @@ cdef class Doc: other.user_span_hooks = dict(self.user_span_hooks) other.length = self.length other.max_length = self.max_length + other.spans = self.spans.copy(doc=other) buff_size = other.max_length + (PADDING*2) assert buff_size > 0 tokens = other.mem.alloc(buff_size, sizeof(TokenC)) @@ -1202,7 +1246,7 @@ cdef class Doc: it doesn't exist. Paths may be either strings or Path-like objects. exclude (Iterable[str]): String names of serialization fields to exclude. - DOCS: https://nightly.spacy.io/api/doc#to_disk + DOCS: https://spacy.io/api/doc#to_disk """ path = util.ensure_path(path) with path.open("wb") as file_: @@ -1217,7 +1261,7 @@ cdef class Doc: exclude (list): String names of serialization fields to exclude. RETURNS (Doc): The modified `Doc` object. - DOCS: https://nightly.spacy.io/api/doc#from_disk + DOCS: https://spacy.io/api/doc#from_disk """ path = util.ensure_path(path) with path.open("rb") as file_: @@ -1231,7 +1275,7 @@ cdef class Doc: RETURNS (bytes): A losslessly serialized copy of the `Doc`, including all annotations. - DOCS: https://nightly.spacy.io/api/doc#to_bytes + DOCS: https://spacy.io/api/doc#to_bytes """ return srsly.msgpack_dumps(self.to_dict(exclude=exclude)) @@ -1242,7 +1286,7 @@ cdef class Doc: exclude (list): String names of serialization fields to exclude. RETURNS (Doc): Itself. - DOCS: https://nightly.spacy.io/api/doc#from_bytes + DOCS: https://spacy.io/api/doc#from_bytes """ return self.from_dict(srsly.msgpack_loads(bytes_data), exclude=exclude) @@ -1253,7 +1297,7 @@ cdef class Doc: RETURNS (bytes): A losslessly serialized copy of the `Doc`, including all annotations. - DOCS: https://nightly.spacy.io/api/doc#to_bytes + DOCS: https://spacy.io/api/doc#to_bytes """ array_head = Doc._get_array_attrs() strings = set() @@ -1291,7 +1335,7 @@ cdef class Doc: if "user_data_values" not in exclude: serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values) if "user_hooks" not in exclude and any((self.user_hooks, self.user_token_hooks, self.user_span_hooks)): - util.logger.warning(Warnings.W109) + warnings.warn(Warnings.W109) return util.to_dict(serializers, exclude) def from_dict(self, msg, *, exclude=tuple()): @@ -1301,7 +1345,7 @@ cdef class Doc: exclude (list): String names of serialization fields to exclude. RETURNS (Doc): Itself. - DOCS: https://nightly.spacy.io/api/doc#from_dict + DOCS: https://spacy.io/api/doc#from_dict """ if self.length != 0: raise ValueError(Errors.E033.format(length=self.length)) @@ -1372,8 +1416,8 @@ cdef class Doc: retokenization are invalidated, although they may accidentally continue to work. - DOCS: https://nightly.spacy.io/api/doc#retokenize - USAGE: https://nightly.spacy.io/usage/linguistic-features#retokenization + DOCS: https://spacy.io/api/doc#retokenize + USAGE: https://spacy.io/usage/linguistic-features#retokenization """ return Retokenizer(self) @@ -1645,7 +1689,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end): j_idx_in_sent = start + j - sent_start n_missing_tokens_in_sent = len(sent) - j_idx_in_sent # make sure we do not go past `end`, in cases where `end` < sent.end - max_range = min(j + n_missing_tokens_in_sent, end) + max_range = min(j + n_missing_tokens_in_sent, end - start) for k in range(j + 1, max_range): lca = _get_tokens_lca(token_j, doc[start + k]) # if lca is outside of span, we set it to -1 diff --git a/spacy/tokens/morphanalysis.pyi b/spacy/tokens/morphanalysis.pyi new file mode 100644 index 000000000..b86203cc4 --- /dev/null +++ b/spacy/tokens/morphanalysis.pyi @@ -0,0 +1,20 @@ +from typing import Any, Dict, Iterator, List, Union +from ..vocab import Vocab + +class MorphAnalysis: + def __init__( + self, vocab: Vocab, features: Union[Dict[str, str], str] = ... + ) -> None: ... + @classmethod + def from_id(cls, vocab: Vocab, key: Any) -> MorphAnalysis: ... + def __contains__(self, feature: str) -> bool: ... + def __iter__(self) -> Iterator[str]: ... + def __len__(self) -> int: ... + def __hash__(self) -> int: ... + def __eq__(self, other: MorphAnalysis) -> bool: ... # type: ignore[override] + def __ne__(self, other: MorphAnalysis) -> bool: ... # type: ignore[override] + def get(self, field: Any) -> List[str]: ... + def to_json(self) -> str: ... + def to_dict(self) -> Dict[str, str]: ... + def __str__(self) -> str: ... + def __repr__(self) -> str: ... diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi new file mode 100644 index 000000000..697051e81 --- /dev/null +++ b/spacy/tokens/span.pyi @@ -0,0 +1,126 @@ +from typing import Callable, Protocol, Iterator, Optional, Union, Tuple, Any, overload +from thinc.types import Floats1d, Ints2d, FloatsXd +from .doc import Doc +from .token import Token +from .underscore import Underscore +from ..lexeme import Lexeme +from ..vocab import Vocab + +class SpanMethod(Protocol): + def __call__(self: Span, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc] + +class Span: + @classmethod + def set_extension( + cls, + name: str, + default: Optional[Any] = ..., + getter: Optional[Callable[[Span], Any]] = ..., + setter: Optional[Callable[[Span, Any], None]] = ..., + method: Optional[SpanMethod] = ..., + force: bool = ..., + ) -> None: ... + @classmethod + def get_extension( + cls, name: str + ) -> Tuple[ + Optional[Any], + Optional[SpanMethod], + Optional[Callable[[Span], Any]], + Optional[Callable[[Span, Any], None]], + ]: ... + @classmethod + def has_extension(cls, name: str) -> bool: ... + @classmethod + def remove_extension( + cls, name: str + ) -> Tuple[ + Optional[Any], + Optional[SpanMethod], + Optional[Callable[[Span], Any]], + Optional[Callable[[Span, Any], None]], + ]: ... + def __init__( + self, + doc: Doc, + start: int, + end: int, + label: Union[str, int] = ..., + vector: Optional[Floats1d] = ..., + vector_norm: Optional[float] = ..., + kb_id: Optional[int] = ..., + ) -> None: ... + def __richcmp__(self, other: Span, op: int) -> bool: ... + def __hash__(self) -> int: ... + def __len__(self) -> int: ... + def __repr__(self) -> str: ... + @overload + def __getitem__(self, i: int) -> Token: ... + @overload + def __getitem__(self, i: slice) -> Span: ... + def __iter__(self) -> Iterator[Token]: ... + @property + def _(self) -> Underscore: ... + def as_doc(self, *, copy_user_data: bool = ...) -> Doc: ... + def get_lca_matrix(self) -> Ints2d: ... + def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ... + @property + def doc(self) -> Doc: ... + @property + def vocab(self) -> Vocab: ... + @property + def sent(self) -> Span: ... + @property + def ents(self) -> Tuple[Span]: ... + @property + def has_vector(self) -> bool: ... + @property + def vector(self) -> Floats1d: ... + @property + def vector_norm(self) -> float: ... + @property + def tensor(self) -> FloatsXd: ... + @property + def sentiment(self) -> float: ... + @property + def text(self) -> str: ... + @property + def text_with_ws(self) -> str: ... + @property + def noun_chunks(self) -> Iterator[Span]: ... + @property + def root(self) -> Token: ... + def char_span( + self, + start_idx: int, + end_idx: int, + label: int = ..., + kb_id: int = ..., + vector: Optional[Floats1d] = ..., + ) -> Span: ... + @property + def conjuncts(self) -> Tuple[Token]: ... + @property + def lefts(self) -> Iterator[Token]: ... + @property + def rights(self) -> Iterator[Token]: ... + @property + def n_lefts(self) -> int: ... + @property + def n_rights(self) -> int: ... + @property + def subtree(self) -> Iterator[Token]: ... + start: int + end: int + start_char: int + end_char: int + label: int + kb_id: int + ent_id: int + ent_id_: str + @property + def orth_(self) -> str: ... + @property + def lemma_(self) -> str: ... + label_: str + kb_id_: str diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 4e6fb84f5..c9c807d7d 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -6,6 +6,7 @@ from libc.math cimport sqrt import numpy from thinc.api import get_array_module import warnings +import copy from .doc cimport token_by_start, token_by_end, get_token_attr, _get_lca_matrix from ..structs cimport TokenC, LexemeC @@ -24,7 +25,7 @@ from .underscore import Underscore, get_ext_args cdef class Span: """A slice from a Doc object. - DOCS: https://nightly.spacy.io/api/span + DOCS: https://spacy.io/api/span """ @classmethod def set_extension(cls, name, **kwargs): @@ -37,8 +38,8 @@ cdef class Span: method (callable): Optional method for method extension. force (bool): Force overwriting existing attribute. - DOCS: https://nightly.spacy.io/api/span#set_extension - USAGE: https://nightly.spacy.io/usage/processing-pipelines#custom-components-attributes + DOCS: https://spacy.io/api/span#set_extension + USAGE: https://spacy.io/usage/processing-pipelines#custom-components-attributes """ if cls.has_extension(name) and not kwargs.get("force", False): raise ValueError(Errors.E090.format(name=name, obj="Span")) @@ -51,7 +52,7 @@ cdef class Span: name (str): Name of the extension. RETURNS (tuple): A `(default, method, getter, setter)` tuple. - DOCS: https://nightly.spacy.io/api/span#get_extension + DOCS: https://spacy.io/api/span#get_extension """ return Underscore.span_extensions.get(name) @@ -62,7 +63,7 @@ cdef class Span: name (str): Name of the extension. RETURNS (bool): Whether the extension has been registered. - DOCS: https://nightly.spacy.io/api/span#has_extension + DOCS: https://spacy.io/api/span#has_extension """ return name in Underscore.span_extensions @@ -74,7 +75,7 @@ cdef class Span: RETURNS (tuple): A `(default, method, getter, setter)` tuple of the removed extension. - DOCS: https://nightly.spacy.io/api/span#remove_extension + DOCS: https://spacy.io/api/span#remove_extension """ if not cls.has_extension(name): raise ValueError(Errors.E046.format(name=name)) @@ -87,12 +88,13 @@ cdef class Span: doc (Doc): The parent document. start (int): The index of the first token of the span. end (int): The index of the first token after the span. - label (uint64): A label to attach to the Span, e.g. for named entities. - kb_id (uint64): An identifier from a Knowledge Base to capture the meaning of a named entity. + label (int or str): A label to attach to the Span, e.g. for named entities. vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. + vector_norm (float): The L2 norm of the span's vector representation. + kb_id (uint64): An identifier from a Knowledge Base to capture the meaning of a named entity. - DOCS: https://nightly.spacy.io/api/span#init + DOCS: https://spacy.io/api/span#init """ if not (0 <= start <= end <= len(doc)): raise IndexError(Errors.E035.format(start=start, end=end, length=len(doc))) @@ -104,13 +106,18 @@ cdef class Span: if label not in doc.vocab.strings: raise ValueError(Errors.E084.format(label=label)) + start_char = doc[start].idx if start < doc.length else len(doc.text) + if start == end: + end_char = start_char + else: + end_char = doc[end - 1].idx + len(doc[end - 1]) self.c = SpanC( label=label, kb_id=kb_id, start=start, end=end, - start_char=doc[start].idx if start < doc.length else 0, - end_char=doc[end - 1].idx + len(doc[end - 1]) if end >= 1 else 0, + start_char=start_char, + end_char=end_char, ) self._vector = vector self._vector_norm = vector_norm @@ -162,7 +169,7 @@ cdef class Span: RETURNS (int): The number of tokens in the span. - DOCS: https://nightly.spacy.io/api/span#len + DOCS: https://spacy.io/api/span#len """ if self.c.end < self.c.start: return 0 @@ -178,7 +185,7 @@ cdef class Span: the span to get. RETURNS (Token or Span): The token at `span[i]`. - DOCS: https://nightly.spacy.io/api/span#getitem + DOCS: https://spacy.io/api/span#getitem """ if isinstance(i, slice): start, end = normalize_slice(len(self), i.start, i.stop, i.step) @@ -198,7 +205,7 @@ cdef class Span: YIELDS (Token): A `Token` object. - DOCS: https://nightly.spacy.io/api/span#iter + DOCS: https://spacy.io/api/span#iter """ for i in range(self.c.start, self.c.end): yield self.doc[i] @@ -212,22 +219,44 @@ cdef class Span: return Underscore(Underscore.span_extensions, self, start=self.c.start_char, end=self.c.end_char) - def as_doc(self, *, bint copy_user_data=False): + def as_doc(self, *, bint copy_user_data=False, array_head=None, array=None): """Create a `Doc` object with a copy of the `Span`'s data. copy_user_data (bool): Whether or not to copy the original doc's user data. + array_head (tuple): `Doc` array attrs, can be passed in to speed up computation. + array (ndarray): `Doc` as array, can be passed in to speed up computation. RETURNS (Doc): The `Doc` copy of the span. - DOCS: https://nightly.spacy.io/api/span#as_doc + DOCS: https://spacy.io/api/span#as_doc """ words = [t.text for t in self] spaces = [bool(t.whitespace_) for t in self] cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces) - array_head = self.doc._get_array_attrs() - array = self.doc.to_array(array_head) + if array_head is None: + array_head = self.doc._get_array_attrs() + if array is None: + array = self.doc.to_array(array_head) array = array[self.start : self.end] self._fix_dep_copy(array_head, array) + # Fix initial IOB so the entities are valid for doc.ents below. + if len(array) > 0 and ENT_IOB in array_head: + ent_iob_col = array_head.index(ENT_IOB) + if array[0][ent_iob_col] == 1: + array[0][ent_iob_col] = 3 doc.from_array(array_head, array) + # Set partial entities at the beginning or end of the span to have + # missing entity annotation. Note: the initial partial entity could be + # detected from the IOB annotation but the final partial entity can't, + # so detect and remove both in the same way by checking self.ents. + span_ents = {(ent.start, ent.end) for ent in self.ents} + doc_ents = doc.ents + if len(doc_ents) > 0: + # Remove initial partial ent + if (doc_ents[0].start + self.start, doc_ents[0].end + self.start) not in span_ents: + doc.set_ents([], missing=[doc_ents[0]], default="unmodified") + # Remove final partial ent + if (doc_ents[-1].start + self.start, doc_ents[-1].end + self.start) not in span_ents: + doc.set_ents([], missing=[doc_ents[-1]], default="unmodified") doc.noun_chunks_iterator = self.doc.noun_chunks_iterator doc.user_hooks = self.doc.user_hooks doc.user_span_hooks = self.doc.user_span_hooks @@ -241,7 +270,19 @@ cdef class Span: if cat_start == self.start_char and cat_end == self.end_char: doc.cats[cat_label] = value if copy_user_data: - doc.user_data = self.doc.user_data + user_data = {} + char_offset = self.start_char + for key, value in self.doc.user_data.items(): + if isinstance(key, tuple) and len(key) == 4 and key[0] == "._.": + data_type, name, start, end = key + if start is not None or end is not None: + start -= char_offset + if end is not None: + end -= char_offset + user_data[(data_type, name, start, end)] = copy.copy(value) + else: + user_data[key] = copy.copy(value) + doc.user_data = user_data return doc def _fix_dep_copy(self, attrs, array): @@ -291,7 +332,7 @@ cdef class Span: RETURNS (np.array[ndim=2, dtype=numpy.int32]): LCA matrix with shape (n, n), where n = len(self). - DOCS: https://nightly.spacy.io/api/span#get_lca_matrix + DOCS: https://spacy.io/api/span#get_lca_matrix """ return numpy.asarray(_get_lca_matrix(self.doc, self.c.start, self.c.end)) @@ -303,7 +344,7 @@ cdef class Span: `Span`, `Token` and `Lexeme` objects. RETURNS (float): A scalar similarity score. Higher is more similar. - DOCS: https://nightly.spacy.io/api/span#similarity + DOCS: https://spacy.io/api/span#similarity """ if "similarity" in self.doc.user_span_hooks: return self.doc.user_span_hooks["similarity"](self, other) @@ -357,7 +398,12 @@ cdef class Span: @property def sent(self): - """RETURNS (Span): The sentence span that the span is a part of.""" + """Obtain the sentence that contains this span. If the given span + crosses sentence boundaries, return only the first sentence + to which it belongs. + + RETURNS (Span): The sentence span that the span is a part of. + """ if "sent" in self.doc.user_span_hooks: return self.doc.user_span_hooks["sent"](self) # Use `sent_start` token attribute to find sentence boundaries @@ -367,8 +413,8 @@ cdef class Span: start = self.start while self.doc.c[start].sent_start != 1 and start > 0: start += -1 - # Find end of the sentence - end = self.end + # Find end of the sentence - can be within the entity + end = self.start + 1 while end < self.doc.length and self.doc.c[end].sent_start != 1: end += 1 n += 1 @@ -385,7 +431,7 @@ cdef class Span: RETURNS (tuple): Entities in the span, one `Span` per entity. - DOCS: https://nightly.spacy.io/api/span#ents + DOCS: https://spacy.io/api/span#ents """ cdef Span ent ents = [] @@ -404,7 +450,7 @@ cdef class Span: RETURNS (bool): Whether a word vector is associated with the object. - DOCS: https://nightly.spacy.io/api/span#has_vector + DOCS: https://spacy.io/api/span#has_vector """ if "has_vector" in self.doc.user_span_hooks: return self.doc.user_span_hooks["has_vector"](self) @@ -423,12 +469,16 @@ cdef class Span: RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array representing the span's semantics. - DOCS: https://nightly.spacy.io/api/span#vector + DOCS: https://spacy.io/api/span#vector """ if "vector" in self.doc.user_span_hooks: return self.doc.user_span_hooks["vector"](self) if self._vector is None: - self._vector = sum(t.vector for t in self) / len(self) + if not len(self): + xp = get_array_module(self.vocab.vectors.data) + self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f") + else: + self._vector = sum(t.vector for t in self) / len(self) return self._vector @property @@ -437,14 +487,14 @@ cdef class Span: RETURNS (float): The L2 norm of the vector representation. - DOCS: https://nightly.spacy.io/api/span#vector_norm + DOCS: https://spacy.io/api/span#vector_norm """ if "vector_norm" in self.doc.user_span_hooks: return self.doc.user_span_hooks["vector"](self) - vector = self.vector - xp = get_array_module(vector) if self._vector_norm is None: + vector = self.vector total = (vector*vector).sum() + xp = get_array_module(vector) self._vector_norm = xp.sqrt(total) if total != 0. else 0. return self._vector_norm @@ -473,7 +523,7 @@ cdef class Span: def text(self): """RETURNS (str): The original verbatim text of the span.""" text = self.text_with_ws - if self[-1].whitespace_: + if len(self) > 0 and self[-1].whitespace_: text = text[:-1] return text @@ -501,7 +551,7 @@ cdef class Span: YIELDS (Span): Noun chunks in the span. - DOCS: https://nightly.spacy.io/api/span#noun_chunks + DOCS: https://spacy.io/api/span#noun_chunks """ for span in self.doc.noun_chunks: if span.start >= self.start and span.end <= self.end: @@ -515,7 +565,7 @@ cdef class Span: RETURNS (Token): The root token. - DOCS: https://nightly.spacy.io/api/span#root + DOCS: https://spacy.io/api/span#root """ if "root" in self.doc.user_span_hooks: return self.doc.user_span_hooks["root"](self) @@ -563,7 +613,7 @@ cdef class Span: """ start_idx += self.c.start_char end_idx += self.c.start_char - return self.doc.char_span(start_idx, end_idx) + return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector) @property def conjuncts(self): @@ -571,7 +621,7 @@ cdef class Span: RETURNS (tuple): A tuple of Token objects. - DOCS: https://nightly.spacy.io/api/span#lefts + DOCS: https://spacy.io/api/span#lefts """ return self.root.conjuncts @@ -582,7 +632,7 @@ cdef class Span: YIELDS (Token):A left-child of a token of the span. - DOCS: https://nightly.spacy.io/api/span#lefts + DOCS: https://spacy.io/api/span#lefts """ for token in reversed(self): # Reverse, so we get tokens in order for left in token.lefts: @@ -596,7 +646,7 @@ cdef class Span: YIELDS (Token): A right-child of a token of the span. - DOCS: https://nightly.spacy.io/api/span#rights + DOCS: https://spacy.io/api/span#rights """ for token in self: for right in token.rights: @@ -611,7 +661,7 @@ cdef class Span: RETURNS (int): The number of leftward immediate children of the span, in the syntactic dependency parse. - DOCS: https://nightly.spacy.io/api/span#n_lefts + DOCS: https://spacy.io/api/span#n_lefts """ return len(list(self.lefts)) @@ -623,7 +673,7 @@ cdef class Span: RETURNS (int): The number of rightward immediate children of the span, in the syntactic dependency parse. - DOCS: https://nightly.spacy.io/api/span#n_rights + DOCS: https://spacy.io/api/span#n_rights """ return len(list(self.rights)) @@ -633,7 +683,7 @@ cdef class Span: YIELDS (Token): A token within the span, or a descendant from it. - DOCS: https://nightly.spacy.io/api/span#subtree + DOCS: https://spacy.io/api/span#subtree """ for word in self.lefts: yield from word.subtree @@ -704,7 +754,7 @@ cdef class Span: def __get__(self): return self.root.ent_id_ - def __set__(self, hash_t key): + def __set__(self, unicode key): raise NotImplementedError(Errors.E200.format(attr="ent_id_")) @property @@ -718,7 +768,7 @@ cdef class Span: @property def lemma_(self): """RETURNS (str): The span's lemma.""" - return " ".join([t.lemma_ for t in self]).strip() + return "".join([t.lemma_ + t.whitespace_ for t in self]).strip() property label_: """RETURNS (str): The span's label.""" @@ -726,9 +776,7 @@ cdef class Span: return self.doc.vocab.strings[self.label] def __set__(self, unicode label_): - if not label_: - label_ = '' - raise NotImplementedError(Errors.E129.format(start=self.start, end=self.end, label=label_)) + self.label = self.doc.vocab.strings.add(label_) property kb_id_: """RETURNS (str): The named entity's KB ID.""" @@ -736,13 +784,7 @@ cdef class Span: return self.doc.vocab.strings[self.kb_id] def __set__(self, unicode kb_id_): - if not kb_id_: - kb_id_ = '' - current_label = self.label_ - if not current_label: - current_label = '' - raise NotImplementedError(Errors.E131.format(start=self.start, end=self.end, - label=current_label, kb_id=kb_id_)) + self.kb_id = self.doc.vocab.strings.add(kb_id_) cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1: diff --git a/spacy/tokens/span_group.pyi b/spacy/tokens/span_group.pyi new file mode 100644 index 000000000..26efc3ba0 --- /dev/null +++ b/spacy/tokens/span_group.pyi @@ -0,0 +1,26 @@ +from typing import Any, Dict, Iterable +from .doc import Doc +from .span import Span + +class SpanGroup: + name: str + attrs: Dict[str, Any] + def __init__( + self, + doc: Doc, + *, + name: str = ..., + attrs: Dict[str, Any] = ..., + spans: Iterable[Span] = ... + ) -> None: ... + def __repr__(self) -> str: ... + @property + def doc(self) -> Doc: ... + @property + def has_overlap(self) -> bool: ... + def __len__(self) -> int: ... + def append(self, span: Span) -> None: ... + def extend(self, spans: Iterable[Span]) -> None: ... + def __getitem__(self, i: int) -> Span: ... + def to_bytes(self) -> bytes: ... + def from_bytes(self, bytes_data: bytes) -> SpanGroup: ... diff --git a/spacy/tokens/span_group.pyx b/spacy/tokens/span_group.pyx index 5b768994e..eb9221584 100644 --- a/spacy/tokens/span_group.pyx +++ b/spacy/tokens/span_group.pyx @@ -1,6 +1,8 @@ import weakref import struct import srsly + +from spacy.errors import Errors from .span cimport Span from libc.stdint cimport uint64_t, uint32_t, int32_t @@ -27,7 +29,7 @@ cdef class SpanGroup: >>> doc.spans["errors"] = [doc[0:1], doc[2:4]] >>> assert isinstance(doc.spans["errors"], SpanGroup) - DOCS: https://nightly.spacy.io/api/spangroup + DOCS: https://spacy.io/api/spangroup """ def __init__(self, doc, *, name="", attrs={}, spans=[]): """Create a SpanGroup. @@ -37,7 +39,7 @@ cdef class SpanGroup: attrs (Dict[str, Any]): Optional JSON-serializable attributes to attach. spans (Iterable[Span]): The spans to add to the group. - DOCS: https://nightly.spacy.io/api/spangroup#init + DOCS: https://spacy.io/api/spangroup#init """ # We need to make this a weak reference, so that the Doc object can # own the SpanGroup without circular references. We do want to get @@ -56,15 +58,19 @@ cdef class SpanGroup: def doc(self): """RETURNS (Doc): The reference document. - DOCS: https://nightly.spacy.io/api/spangroup#doc + DOCS: https://spacy.io/api/spangroup#doc """ - return self._doc_ref() + doc = self._doc_ref() + if doc is None: + # referent has been garbage collected + raise RuntimeError(Errors.E866) + return doc @property def has_overlap(self): """RETURNS (bool): Whether the group contains overlapping spans. - DOCS: https://nightly.spacy.io/api/spangroup#has_overlap + DOCS: https://spacy.io/api/spangroup#has_overlap """ if not len(self): return False @@ -79,7 +85,7 @@ cdef class SpanGroup: def __len__(self): """RETURNS (int): The number of spans in the group. - DOCS: https://nightly.spacy.io/api/spangroup#len + DOCS: https://spacy.io/api/spangroup#len """ return self.c.size() @@ -89,7 +95,7 @@ cdef class SpanGroup: span (Span): The span to append. - DOCS: https://nightly.spacy.io/api/spangroup#append + DOCS: https://spacy.io/api/spangroup#append """ if span.doc is not self.doc: raise ValueError("Cannot add span to group: refers to different Doc.") @@ -101,7 +107,7 @@ cdef class SpanGroup: spans (Iterable[Span]): The spans to add. - DOCS: https://nightly.spacy.io/api/spangroup#extend + DOCS: https://spacy.io/api/spangroup#extend """ cdef Span span for span in spans: @@ -113,7 +119,7 @@ cdef class SpanGroup: i (int): The item index. RETURNS (Span): The span at the given index. - DOCS: https://nightly.spacy.io/api/spangroup#getitem + DOCS: https://spacy.io/api/spangroup#getitem """ cdef int size = self.c.size() if i < -size or i >= size: @@ -127,7 +133,7 @@ cdef class SpanGroup: RETURNS (bytes): The serialized span group. - DOCS: https://nightly.spacy.io/api/spangroup#to_bytes + DOCS: https://spacy.io/api/spangroup#to_bytes """ output = {"name": self.name, "attrs": self.attrs, "spans": []} for i in range(self.c.size()): @@ -159,7 +165,7 @@ cdef class SpanGroup: bytes_data (bytes): The span group to load. RETURNS (SpanGroup): The deserialized span group. - DOCS: https://nightly.spacy.io/api/spangroup#from_bytes + DOCS: https://spacy.io/api/spangroup#from_bytes """ msg = srsly.msgpack_loads(bytes_data) self.name = msg["name"] diff --git a/spacy/tokens/token.pyi b/spacy/tokens/token.pyi new file mode 100644 index 000000000..bd585d034 --- /dev/null +++ b/spacy/tokens/token.pyi @@ -0,0 +1,208 @@ +from typing import ( + Callable, + Protocol, + Iterator, + Optional, + Union, + Tuple, + Any, +) +from thinc.types import Floats1d, FloatsXd +from .doc import Doc +from .span import Span +from .morphanalysis import MorphAnalysis +from ..lexeme import Lexeme +from ..vocab import Vocab +from .underscore import Underscore + +class TokenMethod(Protocol): + def __call__(self: Token, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc] + +class Token: + i: int + doc: Doc + vocab: Vocab + @classmethod + def set_extension( + cls, + name: str, + default: Optional[Any] = ..., + getter: Optional[Callable[[Token], Any]] = ..., + setter: Optional[Callable[[Token, Any], None]] = ..., + method: Optional[TokenMethod] = ..., + force: bool = ..., + ) -> None: ... + @classmethod + def get_extension( + cls, name: str + ) -> Tuple[ + Optional[Any], + Optional[TokenMethod], + Optional[Callable[[Token], Any]], + Optional[Callable[[Token, Any], None]], + ]: ... + @classmethod + def has_extension(cls, name: str) -> bool: ... + @classmethod + def remove_extension( + cls, name: str + ) -> Tuple[ + Optional[Any], + Optional[TokenMethod], + Optional[Callable[[Token], Any]], + Optional[Callable[[Token, Any], None]], + ]: ... + def __init__(self, vocab: Vocab, doc: Doc, offset: int) -> None: ... + def __hash__(self) -> int: ... + def __len__(self) -> int: ... + def __unicode__(self) -> str: ... + def __bytes__(self) -> bytes: ... + def __str__(self) -> str: ... + def __repr__(self) -> str: ... + def __richcmp__(self, other: Token, op: int) -> bool: ... + @property + def _(self) -> Underscore: ... + def nbor(self, i: int = ...) -> Token: ... + def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ... + def has_morph(self) -> bool: ... + morph: MorphAnalysis + @property + def lex(self) -> Lexeme: ... + @property + def lex_id(self) -> int: ... + @property + def rank(self) -> int: ... + @property + def text(self) -> str: ... + @property + def text_with_ws(self) -> str: ... + @property + def prob(self) -> float: ... + @property + def sentiment(self) -> float: ... + @property + def lang(self) -> int: ... + @property + def idx(self) -> int: ... + @property + def cluster(self) -> int: ... + @property + def orth(self) -> int: ... + @property + def lower(self) -> int: ... + @property + def norm(self) -> int: ... + @property + def shape(self) -> int: ... + @property + def prefix(self) -> int: ... + @property + def suffix(self) -> int: ... + lemma: int + pos: int + tag: int + dep: int + @property + def has_vector(self) -> bool: ... + @property + def vector(self) -> Floats1d: ... + @property + def vector_norm(self) -> float: ... + @property + def tensor(self) -> Optional[FloatsXd]: ... + @property + def n_lefts(self) -> int: ... + @property + def n_rights(self) -> int: ... + @property + def sent(self) -> Span: ... + sent_start: bool + is_sent_start: Optional[bool] + is_sent_end: Optional[bool] + @property + def lefts(self) -> Iterator[Token]: ... + @property + def rights(self) -> Iterator[Token]: ... + @property + def children(self) -> Iterator[Token]: ... + @property + def subtree(self) -> Iterator[Token]: ... + @property + def left_edge(self) -> Token: ... + @property + def right_edge(self) -> Token: ... + @property + def ancestors(self) -> Iterator[Token]: ... + def is_ancestor(self, descendant: Token) -> bool: ... + def has_head(self) -> bool: ... + head: Token + @property + def conjuncts(self) -> Tuple[Token]: ... + ent_type: int + ent_type_: str + @property + def ent_iob(self) -> int: ... + @classmethod + def iob_strings(cls) -> Tuple[str]: ... + @property + def ent_iob_(self) -> str: ... + ent_id: int + ent_id_: str + ent_kb_id: int + ent_kb_id_: str + @property + def whitespace_(self) -> str: ... + @property + def orth_(self) -> str: ... + @property + def lower_(self) -> str: ... + norm_: str + @property + def shape_(self) -> str: ... + @property + def prefix_(self) -> str: ... + @property + def suffix_(self) -> str: ... + @property + def lang_(self) -> str: ... + lemma_: str + pos_: str + tag_: str + def has_dep(self) -> bool: ... + dep_: str + @property + def is_oov(self) -> bool: ... + @property + def is_stop(self) -> bool: ... + @property + def is_alpha(self) -> bool: ... + @property + def is_ascii(self) -> bool: ... + @property + def is_digit(self) -> bool: ... + @property + def is_lower(self) -> bool: ... + @property + def is_upper(self) -> bool: ... + @property + def is_title(self) -> bool: ... + @property + def is_punct(self) -> bool: ... + @property + def is_space(self) -> bool: ... + @property + def is_bracket(self) -> bool: ... + @property + def is_quote(self) -> bool: ... + @property + def is_left_punct(self) -> bool: ... + @property + def is_right_punct(self) -> bool: ... + @property + def is_currency(self) -> bool: ... + @property + def like_url(self) -> bool: ... + @property + def like_num(self) -> bool: ... + @property + def like_email(self) -> bool: ... diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 71c0baf63..c5baae510 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -27,7 +27,7 @@ cdef class Token: """An individual token – i.e. a word, punctuation symbol, whitespace, etc. - DOCS: https://nightly.spacy.io/api/token + DOCS: https://spacy.io/api/token """ @classmethod def set_extension(cls, name, **kwargs): @@ -40,8 +40,8 @@ cdef class Token: method (callable): Optional method for method extension. force (bool): Force overwriting existing attribute. - DOCS: https://nightly.spacy.io/api/token#set_extension - USAGE: https://nightly.spacy.io/usage/processing-pipelines#custom-components-attributes + DOCS: https://spacy.io/api/token#set_extension + USAGE: https://spacy.io/usage/processing-pipelines#custom-components-attributes """ if cls.has_extension(name) and not kwargs.get("force", False): raise ValueError(Errors.E090.format(name=name, obj="Token")) @@ -54,7 +54,7 @@ cdef class Token: name (str): Name of the extension. RETURNS (tuple): A `(default, method, getter, setter)` tuple. - DOCS: https://nightly.spacy.io/api/token#get_extension + DOCS: https://spacy.io/api/token#get_extension """ return Underscore.token_extensions.get(name) @@ -65,7 +65,7 @@ cdef class Token: name (str): Name of the extension. RETURNS (bool): Whether the extension has been registered. - DOCS: https://nightly.spacy.io/api/token#has_extension + DOCS: https://spacy.io/api/token#has_extension """ return name in Underscore.token_extensions @@ -77,7 +77,7 @@ cdef class Token: RETURNS (tuple): A `(default, method, getter, setter)` tuple of the removed extension. - DOCS: https://nightly.spacy.io/api/token#remove_extension + DOCS: https://spacy.io/api/token#remove_extension """ if not cls.has_extension(name): raise ValueError(Errors.E046.format(name=name)) @@ -90,7 +90,7 @@ cdef class Token: doc (Doc): The parent document. offset (int): The index of the token within the document. - DOCS: https://nightly.spacy.io/api/token#init + DOCS: https://spacy.io/api/token#init """ self.vocab = vocab self.doc = doc @@ -105,7 +105,7 @@ cdef class Token: RETURNS (int): The number of unicode characters in the token. - DOCS: https://nightly.spacy.io/api/token#len + DOCS: https://spacy.io/api/token#len """ return self.c.lex.length @@ -168,7 +168,7 @@ cdef class Token: flag_id (int): The ID of the flag attribute. RETURNS (bool): Whether the flag is set. - DOCS: https://nightly.spacy.io/api/token#check_flag + DOCS: https://spacy.io/api/token#check_flag """ return Lexeme.c_check_flag(self.c.lex, flag_id) @@ -178,7 +178,7 @@ cdef class Token: i (int): The relative position of the token to get. Defaults to 1. RETURNS (Token): The token at position `self.doc[self.i+i]`. - DOCS: https://nightly.spacy.io/api/token#nbor + DOCS: https://spacy.io/api/token#nbor """ if self.i+i < 0 or (self.i+i >= len(self.doc)): raise IndexError(Errors.E042.format(i=self.i, j=i, length=len(self.doc))) @@ -192,7 +192,7 @@ cdef class Token: `Span`, `Token` and `Lexeme` objects. RETURNS (float): A scalar similarity score. Higher is more similar. - DOCS: https://nightly.spacy.io/api/token#similarity + DOCS: https://spacy.io/api/token#similarity """ if "similarity" in self.doc.user_token_hooks: return self.doc.user_token_hooks["similarity"](self, other) @@ -329,7 +329,7 @@ cdef class Token: @property def shape(self): """RETURNS (uint64): ID of the token's shape, a transform of the - tokens's string, to show orthographic features (e.g. "Xxxx", "dd"). + token's string, to show orthographic features (e.g. "Xxxx", "dd"). """ return self.c.lex.shape @@ -388,7 +388,7 @@ cdef class Token: RETURNS (bool): Whether a word vector is associated with the object. - DOCS: https://nightly.spacy.io/api/token#has_vector + DOCS: https://spacy.io/api/token#has_vector """ if "has_vector" in self.doc.user_token_hooks: return self.doc.user_token_hooks["has_vector"](self) @@ -403,7 +403,7 @@ cdef class Token: RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array representing the token's semantics. - DOCS: https://nightly.spacy.io/api/token#vector + DOCS: https://spacy.io/api/token#vector """ if "vector" in self.doc.user_token_hooks: return self.doc.user_token_hooks["vector"](self) @@ -418,7 +418,7 @@ cdef class Token: RETURNS (float): The L2 norm of the vector representation. - DOCS: https://nightly.spacy.io/api/token#vector_norm + DOCS: https://spacy.io/api/token#vector_norm """ if "vector_norm" in self.doc.user_token_hooks: return self.doc.user_token_hooks["vector_norm"](self) @@ -441,7 +441,7 @@ cdef class Token: RETURNS (int): The number of leftward immediate children of the word, in the syntactic dependency parse. - DOCS: https://nightly.spacy.io/api/token#n_lefts + DOCS: https://spacy.io/api/token#n_lefts """ return self.c.l_kids @@ -453,7 +453,7 @@ cdef class Token: RETURNS (int): The number of rightward immediate children of the word, in the syntactic dependency parse. - DOCS: https://nightly.spacy.io/api/token#n_rights + DOCS: https://spacy.io/api/token#n_rights """ return self.c.r_kids @@ -485,7 +485,7 @@ cdef class Token: RETURNS (bool / None): Whether the token starts a sentence. None if unknown. - DOCS: https://nightly.spacy.io/api/token#is_sent_start + DOCS: https://spacy.io/api/token#is_sent_start """ def __get__(self): if self.c.sent_start == 0: @@ -514,7 +514,7 @@ cdef class Token: RETURNS (bool / None): Whether the token ends a sentence. None if unknown. - DOCS: https://nightly.spacy.io/api/token#is_sent_end + DOCS: https://spacy.io/api/token#is_sent_end """ def __get__(self): if self.i + 1 == len(self.doc): @@ -536,7 +536,7 @@ cdef class Token: YIELDS (Token): A left-child of the token. - DOCS: https://nightly.spacy.io/api/token#lefts + DOCS: https://spacy.io/api/token#lefts """ cdef int nr_iter = 0 cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge) @@ -556,7 +556,7 @@ cdef class Token: YIELDS (Token): A right-child of the token. - DOCS: https://nightly.spacy.io/api/token#rights + DOCS: https://spacy.io/api/token#rights """ cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i) tokens = [] @@ -578,7 +578,7 @@ cdef class Token: YIELDS (Token): A child token such that `child.head==self`. - DOCS: https://nightly.spacy.io/api/token#children + DOCS: https://spacy.io/api/token#children """ yield from self.lefts yield from self.rights @@ -591,7 +591,7 @@ cdef class Token: YIELDS (Token): A descendent token such that `self.is_ancestor(descendent) or token == self`. - DOCS: https://nightly.spacy.io/api/token#subtree + DOCS: https://spacy.io/api/token#subtree """ for word in self.lefts: yield from word.subtree @@ -600,7 +600,7 @@ cdef class Token: yield from word.subtree @property - def left_edge(self): + def left_edge(self) -> int: """The leftmost token of this token's syntactic descendents. RETURNS (Token): The first token such that `self.is_ancestor(token)`. @@ -608,7 +608,7 @@ cdef class Token: return self.doc[self.c.l_edge] @property - def right_edge(self): + def right_edge(self) -> int: """The rightmost token of this token's syntactic descendents. RETURNS (Token): The last token such that `self.is_ancestor(token)`. @@ -622,7 +622,7 @@ cdef class Token: YIELDS (Token): A sequence of ancestor tokens such that `ancestor.is_ancestor(self)`. - DOCS: https://nightly.spacy.io/api/token#ancestors + DOCS: https://spacy.io/api/token#ancestors """ cdef const TokenC* head_ptr = self.c # Guard against infinite loop, no token can have @@ -640,7 +640,7 @@ cdef class Token: descendant (Token): Another token. RETURNS (bool): Whether this token is the ancestor of the descendant. - DOCS: https://nightly.spacy.io/api/token#is_ancestor + DOCS: https://spacy.io/api/token#is_ancestor """ if self.doc is not descendant.doc: return False @@ -655,8 +655,8 @@ cdef class Token: return not Token.missing_head(self.c) property head: - """The syntactic parent, or "governor", of this token. - If token.has_head() is `False`, this method will return itself. + """The syntactic parent, or "governor", of this token. + If token.has_head() is `False`, this method will return itself. RETURNS (Token): The token predicted by the parser to be the head of the current token. @@ -696,7 +696,7 @@ cdef class Token: RETURNS (tuple): The coordinated tokens. - DOCS: https://nightly.spacy.io/api/token#conjuncts + DOCS: https://spacy.io/api/token#conjuncts """ cdef Token word, child if "conjuncts" in self.doc.user_token_hooks: @@ -825,7 +825,7 @@ cdef class Token: @property def shape_(self): - """RETURNS (str): Transform of the tokens's string, to show + """RETURNS (str): Transform of the token's string, to show orthographic features. For example, "Xxxx" or "dd". """ return self.vocab.strings[self.c.lex.shape] @@ -867,6 +867,8 @@ cdef class Token: return parts_of_speech.NAMES[self.c.pos] def __set__(self, pos_name): + if pos_name not in parts_of_speech.IDS: + raise ValueError(Errors.E1021.format(pp=pos_name)) self.c.pos = parts_of_speech.IDS[pos_name] property tag_: diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py index b7966fd6e..7fa7bf095 100644 --- a/spacy/tokens/underscore.py +++ b/spacy/tokens/underscore.py @@ -1,3 +1,4 @@ +from typing import Dict, Any import functools import copy @@ -6,9 +7,9 @@ from ..errors import Errors class Underscore: mutable_types = (dict, list, set) - doc_extensions = {} - span_extensions = {} - token_extensions = {} + doc_extensions: Dict[Any, Any] = {} + span_extensions: Dict[Any, Any] = {} + token_extensions: Dict[Any, Any] = {} def __init__(self, extensions, obj, start=None, end=None): object.__setattr__(self, "_extensions", extensions) diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py index 5111b80dc..99fe7c19f 100644 --- a/spacy/training/__init__.py +++ b/spacy/training/__init__.py @@ -1,4 +1,4 @@ -from .corpus import Corpus # noqa: F401 +from .corpus import Corpus, JsonlCorpus # noqa: F401 from .example import Example, validate_examples, validate_get_examples # noqa: F401 from .alignment import Alignment # noqa: F401 from .augment import dont_augment, orth_variants_augmenter # noqa: F401 @@ -7,4 +7,5 @@ from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F40 from .iob_utils import biluo_tags_to_spans, tags_to_entities # noqa: F401 from .gold_io import docs_to_json, read_json_file # noqa: F401 from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401 -from .loggers import console_logger, wandb_logger # noqa: F401 +from .loggers import console_logger, wandb_logger_v3 as wandb_logger # noqa: F401 +from .callbacks import create_copy_from_base_model # noqa: F401 diff --git a/spacy/training/augment.py b/spacy/training/augment.py index 13ae45bd2..63b54034c 100644 --- a/spacy/training/augment.py +++ b/spacy/training/augment.py @@ -1,12 +1,10 @@ from typing import Callable, Iterator, Dict, List, Tuple, TYPE_CHECKING import random import itertools -import copy from functools import partial from pydantic import BaseModel, StrictStr from ..util import registry -from ..tokens import Doc from .example import Example if TYPE_CHECKING: @@ -24,8 +22,8 @@ class OrthVariantsPaired(BaseModel): class OrthVariants(BaseModel): - paired: List[OrthVariantsPaired] = {} - single: List[OrthVariantsSingle] = {} + paired: List[OrthVariantsPaired] = [] + single: List[OrthVariantsSingle] = [] @registry.augmenters("spacy.orth_variants.v1") @@ -71,14 +69,14 @@ def lower_casing_augmenter( else: example_dict = example.to_dict() doc = nlp.make_doc(example.text.lower()) - example_dict["token_annotation"]["ORTH"] = [t.lower_ for t in doc] + example_dict["token_annotation"]["ORTH"] = [t.lower_ for t in example.reference] yield example.from_dict(doc, example_dict) def orth_variants_augmenter( nlp: "Language", example: Example, - orth_variants: dict, + orth_variants: Dict, *, level: float = 0.0, lower: float = 0.0, @@ -88,24 +86,15 @@ def orth_variants_augmenter( else: raw_text = example.text orig_dict = example.to_dict() - if not orig_dict["token_annotation"]: - yield example - else: - variant_text, variant_token_annot = make_orth_variants( - nlp, - raw_text, - orig_dict["token_annotation"], - orth_variants, - lower=raw_text is not None and random.random() < lower, - ) - if variant_text: - doc = nlp.make_doc(variant_text) - else: - doc = Doc(nlp.vocab, words=variant_token_annot["ORTH"]) - variant_token_annot["ORTH"] = [w.text for w in doc] - variant_token_annot["SPACY"] = [w.whitespace_ for w in doc] - orig_dict["token_annotation"] = variant_token_annot - yield example.from_dict(doc, orig_dict) + variant_text, variant_token_annot = make_orth_variants( + nlp, + raw_text, + orig_dict["token_annotation"], + orth_variants, + lower=raw_text is not None and random.random() < lower, + ) + orig_dict["token_annotation"] = variant_token_annot + yield example.from_dict(nlp.make_doc(variant_text), orig_dict) def make_orth_variants( @@ -116,88 +105,53 @@ def make_orth_variants( *, lower: bool = False, ) -> Tuple[str, Dict[str, List[str]]]: - orig_token_dict = copy.deepcopy(token_dict) - ndsv = orth_variants.get("single", []) - ndpv = orth_variants.get("paired", []) words = token_dict.get("ORTH", []) tags = token_dict.get("TAG", []) - # keep unmodified if words or tags are not defined - if words and tags: - if lower: - words = [w.lower() for w in words] - # single variants - punct_choices = [random.choice(x["variants"]) for x in ndsv] - for word_idx in range(len(words)): - for punct_idx in range(len(ndsv)): - if ( - tags[word_idx] in ndsv[punct_idx]["tags"] - and words[word_idx] in ndsv[punct_idx]["variants"] - ): - words[word_idx] = punct_choices[punct_idx] - # paired variants - punct_choices = [random.choice(x["variants"]) for x in ndpv] - for word_idx in range(len(words)): - for punct_idx in range(len(ndpv)): - if tags[word_idx] in ndpv[punct_idx]["tags"] and words[ - word_idx - ] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]): - # backup option: random left vs. right from pair - pair_idx = random.choice([0, 1]) - # best option: rely on paired POS tags like `` / '' - if len(ndpv[punct_idx]["tags"]) == 2: - pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx]) - # next best option: rely on position in variants - # (may not be unambiguous, so order of variants matters) - else: - for pair in ndpv[punct_idx]["variants"]: - if words[word_idx] in pair: - pair_idx = pair.index(words[word_idx]) - words[word_idx] = punct_choices[punct_idx][pair_idx] + # keep unmodified if words are not defined + if not words: + return raw, token_dict + if lower: + words = [w.lower() for w in words] + raw = raw.lower() + # if no tags, only lowercase + if not tags: token_dict["ORTH"] = words - token_dict["TAG"] = tags - # modify raw - if raw is not None: - variants = [] - for single_variants in ndsv: - variants.extend(single_variants["variants"]) - for paired_variants in ndpv: - variants.extend( - list(itertools.chain.from_iterable(paired_variants["variants"])) - ) - # store variants in reverse length order to be able to prioritize - # longer matches (e.g., "---" before "--") - variants = sorted(variants, key=lambda x: len(x)) - variants.reverse() - variant_raw = "" - raw_idx = 0 - # add initial whitespace - while raw_idx < len(raw) and raw[raw_idx].isspace(): - variant_raw += raw[raw_idx] - raw_idx += 1 - for word in words: - match_found = False - # skip whitespace words - if word.isspace(): - match_found = True - # add identical word - elif word not in variants and raw[raw_idx:].startswith(word): - variant_raw += word - raw_idx += len(word) - match_found = True - # add variant word - else: - for variant in variants: - if not match_found and raw[raw_idx:].startswith(variant): - raw_idx += len(variant) - variant_raw += word - match_found = True - # something went wrong, abort - # (add a warning message?) - if not match_found: - return raw, orig_token_dict - # add following whitespace - while raw_idx < len(raw) and raw[raw_idx].isspace(): - variant_raw += raw[raw_idx] - raw_idx += 1 - raw = variant_raw + return raw, token_dict + # single variants + ndsv = orth_variants.get("single", []) + punct_choices = [random.choice(x["variants"]) for x in ndsv] + for word_idx in range(len(words)): + for punct_idx in range(len(ndsv)): + if ( + tags[word_idx] in ndsv[punct_idx]["tags"] + and words[word_idx] in ndsv[punct_idx]["variants"] + ): + words[word_idx] = punct_choices[punct_idx] + # paired variants + ndpv = orth_variants.get("paired", []) + punct_choices = [random.choice(x["variants"]) for x in ndpv] + for word_idx in range(len(words)): + for punct_idx in range(len(ndpv)): + if tags[word_idx] in ndpv[punct_idx]["tags"] and words[ + word_idx + ] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]): + # backup option: random left vs. right from pair + pair_idx = random.choice([0, 1]) + # best option: rely on paired POS tags like `` / '' + if len(ndpv[punct_idx]["tags"]) == 2: + pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx]) + # next best option: rely on position in variants + # (may not be unambiguous, so order of variants matters) + else: + for pair in ndpv[punct_idx]["variants"]: + if words[word_idx] in pair: + pair_idx = pair.index(words[word_idx]) + words[word_idx] = punct_choices[punct_idx][pair_idx] + token_dict["ORTH"] = words + # construct modified raw text from words and spaces + raw = "" + for orth, spacy in zip(token_dict["ORTH"], token_dict["SPACY"]): + raw += orth + if spacy: + raw += " " return raw, token_dict diff --git a/spacy/training/batchers.py b/spacy/training/batchers.py index c54242eae..f0b6c3123 100644 --- a/spacy/training/batchers.py +++ b/spacy/training/batchers.py @@ -1,4 +1,4 @@ -from typing import Union, Iterable, Sequence, TypeVar, List, Callable +from typing import Union, Iterable, Sequence, TypeVar, List, Callable, Iterator from typing import Optional, Any from functools import partial import itertools @@ -6,7 +6,7 @@ import itertools from ..util import registry, minibatch -Sizing = Union[Iterable[int], int] +Sizing = Union[Sequence[int], int] ItemT = TypeVar("ItemT") BatcherT = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]] @@ -24,7 +24,7 @@ def configure_minibatch_by_padded_size( The padded size is defined as the maximum length of sequences within the batch multiplied by the number of sequences in the batch. - size (int or Iterable[int]): The largest padded size to batch sequences into. + size (int or Sequence[int]): The largest padded size to batch sequences into. Can be a single integer, or a sequence, allowing for variable batch sizes. buffer (int): The number of sequences to accumulate before sorting by length. A larger buffer will result in more even sizing, but if the buffer is @@ -56,7 +56,7 @@ def configure_minibatch_by_words( ) -> BatcherT: """Create a batcher that uses the "minibatch by words" strategy. - size (int or Iterable[int]): The target number of words per batch. + size (int or Sequence[int]): The target number of words per batch. Can be a single integer, or a sequence, allowing for variable batch sizes. tolerance (float): What percentage of the size to allow batches to exceed. discard_oversize (bool): Whether to discard sequences that by themselves @@ -66,7 +66,11 @@ def configure_minibatch_by_words( """ optionals = {"get_length": get_length} if get_length is not None else {} return partial( - minibatch_by_words, size=size, discard_oversize=discard_oversize, **optionals + minibatch_by_words, + size=size, + tolerance=tolerance, + discard_oversize=discard_oversize, + **optionals ) @@ -76,7 +80,7 @@ def configure_minibatch( ) -> BatcherT: """Create a batcher that creates batches of the specified size. - size (int or Iterable[int]): The target number of items per batch. + size (int or Sequence[int]): The target number of items per batch. Can be a single integer, or a sequence, allowing for variable batch sizes. """ optionals = {"get_length": get_length} if get_length is not None else {} @@ -96,7 +100,7 @@ def minibatch_by_padded_size( The padded size is defined as the maximum length of sequences within the batch multiplied by the number of sequences in the batch. - size (int): The largest padded size to batch sequences into. + size (int or Sequence[int]): The largest padded size to batch sequences into. buffer (int): The number of sequences to accumulate before sorting by length. A larger buffer will result in more even sizing, but if the buffer is very large, the iteration order will be less random, which can result @@ -107,9 +111,9 @@ def minibatch_by_padded_size( The `len` function is used by default. """ if isinstance(size, int): - size_ = itertools.repeat(size) + size_ = itertools.repeat(size) # type: Iterator[int] else: - size_ = size + size_ = iter(size) for outer_batch in minibatch(seqs, size=buffer): outer_batch = list(outer_batch) target_size = next(size_) @@ -134,7 +138,7 @@ def minibatch_by_words( themselves, or be discarded if discard_oversize=True. seqs (Iterable[Sequence]): The sequences to minibatch. - size (int or Iterable[int]): The target number of words per batch. + size (int or Sequence[int]): The target number of words per batch. Can be a single integer, or a sequence, allowing for variable batch sizes. tolerance (float): What percentage of the size to allow batches to exceed. discard_oversize (bool): Whether to discard sequences that by themselves @@ -143,11 +147,9 @@ def minibatch_by_words( item. The `len` function is used by default. """ if isinstance(size, int): - size_ = itertools.repeat(size) - elif isinstance(size, List): - size_ = iter(size) + size_ = itertools.repeat(size) # type: Iterator[int] else: - size_ = size + size_ = iter(size) target_size = next(size_) tol_size = target_size * tolerance batch = [] @@ -212,7 +214,7 @@ def _batch_by_length( lengths_indices = [(get_length(seq), i) for i, seq in enumerate(seqs)] lengths_indices.sort() batches = [] - batch = [] + batch: List[int] = [] for length, i in lengths_indices: if not batch: batch.append(i) diff --git a/spacy/training/callbacks.py b/spacy/training/callbacks.py new file mode 100644 index 000000000..426fddf90 --- /dev/null +++ b/spacy/training/callbacks.py @@ -0,0 +1,32 @@ +from typing import Callable, Optional +from ..errors import Errors +from ..language import Language +from ..util import load_model, registry, logger + + +@registry.callbacks("spacy.copy_from_base_model.v1") +def create_copy_from_base_model( + tokenizer: Optional[str] = None, + vocab: Optional[str] = None, +) -> Callable[[Language], Language]: + def copy_from_base_model(nlp): + if tokenizer: + logger.info(f"Copying tokenizer from: {tokenizer}") + base_nlp = load_model(tokenizer) + if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]: + nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"])) + else: + raise ValueError( + Errors.E872.format( + curr_config=nlp.config["nlp"]["tokenizer"], + base_config=base_nlp.config["nlp"]["tokenizer"], + ) + ) + if vocab: + logger.info(f"Copying vocab from: {vocab}") + # only reload if the vocab is from a different model + if tokenizer != vocab: + base_nlp = load_model(vocab) + nlp.vocab.from_bytes(base_nlp.vocab.to_bytes()) + + return copy_from_base_model diff --git a/spacy/training/converters/conll_ner_to_docs.py b/spacy/training/converters/conll_ner_to_docs.py index 8c1bad9ea..28b21c5f0 100644 --- a/spacy/training/converters/conll_ner_to_docs.py +++ b/spacy/training/converters/conll_ner_to_docs.py @@ -124,6 +124,9 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None): nlp = load_model(model) if "parser" in nlp.pipe_names: msg.info(f"Segmenting sentences with parser from model '{model}'.") + for name, proc in nlp.pipeline: + if "parser" in getattr(proc, "listening_components", []): + nlp.replace_listeners(name, "parser", ["model.tok2vec"]) sentencizer = nlp.get_pipe("parser") if not sentencizer: msg.info( diff --git a/spacy/training/converters/conllu_to_docs.py b/spacy/training/converters/conllu_to_docs.py index 356021a1d..66156b6e5 100644 --- a/spacy/training/converters/conllu_to_docs.py +++ b/spacy/training/converters/conllu_to_docs.py @@ -69,7 +69,7 @@ def read_conllx( ner_tag_pattern="", ner_map=None, ): - """ Yield docs, one for each sentence """ + """Yield docs, one for each sentence""" vocab = Vocab() # need vocab to make a minimal Doc for sent in input_data.strip().split("\n\n"): lines = sent.strip().split("\n") diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py index b3ff30e66..b30d918fd 100644 --- a/spacy/training/corpus.py +++ b/spacy/training/corpus.py @@ -2,6 +2,7 @@ import warnings from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable from typing import Optional from pathlib import Path +import random import srsly from .. import util @@ -40,8 +41,8 @@ def create_docbin_reader( @util.registry.readers("spacy.JsonlCorpus.v1") def create_jsonl_reader( - path: Path, min_length: int = 0, max_length: int = 0, limit: int = 0 -) -> Callable[["Language"], Iterable[Doc]]: + path: Union[str, Path], min_length: int = 0, max_length: int = 0, limit: int = 0 +) -> Callable[["Language"], Iterable[Example]]: return JsonlCorpus(path, min_length=min_length, max_length=max_length, limit=limit) @@ -96,8 +97,9 @@ class Corpus: Defaults to 0, which indicates no limit. augment (Callable[Example, Iterable[Example]]): Optional data augmentation function, to extrapolate additional examples from your annotations. + shuffle (bool): Whether to shuffle the examples. - DOCS: https://nightly.spacy.io/api/corpus + DOCS: https://spacy.io/api/corpus """ def __init__( @@ -108,12 +110,14 @@ class Corpus: gold_preproc: bool = False, max_length: int = 0, augmenter: Optional[Callable] = None, + shuffle: bool = False, ) -> None: self.path = util.ensure_path(path) self.gold_preproc = gold_preproc self.max_length = max_length self.limit = limit self.augmenter = augmenter if augmenter is not None else dont_augment + self.shuffle = shuffle def __call__(self, nlp: "Language") -> Iterator[Example]: """Yield examples from the data. @@ -121,15 +125,19 @@ class Corpus: nlp (Language): The current nlp object. YIELDS (Example): The examples. - DOCS: https://nightly.spacy.io/api/corpus#call + DOCS: https://spacy.io/api/corpus#call """ ref_docs = self.read_docbin(nlp.vocab, walk_corpus(self.path, FILE_TYPE)) + if self.shuffle: + ref_docs = list(ref_docs) # type: ignore + random.shuffle(ref_docs) # type: ignore + if self.gold_preproc: examples = self.make_examples_gold_preproc(nlp, ref_docs) else: examples = self.make_examples(nlp, ref_docs) for real_eg in examples: - for augmented_eg in self.augmenter(nlp, real_eg): + for augmented_eg in self.augmenter(nlp, real_eg): # type: ignore[operator] yield augmented_eg def _make_example( @@ -155,7 +163,7 @@ class Corpus: continue elif self.max_length == 0 or len(reference) < self.max_length: yield self._make_example(nlp, reference, False) - elif reference.is_sentenced: + elif reference.has_annotation("SENT_START"): for ref_sent in reference.sents: if len(ref_sent) == 0: continue @@ -166,7 +174,7 @@ class Corpus: self, nlp: "Language", reference_docs: Iterable[Doc] ) -> Iterator[Example]: for reference in reference_docs: - if reference.is_sentenced: + if reference.has_annotation("SENT_START"): ref_sents = [sent.as_doc() for sent in reference.sents] else: ref_sents = [reference] @@ -178,11 +186,11 @@ class Corpus: def read_docbin( self, vocab: Vocab, locs: Iterable[Union[str, Path]] ) -> Iterator[Doc]: - """ Yield training examples as example dicts """ + """Yield training examples as example dicts""" i = 0 for loc in locs: loc = util.ensure_path(loc) - if loc.parts[-1].endswith(FILE_TYPE): + if loc.parts[-1].endswith(FILE_TYPE): # type: ignore[union-attr] doc_bin = DocBin().from_disk(loc) docs = doc_bin.get_docs(vocab) for doc in docs: @@ -194,7 +202,7 @@ class Corpus: class JsonlCorpus: - """Iterate Doc objects from a file or directory of jsonl + """Iterate Example objects from a file or directory of jsonl formatted raw text files. path (Path): The directory or filename to read from. @@ -206,7 +214,7 @@ class JsonlCorpus: limit (int): Limit corpus to a subset of examples, e.g. for debugging. Defaults to 0, which indicates no limit. - DOCS: https://nightly.spacy.io/api/corpus#jsonlcorpus + DOCS: https://spacy.io/api/corpus#jsonlcorpus """ file_type = "jsonl" @@ -230,7 +238,7 @@ class JsonlCorpus: nlp (Language): The current nlp object. YIELDS (Example): The example objects. - DOCS: https://nightly.spacy.io/api/corpus#jsonlcorpus-call + DOCS: https://spacy.io/api/corpus#jsonlcorpus-call """ for loc in walk_corpus(self.path, ".jsonl"): records = srsly.read_jsonl(loc) diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index bbe59e9f4..732203e7b 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -13,7 +13,7 @@ from .iob_utils import biluo_tags_to_spans from ..errors import Errors, Warnings from ..pipeline._parser_internals import nonproj from ..tokens.token cimport MISSING_DEP -from ..util import logger +from ..util import logger, to_ternary_int cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot): @@ -22,6 +22,8 @@ cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot): output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"]) if "entities" in doc_annot: _add_entities_to_doc(output, doc_annot["entities"]) + if "spans" in doc_annot: + _add_spans_to_doc(output, doc_annot["spans"]) if array.size: output = output.from_array(attrs, array) # links are currently added with ENT_KB_ID on the token level @@ -211,18 +213,19 @@ cdef class Example: else: return [None] * len(self.x) - def get_aligned_spans_x2y(self, x_spans): - return self._get_aligned_spans(self.y, x_spans, self.alignment.x2y) + def get_aligned_spans_x2y(self, x_spans, allow_overlap=False): + return self._get_aligned_spans(self.y, x_spans, self.alignment.x2y, allow_overlap) - def get_aligned_spans_y2x(self, y_spans): - return self._get_aligned_spans(self.x, y_spans, self.alignment.y2x) + def get_aligned_spans_y2x(self, y_spans, allow_overlap=False): + return self._get_aligned_spans(self.x, y_spans, self.alignment.y2x, allow_overlap) - def _get_aligned_spans(self, doc, spans, align): + def _get_aligned_spans(self, doc, spans, align, allow_overlap): seen = set() output = [] for span in spans: indices = align[span.start : span.end].data.ravel() - indices = [idx for idx in indices if idx not in seen] + if not allow_overlap: + indices = [idx for idx in indices if idx not in seen] if len(indices) >= 1: aligned_span = Span(doc, indices[0], indices[-1] + 1, label=span.label) target_text = span.text.lower().strip().replace(" ", "") @@ -232,10 +235,10 @@ cdef class Example: seen.update(indices) return output - def get_aligned_ner(self): + def get_aligned_ents_and_ner(self): if not self.y.has_annotation("ENT_IOB"): - return [None] * len(self.x) # should this be 'missing' instead of 'None' ? - x_ents = self.get_aligned_spans_y2x(self.y.ents) + return [], [None] * len(self.x) + x_ents = self.get_aligned_spans_y2x(self.y.ents, allow_overlap=False) # Default to 'None' for missing values x_tags = offsets_to_biluo_tags( self.x, @@ -250,6 +253,10 @@ cdef class Example: x_tags[i] = "O" elif self.x[i].is_space: x_tags[i] = "O" + return x_ents, x_tags + + def get_aligned_ner(self): + x_ents, x_tags = self.get_aligned_ents_and_ner() return x_tags def to_dict(self): @@ -314,13 +321,11 @@ def _annot2array(vocab, tok_annot, doc_annot): for key, value in doc_annot.items(): if value: - if key == "entities": + if key in ["entities", "cats", "spans"]: pass elif key == "links": ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], tok_annot["SPACY"], value) tok_annot["ENT_KB_ID"] = ent_kb_ids - elif key == "cats": - pass else: raise ValueError(Errors.E974.format(obj="doc", key=key)) @@ -337,7 +342,7 @@ def _annot2array(vocab, tok_annot, doc_annot): values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]) elif key == "SENT_START": attrs.append(key) - values.append(value) + values.append([to_ternary_int(v) for v in value]) elif key == "MORPH": attrs.append(key) values.append([vocab.morphology.add(v) for v in value]) @@ -351,12 +356,37 @@ def _annot2array(vocab, tok_annot, doc_annot): return attrs, array.T +def _add_spans_to_doc(doc, spans_data): + if not isinstance(spans_data, dict): + raise ValueError(Errors.E879) + for key, span_list in spans_data.items(): + spans = [] + if not isinstance(span_list, list): + raise ValueError(Errors.E879) + for span_tuple in span_list: + if not isinstance(span_tuple, (list, tuple)) or len(span_tuple) < 2: + raise ValueError(Errors.E879) + start_char = span_tuple[0] + end_char = span_tuple[1] + label = 0 + kb_id = 0 + if len(span_tuple) > 2: + label = span_tuple[2] + if len(span_tuple) > 3: + kb_id = span_tuple[3] + span = doc.char_span(start_char, end_char, label=label, kb_id=kb_id) + spans.append(span) + doc.spans[key] = spans + + def _add_entities_to_doc(doc, ner_data): if ner_data is None: return elif ner_data == []: doc.ents = [] - elif isinstance(ner_data[0], tuple): + elif not isinstance(ner_data, (list, tuple)): + raise ValueError(Errors.E973) + elif isinstance(ner_data[0], (list, tuple)): return _add_entities_to_doc( doc, offsets_to_biluo_tags(doc, ner_data) @@ -390,12 +420,12 @@ def _fix_legacy_dict_data(example_dict): token_dict = example_dict.get("token_annotation", {}) doc_dict = example_dict.get("doc_annotation", {}) for key, value in example_dict.items(): - if value: + if value is not None: if key in ("token_annotation", "doc_annotation"): pass elif key == "ids": pass - elif key in ("cats", "links"): + elif key in ("cats", "links", "spans"): doc_dict[key] = value elif key in ("ner", "entities"): doc_dict["entities"] = value diff --git a/spacy/training/gold_io.pyx b/spacy/training/gold_io.pyx index 327748d01..69654e2c7 100644 --- a/spacy/training/gold_io.pyx +++ b/spacy/training/gold_io.pyx @@ -121,7 +121,7 @@ def json_to_annotations(doc): if i == 0: sent_starts.append(1) else: - sent_starts.append(0) + sent_starts.append(-1) if "brackets" in sent: brackets.extend((b["first"] + sent_start_i, b["last"] + sent_start_i, b["label"]) diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index af3979e46..96abcc7cd 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -1,4 +1,4 @@ -from typing import Union, Dict, Optional, Any, List, IO, TYPE_CHECKING +from typing import Union, Dict, Optional, Any, IO, TYPE_CHECKING from thinc.api import Config, fix_random_seed, set_gpu_allocator from thinc.api import ConfigValidationError from pathlib import Path @@ -8,13 +8,17 @@ import tarfile import gzip import zipfile import tqdm +from itertools import islice +import warnings +from .pretrain import get_tok2vec_ref from ..lookups import Lookups from ..vectors import Vectors -from ..errors import Errors +from ..errors import Errors, Warnings from ..schemas import ConfigSchemaTraining from ..util import registry, load_model_from_config, resolve_dot_names, logger -from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB +from ..util import load_model, ensure_path, get_sourced_components +from ..util import OOV_RANK, DEFAULT_OOV_PROB if TYPE_CHECKING: from ..language import Language # noqa: F401 @@ -33,7 +37,7 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": if use_gpu >= 0 and allocator: set_gpu_allocator(allocator) # Use original config here before it's resolved to functions - sourced_components = get_sourced_components(config) + sourced = get_sourced_components(config) nlp = load_model_from_config(raw_config, auto_fill=True) logger.info("Set up nlp object from config") config = nlp.config.interpolate() @@ -57,29 +61,42 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": # Components that shouldn't be updated during training frozen_components = T["frozen_components"] # Sourced components that require resume_training - resume_components = [p for p in sourced_components if p not in frozen_components] + resume_components = [p for p in sourced if p not in frozen_components] logger.info(f"Pipeline: {nlp.pipe_names}") if resume_components: with nlp.select_pipes(enable=resume_components): logger.info(f"Resuming training for: {resume_components}") nlp.resume_training(sgd=optimizer) + # Make sure that listeners are defined before initializing further + nlp._link_components() with nlp.select_pipes(disable=[*frozen_components, *resume_components]): - nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) + if T["max_epochs"] == -1: + sample_size = 100 + logger.debug( + f"Due to streamed train corpus, using only first {sample_size} " + f"examples for initialization. If necessary, provide all labels " + f"in [initialize]. More info: https://spacy.io/api/cli#init_labels" + ) + nlp.initialize( + lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer + ) + else: + nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) logger.info(f"Initialized pipeline components: {nlp.pipe_names}") # Detect components with listeners that are not frozen consistently for name, proc in nlp.pipeline: - if getattr(proc, "listening_components", None): - for listener in proc.listening_components: - if listener in frozen_components and name not in frozen_components: - logger.warn(f"Component '{name}' will be (re)trained, but the " - f"'{listener}' depends on it and is frozen. This means " - f"that the performance of the '{listener}' will be degraded. " - f"You should either freeze both, or neither of the two.") - - if listener not in frozen_components and name in frozen_components: - logger.warn(f"Component '{listener}' will be (re)trained, but it needs the " - f"'{name}' which is frozen. " - f"You should either freeze both, or neither of the two.") + for listener in getattr( + proc, "listening_components", [] + ): # e.g. tok2vec/transformer + # Don't warn about components not in the pipeline + if listener not in nlp.pipe_names: + continue + if listener in frozen_components and name not in frozen_components: + logger.warning(Warnings.W087.format(name=name, listener=listener)) + # We always check this regardless, in case user freezes tok2vec + if listener not in frozen_components and name in frozen_components: + if name not in T["annotating_components"]: + logger.warning(Warnings.W086.format(name=name, listener=listener)) return nlp @@ -89,7 +106,7 @@ def init_vocab( data: Optional[Path] = None, lookups: Optional[Lookups] = None, vectors: Optional[str] = None, -) -> "Language": +) -> None: if lookups: nlp.vocab.lookups = lookups logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}") @@ -113,6 +130,12 @@ def init_vocab( if vectors is not None: load_vectors_into_model(nlp, vectors) logger.info(f"Added vectors: {vectors}") + # warn if source model vectors are not identical + sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {}) + vectors_hash = hash(nlp.vocab.vectors.to_bytes()) + for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items(): + if vectors_hash != sourced_vectors_hash: + warnings.warn(Warnings.W113.format(name=sourced_component)) logger.info("Finished initializing nlp object") @@ -121,7 +144,12 @@ def load_vectors_into_model( ) -> None: """Load word vectors from an installed model or path into a model instance.""" try: - vectors_nlp = load_model(name) + # Load with the same vocab, which automatically adds the vectors to + # the current nlp object. Exclude lookups so they are not modified. + exclude = ["lookups"] + if not add_strings: + exclude.append("strings") + vectors_nlp = load_model(name, vocab=nlp.vocab, exclude=exclude) except ConfigValidationError as e: title = f"Config validation error for vectors {name}" desc = ( @@ -131,13 +159,12 @@ def load_vectors_into_model( ) err = ConfigValidationError.from_error(e, title=title, desc=desc) raise err from None - nlp.vocab.vectors = vectors_nlp.vocab.vectors - if add_strings: - # I guess we should add the strings from the vectors_nlp model? - # E.g. if someone does a similarity query, they might expect the strings. - for key in nlp.vocab.vectors.key2row: - if key in vectors_nlp.vocab.strings: - nlp.vocab.strings.add(vectors_nlp.vocab.strings[key]) + + if len(vectors_nlp.vocab.vectors.keys()) == 0: + logger.warning(Warnings.W112.format(name=name)) + + for lex in nlp.vocab: + lex.rank = nlp.vocab.vectors.key2row.get(lex.orth, OOV_RANK) # type: ignore[attr-defined] def init_tok2vec( @@ -149,10 +176,6 @@ def init_tok2vec( weights_data = None init_tok2vec = ensure_path(I["init_tok2vec"]) if init_tok2vec is not None: - if P["objective"].get("type") == "vectors" and not I["vectors"]: - err = 'need initialize.vectors if pretraining.objective.type is "vectors"' - errors = [{"loc": ["initialize"], "msg": err}] - raise ConfigValidationError(config=nlp.config, errors=errors) if not init_tok2vec.exists(): err = f"can't find pretrained tok2vec: {init_tok2vec}" errors = [{"loc": ["initialize", "init_tok2vec"], "msg": err}] @@ -160,37 +183,13 @@ def init_tok2vec( with init_tok2vec.open("rb") as file_: weights_data = file_.read() if weights_data is not None: - tok2vec_component = P["component"] - if tok2vec_component is None: - desc = ( - f"To use pretrained tok2vec weights, [pretraining.component] " - f"needs to specify the component that should load them." - ) - err = "component can't be null" - errors = [{"loc": ["pretraining", "component"], "msg": err}] - raise ConfigValidationError( - config=nlp.config["pretraining"], errors=errors, desc=desc - ) - layer = nlp.get_pipe(tok2vec_component).model - if P["layer"]: - layer = layer.get_ref(P["layer"]) + layer = get_tok2vec_ref(nlp, P) layer.from_bytes(weights_data) + logger.info(f"Loaded pretrained weights from {init_tok2vec}") return True return False -def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]: - """RETURNS (List[str]): All sourced components in the original config, - e.g. {"source": "en_core_web_sm"}. If the config contains a key - "factory", we assume it refers to a component factory. - """ - return [ - name - for name, cfg in config.get("components", {}).items() - if "factory" not in cfg and "source" in cfg - ] - - def convert_vectors( nlp: "Language", vectors_loc: Optional[Path], @@ -204,7 +203,7 @@ def convert_vectors( nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb"))) for lex in nlp.vocab: if lex.rank and lex.rank != OOV_RANK: - nlp.vocab.vectors.add(lex.orth, row=lex.rank) + nlp.vocab.vectors.add(lex.orth, row=lex.rank) # type: ignore[attr-defined] else: if vectors_loc: logger.info(f"Reading vectors from {vectors_loc}") @@ -229,8 +228,7 @@ def convert_vectors( def read_vectors(vectors_loc: Path, truncate_vectors: int): - f = open_file(vectors_loc) - f = ensure_shape(f) + f = ensure_shape(vectors_loc) shape = tuple(int(size) for size in next(f).split()) if truncate_vectors >= 1: shape = (truncate_vectors, shape[1]) @@ -253,23 +251,24 @@ def open_file(loc: Union[str, Path]) -> IO: """Handle .gz, .tar.gz or unzipped files""" loc = ensure_path(loc) if tarfile.is_tarfile(str(loc)): - return tarfile.open(str(loc), "r:gz") + return tarfile.open(str(loc), "r:gz") # type: ignore[return-value] elif loc.parts[-1].endswith("gz"): - return (line.decode("utf8") for line in gzip.open(str(loc), "r")) + return (line.decode("utf8") for line in gzip.open(str(loc), "r")) # type: ignore[return-value] elif loc.parts[-1].endswith("zip"): zip_file = zipfile.ZipFile(str(loc)) names = zip_file.namelist() file_ = zip_file.open(names[0]) - return (line.decode("utf8") for line in file_) + return (line.decode("utf8") for line in file_) # type: ignore[return-value] else: return loc.open("r", encoding="utf8") -def ensure_shape(lines): +def ensure_shape(vectors_loc): """Ensure that the first line of the data is the vectors shape. If it's not, we read in the data and output the shape as the first result, so that the reader doesn't have to deal with the problem. """ + lines = open_file(vectors_loc) first_line = next(lines) try: shape = tuple(int(size) for size in first_line.split()) @@ -283,7 +282,11 @@ def ensure_shape(lines): # Figure out the shape, make it the first value, and then give the # rest of the data. width = len(first_line.split()) - 1 - captured = [first_line] + list(lines) - length = len(captured) + length = 1 + for _ in lines: + length += 1 yield f"{length} {width}" - yield from captured + # Reading the lines in again from file. This to avoid having to + # store all the results in a list in memory + lines2 = open_file(vectors_loc) + yield from lines2 diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py index 0e8e7eed0..64492c2bc 100644 --- a/spacy/training/iob_utils.py +++ b/spacy/training/iob_utils.py @@ -1,4 +1,4 @@ -from typing import List, Tuple, Iterable, Union, Iterator +from typing import List, Dict, Tuple, Iterable, Union, Iterator import warnings from ..errors import Errors, Warnings @@ -6,7 +6,7 @@ from ..tokens import Span, Doc def iob_to_biluo(tags: Iterable[str]) -> List[str]: - out = [] + out: List[str] = [] tags = list(tags) while tags: out.extend(_consume_os(tags)) @@ -71,6 +71,8 @@ def offsets_to_biluo_tags( entities (iterable): A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. + missing (str): The label used for missing values, e.g. if tokenization + doesn’t align with the entity offsets. Defaults to "O". RETURNS (list): A list of unicode strings, describing the tags. Each tag string will be of the form either "", "O" or "{action}-{label}", where action is one of "B", "I", "L", "U". The missing label is used where the @@ -88,7 +90,7 @@ def offsets_to_biluo_tags( >>> assert tags == ["O", "O", 'U-LOC', "O"] """ # Ensure no overlapping entity labels exist - tokens_in_ents = {} + tokens_in_ents: Dict[int, Tuple[int, int, Union[str, int]]] = {} starts = {token.idx: token.i for token in doc} ends = {token.idx + len(token): token.i for token in doc} biluo = ["-" for _ in doc] @@ -150,7 +152,7 @@ def biluo_tags_to_spans(doc: Doc, tags: Iterable[str]) -> List[Span]: to overwrite the doc.ents. doc (Doc): The document that the BILUO tags refer to. - entities (iterable): A sequence of BILUO tags with each tag describing one + tags (iterable): A sequence of BILUO tags with each tag describing one token. Each tag string will be of the form of either "", "O" or "{action}-{label}", where action is one of "B", "I", "L", "U". RETURNS (list): A sequence of Span objects. Each token with a missing IOB @@ -170,7 +172,7 @@ def biluo_tags_to_offsets( """Encode per-token tags following the BILUO scheme into entity offsets. doc (Doc): The document that the BILUO tags refer to. - entities (iterable): A sequence of BILUO tags with each tag describing one + tags (iterable): A sequence of BILUO tags with each tag describing one token. Each tags string will be of the form of either "", "O" or "{action}-{label}", where action is one of "B", "I", "L", "U". RETURNS (list): A sequence of `(start, end, label)` triples. `start` and @@ -197,14 +199,18 @@ def tags_to_entities(tags: Iterable[str]) -> List[Tuple[str, int, int]]: pass elif tag.startswith("I"): if start is None: - raise ValueError(Errors.E067.format(start="I", tags=tags[: i + 1])) + raise ValueError( + Errors.E067.format(start="I", tags=list(tags)[: i + 1]) + ) elif tag.startswith("U"): entities.append((tag[2:], i, i)) elif tag.startswith("B"): start = i elif tag.startswith("L"): if start is None: - raise ValueError(Errors.E067.format(start="L", tags=tags[: i + 1])) + raise ValueError( + Errors.E067.format(start="L", tags=list(tags)[: i + 1]) + ) entities.append((tag[2:], start, i)) start = None else: diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py index 79459a89b..602e0ff3e 100644 --- a/spacy/training/loggers.py +++ b/spacy/training/loggers.py @@ -29,7 +29,7 @@ def console_logger(progress_bar: bool = False): def setup_printer( nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr ) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable[[], None]]: - write = lambda text: stdout.write(f"{text}\n") + write = lambda text: print(text, file=stdout, flush=True) msg = Printer(no_print=True) # ensure that only trainable components are logged logged_pipes = [ @@ -101,9 +101,20 @@ def console_logger(progress_bar: bool = False): return setup_printer -@registry.loggers("spacy.WandbLogger.v1") -def wandb_logger(project_name: str, remove_config_values: List[str] = []): - import wandb +@registry.loggers("spacy.WandbLogger.v2") +def wandb_logger_v2( + project_name: str, + remove_config_values: List[str] = [], + model_log_interval: Optional[int] = None, + log_dataset_dir: Optional[str] = None, +): + try: + import wandb + + # test that these are available + from wandb import init, log, join # noqa: F401 + except ImportError: + raise ImportError(Errors.E880) console = console_logger(progress_bar=False) @@ -115,9 +126,23 @@ def wandb_logger(project_name: str, remove_config_values: List[str] = []): for field in remove_config_values: del config_dot[field] config = util.dot_to_dict(config_dot) - wandb.init(project=project_name, config=config, reinit=True) + run = wandb.init(project=project_name, config=config, reinit=True) console_log_step, console_finalize = console(nlp, stdout, stderr) + def log_dir_artifact( + path: str, + name: str, + type: str, + metadata: Optional[Dict[str, Any]] = {}, + aliases: Optional[List[str]] = [], + ): + dataset_artifact = wandb.Artifact(name, type=type, metadata=metadata) + dataset_artifact.add_dir(path, name=name) + wandb.log_artifact(dataset_artifact, aliases=aliases) + + if log_dataset_dir: + log_dir_artifact(path=log_dataset_dir, name="dataset", type="dataset") + def log_step(info: Optional[Dict[str, Any]]): console_log_step(info) if info is not None: @@ -129,6 +154,107 @@ def wandb_logger(project_name: str, remove_config_values: List[str] = []): wandb.log({f"loss_{k}": v for k, v in losses.items()}) if isinstance(other_scores, dict): wandb.log(other_scores) + if model_log_interval and info.get("output_path"): + if info["step"] % model_log_interval == 0 and info["step"] != 0: + log_dir_artifact( + path=info["output_path"], + name="pipeline_" + run.id, + type="checkpoint", + metadata=info, + aliases=[ + f"epoch {info['epoch']} step {info['step']}", + "latest", + "best" + if info["score"] == max(info["checkpoints"])[0] + else "", + ], + ) + + def finalize() -> None: + console_finalize() + wandb.join() + + return log_step, finalize + + return setup_logger + + +@registry.loggers("spacy.WandbLogger.v3") +def wandb_logger_v3( + project_name: str, + remove_config_values: List[str] = [], + model_log_interval: Optional[int] = None, + log_dataset_dir: Optional[str] = None, + entity: Optional[str] = None, + run_name: Optional[str] = None, +): + try: + import wandb + + # test that these are available + from wandb import init, log, join # noqa: F401 + except ImportError: + raise ImportError(Errors.E880) + + console = console_logger(progress_bar=False) + + def setup_logger( + nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr + ) -> Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]: + config = nlp.config.interpolate() + config_dot = util.dict_to_dot(config) + for field in remove_config_values: + del config_dot[field] + config = util.dot_to_dict(config_dot) + run = wandb.init( + project=project_name, config=config, entity=entity, reinit=True + ) + + if run_name: + wandb.run.name = run_name + + console_log_step, console_finalize = console(nlp, stdout, stderr) + + def log_dir_artifact( + path: str, + name: str, + type: str, + metadata: Optional[Dict[str, Any]] = {}, + aliases: Optional[List[str]] = [], + ): + dataset_artifact = wandb.Artifact(name, type=type, metadata=metadata) + dataset_artifact.add_dir(path, name=name) + wandb.log_artifact(dataset_artifact, aliases=aliases) + + if log_dataset_dir: + log_dir_artifact(path=log_dataset_dir, name="dataset", type="dataset") + + def log_step(info: Optional[Dict[str, Any]]): + console_log_step(info) + if info is not None: + score = info["score"] + other_scores = info["other_scores"] + losses = info["losses"] + wandb.log({"score": score}) + if losses: + wandb.log({f"loss_{k}": v for k, v in losses.items()}) + if isinstance(other_scores, dict): + wandb.log(other_scores) + if model_log_interval and info.get("output_path"): + if info["step"] % model_log_interval == 0 and info["step"] != 0: + log_dir_artifact( + path=info["output_path"], + name="pipeline_" + run.id, + type="checkpoint", + metadata=info, + aliases=[ + f"epoch {info['epoch']} step {info['step']}", + "latest", + "best" + if info["score"] == max(info["checkpoints"])[0] + else "", + ], + ) def finalize() -> None: console_finalize() diff --git a/spacy/training/loop.py b/spacy/training/loop.py index fe2d4b18f..06372cbb0 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -1,4 +1,4 @@ -from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any, IO +from typing import List, Callable, Tuple, Dict, Iterable, Union, Any, IO from typing import Optional, TYPE_CHECKING from pathlib import Path from timeit import default_timer as timer @@ -32,7 +32,7 @@ def train( """Train a pipeline. nlp (Language): The initialized nlp object with the full config. - output_path (Path): Optional output path to save trained model to. + output_path (Optional[Path]): Optional output path to save trained model to. use_gpu (int): Whether to train on GPU. Make sure to call require_gpu before calling this function. stdout (file): A file-like object to write output messages. To disable @@ -74,11 +74,13 @@ def train( # Components that shouldn't be updated during training frozen_components = T["frozen_components"] + # Components that should set annotations on update + annotating_components = T["annotating_components"] # Create iterator, which yields out info after each optimization step. training_step_iterator = train_while_improving( nlp, optimizer, - create_train_batches(train_corpus(nlp), batcher, T["max_epochs"]), + create_train_batches(nlp, train_corpus, batcher, T["max_epochs"]), create_evaluation_callback(nlp, dev_corpus, score_weights), dropout=T["dropout"], accumulate_gradient=T["accumulate_gradient"], @@ -86,21 +88,28 @@ def train( max_steps=T["max_steps"], eval_frequency=T["eval_frequency"], exclude=frozen_components, + annotating_components=annotating_components, ) clean_output_dir(output_path) stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n") if frozen_components: stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n") + if annotating_components: + stdout.write( + msg.info(f"Set annotations on update for: {annotating_components}") + "\n" + ) stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n") with nlp.select_pipes(disable=frozen_components): log_step, finalize_logger = train_logger(nlp, stdout, stderr) try: for batch, info, is_best_checkpoint in training_step_iterator: - log_step(info if is_best_checkpoint is not None else None) - if is_best_checkpoint is not None and output_path is not None: + if is_best_checkpoint is not None: with nlp.select_pipes(disable=frozen_components): update_meta(T, nlp, info) - save_checkpoint(is_best_checkpoint) + if output_path is not None: + save_checkpoint(is_best_checkpoint) + info["output_path"] = str(output_path / DIR_MODEL_LAST) + log_step(info if is_best_checkpoint is not None else None) except Exception as e: if output_path is not None: stdout.write( @@ -113,7 +122,8 @@ def train( raise e finally: finalize_logger() - save_checkpoint(False) + if output_path is not None: + save_checkpoint(False) # This will only run if we did't hit an error if optimizer.averages: nlp.use_params(optimizer.averages) @@ -139,6 +149,7 @@ def train_while_improving( patience: int, max_steps: int, exclude: List[str], + annotating_components: List[str], ): """Train until an evaluation stops improving. Works as a generator, with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`, @@ -183,14 +194,19 @@ def train_while_improving( else: dropouts = dropout results = [] - losses = {} + losses: Dict[str, float] = {} words_seen = 0 start_time = timer() for step, (epoch, batch) in enumerate(train_data): - dropout = next(dropouts) + dropout = next(dropouts) # type: ignore for subbatch in subdivide_batch(batch, accumulate_gradient): nlp.update( - subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude + subbatch, + drop=dropout, + losses=losses, + sgd=False, # type: ignore[arg-type] + exclude=exclude, + annotates=annotating_components, ) # TODO: refactor this so we don't have to run it separately in here for name, proc in nlp.pipeline: @@ -198,9 +214,9 @@ def train_while_improving( name not in exclude and hasattr(proc, "is_trainable") and proc.is_trainable - and proc.model not in (True, False, None) + and proc.model not in (True, False, None) # type: ignore[attr-defined] ): - proc.finish_update(optimizer) + proc.finish_update(optimizer) # type: ignore[attr-defined] optimizer.step_schedules() if not (step % eval_frequency): if optimizer.averages: @@ -228,7 +244,10 @@ def train_while_improving( if is_best_checkpoint is not None: losses = {} # Stop if no improvement in `patience` updates (if specified) - best_score, best_step = max(results) + # Negate step value so that the earliest best step is chosen for the + # same score, i.e. (1.0, 100) is chosen over (1.0, 200) + best_result = max((r_score, -r_step) for r_score, r_step in results) + best_step = -best_result[1] if patience and (step - best_step) >= patience: break # Stop if we've exhausted our max steps (if specified) @@ -257,6 +276,7 @@ def create_evaluation_callback( weights = {key: value for key, value in weights.items() if value is not None} def evaluate() -> Tuple[float, Dict[str, float]]: + nonlocal weights try: scores = nlp.evaluate(dev_corpus(nlp)) except KeyError as e: @@ -264,6 +284,8 @@ def create_evaluation_callback( # Calculate a weighted sum based on score_weights for the main score. # We can only consider scores that are ints/floats, not dicts like # entity scores per type etc. + scores = {key: value for key, value in scores.items() if value is not None} + weights = {key: value for key, value in weights.items() if key in scores} for key, value in scores.items(): if key in weights and not isinstance(value, (int, float)): raise ValueError(Errors.E915.format(name=key, score_type=type(value))) @@ -281,17 +303,22 @@ def create_evaluation_callback( def create_train_batches( - iterator: Iterator[Example], + nlp: "Language", + corpus: Callable[["Language"], Iterable[Example]], batcher: Callable[[Iterable[Example]], Iterable[Example]], max_epochs: int, ): epoch = 0 - examples = list(iterator) - if not examples: - # Raise error if no data - raise ValueError(Errors.E986) + if max_epochs >= 0: + examples = list(corpus(nlp)) # type: Iterable[Example] + if not examples: + # Raise error if no data + raise ValueError(Errors.E986) while max_epochs < 1 or epoch != max_epochs: - random.shuffle(examples) + if max_epochs >= 0: + random.shuffle(examples) # type: ignore + else: + examples = corpus(nlp) for batch in batcher(examples): yield epoch, batch epoch += 1 @@ -326,7 +353,7 @@ def create_before_to_disk_callback( return before_to_disk -def clean_output_dir(path: Union[str, Path]) -> None: +def clean_output_dir(path: Optional[Path]) -> None: """Remove an existing output directory. Typically used to ensure that that a directory like model-best and its contents aren't just being overwritten by nlp.to_disk, which could preserve existing subdirectories (e.g. diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py index 152d849e9..2328ebbc7 100644 --- a/spacy/training/pretrain.py +++ b/spacy/training/pretrain.py @@ -6,9 +6,12 @@ from collections import Counter import srsly import time import re + +from thinc.config import ConfigValidationError from wasabi import Printer from .example import Example +from ..errors import Errors from ..tokens import Doc from ..schemas import ConfigSchemaPretrain from ..util import registry, load_model_from_config, dot_to_object @@ -38,10 +41,11 @@ def pretrain( optimizer = P["optimizer"] # Load in pretrained weights to resume from if resume_path is not None: - _resume_model(model, resume_path, epoch_resume, silent=silent) + epoch_resume = _resume_model(model, resume_path, epoch_resume, silent=silent) else: # Without '--resume-path' the '--epoch-resume' argument is ignored epoch_resume = 0 + objective = model.attrs["loss"] # TODO: move this to logger function? tracker = ProgressTracker(frequency=10000) @@ -89,21 +93,26 @@ def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]: def _resume_model( - model: Model, resume_path: Path, epoch_resume: int, silent: bool = True -) -> None: + model: Model, resume_path: Path, epoch_resume: Optional[int], silent: bool = True +) -> int: msg = Printer(no_print=silent) msg.info(f"Resume training tok2vec from: {resume_path}") with resume_path.open("rb") as file_: weights_data = file_.read() model.get_ref("tok2vec").from_bytes(weights_data) - # Parse the epoch number from the given weight file - model_name = re.search(r"model\d+\.bin", str(resume_path)) - if model_name: - # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin' - epoch_resume = int(model_name.group(0)[5:][:-4]) + 1 - msg.info(f"Resuming from epoch: {epoch_resume}") - else: - msg.info(f"Resuming from epoch: {epoch_resume}") + + if epoch_resume is None: + # Parse the epoch number from the given weight file + model_name = re.search(r"model\d+\.bin", str(resume_path)) + if model_name: + # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin' + epoch_resume = int(model_name.group(0)[5:][:-4]) + 1 + else: + # No epoch given and couldn't infer it + raise ValueError(Errors.E1020) + + msg.info(f"Resuming from epoch: {epoch_resume}") + return epoch_resume def make_update( @@ -133,12 +142,21 @@ def create_pretraining_model(nlp, pretrain_config): The actual tok2vec layer is stored as a reference, and only this bit will be serialized to file and read back in when calling the 'train' command. """ - nlp.initialize() - component = nlp.get_pipe(pretrain_config["component"]) - if pretrain_config.get("layer"): - tok2vec = component.model.get_ref(pretrain_config["layer"]) - else: - tok2vec = component.model + with nlp.select_pipes(enable=[]): + nlp.initialize() + tok2vec = get_tok2vec_ref(nlp, pretrain_config) + # If the config referred to a Tok2VecListener, grab the original model instead + if type(tok2vec).__name__ == "Tok2VecListener": + original_tok2vec = ( + tok2vec.upstream_name if tok2vec.upstream_name != "*" else "tok2vec" + ) + tok2vec = nlp.get_pipe(original_tok2vec).model + try: + tok2vec.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")]) + except ValueError: + component = pretrain_config["component"] + layer = pretrain_config["layer"] + raise ValueError(Errors.E874.format(component=component, layer=layer)) create_function = pretrain_config["objective"] model = create_function(nlp.vocab, tok2vec) @@ -147,6 +165,24 @@ def create_pretraining_model(nlp, pretrain_config): return model +def get_tok2vec_ref(nlp, pretrain_config): + tok2vec_component = pretrain_config["component"] + if tok2vec_component is None: + desc = ( + f"To use pretrained tok2vec weights, [pretraining.component] " + f"needs to specify the component that should load them." + ) + err = "component can't be null" + errors = [{"loc": ["pretraining", "component"], "msg": err}] + raise ConfigValidationError( + config=nlp.config["pretraining"], errors=errors, desc=desc + ) + layer = nlp.get_pipe(tok2vec_component).model + if pretrain_config["layer"]: + layer = layer.get_ref(pretrain_config["layer"]) + return layer + + class ProgressTracker: def __init__(self, frequency=1000000): self.loss = 0.0 diff --git a/spacy/ty.py b/spacy/ty.py new file mode 100644 index 000000000..8f2903d78 --- /dev/null +++ b/spacy/ty.py @@ -0,0 +1,55 @@ +from typing import TYPE_CHECKING +from typing import Optional, Any, Iterable, Dict, Callable, Sequence, List +from .compat import Protocol, runtime_checkable + +from thinc.api import Optimizer, Model + +if TYPE_CHECKING: + from .training import Example + + +@runtime_checkable +class TrainableComponent(Protocol): + model: Any + is_trainable: bool + + def update( + self, + examples: Iterable["Example"], + *, + drop: float = 0.0, + sgd: Optional[Optimizer] = None, + losses: Optional[Dict[str, float]] = None + ) -> Dict[str, float]: + ... + + def finish_update(self, sgd: Optimizer) -> None: + ... + + +@runtime_checkable +class InitializableComponent(Protocol): + def initialize( + self, + get_examples: Callable[[], Iterable["Example"]], + nlp: Iterable["Example"], + **kwargs: Any + ): + ... + + +@runtime_checkable +class ListenedToComponent(Protocol): + model: Any + listeners: Sequence[Model] + listener_map: Dict[str, Sequence[Model]] + listening_components: List[str] + + def add_listener(self, listener: Model, component_name: str) -> None: + ... + + def remove_listener(self, listener: Model, component_name: str) -> bool: + ... + + def find_listeners(self, component) -> None: + ... diff --git a/spacy/util.py b/spacy/util.py index 77aa712d1..cf62a4ecd 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1,4 +1,5 @@ -from typing import List, Union, Dict, Any, Optional, Iterable, Callable, Tuple +from typing import List, Mapping, NoReturn, Union, Dict, Any, Set +from typing import Optional, Iterable, Callable, Tuple, Type from typing import Iterator, Type, Pattern, Generator, TYPE_CHECKING from types import ModuleType import os @@ -8,7 +9,7 @@ import re from pathlib import Path import thinc from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer -from thinc.api import ConfigValidationError +from thinc.api import ConfigValidationError, Model import functools import itertools import numpy.random @@ -20,8 +21,10 @@ import sys import warnings from packaging.specifiers import SpecifierSet, InvalidSpecifier from packaging.version import Version, InvalidVersion +from packaging.requirements import Requirement import subprocess from contextlib import contextmanager +from collections import defaultdict import tempfile import shutil import shlex @@ -33,11 +36,6 @@ try: except ImportError: cupy = None -try: # Python 3.8 - import importlib.metadata as importlib_metadata -except ImportError: - import importlib_metadata - # These are functions that were previously (v2.x) available from spacy.util # and have since moved to Thinc. We're importing them here so people's code # doesn't break, but they should always be imported from Thinc from now on, @@ -46,31 +44,34 @@ from thinc.api import fix_random_seed, compounding, decaying # noqa: F401 from .symbols import ORTH -from .compat import cupy, CudaStream, is_windows +from .compat import cupy, CudaStream, is_windows, importlib_metadata from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS from . import about if TYPE_CHECKING: # This lets us add type hints for mypy etc. without causing circular imports from .language import Language # noqa: F401 + from .pipeline import Pipe # noqa: F401 from .tokens import Doc, Span # noqa: F401 from .vocab import Vocab # noqa: F401 +# fmt: off OOV_RANK = numpy.iinfo(numpy.uint64).max DEFAULT_OOV_PROB = -20 -LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"] +LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"] # Default order of sections in the config.cfg. Not all sections needs to exist, # and additional sections are added at the end, in alphabetical order. -# fmt: off CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"] # fmt: on logger = logging.getLogger("spacy") logger_stream_handler = logging.StreamHandler() -logger_stream_handler.setFormatter(logging.Formatter("%(message)s")) +logger_stream_handler.setFormatter( + logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s") +) logger.addHandler(logger_stream_handler) @@ -88,7 +89,7 @@ class registry(thinc.registry): displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True) misc = catalogue.create("spacy", "misc", entry_points=True) # Callback functions used to manipulate nlp object etc. - callbacks = catalogue.create("spacy", "callbacks") + callbacks = catalogue.create("spacy", "callbacks", entry_points=True) batchers = catalogue.create("spacy", "batchers", entry_points=True) readers = catalogue.create("spacy", "readers", entry_points=True) augmenters = catalogue.create("spacy", "augmenters", entry_points=True) @@ -141,6 +142,32 @@ class registry(thinc.registry): ) from None return func + @classmethod + def find(cls, registry_name: str, func_name: str) -> Callable: + """Get info about a registered function from the registry.""" + # We're overwriting this classmethod so we're able to provide more + # specific error messages and implement a fallback to spacy-legacy. + if not hasattr(cls, registry_name): + names = ", ".join(cls.get_registry_names()) or "none" + raise RegistryError(Errors.E892.format(name=registry_name, available=names)) + reg = getattr(cls, registry_name) + try: + func_info = reg.find(func_name) + except RegistryError: + if func_name.startswith("spacy."): + legacy_name = func_name.replace("spacy.", "spacy-legacy.") + try: + return reg.find(legacy_name) + except catalogue.RegistryError: + pass + available = ", ".join(sorted(reg.get_all().keys())) or "none" + raise RegistryError( + Errors.E893.format( + name=func_name, reg_name=registry_name, available=available + ) + ) from None + return func_info + @classmethod def has(cls, registry_name: str, func_name: str) -> bool: """Check whether a function is available in a registry.""" @@ -230,7 +257,7 @@ def lang_class_is_loaded(lang: str) -> bool: return lang in registry.languages -def get_lang_class(lang: str) -> "Language": +def get_lang_class(lang: str) -> Type["Language"]: """Import and load a Language class. lang (str): Two-letter language code, e.g. 'en'. @@ -244,7 +271,7 @@ def get_lang_class(lang: str) -> "Language": module = importlib.import_module(f".lang.{lang}", "spacy") except ImportError as err: raise ImportError(Errors.E048.format(lang=lang, err=err)) from err - set_lang_class(lang, getattr(module, module.__all__[0])) + set_lang_class(lang, getattr(module, module.__all__[0])) # type: ignore[attr-defined] return registry.languages.get(lang) @@ -319,13 +346,13 @@ def load_model( if name.startswith("blank:"): # shortcut for blank model return get_lang_class(name.replace("blank:", ""))() if is_package(name): # installed as package - return load_model_from_package(name, **kwargs) + return load_model_from_package(name, **kwargs) # type: ignore[arg-type] if Path(name).exists(): # path to model data directory - return load_model_from_path(Path(name), **kwargs) + return load_model_from_path(Path(name), **kwargs) # type: ignore[arg-type] elif hasattr(name, "exists"): # Path or Path-like to model data - return load_model_from_path(name, **kwargs) + return load_model_from_path(name, **kwargs) # type: ignore[arg-type] if name in OLD_MODEL_SHORTCUTS: - raise IOError(Errors.E941.format(name=name, full=OLD_MODEL_SHORTCUTS[name])) + raise IOError(Errors.E941.format(name=name, full=OLD_MODEL_SHORTCUTS[name])) # type: ignore[index] raise IOError(Errors.E050.format(name=name)) @@ -352,11 +379,11 @@ def load_model_from_package( RETURNS (Language): The loaded nlp object. """ cls = importlib.import_module(name) - return cls.load(vocab=vocab, disable=disable, exclude=exclude, config=config) + return cls.load(vocab=vocab, disable=disable, exclude=exclude, config=config) # type: ignore[attr-defined] def load_model_from_path( - model_path: Union[str, Path], + model_path: Path, *, meta: Optional[Dict[str, Any]] = None, vocab: Union["Vocab", bool] = True, @@ -367,7 +394,7 @@ def load_model_from_path( """Load a model from a data directory path. Creates Language class with pipeline from config.cfg and then calls from_disk() with path. - name (str): Package name or model path. + model_path (Path): Mmodel path. meta (Dict[str, Any]): Optional model meta. vocab (Vocab / True): Optional vocab to pass in on initialization. If True, a new Vocab object will be created. @@ -385,9 +412,10 @@ def load_model_from_path( if not meta: meta = get_model_meta(model_path) config_path = model_path / "config.cfg" - config = load_config(config_path, overrides=dict_to_dot(config)) + overrides = dict_to_dot(config) + config = load_config(config_path, overrides=overrides) nlp = load_model_from_config(config, vocab=vocab, disable=disable, exclude=exclude) - return nlp.from_disk(model_path, exclude=exclude) + return nlp.from_disk(model_path, exclude=exclude, overrides=overrides) def load_model_from_config( @@ -434,7 +462,23 @@ def load_model_from_config( return nlp -def resolve_dot_names(config: Config, dot_names: List[Optional[str]]) -> Tuple[Any]: +def get_sourced_components( + config: Union[Dict[str, Any], Config] +) -> Dict[str, Dict[str, Any]]: + """RETURNS (List[str]): All sourced components in the original config, + e.g. {"source": "en_core_web_sm"}. If the config contains a key + "factory", we assume it refers to a component factory. + """ + return { + name: cfg + for name, cfg in config.get("components", {}).items() + if "factory" not in cfg and "source" in cfg + } + + +def resolve_dot_names( + config: Config, dot_names: List[Optional[str]] +) -> Tuple[Any, ...]: """Resolve one or more "dot notation" names, e.g. corpora.train. The paths could point anywhere into the config, so we don't know which top-level section we'll be looking within. @@ -444,7 +488,7 @@ def resolve_dot_names(config: Config, dot_names: List[Optional[str]]) -> Tuple[A """ # TODO: include schema? resolved = {} - output = [] + output: List[Any] = [] errors = [] for name in dot_names: if name is None: @@ -460,7 +504,7 @@ def resolve_dot_names(config: Config, dot_names: List[Optional[str]]) -> Tuple[A result = registry.resolve(config[section]) resolved[section] = result try: - output.append(dot_to_object(resolved, name)) + output.append(dot_to_object(resolved, name)) # type: ignore[arg-type] except KeyError: msg = f"not a valid section reference: {name}" errors.append({"loc": name.split("."), "msg": msg}) @@ -564,8 +608,8 @@ def get_package_version(name: str) -> Optional[str]: RETURNS (str / None): The version or None if package not installed. """ try: - return importlib_metadata.version(name) - except importlib_metadata.PackageNotFoundError: + return importlib_metadata.version(name) # type: ignore[attr-defined] + except importlib_metadata.PackageNotFoundError: # type: ignore[attr-defined] return None @@ -588,7 +632,7 @@ def is_compatible_version( constraint = f"=={constraint}" try: spec = SpecifierSet(constraint) - version = Version(version) + version = Version(version) # type: ignore[assignment] except (InvalidSpecifier, InvalidVersion): return None spec.prereleases = prereleases @@ -622,13 +666,30 @@ def is_unconstrained_version( return True -def get_model_version_range(spacy_version: str) -> str: - """Generate a version range like >=1.2.3,<1.3.0 based on a given spaCy - version. Models are always compatible across patch versions but not - across minor or major versions. +def split_requirement(requirement: str) -> Tuple[str, str]: + """Split a requirement like spacy>=1.2.3 into ("spacy", ">=1.2.3").""" + req = Requirement(requirement) + return (req.name, str(req.specifier)) + + +def get_minor_version_range(version: str) -> str: + """Generate a version range like >=1.2.3,<1.3.0 based on a given version + (e.g. of spaCy). """ - release = Version(spacy_version).release - return f">={spacy_version},<{release[0]}.{release[1] + 1}.0" + release = Version(version).release + return f">={version},<{release[0]}.{release[1] + 1}.0" + + +def get_model_lower_version(constraint: str) -> Optional[str]: + """From a version range like >=1.2.3,<1.3.0 return the lower pin.""" + try: + specset = SpecifierSet(constraint) + for spec in specset: + if spec.operator in (">=", "==", "~="): + return spec.version + except Exception: + pass + return None def get_base_version(version: str) -> str: @@ -684,10 +745,18 @@ def load_meta(path: Union[str, Path]) -> Dict[str, Any]: raise ValueError(Errors.E054.format(setting=setting)) if "spacy_version" in meta: if not is_compatible_version(about.__version__, meta["spacy_version"]): + lower_version = get_model_lower_version(meta["spacy_version"]) + lower_version = get_minor_version(lower_version) # type: ignore[arg-type] + if lower_version is not None: + lower_version = "v" + lower_version + elif "spacy_git_version" in meta: + lower_version = "git commit " + meta["spacy_git_version"] + else: + lower_version = "version unknown" warn_msg = Warnings.W095.format( model=f"{meta['lang']}_{meta['name']}", model_version=meta["version"], - version=meta["spacy_version"], + version=lower_version, current=about.__version__, ) warnings.warn(warn_msg) @@ -696,7 +765,7 @@ def load_meta(path: Union[str, Path]) -> Dict[str, Any]: model=f"{meta['lang']}_{meta['name']}", model_version=meta["version"], version=meta["spacy_version"], - example=get_model_version_range(about.__version__), + example=get_minor_version_range(about.__version__), ) warnings.warn(warn_msg) return meta @@ -719,7 +788,7 @@ def is_package(name: str) -> bool: RETURNS (bool): True if installed package, False if not. """ try: - importlib_metadata.distribution(name) + importlib_metadata.distribution(name) # type: ignore[attr-defined] return True except: # noqa: E722 return False @@ -738,6 +807,24 @@ def get_package_path(name: str) -> Path: return Path(pkg.__file__).parent +def replace_model_node(model: Model, target: Model, replacement: Model) -> None: + """Replace a node within a model with a new one, updating refs. + + model (Model): The parent model. + target (Model): The target node. + replacement (Model): The node to replace the target with. + """ + # Place the node into the sublayers + for node in model.walk(): + if target in node.layers: + node.layers[node.layers.index(target)] = replacement + # Now fix any node references + for node in model.walk(): + for ref_name in node.ref_names: + if node.maybe_get_ref(ref_name) is target: + node.set_ref(ref_name, replacement) + + def split_command(command: str) -> List[str]: """Split a string command using shlex. Handles platform compatibility. @@ -762,7 +849,7 @@ def run_command( *, stdin: Optional[Any] = None, capture: bool = False, -) -> Optional[subprocess.CompletedProcess]: +) -> subprocess.CompletedProcess: """Run a command on the command line as a subprocess. If the subprocess returns a non-zero exit code, a system exit is performed. @@ -771,7 +858,7 @@ def run_command( stdin (Optional[Any]): stdin to read from or None. capture (bool): Whether to capture the output and errors. If False, the stdout and stderr will not be redirected, and if there's an error, - sys.exit will be called with the returncode. You should use capture=False + sys.exit will be called with the return code. You should use capture=False when you want to turn over execution to the command, and capture=True when you want to run the command more like a function. RETURNS (Optional[CompletedProcess]): The process object. @@ -805,8 +892,8 @@ def run_command( message += f"\n\nProcess log (stdout and stderr):\n\n" message += ret.stdout error = subprocess.SubprocessError(message) - error.ret = ret - error.command = cmd_str + error.ret = ret # type: ignore[attr-defined] + error.command = cmd_str # type: ignore[attr-defined] raise error elif ret.returncode != 0: sys.exit(ret.returncode) @@ -814,7 +901,7 @@ def run_command( @contextmanager -def working_dir(path: Union[str, Path]) -> None: +def working_dir(path: Union[str, Path]) -> Iterator[Path]: """Change current working directory and returns to previous on exit. path (str / Path): The directory to navigate to. @@ -862,7 +949,7 @@ def is_in_jupyter() -> bool: """ # https://stackoverflow.com/a/39662359/6400719 try: - shell = get_ipython().__class__.__name__ + shell = get_ipython().__class__.__name__ # type: ignore[name-defined] if shell == "ZMQInteractiveShell": return True # Jupyter notebook or qtconsole except NameError: @@ -898,6 +985,8 @@ def is_same_func(func1: Callable, func2: Callable) -> bool: """ if not callable(func1) or not callable(func2): return False + if not hasattr(func1, "__qualname__") or not hasattr(func2, "__qualname__"): + return False same_name = func1.__qualname__ == func2.__qualname__ same_file = inspect.getfile(func1) == inspect.getfile(func2) same_code = inspect.getsourcelines(func1) == inspect.getsourcelines(func2) @@ -942,7 +1031,7 @@ def compile_prefix_regex(entries: Iterable[Union[str, Pattern]]) -> Pattern: spacy.lang.punctuation.TOKENIZER_PREFIXES. RETURNS (Pattern): The regex object. to be used for Tokenizer.prefix_search. """ - expression = "|".join(["^" + piece for piece in entries if piece.strip()]) + expression = "|".join(["^" + piece for piece in entries if piece.strip()]) # type: ignore[operator, union-attr] return re.compile(expression) @@ -953,7 +1042,7 @@ def compile_suffix_regex(entries: Iterable[Union[str, Pattern]]) -> Pattern: spacy.lang.punctuation.TOKENIZER_SUFFIXES. RETURNS (Pattern): The regex object. to be used for Tokenizer.suffix_search. """ - expression = "|".join([piece + "$" for piece in entries if piece.strip()]) + expression = "|".join([piece + "$" for piece in entries if piece.strip()]) # type: ignore[operator, union-attr] return re.compile(expression) @@ -964,7 +1053,7 @@ def compile_infix_regex(entries: Iterable[Union[str, Pattern]]) -> Pattern: spacy.lang.punctuation.TOKENIZER_INFIXES. RETURNS (regex object): The regex object. to be used for Tokenizer.infix_finditer. """ - expression = "|".join([piece for piece in entries if piece.strip()]) + expression = "|".join([piece for piece in entries if piece.strip()]) # type: ignore[misc, union-attr] return re.compile(expression) @@ -986,7 +1075,7 @@ def _get_attr_unless_lookup( ) -> Any: for lookup in lookups: if string in lookup: - return lookup[string] + return lookup[string] # type: ignore[index] return default_func(string) @@ -1068,7 +1157,7 @@ def filter_spans(spans: Iterable["Span"]) -> List["Span"]: get_sort_key = lambda span: (span.end - span.start, -span.start) sorted_spans = sorted(spans, key=get_sort_key, reverse=True) result = [] - seen_tokens = set() + seen_tokens: Set[int] = set() for span in sorted_spans: # Check for end - 1 here because boundaries are inclusive if span.start not in seen_tokens and span.end - 1 not in seen_tokens: @@ -1087,7 +1176,7 @@ def from_bytes( setters: Dict[str, Callable[[bytes], Any]], exclude: Iterable[str], ) -> None: - return from_dict(srsly.msgpack_loads(bytes_data), setters, exclude) + return from_dict(srsly.msgpack_loads(bytes_data), setters, exclude) # type: ignore[return-value] def to_dict( @@ -1149,8 +1238,8 @@ def import_file(name: str, loc: Union[str, Path]) -> ModuleType: RETURNS: The loaded module. """ spec = importlib.util.spec_from_file_location(name, str(loc)) - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) + module = importlib.util.module_from_spec(spec) # type: ignore[arg-type] + spec.loader.exec_module(module) # type: ignore[union-attr] return module @@ -1240,7 +1329,7 @@ def dot_to_dict(values: Dict[str, Any]) -> Dict[str, dict]: values (Dict[str, Any]): The key/value pairs to convert. RETURNS (Dict[str, dict]): The converted values. """ - result = {} + result: Dict[str, dict] = {} for key, value in values.items(): path = result parts = key.lower().split(".") @@ -1279,6 +1368,25 @@ def dot_to_object(config: Config, section: str): return component +def set_dot_to_object(config: Config, section: str, value: Any) -> None: + """Update a config at a given position from a dot notation. + + config (Config): The config. + section (str): The dot notation of the section in the config. + value (Any): The value to set in the config. + """ + component = config + parts = section.split(".") + for i, item in enumerate(parts): + try: + if i == len(parts) - 1: + component[item] = value + else: + component = component[item] + except (KeyError, TypeError): + raise KeyError(Errors.E952.format(name=section)) from None + + def walk_dict( node: Dict[str, Any], parent: List[str] = [] ) -> Iterator[Tuple[List[str], Any]]: @@ -1299,13 +1407,13 @@ def get_arg_names(func: Callable) -> List[str]: RETURNS (List[str]): The argument names. """ argspec = inspect.getfullargspec(func) - return list(set([*argspec.args, *argspec.kwonlyargs])) + return list(dict.fromkeys([*argspec.args, *argspec.kwonlyargs])) def combine_score_weights( - weights: List[Dict[str, float]], - overrides: Dict[str, Optional[Union[float, int]]] = SimpleFrozenDict(), -) -> Dict[str, float]: + weights: List[Dict[str, Optional[float]]], + overrides: Dict[str, Optional[float]] = SimpleFrozenDict(), +) -> Dict[str, Optional[float]]: """Combine and normalize score weights defined by components, e.g. {"ents_r": 0.2, "ents_p": 0.3, "ents_f": 0.5} and {"some_other_score": 1.0}. @@ -1314,32 +1422,17 @@ def combine_score_weights( should be preserved. RETURNS (Dict[str, float]): The combined and normalized weights. """ + # We divide each weight by the total weight sum. # We first need to extract all None/null values for score weights that # shouldn't be shown in the table *or* be weighted - result = {} - all_weights = [] - for w_dict in weights: - filtered_weights = {} - for key, value in w_dict.items(): - value = overrides.get(key, value) - if value is None: - result[key] = None - else: - filtered_weights[key] = value - all_weights.append(filtered_weights) - for w_dict in all_weights: - # We need to account for weights that don't sum to 1.0 and normalize - # the score weights accordingly, then divide score by the number of - # components. - total = sum(w_dict.values()) - for key, value in w_dict.items(): - if total == 0: - weight = 0.0 - else: - weight = round(value / total / len(all_weights), 2) - prev_weight = result.get(key, 0.0) - prev_weight = 0.0 if prev_weight is None else prev_weight - result[key] = prev_weight + weight + result: Dict[str, Optional[float]] = { + key: value for w_dict in weights for (key, value) in w_dict.items() + } + result.update(overrides) + weight_sum = sum([v if v else 0.0 for v in result.values()]) + for key, value in result.items(): + if value and weight_sum > 0: + result[key] = round(value / weight_sum, 2) return result @@ -1356,13 +1449,13 @@ class DummyTokenizer: def to_bytes(self, **kwargs): return b"" - def from_bytes(self, _bytes_data, **kwargs): + def from_bytes(self, data: bytes, **kwargs) -> "DummyTokenizer": return self - def to_disk(self, _path, **kwargs): + def to_disk(self, path: Union[str, Path], **kwargs) -> None: return None - def from_disk(self, _path, **kwargs): + def from_disk(self, path: Union[str, Path], **kwargs) -> "DummyTokenizer": return self @@ -1401,7 +1494,11 @@ def is_cython_func(func: Callable) -> bool: if hasattr(func, attr): # function or class instance return True # https://stackoverflow.com/a/55767059 - if hasattr(func, "__qualname__") and hasattr(func, "__module__"): # method + if ( + hasattr(func, "__qualname__") + and hasattr(func, "__module__") + and func.__module__ in sys.modules + ): # method cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]] return hasattr(cls_func, attr) return False @@ -1420,15 +1517,91 @@ def check_bool_env_var(env_var: str) -> bool: return bool(value) -def _pipe(docs, proc, kwargs): +def _pipe( + docs: Iterable["Doc"], + proc: "Pipe", + name: str, + default_error_handler: Callable[[str, "Pipe", List["Doc"], Exception], NoReturn], + kwargs: Mapping[str, Any], +) -> Iterator["Doc"]: if hasattr(proc, "pipe"): yield from proc.pipe(docs, **kwargs) else: # We added some args for pipe that __call__ doesn't expect. kwargs = dict(kwargs) + error_handler = default_error_handler + if hasattr(proc, "get_error_handler"): + error_handler = proc.get_error_handler() for arg in ["batch_size"]: if arg in kwargs: kwargs.pop(arg) for doc in docs: - doc = proc(doc, **kwargs) - yield doc + try: + doc = proc(doc, **kwargs) # type: ignore[call-arg] + yield doc + except Exception as e: + error_handler(name, proc, [doc], e) + + +def raise_error(proc_name, proc, docs, e): + raise e + + +def ignore_error(proc_name, proc, docs, e): + pass + + +def warn_if_jupyter_cupy(): + """Warn about require_gpu if a jupyter notebook + cupy + mismatched + contextvars vs. thread ops are detected + """ + if is_in_jupyter(): + from thinc.backends.cupy_ops import CupyOps + + if CupyOps.xp is not None: + from thinc.backends import contextvars_eq_thread_ops + + if not contextvars_eq_thread_ops(): + warnings.warn(Warnings.W111) + + +def check_lexeme_norms(vocab, component_name): + lexeme_norms = vocab.lookups.get_table("lexeme_norm", {}) + if len(lexeme_norms) == 0 and vocab.lang in LEXEME_NORM_LANGS: + langs = ", ".join(LEXEME_NORM_LANGS) + logger.debug(Warnings.W033.format(model=component_name, langs=langs)) + + +def to_ternary_int(val) -> int: + """Convert a value to the ternary 1/0/-1 int used for True/None/False in + attributes such as SENT_START: True/1/1.0 is 1 (True), None/0/0.0 is 0 + (None), any other values are -1 (False). + """ + if val is True: + return 1 + elif val is None: + return 0 + elif val is False: + return -1 + elif val == 1: + return 1 + elif val == 0: + return 0 + else: + return -1 + + +# The following implementation of packages_distributions() is adapted from +# importlib_metadata, which is distributed under the Apache 2.0 License. +# Copyright (c) 2017-2019 Jason R. Coombs, Barry Warsaw +# See licenses/3rd_party_licenses.txt +def packages_distributions() -> Dict[str, List[str]]: + """Return a mapping of top-level packages to their distributions. We're + inlining this helper from the importlib_metadata "backport" here, since + it's not available in the builtin importlib.metadata. + """ + pkg_to_dist = defaultdict(list) + for dist in importlib_metadata.distributions(): # type: ignore[attr-defined] + for pkg in (dist.read_text("top_level.txt") or "").split(): + pkg_to_dist[pkg].append(dist.metadata["Name"]) + return dict(pkg_to_dist) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index ae2508c87..7cb3322c2 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -44,7 +44,7 @@ cdef class Vectors: the table need to be assigned - so len(list(vectors.keys())) may be greater or smaller than vectors.shape[0]. - DOCS: https://nightly.spacy.io/api/vectors + DOCS: https://spacy.io/api/vectors """ cdef public object name cdef public object data @@ -55,17 +55,18 @@ cdef class Vectors: """Create a new vector store. shape (tuple): Size of the table, as (# entries, # columns) - data (numpy.ndarray): The vector data. + data (numpy.ndarray or cupy.ndarray): The vector data. keys (iterable): A sequence of keys, aligned with the data. name (str): A name to identify the vectors table. - DOCS: https://nightly.spacy.io/api/vectors#init + DOCS: https://spacy.io/api/vectors#init """ self.name = name if data is None: if shape is None: shape = (0,0) - data = numpy.zeros(shape, dtype="f") + ops = get_current_ops() + data = ops.xp.zeros(shape, dtype="f") self.data = data self.key2row = {} if self.data is not None: @@ -83,7 +84,7 @@ cdef class Vectors: RETURNS (tuple): A `(rows, dims)` pair. - DOCS: https://nightly.spacy.io/api/vectors#shape + DOCS: https://spacy.io/api/vectors#shape """ return self.data.shape @@ -93,7 +94,7 @@ cdef class Vectors: RETURNS (int): The vector size. - DOCS: https://nightly.spacy.io/api/vectors#size + DOCS: https://spacy.io/api/vectors#size """ return self.data.shape[0] * self.data.shape[1] @@ -103,7 +104,7 @@ cdef class Vectors: RETURNS (bool): `True` if no slots are available for new keys. - DOCS: https://nightly.spacy.io/api/vectors#is_full + DOCS: https://spacy.io/api/vectors#is_full """ return self._unset.size() == 0 @@ -114,7 +115,7 @@ cdef class Vectors: RETURNS (int): The number of keys in the table. - DOCS: https://nightly.spacy.io/api/vectors#n_keys + DOCS: https://spacy.io/api/vectors#n_keys """ return len(self.key2row) @@ -127,7 +128,7 @@ cdef class Vectors: key (int): The key to get the vector for. RETURNS (ndarray): The vector for the key. - DOCS: https://nightly.spacy.io/api/vectors#getitem + DOCS: https://spacy.io/api/vectors#getitem """ i = self.key2row[key] if i is None: @@ -141,7 +142,7 @@ cdef class Vectors: key (int): The key to set the vector for. vector (ndarray): The vector to set. - DOCS: https://nightly.spacy.io/api/vectors#setitem + DOCS: https://spacy.io/api/vectors#setitem """ i = self.key2row[key] self.data[i] = vector @@ -153,7 +154,7 @@ cdef class Vectors: YIELDS (int): A key in the table. - DOCS: https://nightly.spacy.io/api/vectors#iter + DOCS: https://spacy.io/api/vectors#iter """ yield from self.key2row @@ -162,7 +163,7 @@ cdef class Vectors: RETURNS (int): The number of vectors in the data. - DOCS: https://nightly.spacy.io/api/vectors#len + DOCS: https://spacy.io/api/vectors#len """ return self.data.shape[0] @@ -172,7 +173,7 @@ cdef class Vectors: key (int): The key to check. RETURNS (bool): Whether the key has a vector entry. - DOCS: https://nightly.spacy.io/api/vectors#contains + DOCS: https://spacy.io/api/vectors#contains """ return key in self.key2row @@ -189,7 +190,7 @@ cdef class Vectors: inplace (bool): Reallocate the memory. RETURNS (list): The removed items as a list of `(key, row)` tuples. - DOCS: https://nightly.spacy.io/api/vectors#resize + DOCS: https://spacy.io/api/vectors#resize """ xp = get_array_module(self.data) if inplace: @@ -224,7 +225,7 @@ cdef class Vectors: YIELDS (ndarray): A vector in the table. - DOCS: https://nightly.spacy.io/api/vectors#values + DOCS: https://spacy.io/api/vectors#values """ for row, vector in enumerate(range(self.data.shape[0])): if not self._unset.count(row): @@ -235,7 +236,7 @@ cdef class Vectors: YIELDS (tuple): A key/vector pair. - DOCS: https://nightly.spacy.io/api/vectors#items + DOCS: https://spacy.io/api/vectors#items """ for key, row in self.key2row.items(): yield key, self.data[row] @@ -281,7 +282,7 @@ cdef class Vectors: row (int / None): The row number of a vector to map the key to. RETURNS (int): The row the vector was added to. - DOCS: https://nightly.spacy.io/api/vectors#add + DOCS: https://spacy.io/api/vectors#add """ # use int for all keys and rows in key2row for more efficient access # and serialization @@ -300,6 +301,8 @@ cdef class Vectors: else: raise ValueError(Errors.E197.format(row=row, key=key)) if vector is not None: + xp = get_array_module(self.data) + vector = xp.asarray(vector) self.data[row] = vector if self._unset.count(row): self._unset.erase(self._unset.find(row)) @@ -321,10 +324,11 @@ cdef class Vectors: RETURNS (tuple): The most similar entries as a `(keys, best_rows, scores)` tuple. """ + xp = get_array_module(self.data) filled = sorted(list({row for row in self.key2row.values()})) if len(filled) < n: raise ValueError(Errors.E198.format(n=n, n_rows=len(filled))) - xp = get_array_module(self.data) + filled = xp.asarray(filled) norms = xp.linalg.norm(self.data[filled], axis=1, keepdims=True) norms[norms == 0] = 1 @@ -357,8 +361,10 @@ cdef class Vectors: # Account for numerical error we want to return in range -1, 1 scores = xp.clip(scores, a_min=-1, a_max=1, out=scores) row2key = {row: key for key, row in self.key2row.items()} + + numpy_rows = get_current_ops().to_numpy(best_rows) keys = xp.asarray( - [[row2key[row] for row in best_rows[i] if row in row2key] + [[row2key[row] for row in numpy_rows[i] if row in row2key] for i in range(len(queries)) ], dtype="uint64") return (keys, best_rows, scores) @@ -368,7 +374,7 @@ cdef class Vectors: path (str / Path): A path to a directory, which will be created if it doesn't exists. - DOCS: https://nightly.spacy.io/api/vectors#to_disk + DOCS: https://spacy.io/api/vectors#to_disk """ xp = get_array_module(self.data) if xp is numpy: @@ -396,7 +402,7 @@ cdef class Vectors: path (str / Path): Directory path, string or Path-like object. RETURNS (Vectors): The modified object. - DOCS: https://nightly.spacy.io/api/vectors#from_disk + DOCS: https://spacy.io/api/vectors#from_disk """ def load_key2row(path): if path.exists(): @@ -432,7 +438,7 @@ cdef class Vectors: exclude (list): String names of serialization fields to exclude. RETURNS (bytes): The serialized form of the `Vectors` object. - DOCS: https://nightly.spacy.io/api/vectors#to_bytes + DOCS: https://spacy.io/api/vectors#to_bytes """ def serialize_weights(): if hasattr(self.data, "to_bytes"): @@ -453,13 +459,14 @@ cdef class Vectors: exclude (list): String names of serialization fields to exclude. RETURNS (Vectors): The `Vectors` object. - DOCS: https://nightly.spacy.io/api/vectors#from_bytes + DOCS: https://spacy.io/api/vectors#from_bytes """ def deserialize_weights(b): if hasattr(self.data, "from_bytes"): self.data.from_bytes() else: - self.data = srsly.msgpack_loads(b) + xp = get_array_module(self.data) + self.data = xp.asarray(srsly.msgpack_loads(b)) deserializers = { "key2row": lambda b: self.key2row.update(srsly.msgpack_loads(b)), diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index b5bcf7658..9067476f7 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -25,12 +25,12 @@ cdef struct _Cached: cdef class Vocab: cdef Pool mem - cpdef readonly StringStore strings - cpdef public Morphology morphology - cpdef public object vectors - cpdef public object _lookups - cpdef public object writing_system - cpdef public object get_noun_chunks + cdef readonly StringStore strings + cdef public Morphology morphology + cdef public object vectors + cdef public object _lookups + cdef public object writing_system + cdef public object get_noun_chunks cdef readonly int length cdef public object data_dir cdef public object lex_attr_getters diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi new file mode 100644 index 000000000..603ef1ae7 --- /dev/null +++ b/spacy/vocab.pyi @@ -0,0 +1,78 @@ +from typing import Callable, Iterator, Optional, Union, List, Dict +from typing import Any, Iterable +from thinc.types import Floats1d, FloatsXd +from . import Language +from .strings import StringStore +from .lexeme import Lexeme +from .lookups import Lookups +from .morphology import Morphology +from .tokens import Doc, Span +from .vectors import Vectors +from pathlib import Path + +def create_vocab( + lang: Optional[str], defaults: Any, vectors_name: Optional[str] = ... +) -> Vocab: ... + +class Vocab: + cfg: Dict[str, Any] + get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]] + lookups: Lookups + morphology: Morphology + strings: StringStore + vectors: Vectors + writing_system: Dict[str, Any] + def __init__( + self, + lex_attr_getters: Optional[Dict[str, Callable[[str], Any]]] = ..., + strings: Optional[Union[List[str], StringStore]] = ..., + lookups: Optional[Lookups] = ..., + oov_prob: float = ..., + vectors_name: Optional[str] = ..., + writing_system: Dict[str, Any] = ..., + get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]] = ..., + ) -> None: ... + @property + def lang(self) -> str: ... + def __len__(self) -> int: ... + def add_flag( + self, flag_getter: Callable[[str], bool], flag_id: int = ... + ) -> int: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Lexeme]: ... + def __getitem__(self, id_or_string: Union[str, int]) -> Lexeme: ... + @property + def vectors_length(self) -> int: ... + def reset_vectors( + self, *, width: Optional[int] = ..., shape: Optional[int] = ... + ) -> None: ... + def prune_vectors(self, nr_row: int, batch_size: int = ...) -> Dict[str, float]: ... + def get_vector( + self, + orth: Union[int, str], + minn: Optional[int] = ..., + maxn: Optional[int] = ..., + ) -> FloatsXd: ... + def set_vector(self, orth: Union[int, str], vector: Floats1d) -> None: ... + def has_vector(self, orth: Union[int, str]) -> bool: ... + def to_disk( + self, path: Union[str, Path], *, exclude: Iterable[str] = ... + ) -> None: ... + def from_disk( + self, path: Union[str, Path], *, exclude: Iterable[str] = ... + ) -> Vocab: ... + def to_bytes(self, *, exclude: Iterable[str] = ...) -> bytes: ... + def from_bytes( + self, bytes_data: bytes, *, exclude: Iterable[str] = ... + ) -> Vocab: ... + +def pickle_vocab(vocab: Vocab) -> Any: ... +def unpickle_vocab( + sstore: StringStore, + vectors: Any, + morphology: Any, + data_dir: Any, + lex_attr_getters: Any, + lookups: Any, + get_noun_chunks: Any, +) -> Vocab: ... diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index e8ed1b61c..5bbbac8ac 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -2,7 +2,7 @@ from libc.string cimport memcpy import srsly -from thinc.api import get_array_module +from thinc.api import get_array_module, get_current_ops import functools from .lexeme cimport EMPTY_LEXEME, OOV_RANK @@ -47,7 +47,7 @@ cdef class Vocab: instance also provides access to the `StringStore`, and owns underlying C-data that is shared between `Doc` objects. - DOCS: https://nightly.spacy.io/api/vocab + DOCS: https://spacy.io/api/vocab """ def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None, oov_prob=-20., vectors_name=None, writing_system={}, @@ -61,6 +61,8 @@ cdef class Vocab: lookups (Lookups): Container for large lookup tables and dictionaries. oov_prob (float): Default OOV probability. vectors_name (unicode): Optional name to identify the vectors table. + get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]): + A function that yields base noun phrases used for Doc.noun_chunks. """ lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} if lookups in (None, True, False): @@ -110,7 +112,7 @@ cdef class Vocab: available bit will be chosen. RETURNS (int): The integer ID by which the flag value can be checked. - DOCS: https://nightly.spacy.io/api/vocab#add_flag + DOCS: https://spacy.io/api/vocab#add_flag """ if flag_id == -1: for bit in range(1, 64): @@ -197,7 +199,7 @@ cdef class Vocab: string (unicode): The ID string. RETURNS (bool) Whether the string has an entry in the vocabulary. - DOCS: https://nightly.spacy.io/api/vocab#contains + DOCS: https://spacy.io/api/vocab#contains """ cdef hash_t int_key if isinstance(key, bytes): @@ -214,7 +216,7 @@ cdef class Vocab: YIELDS (Lexeme): An entry in the vocabulary. - DOCS: https://nightly.spacy.io/api/vocab#iter + DOCS: https://spacy.io/api/vocab#iter """ cdef attr_t key cdef size_t addr @@ -237,7 +239,7 @@ cdef class Vocab: >>> apple = nlp.vocab.strings["apple"] >>> assert nlp.vocab[apple] == nlp.vocab[u"apple"] - DOCS: https://nightly.spacy.io/api/vocab#getitem + DOCS: https://spacy.io/api/vocab#getitem """ cdef attr_t orth if isinstance(id_or_string, unicode): @@ -286,7 +288,7 @@ cdef class Vocab: among those remaining. For example, suppose the original table had vectors for the words: - ['sat', 'cat', 'feline', 'reclined']. If we prune the vector table to, + ['sat', 'cat', 'feline', 'reclined']. If we prune the vector table to two rows, we would discard the vectors for 'feline' and 'reclined'. These words would then be remapped to the closest remaining vector -- so "feline" would have the same vector as "cat", and "reclined" @@ -305,8 +307,9 @@ cdef class Vocab: word was mapped to, and `score` the similarity score between the two words. - DOCS: https://nightly.spacy.io/api/vocab#prune_vectors + DOCS: https://spacy.io/api/vocab#prune_vectors """ + ops = get_current_ops() xp = get_array_module(self.vectors.data) # Make sure all vectors are in the vocab for orth in self.vectors: @@ -322,8 +325,9 @@ cdef class Vocab: toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]]) self.vectors = Vectors(data=keep, keys=keys[:nr_row], name=self.vectors.name) syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size) + syn_keys = ops.to_numpy(syn_keys) remap = {} - for i, key in enumerate(keys[nr_row:]): + for i, key in enumerate(ops.to_numpy(keys[nr_row:])): self.vectors.add(key, row=syn_rows[i][0]) word = self.strings[key] synonym = self.strings[syn_keys[i][0]] @@ -344,26 +348,26 @@ cdef class Vocab: Defaults to the length of `orth`. maxn (int): Maximum n-gram length used for Fasttext's ngram computation. Defaults to the length of `orth`. - RETURNS (numpy.ndarray): A word vector. Size + RETURNS (numpy.ndarray or cupy.ndarray): A word vector. Size and shape determined by the `vocab.vectors` instance. Usually, a numpy ndarray of shape (300,) and dtype float32. - DOCS: https://nightly.spacy.io/api/vocab#get_vector + DOCS: https://spacy.io/api/vocab#get_vector """ if isinstance(orth, str): orth = self.strings.add(orth) word = self[orth].orth_ if orth in self.vectors.key2row: return self.vectors[orth] - # Assign default ngram limits to minn and maxn which is the length of the word. - if minn is None: - minn = len(word) - if maxn is None: - maxn = len(word) xp = get_array_module(self.vectors.data) vectors = xp.zeros((self.vectors_length,), dtype="f") + if minn is None: + return vectors # Fasttext's ngram computation taken from # https://github.com/facebookresearch/fastText + # Assign default ngram limit to maxn which is the length of the word. + if maxn is None: + maxn = len(word) ngrams_size = 0; for i in range(len(word)): ngram = "" @@ -393,9 +397,9 @@ cdef class Vocab: by string or int ID. orth (int / unicode): The word. - vector (numpy.ndarray[ndim=1, dtype='float32']): The vector to set. + vector (numpy.ndarray or cupy.nadarry[ndim=1, dtype='float32']): The vector to set. - DOCS: https://nightly.spacy.io/api/vocab#set_vector + DOCS: https://spacy.io/api/vocab#set_vector """ if isinstance(orth, str): orth = self.strings.add(orth) @@ -417,7 +421,7 @@ cdef class Vocab: orth (int / unicode): The word. RETURNS (bool): Whether the word has a vector. - DOCS: https://nightly.spacy.io/api/vocab#has_vector + DOCS: https://spacy.io/api/vocab#has_vector """ if isinstance(orth, str): orth = self.strings.add(orth) @@ -441,9 +445,9 @@ cdef class Vocab: path (unicode or Path): A path to a directory, which will be created if it doesn't exist. - exclude (list): String names of serialization fields to exclude. + exclude (Iterable[str]): String names of serialization fields to exclude. - DOCS: https://nightly.spacy.io/api/vocab#to_disk + DOCS: https://spacy.io/api/vocab#to_disk """ path = util.ensure_path(path) if not path.exists(): @@ -461,10 +465,10 @@ cdef class Vocab: returns it. path (unicode or Path): A path to a directory. - exclude (list): String names of serialization fields to exclude. + exclude (Iterable[str]): String names of serialization fields to exclude. RETURNS (Vocab): The modified `Vocab` object. - DOCS: https://nightly.spacy.io/api/vocab#to_disk + DOCS: https://spacy.io/api/vocab#to_disk """ path = util.ensure_path(path) getters = ["strings", "vectors"] @@ -486,10 +490,10 @@ cdef class Vocab: def to_bytes(self, *, exclude=tuple()): """Serialize the current state to a binary string. - exclude (list): String names of serialization fields to exclude. + exclude (Iterable[str]): String names of serialization fields to exclude. RETURNS (bytes): The serialized form of the `Vocab` object. - DOCS: https://nightly.spacy.io/api/vocab#to_bytes + DOCS: https://spacy.io/api/vocab#to_bytes """ def deserialize_vectors(): if self.vectors is None: @@ -508,10 +512,10 @@ cdef class Vocab: """Load state from a binary string. bytes_data (bytes): The data to load from. - exclude (list): String names of serialization fields to exclude. + exclude (Iterable[str]): String names of serialization fields to exclude. RETURNS (Vocab): The `Vocab` object. - DOCS: https://nightly.spacy.io/api/vocab#from_bytes + DOCS: https://spacy.io/api/vocab#from_bytes """ def serialize_vectors(b): if self.vectors is None: @@ -521,7 +525,6 @@ cdef class Vocab: setters = { "strings": lambda b: self.strings.from_bytes(b), - "lexemes": lambda b: self.lexemes_from_bytes(b), "vectors": lambda b: serialize_vectors(b), "lookups": lambda b: self.lookups.from_bytes(b), } @@ -546,12 +549,13 @@ def pickle_vocab(vocab): data_dir = vocab.data_dir lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters) lookups = vocab.lookups + get_noun_chunks = vocab.get_noun_chunks return (unpickle_vocab, - (sstore, vectors, morph, data_dir, lex_attr_getters, lookups)) + (sstore, vectors, morph, data_dir, lex_attr_getters, lookups, get_noun_chunks)) def unpickle_vocab(sstore, vectors, morphology, data_dir, - lex_attr_getters, lookups): + lex_attr_getters, lookups, get_noun_chunks): cdef Vocab vocab = Vocab() vocab.vectors = vectors vocab.strings = sstore @@ -559,6 +563,7 @@ def unpickle_vocab(sstore, vectors, morphology, data_dir, vocab.data_dir = data_dir vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters) vocab.lookups = lookups + vocab.get_noun_chunks = get_noun_chunks return vocab diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index b1f274252..72a75bb31 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -9,6 +9,7 @@ menu: - ['Parser & NER', 'parser'] - ['Tagging', 'tagger'] - ['Text Classification', 'textcat'] + - ['Span Classification', 'spancat'] - ['Entity Linking', 'entitylinker'] --- @@ -19,7 +20,7 @@ spaCy's built-in architectures that are used for different NLP tasks. All trainable [built-in components](/api#architecture-pipeline) expect a `model` argument defined in the config and document their the default architecture. Custom architectures can be registered using the -[`@spacy.registry.architectures`](/api/top-level#regsitry) decorator and used as +[`@spacy.registry.architectures`](/api/top-level#registry) decorator and used as part of the [training config](/usage/training#custom-functions). Also see the usage documentation on [layers and model architectures](/usage/layers-architectures). @@ -35,7 +36,7 @@ usage documentation on > @architectures = "spacy.Tok2Vec.v2" > > [model.embed] -> @architectures = "spacy.CharacterEmbed.v1" +> @architectures = "spacy.CharacterEmbed.v2" > # ... > > [model.encode] @@ -54,13 +55,13 @@ blog post for background. | `encode` | Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. For example, [MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder). ~~Model[List[Floats2d], List[Floats2d]]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | -### spacy.HashEmbedCNN.v1 {#HashEmbedCNN} +### spacy.HashEmbedCNN.v2 {#HashEmbedCNN} > #### Example Config > > ```ini > [model] -> @architectures = "spacy.HashEmbedCNN.v1" +> @architectures = "spacy.HashEmbedCNN.v2" > pretrained_vectors = null > width = 96 > depth = 4 @@ -96,7 +97,7 @@ consisting of a CNN and a layer-normalized maxout activation function. > factory = "tok2vec" > > [components.tok2vec.model] -> @architectures = "spacy.HashEmbedCNN.v1" +> @architectures = "spacy.HashEmbedCNN.v2" > width = 342 > > [components.tagger] @@ -129,13 +130,13 @@ argument that connects to the shared `tok2vec` component in the pipeline. | `upstream` | A string to identify the "upstream" `Tok2Vec` component to communicate with. By default, the upstream name is the wildcard string `"*"`, but you could also specify the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. ~~str~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | -### spacy.MultiHashEmbed.v1 {#MultiHashEmbed} +### spacy.MultiHashEmbed.v2 {#MultiHashEmbed} > #### Example config > > ```ini > [model] -> @architectures = "spacy.MultiHashEmbed.v1" +> @architectures = "spacy.MultiHashEmbed.v2" > width = 64 > attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] > rows = [2000, 1000, 1000, 1000] @@ -160,13 +161,13 @@ not updated). | `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [`Doc`](/api/doc) objects' vocab. ~~bool~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | -### spacy.CharacterEmbed.v1 {#CharacterEmbed} +### spacy.CharacterEmbed.v2 {#CharacterEmbed} > #### Example config > > ```ini > [model] -> @architectures = "spacy.CharacterEmbed.v1" +> @architectures = "spacy.CharacterEmbed.v2" > width = 128 > rows = 7000 > nM = 64 @@ -266,13 +267,13 @@ Encode context using bidirectional LSTM layers. Requires | `dropout` | Creates a Dropout layer on the outputs of each LSTM layer except the last layer. Set to 0.0 to disable this functionality. ~~float~~ | | **CREATES** | The model using the architecture. ~~Model[List[Floats2d], List[Floats2d]]~~ | -### spacy.StaticVectors.v1 {#StaticVectors} +### spacy.StaticVectors.v2 {#StaticVectors} > #### Example config > > ```ini > [model] -> @architectures = "spacy.StaticVectors.v1" +> @architectures = "spacy.StaticVectors.v2" > nO = null > nM = null > dropout = 0.2 @@ -283,8 +284,9 @@ Encode context using bidirectional LSTM layers. Requires > ``` Embed [`Doc`](/api/doc) objects with their vocab's vectors table, applying a -learned linear projection to control the dimensionality. See the documentation -on [static vectors](/usage/embeddings-transformers#static-vectors) for details. +learned linear projection to control the dimensionality. Unknown tokens are +mapped to a zero vector. See the documentation on +[static vectors](/usage/embeddings-transformers#static-vectors) for details. | Name |  Description | | ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -330,15 +332,18 @@ for details and system requirements. -### spacy-transformers.TransformerModel.v1 {#TransformerModel} +### spacy-transformers.TransformerModel.v3 {#TransformerModel} > #### Example Config > > ```ini > [model] -> @architectures = "spacy-transformers.TransformerModel.v1" +> @architectures = "spacy-transformers.TransformerModel.v3" > name = "roberta-base" > tokenizer_config = {"use_fast": true} +> transformer_config = {} +> mixed_precision = true +> grad_scaler_config = {"init_scale": 32768} > > [model.get_spans] > @span_getters = "spacy-transformers.strided_spans.v1" @@ -364,12 +369,31 @@ transformer weights across your pipeline. For a layer that's configured for use in other components, see [Tok2VecTransformer](/api/architectures#Tok2VecTransformer). -| Name | Description | -| ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `name` | Any model name that can be loaded by [`transformers.AutoModel`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoModel). ~~str~~ | -| `get_spans` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. ~~Callable[[List[Doc]], List[Span]]~~ | -| `tokenizer_config` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). ~~Dict[str, Any]~~ | -| **CREATES** | The model using the architecture. ~~Model[List[Doc], FullTransformerBatch]~~ | +| Name | Description | +| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | Any model name that can be loaded by [`transformers.AutoModel`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoModel). ~~str~~ | +| `get_spans` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. ~~Callable[[List[Doc]], List[Span]]~~ | +| `tokenizer_config` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). ~~Dict[str, Any]~~ | +| `transformer_config` | Transformer settings passed to [`transformers.AutoConfig`](https://huggingface.co/transformers/model_doc/auto.html?highlight=autoconfig#transformers.AutoConfig) ~~Dict[str, Any]~~ | +| `mixed_precision` | Replace whitelisted ops by half-precision counterparts. Speeds up training and prediction on GPUs with [Tensor Cores](https://developer.nvidia.com/tensor-cores) and reduces GPU memory use. ~~bool~~ | +| `grad_scaler_config` | Configuration to pass to `thinc.api.PyTorchGradScaler` during training when `mixed_precision` is enabled. ~~Dict[str, Any]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], FullTransformerBatch]~~ | +| | | + + +Mixed-precision support is currently an experimental feature. + + + + +- The `transformer_config` argument was added in + `spacy-transformers.TransformerModel.v2`. +- The `mixed_precision` and `grad_scaler_config` arguments were added in + `spacy-transformers.TransformerModel.v3`. + +The other arguments are shared between all versions. + + ### spacy-transformers.TransformerListener.v1 {#TransformerListener} @@ -401,16 +425,19 @@ a single token vector given zero or more wordpiece vectors. | `upstream` | A string to identify the "upstream" `Transformer` component to communicate with. By default, the upstream name is the wildcard string `"*"`, but you could also specify the name of the `Transformer` component. You'll almost never have multiple upstream `Transformer` components, so the wildcard string will almost always be fine. ~~str~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | -### spacy-transformers.Tok2VecTransformer.v1 {#Tok2VecTransformer} +### spacy-transformers.Tok2VecTransformer.v3 {#Tok2VecTransformer} > #### Example Config > > ```ini > [model] -> @architectures = "spacy.Tok2VecTransformer.v1" +> @architectures = "spacy-transformers.Tok2VecTransformer.v3" > name = "albert-base-v2" > tokenizer_config = {"use_fast": false} +> transformer_config = {} > grad_factor = 1.0 +> mixed_precision = true +> grad_scaler_config = {"init_scale": 32768} > ``` Use a transformer as a [`Tok2Vec`](/api/tok2vec) layer directly. This does @@ -419,13 +446,31 @@ Use a transformer as a [`Tok2Vec`](/api/tok2vec) layer directly. This does object, but it's a **simpler solution** if you only need the transformer within one component. -| Name | Description | -| ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `get_spans` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. ~~Callable[[List[Doc]], List[Span]]~~ | -| `tokenizer_config` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). ~~Dict[str, Any]~~ | -| `pooling` | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. ~~Model[Ragged, Floats2d]~~ | -| `grad_factor` | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. ~~float~~ | -| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | +| Name | Description | +| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_spans` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. ~~Callable[[List[Doc]], List[Span]]~~ | +| `tokenizer_config` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). ~~Dict[str, Any]~~ | +| `transformer_config` | Settings to pass to the transformers forward pass. ~~Dict[str, Any]~~ | +| `pooling` | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. ~~Model[Ragged, Floats2d]~~ | +| `grad_factor` | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. ~~float~~ | +| `mixed_precision` | Replace whitelisted ops by half-precision counterparts. Speeds up training and prediction on GPUs with [Tensor Cores](https://developer.nvidia.com/tensor-cores) and reduces GPU memory use. ~~bool~~ | +| `grad_scaler_config` | Configuration to pass to `thinc.api.PyTorchGradScaler` during training when `mixed_precision` is enabled. ~~Dict[str, Any]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | + + +Mixed-precision support is currently an experimental feature. + + + + +- The `transformer_config` argument was added in + `spacy-transformers.Tok2VecTransformer.v2`. +- The `mixed_precision` and `grad_scaler_config` arguments were added in + `spacy-transformers.Tok2VecTransformer.v3`. + +The other arguments are shared between all versions. + + ## Pretraining architectures {#pretrain source="spacy/ml/models/multi_task.py"} @@ -447,6 +492,9 @@ For more information, see the section on > ```ini > [pretraining] > component = "tok2vec" +> +> [initialize] +> vectors = "en_core_web_lg" > ... > > [pretraining.objective] @@ -457,7 +505,9 @@ For more information, see the section on > ``` Predict the word's vector from a static embeddings table as pretraining -objective for a Tok2Vec layer. +objective for a Tok2Vec layer. To use this objective, make sure that the +`initialize.vectors` section in the config refers to a model with static +vectors. | Name | Description | | --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -507,7 +557,7 @@ for a Tok2Vec layer. > maxout_pieces = 2 > > [model.tok2vec] -> @architectures = "spacy.HashEmbedCNN.v1" +> @architectures = "spacy.HashEmbedCNN.v2" > pretrained_vectors = null > width = 96 > depth = 4 @@ -544,6 +594,13 @@ consists of either two or three subnetworks: | `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ | | **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ | + + +[TransitionBasedParser.v1](/api/legacy#TransitionBasedParser_v1) had the exact +same signature, but the `use_upper` argument was `True` by default. + + + ## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"} ### spacy.Tagger.v1 {#Tagger} @@ -582,6 +639,17 @@ several different built-in architectures. It is recommended to experiment with different architectures and settings to determine what works best on your specific data and challenge. + + +When the architecture for a text classification challenge contains a setting for +`exclusive_classes`, it is important to use the correct value for the correct +pipeline component. The `textcat` component should always be used for +single-label use-cases where `exclusive_classes = true`, while the +`textcat_multilabel` should be used for multi-label settings with +`exclusive_classes = false`. + + + ### spacy.TextCatEnsemble.v2 {#TextCatEnsemble} > #### Example Config @@ -592,7 +660,7 @@ specific data and challenge. > nO = null > > [model.linear_model] -> @architectures = "spacy.TextCatBOW.v1" +> @architectures = "spacy.TextCatBOW.v2" > exclusive_classes = true > ngram_size = 1 > no_output_layer = false @@ -601,7 +669,7 @@ specific data and challenge. > @architectures = "spacy.Tok2Vec.v2" > > [model.tok2vec.embed] -> @architectures = "spacy.MultiHashEmbed.v1" +> @architectures = "spacy.MultiHashEmbed.v2" > width = 64 > rows = [2000, 2000, 1000, 1000, 1000, 1000] > attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] @@ -629,8 +697,8 @@ from the linear model, where it is stored in `model.attrs["multi_label"]`. -The v1 was functionally similar, but used an internal `tok2vec` instead of -taking it as argument. +[TextCatEnsemble.v1](/api/legacy#TextCatEnsemble_v1) was functionally similar, +but used an internal `tok2vec` instead of taking it as argument: | Name | Description | | -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -647,18 +715,18 @@ taking it as argument. -### spacy.TextCatCNN.v1 {#TextCatCNN} +### spacy.TextCatCNN.v2 {#TextCatCNN} > #### Example Config > > ```ini > [model] -> @architectures = "spacy.TextCatCNN.v1" +> @architectures = "spacy.TextCatCNN.v2" > exclusive_classes = false > nO = null > > [model.tok2vec] -> @architectures = "spacy.HashEmbedCNN.v1" +> @architectures = "spacy.HashEmbedCNN.v2" > pretrained_vectors = null > width = 96 > depth = 4 @@ -679,13 +747,21 @@ architecture is usually less accurate than the ensemble, but runs faster. | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | -### spacy.TextCatBOW.v1 {#TextCatBOW} + + +[TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was +not yet resizable. Since v2, new labels can be added to this component, even +after training. + + + +### spacy.TextCatBOW.v2 {#TextCatBOW} > #### Example Config > > ```ini > [model] -> @architectures = "spacy.TextCatBOW.v1" +> @architectures = "spacy.TextCatBOW.v2" > exclusive_classes = false > ngram_size = 1 > no_output_layer = false @@ -703,6 +779,62 @@ the others, but may not be as accurate, especially if texts are short. | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | + + +[TextCatBOW.v1](/api/legacy#TextCatBOW_v1) had the exact same signature, but was +not yet resizable. Since v2, new labels can be added to this component, even +after training. + + + +## Span classification architectures {#spancat source="spacy/ml/models/spancat.py"} + +### spacy.SpanCategorizer.v1 {#SpanCategorizer} + +> #### Example Config +> +> ```ini +> [model] +> @architectures = "spacy.SpanCategorizer.v1" +> scorer = {"@layers": "spacy.LinearLogistic.v1"} +> +> [model.reducer] +> @layers = spacy.mean_max_reducer.v1" +> hidden_size = 128 +> +> [model.tok2vec] +> @architectures = "spacy.Tok2Vec.v1" +> +> [model.tok2vec.embed] +> @architectures = "spacy.MultiHashEmbed.v1" +> # ... +> +> [model.tok2vec.encode] +> @architectures = "spacy.MaxoutWindowEncoder.v1" +> # ... +> ``` + +Build a span categorizer model to power a +[`SpanCategorizer`](/api/spancategorizer) component, given a token-to-vector +model, a reducer model to map the sequence of vectors for each span down to a +single vector, and a scorer model to map the vectors to probabilities. + +| Name | Description | +| ----------- | ------------------------------------------------------------------------------- | +| `tok2vec` | The token-to-vector model. ~~Model[List[Doc], List[Floats2d]]~~ | +| `reducer` | The reducer model. ~~Model[Ragged, Floats2d]~~ | +| `scorer` | The scorer model. ~~Model[Floats2d, Floats2d]~~ | +| **CREATES** | The model using the architecture. ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ | + +### spacy.mean_max_reducer.v1 {#mean_max_reducer} + +Reduce sequences by concatenating their mean and max pooled vectors, and then +combine the concatenated vectors with a hidden layer. + +| Name | Description | +| ------------- | ------------------------------------- | +| `hidden_size` | The size of the hidden layer. ~~int~~ | + ## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"} An [`EntityLinker`](/api/entitylinker) component disambiguates textual mentions @@ -726,7 +858,7 @@ into the "real world". This requires 3 main components: > nO = null > > [model.tok2vec] -> @architectures = "spacy.HashEmbedCNN.v1" +> @architectures = "spacy.HashEmbedCNN.v2" > pretrained_vectors = null > width = 96 > depth = 2 diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 6dc8de900..a4462af56 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -12,9 +12,11 @@ menu: - ['train', 'train'] - ['pretrain', 'pretrain'] - ['evaluate', 'evaluate'] + - ['assemble', 'assemble'] - ['package', 'package'] - ['project', 'project'] - ['ray', 'ray'] + - ['huggingface-hub', 'huggingface-hub'] --- spaCy's CLI provides a range of helpful commands for downloading and training @@ -28,7 +30,7 @@ available arguments and usage. Download [trained pipelines](/usage/models) for spaCy. The downloader finds the best-matching compatible version and uses `pip install` to download the Python package. Direct downloads don't perform any compatibility checks and require the -pipeline name to be specified with its version (e.g. `en_core_web_sm-2.2.0`). +pipeline name to be specified with its version (e.g. `en_core_web_sm-3.0.0`). > #### Downloading best practices > @@ -42,16 +44,17 @@ pipeline name to be specified with its version (e.g. `en_core_web_sm-2.2.0`). > project. ```cli -$ python -m spacy download [model] [--direct] [pip_args] +$ python -m spacy download [model] [--direct] [--sdist] [pip_args] ``` -| Name | Description | -| ------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `model` | Pipeline package name, e.g. [`en_core_web_sm`](/models/en#en_core_web_sm). ~~str (positional)~~ | -| `--direct`, `-d` | Force direct download of exact package version. ~~bool (flag)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| pip args 2.1 | Additional installation options to be passed to `pip install` when installing the pipeline package. For example, `--user` to install to the user home directory or `--no-deps` to not install package dependencies. ~~Any (option/flag)~~ | -| **CREATES** | The installed pipeline package in your `site-packages` directory. | +| Name | Description | +| ------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `model` | Pipeline package name, e.g. [`en_core_web_sm`](/models/en#en_core_web_sm). ~~str (positional)~~ | +| `--direct`, `-D` | Force direct download of exact package version. ~~bool (flag)~~ | +| `--sdist`, `-S` 3 | Download the source package (`.tar.gz` archive) instead of the default pre-built binary wheel. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| pip args 2.1 | Additional installation options to be passed to `pip install` when installing the pipeline package. For example, `--user` to install to the user home directory or `--no-deps` to not install package dependencies. ~~Any (option/flag)~~ | +| **CREATES** | The installed pipeline package in your `site-packages` directory. | ## info {#info tag="command"} @@ -76,7 +79,7 @@ $ python -m spacy info [model] [--markdown] [--silent] [--exclude] | Name | Description | | ------------------------------------------------ | --------------------------------------------------------------------------------------------- | -| `model` | A trained pipeline, i.e. package name or path (optional). ~~Optional[str] \(positional)~~ | +| `model` | A trained pipeline, i.e. package name or path (optional). ~~Optional[str] \(option)~~ | | `--markdown`, `-md` | Print information as Markdown. ~~bool (flag)~~ | | `--silent`, `-s` 2.0.12 | Don't print anything, just return the values. ~~bool (flag)~~ | | `--exclude`, `-e` | Comma-separated keys to exclude from the print-out. Defaults to `"labels"`. ~~Optional[str]~~ | @@ -169,14 +172,15 @@ validation error with more details. $ python -m spacy init fill-config [base_path] [output_file] [--diff] ``` -| Name | Description | -| ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | -| `base_path` | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~ | -| `output_file` | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ | -| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~ | -| `--diff`, `-D` | Print a visual diff highlighting the changes. ~~bool (flag)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | Complete and auto-filled config file for training. | +| Name | Description | +| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `base_path` | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~ | +| `output_file` | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ | +| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~ | +| `--diff`, `-D` | Print a visual diff highlighting the changes. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | Complete and auto-filled config file for training. | ### init vectors {#init-vectors new="3" tag="command"} @@ -236,11 +240,11 @@ $ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [ | `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ | | `output_path` | Output directory for the label files. Will create one JSON file per component. ~~Path (positional)~~ | | `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | -| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ | +| `--verbose`, `-V` | Show more detailed messages for debugging purposes. ~~bool (flag)~~ | | `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | -| **CREATES** | The best trained pipeline and the final checkpoint (if training is terminated). | +| **CREATES** | The label files. | ## convert {#convert tag="command"} @@ -256,28 +260,30 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type] | Name | Description | | ------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- | -| `input_file` | Input file. ~~Path (positional)~~ | -| `output_dir` | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. ~~Optional[Path] \(positional)~~ | +| `input_path` | Input file or directory. ~~Path (positional)~~ | +| `output_dir` | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. ~~Optional[Path] \(option)~~ | | `--converter`, `-c` 2 | Name of converter to use (see below). ~~str (option)~~ | | `--file-type`, `-t` 2.1 | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. ~~str (option)~~ | -| `--n-sents`, `-n` | Number of sentences per document. ~~int (option)~~ | -| `--seg-sents`, `-s` 2.2 | Segment sentences (for `--converter ner`). ~~bool (flag)~~ | -| `--base`, `-b` | Trained spaCy pipeline for sentence segmentation to use as base (for `--seg-sents`). ~~Optional[str](option)~~ | -| `--morphology`, `-m` | Enable appending morphology to tags. ~~bool (flag)~~ | -| `--ner-map`, `-nm` | NER tag mapping (as JSON-encoded dict of entity types). ~~Optional[Path](option)~~ | +| `--n-sents`, `-n` | Number of sentences per document. Supported for: `conll`, `conllu`, `iob`, `ner` ~~int (option)~~ | +| `--seg-sents`, `-s` 2.2 | Segment sentences. Supported for: `conll`, `ner` ~~bool (flag)~~ | +| `--base`, `-b`, `--model` | Trained spaCy pipeline for sentence segmentation to use as base (for `--seg-sents`). ~~Optional[str](option)~~ | +| `--morphology`, `-m` | Enable appending morphology to tags. Supported for: `conllu` ~~bool (flag)~~ | +| `--merge-subtokens`, `-T` | Merge CoNLL-U subtokens ~~bool (flag)~~ | +| `--ner-map`, `-nm` | NER tag mapping (as JSON-encoded dict of entity types). Supported for: `conllu` ~~Optional[Path](option)~~ | | `--lang`, `-l` 2.1 | Language code (if tokenizer required). ~~Optional[str] \(option)~~ | +| `--concatenate`, `-C` | Concatenate output to a single file ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | **CREATES** | Binary [`DocBin`](/api/docbin) training data that can be used with [`spacy train`](/api/cli#train). | ### Converters {#converters} -| ID | Description | -| ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `auto` | Automatically pick converter based on file extension and file content (default). | -| `json` | JSON-formatted training data used in spaCy v2.x. | -| `conll` | Universal Dependencies `.conllu` or `.conll` format. | -| `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](%%GITHUB_SPACY/extra/example_data/ner_example_data). | -| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](%%GITHUB_SPACY/extra/example_data/ner_example_data). | +| ID | Description | +| --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `auto` | Automatically pick converter based on file extension and file content (default). | +| `json` | JSON-formatted training data used in spaCy v2.x. | +| `conllu` | Universal Dependencies `.conllu` format. | +| `ner` / `conll` | NER with IOB/IOB2/BILUO tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the NER tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](%%GITHUB_SPACY/extra/example_data/ner_example_data). | +| `iob` | NER with IOB/IOB2/BILUO tags, one sentence per line with tokens separated by whitespace and annotation separated by `\|`, either `word\|B-ENT`or`word\|POS\|B-ENT`. See [sample data](%%GITHUB_SPACY/extra/example_data/ner_example_data). | ## debug {#debug new="3"} @@ -640,7 +646,7 @@ $ python -m spacy debug profile [model] [inputs] [--n-texts] | Name | Description | | ----------------- | ---------------------------------------------------------------------------------- | | `model` | A loadable spaCy pipeline (package name or path). ~~str (positional)~~ | -| `inputs` | Optional path to input file, or `-` for standard input. ~~Path (positional)~~ | +| `inputs` | Path to input file, or `-` for standard input. ~~Path (positional)~~ | | `--n-texts`, `-n` | Maximum number of texts to use if available. Defaults to `10000`. ~~int (option)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | **PRINTS** | Profiling information for the pipeline. | @@ -765,6 +771,7 @@ $ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P | `--print-step3`, `-P3` | Print final predictions. ~~bool (flag)~~ | | `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | | **PRINTS** | Debugging information. | ## train {#train tag="command"} @@ -804,7 +811,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id] | Name | Description | | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ | -| `--output`, `-o` | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ | +| `--output`, `-o` | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(option)~~ | | `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ | | `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ | @@ -812,6 +819,29 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id] | overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | | **CREATES** | The final trained pipeline and the best trained pipeline. | +### Calling the training function from Python {#train-function new="3.2"} + +The training CLI exposes a `train` helper function that lets you run the +training just like `spacy train`. Usually it's easier to use the command line +directly, but if you need to kick off training from code this is how to do it. + +> #### Example +> +> ```python +> from spacy.cli.train import train +> +> train("./config.cfg", overrides={"paths.train": "./train.spacy", "paths.dev": "./dev.spacy"}) +> +> ``` + +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------------------------------------- | +| `config_path` | Path to the config to use for training. ~~Union[str, Path]~~ | +| `output_path` | Optional name of directory to save output model in. If not provided a model will not be saved. ~~Optional[Union[str, Path]]~~ | +| _keyword-only_ | | +| `use_gpu` | Which GPU to use. Defaults to -1 for no GPU. ~~int~~ | +| `overrides` | Values to override config settings. ~~Dict[str, Any]~~ | + ## pretrain {#pretrain new="2.1" tag="command,experimental"} Pretrain the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline @@ -874,21 +904,49 @@ skew. To render a sample of dependency parses in a HTML file using the `--displacy-path` argument. ```cli -$ python -m spacy evaluate [model] [data_path] [--output] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit] +$ python -m spacy evaluate [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit] ``` -| Name | Description | -| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ | -| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ | -| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ | -| `--code-path`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | -| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ | -| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | -| `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ | -| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | Training results and optional metrics and visualizations. | +| Name | Description | +| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ | +| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ | +| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ | +| `--code`, `-c` 3 | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ | +| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | +| `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ | +| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | Training results and optional metrics and visualizations. | + +## assemble {#assemble tag="command"} + +Assemble a pipeline from a config file without additional training. Expects a +[config file](/api/data-formats#config) with all settings and hyperparameters. +The `--code` argument can be used to import a Python file that lets you register +[custom functions](/usage/training#custom-functions) and refer to them in your +config. + +> #### Example +> +> ```cli +> $ python -m spacy assemble config.cfg ./output +> ``` + +```cli +$ python -m spacy assemble [config_path] [output_dir] [--code] [--verbose] [overrides] +``` + +| Name | Description | +| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `config_path` | Path to the [config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ | +| `output_dir` | Directory to store the final pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(option)~~ | +| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions). ~~Optional[Path] \(option)~~ | +| `--verbose`, `-V` | Show more detailed messages during processing. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.data ./data`. ~~Any (option/flag)~~ | +| **CREATES** | The final assembled pipeline. | ## package {#package tag="command"} @@ -900,19 +958,24 @@ registered functions like copied into the package and imported in the `__init__.py`. If the path to a [`meta.json`](/api/data-formats#meta) is supplied, or a `meta.json` is found in the input directory, this file is used. Otherwise, the data can be entered -directly from the command line. spaCy will then create a `.tar.gz` archive file -that you can distribute and install with `pip install`. +directly from the command line. spaCy will then create a build artifact that you +can distribute and install with `pip install`. As of v3.1, the `package` command +will also create a formatted `README.md` based on the pipeline information +defined in the `meta.json`. If a `README.md` is already present in the source +directory, it will be used instead. The `spacy package` command now also builds the `.tar.gz` archive automatically, so you don't have to run `python setup.py sdist` separately anymore. To disable -this, you can set the `--no-sdist` flag. +this, you can set `--build none`. You can also choose to build a binary wheel +(which installs more efficiently) by setting `--build wheel`, or to build both +the sdist and wheel by setting `--build sdist,wheel`. ```cli -$ python -m spacy package [input_dir] [output_dir] [--code] [--meta-path] [--create-meta] [--no-sdist] [--name] [--version] [--force] +$ python -m spacy package [input_dir] [output_dir] [--code] [--meta-path] [--create-meta] [--build] [--name] [--version] [--force] ``` > #### Example @@ -923,19 +986,19 @@ $ python -m spacy package [input_dir] [output_dir] [--code] [--meta-path] [--cre > $ pip install dist/en_pipeline-0.0.0.tar.gz > ``` -| Name | Description | -| ------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `input_dir` | Path to directory containing pipeline data. ~~Path (positional)~~ | -| `output_dir` | Directory to create package folder in. ~~Path (positional)~~ | -| `--code`, `-c` 3 | Comma-separated paths to Python files to be included in the package and imported in its `__init__.py`. This allows including [registering functions](/usage/training#custom-functions) and [custom components](/usage/processing-pipelines#custom-components). ~~Optional[str] \(option)~~ | -| `--meta-path`, `-m` 2 | Path to [`meta.json`](/api/data-formats#meta) file (optional). ~~Optional[Path] \(option)~~ | -| `--create-meta`, `-C` 2 | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. ~~bool (flag)~~ | -| `--no-sdist`, `-NS`, | Don't build the `.tar.gz` sdist automatically. Can be set if you want to run this step manually. ~~bool (flag)~~ | -| `--name`, `-n` 3 | Package name to override in meta. ~~Optional[str] \(option)~~ | -| `--version`, `-v` 3 | Package version to override in meta. Useful when training new versions, as it doesn't require editing the meta template. ~~Optional[str] \(option)~~ | -| `--force`, `-f` | Force overwriting of existing folder in output directory. ~~bool (flag)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | A Python package containing the spaCy pipeline. | +| Name | Description | +| ------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `input_dir` | Path to directory containing pipeline data. ~~Path (positional)~~ | +| `output_dir` | Directory to create package folder in. ~~Path (positional)~~ | +| `--code`, `-c` 3 | Comma-separated paths to Python files to be included in the package and imported in its `__init__.py`. This allows including [registering functions](/usage/training#custom-functions) and [custom components](/usage/processing-pipelines#custom-components). ~~str (option)~~ | +| `--meta-path`, `-m` 2 | Path to [`meta.json`](/api/data-formats#meta) file (optional). ~~Optional[Path] \(option)~~ | +| `--create-meta`, `-C` 2 | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. ~~bool (flag)~~ | +| `--build`, `-b` 3 | Comma-separated artifact formats to build. Can be `sdist` (for a `.tar.gz` archive) and/or `wheel` (for a binary `.whl` file), or `none` if you want to run this step manually. The generated artifacts can be installed by `pip install`. Defaults to `sdist`. ~~str (option)~~ | +| `--name`, `-n` 3 | Package name to override in meta. ~~Optional[str] \(option)~~ | +| `--version`, `-v` 3 | Package version to override in meta. Useful when training new versions, as it doesn't require editing the meta template. ~~Optional[str] \(option)~~ | +| `--force`, `-f` | Force overwriting of existing folder in output directory. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | A Python package containing the spaCy pipeline. | ## project {#project new="3"} @@ -1187,14 +1250,14 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose] > $ python -m spacy project dvc all > ``` -| Name | Description | -| ----------------- | ----------------------------------------------------------------------------------------------------------------- | -| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | -| `workflow` | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(positional)~~ | -| `--force`, `-F` | Force-updating config file. ~~bool (flag)~~ | -| `--verbose`, `-V` |  Print more output generated by DVC. ~~bool (flag)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. | +| Name | Description | +| ----------------- | ------------------------------------------------------------------------------------------------------------- | +| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | +| `workflow` | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(option)~~ | +| `--force`, `-F` | Force-updating config file. ~~bool (flag)~~ | +| `--verbose`, `-V` |  Print more output generated by DVC. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. | ## ray {#ray new="3"} @@ -1232,10 +1295,56 @@ $ python -m spacy ray train [config_path] [--code] [--output] [--n-workers] [--a | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ | | `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | -| `--output`, `-o` | Directory or remote storage URL for saving trained pipeline. The directory will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ | +| `--output`, `-o` | Directory or remote storage URL for saving trained pipeline. The directory will be created if it doesn't exist. ~~Optional[Path] \(option)~~ | | `--n-workers`, `-n` | The number of workers. Defaults to `1`. ~~int (option)~~ | | `--address`, `-a` | Optional address of the Ray cluster. If not set (default), Ray will run locally. ~~Optional[str] \(option)~~ | | `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ | | `--verbose`, `-V` | Display more information for debugging purposes. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | + +## huggingface-hub {#huggingface-hub new="3.1"} + +The `spacy huggingface-cli` CLI includes commands for uploading your trained +spaCy pipelines to the [Hugging Face Hub](https://huggingface.co/). + +> #### Installation +> +> ```cli +> $ pip install spacy-huggingface-hub +> $ huggingface-cli login +> ``` + + + +To use this command, you need the +[`spacy-huggingface-hub`](https://github.com/explosion/spacy-huggingface-hub) +package installed. Installing the package will automatically add the +`huggingface-hub` command to the spaCy CLI. + + + +### huggingface-hub push {#huggingface-hub-push tag="command"} + +Push a spaCy pipeline to the Hugging Face Hub. Expects a `.whl` file packaged +with [`spacy package`](/api/cli#package) and `--build wheel`. For more details, +see the spaCy project [integration](/usage/projects#huggingface_hub). + +```cli +$ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--local-repo] [--verbose] +``` + +> #### Example +> +> ```cli +> $ python -m spacy huggingface-hub push en_ner_fashion-0.0.0-py3-none-any.whl +> ``` + +| Name | Description | +| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | +| `whl_path` | The path to the `.whl` file packaged with [`spacy package`](https://spacy.io/api/cli#package). ~~Path(positional)~~ | +| `--org`, `-o` | Optional name of organization to which the pipeline should be uploaded. ~~str (option)~~ | +| `--msg`, `-m` | Commit message to use for update. Defaults to `"Update spaCy pipeline"`. ~~str (option)~~ | +| `--local-repo`, `-l` | Local path to the model repository (will be created if it doesn't exist). Defaults to `hub` in the current working directory. ~~Path (option)~~ | +| `--verbose`, `-V` | Output additional info for debugging, e.g. the full generated hub metadata. ~~bool (flag)~~  | +| **UPLOADS** | The pipeline to the hub. | diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 4f134c808..001455f33 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -29,8 +29,8 @@ recommended settings for your use case, check out the > > The `@` syntax lets you refer to function names registered in the > [function registry](/api/top-level#registry). For example, -> `@architectures = "spacy.HashEmbedCNN.v1"` refers to a registered function of -> the name [spacy.HashEmbedCNN.v1](/api/architectures#HashEmbedCNN) and all +> `@architectures = "spacy.HashEmbedCNN.v2"` refers to a registered function of +> the name [spacy.HashEmbedCNN.v2](/api/architectures#HashEmbedCNN) and all > other values defined in its block will be passed into that function as > arguments. Those arguments depend on the registered function. See the usage > guide on [registered functions](/usage/training#config-functions) for details. @@ -90,10 +90,9 @@ Defines the `nlp` object, its tokenizer and > ```ini > [components.textcat] > factory = "textcat" -> labels = ["POSITIVE", "NEGATIVE"] > > [components.textcat.model] -> @architectures = "spacy.TextCatBOW.v1" +> @architectures = "spacy.TextCatBOW.v2" > exclusive_classes = true > ngram_size = 1 > no_output_layer = false @@ -182,24 +181,25 @@ single corpus once and then divide it up into `train` and `dev` partitions. This section defines settings and controls for the training and evaluation process that are used when you run [`spacy train`](/api/cli#train). -| Name | Description | -| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | -| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | -| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | -| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ | -| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | -| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | -| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | -| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ | -| `logger` | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ | -| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | -| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ | -| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | -| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ | -| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | -| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | -| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | +| Name | Description | +| ----------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | +| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | +| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | +| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ | +| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | +| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | +| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | +| `annotating_components` | Pipeline component names that should set annotations on the predicted docs during training. See [here](/usage/training#annotating-components) for details. Defaults to `[]`. ~~List[str]~~ | +| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ | +| `logger` | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ | +| `max_epochs` | Maximum number of epochs to train for. `0` means an unlimited number of epochs. `-1` means that the train corpus should be streamed rather than loaded into memory with no shuffling within the training loop. Defaults to `0`. ~~int~~ | +| `max_steps` | Maximum number of update steps to train for. `0` means an unlimited number of steps. Defaults to `20000`. ~~int~~ | +| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | +| `patience` | How many steps to continue without improvement in evaluation score. `0` disables early stopping. Defaults to `1600`. ~~int~~ | +| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | +| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | +| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | ### pretraining {#config-pretraining tag="section,optional"} @@ -245,6 +245,8 @@ Also see the usage guides on the | Name | Description | | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `after_init` | Optional callback to modify the `nlp` object after initialization. ~~Optional[Callable[[Language], Language]]~~ | +| `before_init` | Optional callback to modify the `nlp` object before initialization. ~~Optional[Callable[[Language], Language]]~~ | | `components` | Additional arguments passed to the `initialize` method of a pipeline component, keyed by component name. If type annotations are available on the method, the config will be validated against them. The `initialize` methods will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Dict[str, Any]]~~ | | `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ | | `lookups` | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~ | @@ -281,6 +283,10 @@ CLI [`train`](/api/cli#train) command. The built-in of the `.conllu` format used by the [Universal Dependencies corpora](https://github.com/UniversalDependencies). +Note that while this is the format used to save training data, you do not have +to understand the internal details to use it or create training data. See the +section on [preparing training data](/usage/training#training-data). + ### JSON training format {#json-input tag="deprecated"} @@ -294,7 +300,7 @@ objects to JSON, you can now serialize them directly using the format: ```cli -$ python -m spacy convert ./data.json ./output.spacy +$ python -m spacy convert ./data.json . ``` @@ -388,7 +394,7 @@ file to keep track of your settings and hyperparameters and your own > "tags": List[str], > "pos": List[str], > "morphs": List[str], -> "sent_starts": List[bool], +> "sent_starts": List[Optional[bool]], > "deps": List[string], > "heads": List[int], > "entities": List[str], @@ -449,9 +455,11 @@ doc = nlp("I'm pretty happy about that!") gold_dict = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}} example = Example.from_dict(doc, gold_dict) -# Training data for an Entity Linking component +# Training data for an Entity Linking component (also requires entities & sentences) doc = nlp("Russ Cochran his reprints include EC Comics.") -gold_dict = {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}} +gold_dict = {"entities": [(0, 12, "PERSON")], + "links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}, + "sent_starts": [1, -1, -1, -1, -1, -1, -1, -1]} example = Example.from_dict(doc, gold_dict) ``` @@ -584,7 +592,7 @@ source of truth** used for loading a pipeline. | `vectors` | Information about the word vectors included with the pipeline. Typically a dict with the keys `"width"`, `"vectors"` (number of vectors), `"keys"` and `"name"`. ~~Dict[str, Any]~~ | | `pipeline` | Names of pipeline component names, in order. Corresponds to [`nlp.pipe_names`](/api/language#pipe_names). Only exists for reference and is not used to create the components. This information is defined in the [`config.cfg`](/api/data-formats#config). Defaults to `[]`. ~~List[str]~~ | | `labels` | Label schemes of the trained pipeline components, keyed by component name. Corresponds to [`nlp.pipe_labels`](/api/language#pipe_labels). [See here](https://github.com/explosion/spacy-models/tree/master/meta) for examples. Defaults to `{}`. ~~Dict[str, Dict[str, List[str]]]~~ | -| `accuracy` | Training accuracy, added automatically by [`spacy train`](/api/cli#train). Dictionary of [score names](/usage/training#metrics) mapped to scores. Defaults to `{}`. ~~Dict[str, Union[float, Dict[str, float]]]~~ | +| `performance` | Training accuracy, added automatically by [`spacy train`](/api/cli#train). Dictionary of [score names](/usage/training#metrics) mapped to scores. Defaults to `{}`. ~~Dict[str, Union[float, Dict[str, float]]]~~ | | `speed` | Inference speed, added automatically by [`spacy train`](/api/cli#train). Typically a dictionary with the keys `"cpu"`, `"gpu"` and `"nwords"` (words per second). Defaults to `{}`. ~~Dict[str, Optional[Union[float, str]]]~~ | | `spacy_git_version` 3 | Git commit of [`spacy`](https://github.com/explosion/spaCy) used to create pipeline. ~~str~~ | | other | Any other custom meta information you want to add. The data is preserved in [`nlp.meta`](/api/language#meta). ~~Any~~ | diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index 8974d9ea7..c48172a22 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -25,6 +25,20 @@ current state. The weights are updated such that the scores assigned to the set of optimal actions is increased, while scores assigned to other actions are decreased. Note that more than one action may be optimal for a given state. +## Assigned Attributes {#assigned-attributes} + +Dependency predictions are assigned to the `Token.dep` and `Token.head` fields. +Beside the dependencies themselves, the parser decides sentence boundaries, +which are saved in `Token.is_sent_start` and accessible via `Doc.sents`. + +| Location | Value | +| --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- | +| `Token.dep` | The type of dependency relation (hash). ~~int~~ | +| `Token.dep_` | The type of dependency relation. ~~str~~ | +| `Token.head` | The syntactic parent, or "governor", of this token. ~~Token~~ | +| `Token.is_sent_start` | A boolean value indicating whether the token starts a sentence. After the parser runs this will be `True` or `False` for all tokens. ~~bool~~ | +| `Doc.sents` | An iterator over sentences in the `Doc`, determined by `Token.is_sent_start` values. ~~Iterator[Span]~~ | + ## Config and implementation {#config} The default config is defined by the pipeline component factory and describes @@ -50,7 +64,7 @@ architectures and their arguments and hyperparameters. | Setting | Description | | ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ | +| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[TransitionSystem]~~ | | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | | `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. Defaults to `False`. ~~bool~~ | | `min_action_freq` | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. Defaults to `30`. ~~int~~ | @@ -88,8 +102,8 @@ shortcut for this and instantiate the component using its string name and | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | | `moves` | A list of transition names. Inferred from the data if not provided. ~~Optional[List[str]]~~ | | _keyword-only_ | | -| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. ~~int~~ | -| `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. ~~bool~~ | +| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | +| `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. Defaults to `False`. ~~bool~~ | | `min_action_freq` | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. ~~int~~ | ## DependencyParser.\_\_call\_\_ {#call tag="method"} @@ -220,9 +234,8 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores. ## DependencyParser.update {#update tag="method"} Learn from a batch of [`Example`](/api/example) objects, updating the pipe's -model. Delegates to [`predict`](/api/dependencyparser#predict), -[`get_loss`](/api/dependencyparser#get_loss) and -[`set_annotations`](/api/dependencyparser#set_annotations). +model. Delegates to [`predict`](/api/dependencyparser#predict) and +[`get_loss`](/api/dependencyparser#get_loss). > #### Example > @@ -232,14 +245,14 @@ model. Delegates to [`predict`](/api/dependencyparser#predict), > losses = parser.update(examples, sgd=optimizer) > ``` -| Name | Description | -| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| `drop` | The dropout rate. ~~float~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | -| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------ | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## DependencyParser.get_loss {#get_loss tag="method"} diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index a0b4c29bb..9836b8c21 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -34,7 +34,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the | Name | Description | | ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `vocab` | A storage container for lexical types. ~~Vocab~~ | -| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ | +| `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ | | `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ | | _keyword-only_ | | | `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ | @@ -44,7 +44,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the | `lemmas` 3 | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | | `heads` 3 | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ | | `deps` 3 | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | -| `sent_starts` 3 | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~ | +| `sent_starts` 3 | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Optional[bool]]]~~ | | `ents` 3 | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~ | ## Doc.\_\_getitem\_\_ {#getitem tag="method"} @@ -212,14 +212,14 @@ alignment mode `"strict". | Name | Description | | ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `start` | The index of the first character of the span. ~~int~~ | -| `end` | The index of the last character after the span. ~int~~ | +| `end` | The index of the last character after the span. ~~int~~ | | `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | | `kb_id` 2.2 | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | | `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | | `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | | **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | -## Doc.set_ents {#ents tag="method" new="3"} +## Doc.set_ents {#set_ents tag="method" new="3"} Set the named entities in the document. @@ -234,14 +234,14 @@ Set the named entities in the document. > assert ents[0].text == "Mr. Best" > ``` -| Name | Description | -| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| entities | Spans with labels to set as entities. ~~List[Span]~~ | -| _keyword-only_ | | -| blocked | Spans to set as "blocked" (never an entity) for spacy's built-in NER component. Other components may ignore this setting. ~~Optional[List[Span]]~~ | -| missing | Spans with missing/unknown entity information. ~~Optional[List[Span]]~~ | -| outside | Spans outside of entities (O in IOB). ~~Optional[List[Span]]~~ | -| default | How to set entity annotation for tokens outside of any provided spans. Options: "blocked", "missing", "outside" and "unmodified" (preserve current state). Defaults to "outside". ~~str~~ | +| Name | Description | +| -------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `entities` | Spans with labels to set as entities. ~~List[Span]~~ | +| _keyword-only_ | | +| `blocked` | Spans to set as "blocked" (never an entity) for spacy's built-in NER component. Other components may ignore this setting. ~~Optional[List[Span]]~~ | +| `missing` | Spans with missing/unknown entity information. ~~Optional[List[Span]]~~ | +| `outside` | Spans outside of entities (O in IOB). ~~Optional[List[Span]]~~ | +| `default` | How to set entity annotation for tokens outside of any provided spans. Options: `"blocked"`, `"missing"`, `"outside"` and `"unmodified"` (preserve current state). Defaults to `"outside"`. ~~str~~ | ## Doc.similarity {#similarity tag="method" model="vectors"} @@ -571,9 +571,9 @@ objects, if the entity recognizer has been applied. > assert ents[0].text == "Mr. Best" > ``` -| Name | Description | -| ----------- | --------------------------------------------------------------------- | -| **RETURNS** | Entities in the document, one `Span` per entity. ~~Tuple[Span, ...]~~ | +| Name | Description | +| ----------- | ---------------------------------------------------------------- | +| **RETURNS** | Entities in the document, one `Span` per entity. ~~Tuple[Span]~~ | ## Doc.spans {#spans tag="property"} @@ -616,8 +616,10 @@ phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be nested within it – so no NP-level coordination, no prepositional phrases, and no relative clauses. -If the `noun_chunk` [syntax iterator](/usage/adding-languages#language-data) has -not been implemeted for the given language, a `NotImplementedError` is raised. +To customize the noun chunk iterator in a loaded pipeline, modify +[`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk` +[syntax iterator](/usage/adding-languages#language-data) has not been +implemented for the given language, a `NotImplementedError` is raised. > #### Example > @@ -633,12 +635,14 @@ not been implemeted for the given language, a `NotImplementedError` is raised. | ---------- | ------------------------------------- | | **YIELDS** | Noun chunks in the document. ~~Span~~ | -## Doc.sents {#sents tag="property" model="parser"} +## Doc.sents {#sents tag="property" model="sentences"} -Iterate over the sentences in the document. Sentence spans have no label. To -improve accuracy on informal texts, spaCy calculates sentence boundaries from -the syntactic dependency parse. If the parser is disabled, the `sents` iterator -will be unavailable. +Iterate over the sentences in the document. Sentence spans have no label. + +This property is only available when +[sentence boundaries](/usage/linguistic-features#sbd) have been set on the +document by the `parser`, `senter`, `sentencizer` or some custom function. It +will raise an error otherwise. > #### Example > diff --git a/website/docs/api/docbin.md b/website/docs/api/docbin.md index 3625ed790..b1d1798ba 100644 --- a/website/docs/api/docbin.md +++ b/website/docs/api/docbin.md @@ -16,7 +16,7 @@ document from the `DocBin`. The serialization format is gzipped msgpack, where the msgpack object has the following structure: ```python -### msgpack object structrue +### msgpack object structure { "version": str, # DocBin version number "attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE] diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index b90c52710..bbc8f3942 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -16,6 +16,16 @@ plausible candidates from that `KnowledgeBase` given a certain textual mention, and a machine learning model to pick the right candidate, given the local context of the mention. +## Assigned Attributes {#assigned-attributes} + +Predictions, in the form of knowledge base IDs, will be assigned to +`Token.ent_kb_id_`. + +| Location | Value | +| ------------------ | --------------------------------- | +| `Token.ent_kb_id` | Knowledge base ID (hash). ~~int~~ | +| `Token.ent_kb_id_` | Knowledge base ID. ~~str~~ | + ## Config and implementation {#config} The default config is defined by the pipeline component factory and describes @@ -31,6 +41,7 @@ architectures and their arguments and hyperparameters. > from spacy.pipeline.entity_linker import DEFAULT_NEL_MODEL > config = { > "labels_discard": [], +> "n_sents": 0, > "incl_prior": True, > "incl_context": True, > "model": DEFAULT_NEL_MODEL, @@ -43,6 +54,7 @@ architectures and their arguments and hyperparameters. | Setting | Description | | ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | +| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ | | `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | | `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | @@ -89,6 +101,7 @@ custom knowledge base, you should either call | `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ | | `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | | `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ | +| `n_sents` | The number of neighbouring sentences to take into account. ~~int~~ | | `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ | | `incl_context` | Whether or not to include the local context in the model. ~~bool~~ | @@ -139,7 +152,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## EntityLinker.set_kb {#initialize tag="method" new="3"} +## EntityLinker.set_kb {#set_kb tag="method" new="3"} The `kb_loader` should be a function that takes a `Vocab` instance and creates the `KnowledgeBase`, ensuring that the strings of the knowledge base are synced @@ -154,7 +167,7 @@ with the current vocab. > kb.add_alias(...) > return kb > entity_linker = nlp.add_pipe("entity_linker") -> entity_linker.set_kb(lambda: [], nlp=nlp, kb_loader=create_kb) +> entity_linker.set_kb(create_kb) > ``` | Name | Description | @@ -210,10 +223,10 @@ if there is no prediction. > kb_ids = entity_linker.predict([doc1, doc2]) > ``` -| Name | Description | -| ----------- | ------------------------------------------- | -| `docs` | The documents to predict. ~~Iterable[Doc]~~ | -| **RETURNS** | `List[str]` | The predicted KB identifiers for the entities in the `docs`. ~~List[str]~~ | +| Name | Description | +| ----------- | -------------------------------------------------------------------------- | +| `docs` | The documents to predict. ~~Iterable[Doc]~~ | +| **RETURNS** | The predicted KB identifiers for the entities in the `docs`. ~~List[str]~~ | ## EntityLinker.set_annotations {#set_annotations tag="method"} @@ -237,8 +250,7 @@ entities. Learn from a batch of [`Example`](/api/example) objects, updating both the pipe's entity linking model and context encoder. Delegates to -[`predict`](/api/entitylinker#predict) and -[`set_annotations`](/api/entitylinker#set_annotations). +[`predict`](/api/entitylinker#predict). > #### Example > @@ -248,14 +260,14 @@ pipe's entity linking model and context encoder. Delegates to > losses = entity_linker.update(examples, sgd=optimizer) > ``` -| Name | Description | -| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| `drop` | The dropout rate. ~~float~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | -| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------ | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## EntityLinker.score {#score tag="method" new="3"} @@ -339,6 +351,42 @@ Load the pipe from disk. Modifies the object in place and returns it. | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | | **RETURNS** | The modified `EntityLinker` object. ~~EntityLinker~~ | +## EntityLinker.to_bytes {#to_bytes tag="method"} + +> #### Example +> +> ```python +> entity_linker = nlp.add_pipe("entity_linker") +> entity_linker_bytes = entity_linker.to_bytes() +> ``` + +Serialize the pipe to a bytestring, including the `KnowledgeBase`. + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The serialized form of the `EntityLinker` object. ~~bytes~~ | + +## EntityLinker.from_bytes {#from_bytes tag="method"} + +Load the pipe from a bytestring. Modifies the object in place and returns it. + +> #### Example +> +> ```python +> entity_linker_bytes = entity_linker.to_bytes() +> entity_linker = nlp.add_pipe("entity_linker") +> entity_linker.from_bytes(entity_linker_bytes) +> ``` + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| `bytes_data` | The data to load from. ~~bytes~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The `EntityLinker` object. ~~EntityLinker~~ | + ## Serialization fields {#serialization-fields} During serialization, spaCy will export several data fields used to restore diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index dd969d14b..ba7022c14 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -20,6 +20,24 @@ your entities will be close to their initial tokens. If your entities are long and characterized by tokens in their middle, the component will likely not be a good fit for your task. +## Assigned Attributes {#assigned-attributes} + +Predictions will be saved to `Doc.ents` as a tuple. Each label will also be +reflected to each underlying token, where it is saved in the `Token.ent_type` +and `Token.ent_iob` fields. Note that by definition each token can only have one +label. + +When setting `Doc.ents` to create training data, all the spans must be valid and +non-overlapping, or an error will be thrown. + +| Location | Value | +| ----------------- | ----------------------------------------------------------------- | +| `Doc.ents` | The annotated spans. ~~Tuple[Span]~~ | +| `Token.ent_iob` | An enum encoding of the IOB part of the named entity tag. ~~int~~ | +| `Token.ent_iob_` | The IOB part of the named entity tag. ~~str~~ | +| `Token.ent_type` | The label part of the named entity tag (hash). ~~int~~ | +| `Token.ent_type_` | The label part of the named entity tag. ~~str~~ | + ## Config and implementation {#config} The default config is defined by the pipeline component factory and describes @@ -37,6 +55,7 @@ architectures and their arguments and hyperparameters. > "moves": None, > "update_with_oracle_cut_size": 100, > "model": DEFAULT_NER_MODEL, +> "incorrect_spans_key": "incorrect_spans", > } > nlp.add_pipe("ner", config=config) > ``` @@ -46,6 +65,7 @@ architectures and their arguments and hyperparameters. | `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ | | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ | +| `incorrect_spans_key` | This key refers to a `SpanGroup` in `doc.spans` that specifies incorrect spans. The NER wiill learn not to predict (exactly) those spans. Defaults to `None`. ~~Optional[str]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/ner.pyx @@ -72,14 +92,15 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). -| Name | Description | -| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| `moves` | A list of transition names. Inferred from the data if not provided. ~~Optional[List[str]]~~ | -| _keyword-only_ | | -| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. ~~int~~ | +| Name | Description | +| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| `moves` | A list of transition names. Inferred from the data if set to `None`, which is the default. ~~Optional[List[str]]~~ | +| _keyword-only_ | | +| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | +| `incorrect_spans_key` | Identifies spans that are known to be incorrect entity annotations. The incorrect entity annotations can be stored in the span group in [`Doc.spans`](/api/doc#spans), under this key. Defaults to `None`. ~~Optional[str]~~ | ## EntityRecognizer.\_\_call\_\_ {#call tag="method"} @@ -209,9 +230,8 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores. ## EntityRecognizer.update {#update tag="method"} Learn from a batch of [`Example`](/api/example) objects, updating the pipe's -model. Delegates to [`predict`](/api/entityrecognizer#predict), -[`get_loss`](/api/entityrecognizer#get_loss) and -[`set_annotations`](/api/entityrecognizer#set_annotations). +model. Delegates to [`predict`](/api/entityrecognizer#predict) and +[`get_loss`](/api/entityrecognizer#get_loss). > #### Example > @@ -221,14 +241,14 @@ model. Delegates to [`predict`](/api/entityrecognizer#predict), > losses = ner.update(examples, sgd=optimizer) > ``` -| Name | Description | -| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| `drop` | The dropout rate. ~~float~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | -| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------ | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## EntityRecognizer.get_loss {#get_loss tag="method"} diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index 76a4b3604..c9c3ec365 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -15,6 +15,27 @@ used on its own to implement a purely rule-based entity recognition system. For usage examples, see the docs on [rule-based entity recognition](/usage/rule-based-matching#entityruler). +## Assigned Attributes {#assigned-attributes} + +This component assigns predictions basically the same way as the +[`EntityRecognizer`](/api/entityrecognizer). + +Predictions can be accessed under `Doc.ents` as a tuple. Each label will also be +reflected in each underlying token, where it is saved in the `Token.ent_type` +and `Token.ent_iob` fields. Note that by definition each token can only have one +label. + +When setting `Doc.ents` to create training data, all the spans must be valid and +non-overlapping, or an error will be thrown. + +| Location | Value | +| ----------------- | ----------------------------------------------------------------- | +| `Doc.ents` | The annotated spans. ~~Tuple[Span]~~ | +| `Token.ent_iob` | An enum encoding of the IOB part of the named entity tag. ~~int~~ | +| `Token.ent_iob_` | The IOB part of the named entity tag. ~~str~~ | +| `Token.ent_type` | The label part of the named entity tag (hash). ~~int~~ | +| `Token.ent_type_` | The label part of the named entity tag. ~~str~~ | + ## Config and implementation {#config} The default config is defined by the pipeline component factory and describes @@ -39,7 +60,7 @@ how the component should be configured. You can override its settings via the | `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ | | `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ | | `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ | -| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"||"`. ~~str~~ | +| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ | ```python %%GITHUB_SPACY/spacy/pipeline/entityruler.py @@ -71,7 +92,7 @@ be a token pattern (list) or a phrase pattern (string). For example: | `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ | | `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ | | `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ | -| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"||"`. ~~str~~ | +| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ | | `patterns` | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~ | ## EntityRuler.initialize {#initialize tag="method" new="3"} @@ -267,7 +288,7 @@ All labels present in the match patterns. | ----------- | -------------------------------------- | | **RETURNS** | The string labels. ~~Tuple[str, ...]~~ | -## EntityRuler.ent_ids {#labels tag="property" new="2.2.2"} +## EntityRuler.ent_ids {#ent_ids tag="property" new="2.2.2"} All entity IDs present in the `id` properties of the match patterns. diff --git a/website/docs/api/example.md b/website/docs/api/example.md index 2811f4d91..ca9d3c056 100644 --- a/website/docs/api/example.md +++ b/website/docs/api/example.md @@ -33,8 +33,8 @@ both documents. | Name | Description | | -------------- | ------------------------------------------------------------------------------------------------------------------------ | -| `predicted` | The document containing (partial) predictions. Cannot be `None`. ~~Doc~~ | -| `reference` | The document containing gold-standard annotations. Cannot be `None`. ~~Doc~~ | +| `predicted` | The document containing (partial) predictions. Cannot be `None`. ~~Doc~~ | +| `reference` | The document containing gold-standard annotations. Cannot be `None`. ~~Doc~~ | | _keyword-only_ | | | `alignment` | An object holding the alignment between the tokens of the `predicted` and `reference` documents. ~~Optional[Alignment]~~ | @@ -56,11 +56,11 @@ see the [training format documentation](/api/data-formats#dict-input). > example = Example.from_dict(predicted, {"words": token_ref, "tags": tags_ref}) > ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------- | -| `predicted` | The document containing (partial) predictions. Cannot be `None`. ~~Doc~~ | -| `example_dict` | `Dict[str, obj]` | The gold-standard annotations as a dictionary. Cannot be `None`. ~~Dict[str, Any]~~ | -| **RETURNS** | The newly constructed object. ~~Example~~ | +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------- | +| `predicted` | The document containing (partial) predictions. Cannot be `None`. ~~Doc~~ | +| `example_dict` | The gold-standard annotations as a dictionary. Cannot be `None`. ~~Dict[str, Any]~~ | +| **RETURNS** | The newly constructed object. ~~Example~~ | ## Example.text {#text tag="property"} @@ -211,10 +211,11 @@ align to the tokenization in [`Example.predicted`](/api/example#predicted). > assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1)] > ``` -| Name | Description | -| ----------- | ----------------------------------------------------------------------------- | -| `y_spans` | `Span` objects aligned to the tokenization of `reference`. ~~Iterable[Span]~~ | -| **RETURNS** | `Span` objects aligned to the tokenization of `predicted`. ~~List[Span]~~ | +| Name | Description | +| --------------- | -------------------------------------------------------------------------------------------- | +| `y_spans` | `Span` objects aligned to the tokenization of `reference`. ~~Iterable[Span]~~ | +| `allow_overlap` | Whether the resulting `Span` objects may overlap or not. Set to `False` by default. ~~bool~~ | +| **RETURNS** | `Span` objects aligned to the tokenization of `predicted`. ~~List[Span]~~ | ## Example.get_aligned_spans_x2y {#get_aligned_spans_x2y tag="method"} @@ -238,10 +239,11 @@ against the original gold-standard annotation. > assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2)] > ``` -| Name | Description | -| ----------- | ----------------------------------------------------------------------------- | -| `x_spans` | `Span` objects aligned to the tokenization of `predicted`. ~~Iterable[Span]~~ | -| **RETURNS** | `Span` objects aligned to the tokenization of `reference`. ~~List[Span]~~ | +| Name | Description | +| --------------- | -------------------------------------------------------------------------------------------- | +| `x_spans` | `Span` objects aligned to the tokenization of `predicted`. ~~Iterable[Span]~~ | +| `allow_overlap` | Whether the resulting `Span` objects may overlap or not. Set to `False` by default. ~~bool~~ | +| **RETURNS** | `Span` objects aligned to the tokenization of `reference`. ~~List[Span]~~ | ## Example.to_dict {#to_dict tag="method"} diff --git a/website/docs/api/kb.md b/website/docs/api/kb.md index 855dead27..e7a8fcd6f 100644 --- a/website/docs/api/kb.md +++ b/website/docs/api/kb.md @@ -82,7 +82,7 @@ Add an alias or mention to the knowledge base, specifying its potential KB identifiers and their prior probabilities. The entity identifiers should refer to entities previously added with [`add_entity`](/api/kb#add_entity) or [`set_entities`](/api/kb#set_entities). The sum of the prior probabilities -should not exceed 1. +should not exceed 1. Note that an empty string can not be used as alias. > #### Example > @@ -92,7 +92,7 @@ should not exceed 1. | Name | Description | | --------------- | --------------------------------------------------------------------------------- | -| `alias` | The textual mention or alias. ~~str~~ | +| `alias` | The textual mention or alias. Can not be the empty string. ~~str~~ | | `entities` | The potential entities that the alias may refer to. ~~Iterable[Union[str, int]]~~ | | `probabilities` | The prior probabilities of each entity. ~~Iterable[float]~~ | @@ -152,7 +152,7 @@ Get a list of all aliases in the knowledge base. | ----------- | -------------------------------------------------------- | | **RETURNS** | The list of aliases in the knowledge base. ~~List[str]~~ | -## KnowledgeBase.get_candidates {#get_candidates tag="method"} +## KnowledgeBase.get_alias_candidates {#get_alias_candidates tag="method"} Given a certain textual mention as input, retrieve a list of candidate entities of type [`Candidate`](/api/kb/#candidate). @@ -160,13 +160,13 @@ of type [`Candidate`](/api/kb/#candidate). > #### Example > > ```python -> candidates = kb.get_candidates("Douglas") +> candidates = kb.get_alias_candidates("Douglas") > ``` -| Name | Description | -| ----------- | ------------------------------------- | -| `alias` | The textual mention or alias. ~~str~~ | -| **RETURNS** | iterable | The list of relevant `Candidate` objects. ~~List[Candidate]~~ | +| Name | Description | +| ----------- | ------------------------------------------------------------- | +| `alias` | The textual mention or alias. ~~str~~ | +| **RETURNS** | The list of relevant `Candidate` objects. ~~List[Candidate]~~ | ## KnowledgeBase.get_vector {#get_vector tag="method"} @@ -245,8 +245,8 @@ certain prior probability. ### Candidate.\_\_init\_\_ {#candidate-init tag="method"} Construct a `Candidate` object. Usually this constructor is not called directly, -but instead these objects are returned by the -[`get_candidates`](/api/kb#get_candidates) method of a `KnowledgeBase`. +but instead these objects are returned by the `get_candidates` method of the +[`entity_linker`](/api/entitylinker) pipe. > #### Example > diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 382415416..d0d6b9514 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -198,11 +198,32 @@ more efficient than processing texts one-by-one. | `as_tuples` | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ | | `batch_size` | The number of texts to buffer. ~~Optional[int]~~ | | `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ | -| `cleanup` | If `True`, unneeded strings are freed to control memory use. Experimental. ~~bool~~ | | `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | | `n_process` 2.2.2 | Number of processors to use. Defaults to `1`. ~~int~~ | | **YIELDS** | Documents in the order of the original text. ~~Doc~~ | +## Language.set_error_handler {#set_error_handler tag="method" new="3"} + +Define a callback that will be invoked when an error is thrown during processing +of one or more documents. Specifically, this function will call +[`set_error_handler`](/api/pipe#set_error_handler) on all the pipeline +components that define that function. The error handler will be invoked with the +original component's name, the component itself, the list of documents that was +being processed, and the original error. + +> #### Example +> +> ```python +> def warn_error(proc_name, proc, docs, e): +> print(f"An error occurred when applying component {proc_name}.") +> +> nlp.set_error_handler(warn_error) +> ``` + +| Name | Description | +| --------------- | -------------------------------------------------------------------------------------------------------------- | +| `error_handler` | A function that performs custom error handling. ~~Callable[[str, Callable[[Doc], Doc], List[Doc], Exception]~~ | + ## Language.initialize {#initialize tag="method" new="3"} Initialize the pipeline for training and return an @@ -342,7 +363,7 @@ Evaluate a pipeline's components. -The `Language.update` method now takes a batch of [`Example`](/api/example) +The `Language.evaluate` method now takes a batch of [`Example`](/api/example) objects instead of tuples of `Doc` and `GoldParse` objects. @@ -405,7 +426,8 @@ component, adds it to the pipeline and returns it. > ```python > @Language.component("component") > def component_func(doc): -> # modify Doc and return it return doc +> # modify Doc and return it +> return doc > > nlp.add_pipe("component", before="ner") > component = nlp.add_pipe("component", name="custom_name", last=True) @@ -424,7 +446,7 @@ component, adds it to the pipeline and returns it. | `after` | Component name or index to insert component directly after. ~~Optional[Union[str, int]]~~ | | `first` | Insert component first / not first in the pipeline. ~~Optional[bool]~~ | | `last` | Insert component last / not last in the pipeline. ~~Optional[bool]~~ | -| `config` 3 | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ | +| `config` 3 | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Dict[str, Any]~~ | | `source` 3 | Optional source pipeline to copy component from. If a source is provided, the `factory_name` is interpreted as the name of the component in the source pipeline. Make sure that the vocab, vectors and settings of the source pipeline match the target pipeline. ~~Optional[Language]~~ | | `validate` 3 | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ | | **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ | @@ -454,7 +476,7 @@ To create a component and add it to the pipeline, you should always use | `factory_name` | Name of the registered component factory. ~~str~~ | | `name` | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. ~~Optional[str]~~ | | _keyword-only_ | | -| `config` 3 | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ | +| `config` 3 | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Dict[str, Any]~~ | | `validate` 3 | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ | | **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ | @@ -811,6 +833,51 @@ token.ent_iob, token.ent_type | `pretty` | Pretty-print the results as a table. Defaults to `False`. ~~bool~~ | | **RETURNS** | Dictionary containing the pipe analysis, keyed by `"summary"` (component meta by pipe), `"problems"` (attribute names by pipe) and `"attrs"` (pipes that assign and require an attribute, keyed by attribute). ~~Optional[Dict[str, Any]]~~ | +## Language.replace_listeners {#replace_listeners tag="method" new="3"} + +Find [listener layers](/usage/embeddings-transformers#embedding-layers) +(connecting to a shared token-to-vector embedding component) of a given pipeline +component model and replace them with a standalone copy of the token-to-vector +layer. The listener layer allows other components to connect to a shared +token-to-vector embedding component like [`Tok2Vec`](/api/tok2vec) or +[`Transformer`](/api/transformer). Replacing listeners can be useful when +training a pipeline with components sourced from an existing pipeline: if +multiple components (e.g. tagger, parser, NER) listen to the same +token-to-vector component, but some of them are frozen and not updated, their +performance may degrade significally as the token-to-vector component is updated +with new data. To prevent this, listeners can be replaced with a standalone +token-to-vector layer that is owned by the component and doesn't change if the +component isn't updated. + +This method is typically not called directly and only executed under the hood +when loading a config with +[sourced components](/usage/training#config-components) that define +`replace_listeners`. + +> ```python +> ### Example +> nlp = spacy.load("en_core_web_sm") +> nlp.replace_listeners("tok2vec", "tagger", ["model.tok2vec"]) +> ``` +> +> ```ini +> ### config.cfg (excerpt) +> [training] +> frozen_components = ["tagger"] +> +> [components] +> +> [components.tagger] +> source = "en_core_web_sm" +> replace_listeners = ["model.tok2vec"] +> ``` + +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `tok2vec_name` | Name of the token-to-vector component, typically `"tok2vec"` or `"transformer"`.~~str~~ | +| `pipe_name` | Name of pipeline component to replace listeners for. ~~str~~ | +| `listeners` | The paths to the listeners, relative to the component config, e.g. `["model.tok2vec"]`. Typically, implementations will only connect to one tok2vec component, `model.tok2vec`, but in theory, custom models can use multiple listeners. The value here can either be an empty list to not replace any listeners, or a _complete_ list of the paths to all listener layers used by the model that should be replaced.~~Iterable[str]~~ | + ## Language.meta {#meta tag="property"} Meta data for the `Language` class, including name, version, data sources, @@ -1010,9 +1077,9 @@ customize the default language data: | --------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `stop_words` | List of stop words, used for `Token.is_stop`.
**Example:** [`stop_words.py`](%%GITHUB_SPACY/spacy/lang/en/stop_words.py) ~~Set[str]~~ | | `tokenizer_exceptions` | Tokenizer exception rules, string mapped to list of token attributes.
**Example:** [`de/tokenizer_exceptions.py`](%%GITHUB_SPACY/spacy/lang/de/tokenizer_exceptions.py) ~~Dict[str, List[dict]]~~ | -| `prefixes`, `suffixes`, `infixes` | Prefix, suffix and infix rules for the default tokenizer.
**Example:** [`puncutation.py`](%%GITHUB_SPACY/spacy/lang/punctuation.py) ~~Optional[List[Union[str, Pattern]]]~~ | -| `token_match` | Optional regex for matching strings that should never be split, overriding the infix rules.
**Example:** [`fr/tokenizer_exceptions.py`](%%GITHUB_SPACY/spacy/lang/fr/tokenizer_exceptions.py) ~~Optional[Pattern]~~ | -| `url_match` | Regular expression for matching URLs. Prefixes and suffixes are removed before applying the match.
**Example:** [`tokenizer_exceptions.py`](%%GITHUB_SPACY/spacy/lang/tokenizer_exceptions.py) ~~Optional[Pattern]~~ | +| `prefixes`, `suffixes`, `infixes` | Prefix, suffix and infix rules for the default tokenizer.
**Example:** [`puncutation.py`](%%GITHUB_SPACY/spacy/lang/punctuation.py) ~~Optional[Sequence[Union[str, Pattern]]]~~ | +| `token_match` | Optional regex for matching strings that should never be split, overriding the infix rules.
**Example:** [`fr/tokenizer_exceptions.py`](%%GITHUB_SPACY/spacy/lang/fr/tokenizer_exceptions.py) ~~Optional[Callable]~~ | +| `url_match` | Regular expression for matching URLs. Prefixes and suffixes are removed before applying the match.
**Example:** [`tokenizer_exceptions.py`](%%GITHUB_SPACY/spacy/lang/tokenizer_exceptions.py) ~~Optional[Callable]~~ | | `lex_attr_getters` | Custom functions for setting lexical attributes on tokens, e.g. `like_num`.
**Example:** [`lex_attrs.py`](%%GITHUB_SPACY/spacy/lang/en/lex_attrs.py) ~~Dict[int, Callable[[str], Any]]~~ | | `syntax_iterators` | Functions that compute views of a `Doc` object based on its syntax. At the moment, only used for [noun chunks](/usage/linguistic-features#noun-chunks).
**Example:** [`syntax_iterators.py`](%%GITHUB_SPACY/spacy/lang/en/syntax_iterators.py). ~~Dict[str, Callable[[Union[Doc, Span]], Iterator[Span]]]~~ | | `writing_system` | Information about the language's writing system, available via `Vocab.writing_system`. Defaults to: `{"direction": "ltr", "has_case": True, "has_letters": True}.`.
**Example:** [`zh/__init__.py`](%%GITHUB_SPACY/spacy/lang/zh/__init__.py) ~~Dict[str, Any]~~ | diff --git a/website/docs/api/legacy.md b/website/docs/api/legacy.md new file mode 100644 index 000000000..916a5bf7f --- /dev/null +++ b/website/docs/api/legacy.md @@ -0,0 +1,270 @@ +--- +title: Legacy functions and architectures +teaser: Archived implementations available through spacy-legacy +source: spacy/legacy +--- + +The [`spacy-legacy`](https://github.com/explosion/spacy-legacy) package includes +outdated registered functions and architectures. It is installed automatically +as a dependency of spaCy, and provides backwards compatibility for archived +functions that may still be used in projects. + +You can find the detailed documentation of each such legacy function on this +page. + +## Architectures {#architectures} + +These functions are available from `@spacy.registry.architectures`. + +### spacy.Tok2Vec.v1 {#Tok2Vec_v1} + +The `spacy.Tok2Vec.v1` architecture was expecting an `encode` model of type +`Model[Floats2D, Floats2D]` such as `spacy.MaxoutWindowEncoder.v1` or +`spacy.MishWindowEncoder.v1`. + +> #### Example config +> +> ```ini +> [model] +> @architectures = "spacy.Tok2Vec.v1" +> +> [model.embed] +> @architectures = "spacy.CharacterEmbed.v1" +> # ... +> +> [model.encode] +> @architectures = "spacy.MaxoutWindowEncoder.v1" +> # ... +> ``` + +Construct a tok2vec model out of two subnetworks: one for embedding and one for +encoding. See the +["Embed, Encode, Attend, Predict"](https://explosion.ai/blog/deep-learning-formula-nlp) +blog post for background. + +| Name | Description | +| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `embed` | Embed tokens into context-independent word vector representations. For example, [CharacterEmbed](/api/architectures#CharacterEmbed) or [MultiHashEmbed](/api/architectures#MultiHashEmbed). ~~Model[List[Doc], List[Floats2d]]~~ | +| `encode` | Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. For example, [MaxoutWindowEncoder.v1](/api/legacy#MaxoutWindowEncoder_v1). ~~Model[Floats2d, Floats2d]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | + +### spacy.MaxoutWindowEncoder.v1 {#MaxoutWindowEncoder_v1} + +The `spacy.MaxoutWindowEncoder.v1` architecture was producing a model of type +`Model[Floats2D, Floats2D]`. Since `spacy.MaxoutWindowEncoder.v2`, this has been +changed to output type `Model[List[Floats2d], List[Floats2d]]`. + +> #### Example config +> +> ```ini +> [model] +> @architectures = "spacy.MaxoutWindowEncoder.v1" +> width = 128 +> window_size = 1 +> maxout_pieces = 3 +> depth = 4 +> ``` + +Encode context using convolutions with maxout activation, layer normalization +and residual connections. + +| Name | Description | +| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `width` | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. ~~int~~ | +| `window_size` | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. ~~int~~ | +| `maxout_pieces` | The number of maxout pieces to use. Recommended values are `2` or `3`. ~~int~~ | +| `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ | +| **CREATES** | The model using the architecture. ~~Model[Floats2d, Floats2d]~~ | + +### spacy.MishWindowEncoder.v1 {#MishWindowEncoder_v1} + +The `spacy.MishWindowEncoder.v1` architecture was producing a model of type +`Model[Floats2D, Floats2D]`. Since `spacy.MishWindowEncoder.v2`, this has been +changed to output type `Model[List[Floats2d], List[Floats2d]]`. + +> #### Example config +> +> ```ini +> [model] +> @architectures = "spacy.MishWindowEncoder.v1" +> width = 64 +> window_size = 1 +> depth = 4 +> ``` + +Encode context using convolutions with +[`Mish`](https://thinc.ai/docs/api-layers#mish) activation, layer normalization +and residual connections. + +| Name | Description | +| ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `width` | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. ~~int~~ | +| `window_size` | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. ~~int~~ | +| `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ | +| **CREATES** | The model using the architecture. ~~Model[Floats2d, Floats2d]~~ | + +### spacy.TransitionBasedParser.v1 {#TransitionBasedParser_v1} + +Identical to +[`spacy.TransitionBasedParser.v2`](/api/architectures#TransitionBasedParser) +except the `use_upper` was set to `True` by default. + +### spacy.TextCatEnsemble.v1 {#TextCatEnsemble_v1} + +The `spacy.TextCatEnsemble.v1` architecture built an internal `tok2vec` and +`linear_model`. Since `spacy.TextCatEnsemble.v2`, this has been refactored so +that the `TextCatEnsemble` takes these two sublayers as input. + +> #### Example Config +> +> ```ini +> [model] +> @architectures = "spacy.TextCatEnsemble.v1" +> exclusive_classes = false +> pretrained_vectors = null +> width = 64 +> embed_size = 2000 +> conv_depth = 2 +> window_size = 1 +> ngram_size = 1 +> dropout = null +> nO = null +> ``` + +Stacked ensemble of a bag-of-words model and a neural network model. The neural +network has an internal CNN Tok2Vec layer and uses attention. + +| Name | Description | +| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | +| `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~ | +| `width` | Output dimension of the feature encoding step. ~~int~~ | +| `embed_size` | Input dimension of the feature encoding step. ~~int~~ | +| `conv_depth` | Depth of the tok2vec layer. ~~int~~ | +| `window_size` | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~ | +| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ | +| `dropout` | The dropout rate. ~~float~~ | +| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | + +### spacy.HashEmbedCNN.v1 {#HashEmbedCNN_v1} + +Identical to [`spacy.HashEmbedCNN.v2`](/api/architectures#HashEmbedCNN) except +using [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are included. + +### spacy.MultiHashEmbed.v1 {#MultiHashEmbed_v1} + +Identical to [`spacy.MultiHashEmbed.v2`](/api/architectures#MultiHashEmbed) +except with [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are +included. + +### spacy.CharacterEmbed.v1 {#CharacterEmbed_v1} + +Identical to [`spacy.CharacterEmbed.v2`](/api/architectures#CharacterEmbed) +except using [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are +included. + +## Layers {#layers} + +These functions are available from `@spacy.registry.layers`. + +### spacy.StaticVectors.v1 {#StaticVectors_v1} + +Identical to [`spacy.StaticVectors.v2`](/api/architectures#StaticVectors) except +for the handling of tokens without vectors. + + + +`spacy.StaticVectors.v1` maps tokens without vectors to the final row in the +vectors table, which causes the model predictions to change if new vectors are +added to an existing vectors table. See more details in +[issue #7662](https://github.com/explosion/spaCy/issues/7662#issuecomment-813925655). + + + +### spacy.TextCatCNN.v1 {#TextCatCNN_v1} + +Since `spacy.TextCatCNN.v2`, this architecture has become resizable, which means +that you can add labels to a previously trained textcat. `TextCatCNN` v1 did not +yet support that. + +> #### Example Config +> +> ```ini +> [model] +> @architectures = "spacy.TextCatCNN.v1" +> exclusive_classes = false +> nO = null +> +> [model.tok2vec] +> @architectures = "spacy.HashEmbedCNN.v1" +> pretrained_vectors = null +> width = 96 +> depth = 4 +> embed_size = 2000 +> window_size = 1 +> maxout_pieces = 3 +> subword_features = true +> ``` + +A neural network model where token vectors are calculated using a CNN. The +vectors are mean pooled and used as features in a feed-forward network. This +architecture is usually less accurate than the ensemble, but runs faster. + +| Name | Description | +| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | +| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ | +| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | + +### spacy.TextCatBOW.v1 {#TextCatBOW_v1} + +Since `spacy.TextCatBOW.v2`, this architecture has become resizable, which means +that you can add labels to a previously trained textcat. `TextCatBOW` v1 did not +yet support that. + +> #### Example Config +> +> ```ini +> [model] +> @architectures = "spacy.TextCatBOW.v1" +> exclusive_classes = false +> ngram_size = 1 +> no_output_layer = false +> nO = null +> ``` + +An n-gram "bag-of-words" model. This architecture should run much faster than +the others, but may not be as accurate, especially if texts are short. + +| Name | Description | +| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | +| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~ | +| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~ | +| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | + +## Loggers {#loggers} + +These functions are available from `@spacy.registry.loggers`. + +### spacy.WandbLogger.v1 {#WandbLogger_v1} + +The first version of the [`WandbLogger`](/api/top-level#WandbLogger) did not yet +support the `log_dataset_dir` and `model_log_interval` arguments. + +> #### Example config +> +> ```ini +> [training.logger] +> @loggers = "spacy.WandbLogger.v1" +> project_name = "monitor_spacy_training" +> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"] +> ``` +> +> | Name | Description | +> | ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------- | +> | `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ | +> | `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ | diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index e838c75b2..8cb869f64 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -4,7 +4,6 @@ tag: class source: spacy/pipeline/lemmatizer.py new: 3 teaser: 'Pipeline component for lemmatization' -api_base_class: /api/pipe api_string_name: lemmatizer api_trainable: false --- @@ -32,6 +31,15 @@ available in the pipeline and runs _before_ the lemmatizer. +## Assigned Attributes {#assigned-attributes} + +Lemmas generated by rules or predicted will be saved to `Token.lemma`. + +| Location | Value | +| -------------- | ------------------------- | +| `Token.lemma` | The lemma (hash). ~~int~~ | +| `Token.lemma_` | The lemma. ~~str~~ | + ## Config and implementation The default config is defined by the pipeline component factory and describes @@ -48,11 +56,36 @@ data format used by the lookup and rule-based lemmatizers, see > nlp.add_pipe("lemmatizer", config=config) > ``` -| Setting | Description | -| ----------- | --------------------------------------------------------------------------------- | -| `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ | -| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ | -| `model` | **Not yet implemented:** the model to use. ~~Model~~ | +| Setting | Description | +| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `lookup` if no language-specific lemmatizer is available (see the following table). ~~str~~ | +| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ | +| `model` | **Not yet implemented:** the model to use. ~~Model~~ | + +Many languages specify a default lemmatizer mode other than `lookup` if a better +lemmatizer is available. The lemmatizer modes `rule` and `pos_lookup` require +[`token.pos`](/api/token) from a previous pipeline component (see example +pipeline configurations in the +[pretrained pipeline design details](/models#design-cnn)) or rely on third-party +libraries (`pymorphy2`). + +| Language | Default Mode | +| -------- | ------------ | +| `bn` | `rule` | +| `ca` | `pos_lookup` | +| `el` | `rule` | +| `en` | `rule` | +| `es` | `rule` | +| `fa` | `rule` | +| `fr` | `rule` | +| `it` | `pos_lookup` | +| `mk` | `rule` | +| `nb` | `rule` | +| `nl` | `rule` | +| `pl` | `pos_lookup` | +| `ru` | `pymorphy2` | +| `sv` | `rule` | +| `uk` | `pymorphy2` | ```python %%GITHUB_SPACY/spacy/pipeline/lemmatizer.py @@ -67,7 +100,7 @@ data format used by the lookup and rule-based lemmatizers, see > lemmatizer = nlp.add_pipe("lemmatizer") > > # Construction via add_pipe with custom settings -> config = {"mode": "rule", overwrite=True} +> config = {"mode": "rule", "overwrite": True} > lemmatizer = nlp.add_pipe("lemmatizer", config=config) > ``` diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md index a7e1d1ca0..c5d4b7544 100644 --- a/website/docs/api/lexeme.md +++ b/website/docs/api/lexeme.md @@ -127,14 +127,14 @@ The L2 norm of the lexeme's vector representation. | `text` | Verbatim text content. ~~str~~ | | `orth` | ID of the verbatim text content. ~~int~~ | | `orth_` | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. ~~str~~ | -| `rank` | Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ | +| `rank` | Sequential ID of the lexeme's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ | | `flags` | Container of the lexeme's binary flags. ~~int~~ | -| `norm` | The lexemes's norm, i.e. a normalized form of the lexeme text. ~~int~~ | -| `norm_` | The lexemes's norm, i.e. a normalized form of the lexeme text. ~~str~~ | +| `norm` | The lexeme's norm, i.e. a normalized form of the lexeme text. ~~int~~ | +| `norm_` | The lexeme's norm, i.e. a normalized form of the lexeme text. ~~str~~ | | `lower` | Lowercase form of the word. ~~int~~ | | `lower_` | Lowercase form of the word. ~~str~~ | -| `shape` | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ | -| `shape_` | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ | +| `shape` | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ | +| `shape_` | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ | | `prefix` | Length-N substring from the start of the word. Defaults to `N=1`. ~~int~~ | | `prefix_` | Length-N substring from the start of the word. Defaults to `N=1`. ~~str~~ | | `suffix` | Length-N substring from the end of the word. Defaults to `N=3`. ~~int~~ | @@ -155,7 +155,7 @@ The L2 norm of the lexeme's vector representation. | `like_url` | Does the lexeme resemble a URL? ~~bool~~ | | `like_num` | Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~ | | `like_email` | Does the lexeme resemble an email address? ~~bool~~ | -| `is_oov` | Does the lexeme have a word vector? ~~bool~~ | +| `is_oov` | Is the lexeme out-of-vocabulary (i.e. does it not have a word vector)? ~~bool~~ | | `is_stop` | Is the lexeme part of a "stop list"? ~~bool~~ | | `lang` | Language of the parent vocabulary. ~~int~~ | | `lang_` | Language of the parent vocabulary. ~~str~~ | diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index 7c39d9caf..c34560dec 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -77,13 +77,14 @@ it compares to another value. > ] > ``` -| Attribute | Description | -| -------------------------- | ------------------------------------------------------------------------------------------------------- | -| `IN` | Attribute value is member of a list. ~~Any~~ | -| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ | -| `ISSUBSET` | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~ | -| `ISSUPERSET` | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~ | -| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ | +| Attribute | Description | +| -------------------------- | -------------------------------------------------------------------------------------------------------- | +| `IN` | Attribute value is member of a list. ~~Any~~ | +| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ | +| `IS_SUBSET` | Attribute value (for `MORPH` or custom list attributes) is a subset of a list. ~~Any~~ | +| `IS_SUPERSET` | Attribute value (for `MORPH` or custom list attributes) is a superset of a list. ~~Any~~ | +| `INTERSECTS` | Attribute value (for `MORPH` or custom list attribute) has a non-empty intersection with a list. ~~Any~~ | +| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ | ## Matcher.\_\_init\_\_ {#init tag="method"} @@ -120,12 +121,14 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`. > matches = matcher(doc) > ``` -| Name | Description | -| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ | -| _keyword-only_ | | -| `as_spans` 3 | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ | -| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ | +| Name | Description | +| ------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ | +| _keyword-only_ | | +| `as_spans` 3 | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ | +| `allow_missing` 3 | Whether to skip checks for missing annotation for attributes included in patterns. Defaults to `False`. ~~bool~~ | +| `with_alignments` 3.0.6 | Return match alignment information as part of the match tuple as `List[int]` with the same length as the matched span. Each entry denotes the corresponding index of the token pattern. If `as_spans` is set to `True`, this setting is ignored. Defaults to `False`. ~~bool~~ | +| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ | ## Matcher.\_\_len\_\_ {#len tag="method" new="2"} diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md index 9cda478c8..00af83e6f 100644 --- a/website/docs/api/morphologizer.md +++ b/website/docs/api/morphologizer.md @@ -15,6 +15,16 @@ coarse-grained POS tags following the Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) annotation guidelines. +## Assigned Attributes {#assigned-attributes} + +Predictions are saved to `Token.morph` and `Token.pos`. + +| Location | Value | +| ------------- | ----------------------------------------- | +| `Token.pos` | The UPOS part of speech (hash). ~~int~~ | +| `Token.pos_` | The UPOS part of speech. ~~str~~ | +| `Token.morph` | Morphological features. ~~MorphAnalysis~~ | + ## Config and implementation {#config} The default config is defined by the pipeline component factory and describes @@ -61,11 +71,11 @@ shortcut for this and instantiate the component using its string name and > morphologizer = Morphologizer(nlp.vocab, model) > ``` -| Name | Description | -| -------------- | -------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| Name | Description | +| ------- | -------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | ## Morphologizer.\_\_call\_\_ {#call tag="method"} @@ -189,9 +199,8 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores. Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. -Delegates to [`predict`](/api/morphologizer#predict), -[`get_loss`](/api/morphologizer#get_loss) and -[`set_annotations`](/api/morphologizer#set_annotations). +Delegates to [`predict`](/api/morphologizer#predict) and +[`get_loss`](/api/morphologizer#get_loss). > #### Example > @@ -201,14 +210,14 @@ Delegates to [`predict`](/api/morphologizer#predict), > losses = morphologizer.update(examples, sgd=optimizer) > ``` -| Name | Description | -| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| `drop` | The dropout rate. ~~float~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | -| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------ | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## Morphologizer.get_loss {#get_loss tag="method"} diff --git a/website/docs/api/morphology.md b/website/docs/api/morphology.md index e64f26bdd..20fcd1a40 100644 --- a/website/docs/api/morphology.md +++ b/website/docs/api/morphology.md @@ -98,9 +98,9 @@ representation. > assert f == "Feat1=Val1|Feat2=Val2" > ``` -| Name | Description | -| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `feats_dict` | The morphological features as a dictionary. ~~Dict[str, str]~~ | +| Name | Description | +| ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------- | +| `feats_dict` | The morphological features as a dictionary. ~~Dict[str, str]~~ | | **RETURNS** | The morphological features in Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ | ## Attributes {#attributes} diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md index 47bbdcf6a..2cef9ac2a 100644 --- a/website/docs/api/phrasematcher.md +++ b/website/docs/api/phrasematcher.md @@ -44,7 +44,7 @@ be shown. ## PhraseMatcher.\_\_call\_\_ {#call tag="method"} -Find all token sequences matching the supplied patterns on the `Doc`. +Find all token sequences matching the supplied patterns on the `Doc` or `Span`. > #### Example > @@ -59,7 +59,7 @@ Find all token sequences matching the supplied patterns on the `Doc`. | Name | Description | | ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doc` | The document to match over. ~~Doc~~ | +| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ | | _keyword-only_ | | | `as_spans` 3 | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ | | **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ | @@ -150,7 +150,7 @@ patterns = [nlp("health care reform"), nlp("healthcare reform")] | Name | Description | | -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `match_id` | An ID for the thing you're matching. ~~str~~ | | +| `key` | An ID for the thing you're matching. ~~str~~ | | `docs` | `Doc` objects of the phrases to match. ~~List[Doc]~~ | | _keyword-only_ | | | `on_match` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. ~~Optional[Callable[[Matcher, Doc, int, List[tuple], Any]]~~ | diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md index 027d1e1c0..2f856c667 100644 --- a/website/docs/api/pipe.md +++ b/website/docs/api/pipe.md @@ -100,6 +100,47 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | +## TrainablePipe.set_error_handler {#set_error_handler tag="method" new="3"} + +Define a callback that will be invoked when an error is thrown during processing +of one or more documents with either [`__call__`](/api/pipe#call) or +[`pipe`](/api/pipe#pipe). The error handler will be invoked with the original +component's name, the component itself, the list of documents that was being +processed, and the original error. + +> #### Example +> +> ```python +> def warn_error(proc_name, proc, docs, e): +> print(f"An error occurred when applying component {proc_name}.") +> +> pipe = nlp.add_pipe("ner") +> pipe.set_error_handler(warn_error) +> ``` + +| Name | Description | +| --------------- | -------------------------------------------------------------------------------------------------------------- | +| `error_handler` | A function that performs custom error handling. ~~Callable[[str, Callable[[Doc], Doc], List[Doc], Exception]~~ | + +## TrainablePipe.get_error_handler {#get_error_handler tag="method" new="3"} + +Retrieve the callback that performs error handling for this component's +[`__call__`](/api/pipe#call) and [`pipe`](/api/pipe#pipe) methods. If no custom +function was previously defined with +[`set_error_handler`](/api/pipe#set_error_handler), a default function is +returned that simply reraises the exception. + +> #### Example +> +> ```python +> pipe = nlp.add_pipe("ner") +> error_handler = pipe.get_error_handler() +> ``` + +| Name | Description | +| ----------- | ---------------------------------------------------------------------------------------------------------------- | +| **RETURNS** | The function that performs custom error handling. ~~Callable[[str, Callable[[Doc], Doc], List[Doc], Exception]~~ | + ## TrainablePipe.initialize {#initialize tag="method" new="3"} Initialize the component for training. `get_examples` should be a function that @@ -190,14 +231,14 @@ predictions and gold-standard annotations, and update the component's model. > losses = pipe.update(examples, sgd=optimizer) > ``` -| Name | Description | -| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| `drop` | The dropout rate. ~~float~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | -| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------ | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## TrainablePipe.rehearse {#rehearse tag="method,experimental" new="3"} diff --git a/website/docs/api/pipeline-functions.md b/website/docs/api/pipeline-functions.md index 628d36000..a776eca9b 100644 --- a/website/docs/api/pipeline-functions.md +++ b/website/docs/api/pipeline-functions.md @@ -113,8 +113,7 @@ end of the pipeline and after all other components. Split tokens longer than a minimum length into shorter tokens. Intended for use with transformer pipelines where long spaCy tokens lead to input text that -exceed the transformer model max length. See -[managing transformer model max length limitations](/usage/embeddings-transformers#transformer-max-length). +exceed the transformer model max length. > #### Example > diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md index fb48d68cc..c8163091f 100644 --- a/website/docs/api/scorer.md +++ b/website/docs/api/scorer.md @@ -46,7 +46,10 @@ attribute being scored: - `tag_acc`, `pos_acc`, `morph_acc`, `morph_per_feat`, `lemma_acc` - `dep_uas`, `dep_las`, `dep_las_per_type` - `ents_p`, `ents_r` `ents_f`, `ents_per_type` -- `textcat_macro_auc`, `textcat_macro_f` +- `cats_score` (depends on config, description provided in `cats_score_desc`), + `cats_micro_p`, `cats_micro_r`, `cats_micro_f`, `cats_macro_p`, + `cats_macro_r`, `cats_macro_f`, `cats_macro_auc`, `cats_f_per_type`, + `cats_auc_per_type` > #### Example > @@ -144,6 +147,8 @@ Returns PRF scores for labeled or unlabeled spans. | _keyword-only_ | | | `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~ | | `has_annotation` | Defaults to `None`. If provided, `has_annotation(doc)` should return whether a `Doc` has annotation for this `attr`. Docs without annotation are skipped for scoring purposes. ~~str~~ | +| `labeled` | Defaults to `True`. If set to `False`, two spans will be considered equal if their start and end match, irrespective of their label. ~~bool~~ | +| `allow_overlap` | Defaults to `False`. Whether or not to allow overlapping spans. If set to `False`, the alignment will automatically resolve conflicts. ~~bool~~ | | **RETURNS** | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ | ## Scorer.score_deps {#score_deps tag="staticmethod" new="3"} diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md index 23c8e87d9..8d8e57319 100644 --- a/website/docs/api/sentencerecognizer.md +++ b/website/docs/api/sentencerecognizer.md @@ -12,6 +12,16 @@ api_trainable: true A trainable pipeline component for sentence segmentation. For a simpler, rule-based strategy, see the [`Sentencizer`](/api/sentencizer). +## Assigned Attributes {#assigned-attributes} + +Predicted values will be assigned to `Token.is_sent_start`. The resulting +sentences can be accessed using `Doc.sents`. + +| Location | Value | +| --------------------- | ------------------------------------------------------------------------------------------------------------------------------ | +| `Token.is_sent_start` | A boolean value indicating whether the token starts a sentence. This will be either `True` or `False` for all tokens. ~~bool~~ | +| `Doc.sents` | An iterator over sentences in the `Doc`, determined by `Token.is_sent_start` values. ~~Iterator[Span]~~ | + ## Config and implementation {#config} The default config is defined by the pipeline component factory and describes @@ -176,9 +186,8 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores. Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. -Delegates to [`predict`](/api/sentencerecognizer#predict), -[`get_loss`](/api/sentencerecognizer#get_loss) and -[`set_annotations`](/api/sentencerecognizer#set_annotations). +Delegates to [`predict`](/api/sentencerecognizer#predict) and +[`get_loss`](/api/sentencerecognizer#get_loss). > #### Example > @@ -188,14 +197,14 @@ Delegates to [`predict`](/api/sentencerecognizer#predict), > losses = senter.update(examples, sgd=optimizer) > ``` -| Name | Description | -| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| `drop` | The dropout rate. ~~float~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | -| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------ | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## SentenceRecognizer.rehearse {#rehearse tag="method,experimental" new="3"} diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md index 2cd49127d..ef2465c27 100644 --- a/website/docs/api/sentencizer.md +++ b/website/docs/api/sentencizer.md @@ -13,6 +13,16 @@ performed by the [`DependencyParser`](/api/dependencyparser), so the `Sentencizer` lets you implement a simpler, rule-based strategy that doesn't require a statistical model to be loaded. +## Assigned Attributes {#assigned-attributes} + +Calculated values will be assigned to `Token.is_sent_start`. The resulting +sentences can be accessed using `Doc.sents`. + +| Location | Value | +| --------------------- | ------------------------------------------------------------------------------------------------------------------------------ | +| `Token.is_sent_start` | A boolean value indicating whether the token starts a sentence. This will be either `True` or `False` for all tokens. ~~bool~~ | +| `Doc.sents` | An iterator over sentences in the `Doc`, determined by `Token.is_sent_start` values. ~~Iterator[Span]~~ | + ## Config and implementation {#config} The default config is defined by the pipeline component factory and describes @@ -24,7 +34,7 @@ how the component should be configured. You can override its settings via the > > ```python > config = {"punct_chars": None} -> nlp.add_pipe("entity_ruler", config=config) +> nlp.add_pipe("sentencizer", config=config) > ``` | Setting | Description | diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 37d18c62e..2938b4253 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -18,14 +18,15 @@ Create a `Span` object from the slice `doc[start : end]`. > assert [t.text for t in span] == ["it", "back", "!"] > ``` -| Name | Description | -| -------- | --------------------------------------------------------------------------------------- | -| `doc` | The parent document. ~~Doc~~ | -| `start` | The index of the first token of the span. ~~int~~ | -| `end` | The index of the first token after the span. ~~int~~ | -| `label` | A label to attach to the span, e.g. for named entities. ~~Union[str, int]~~ | -| `kb_id` | A knowledge base ID to attach to the span, e.g. for named entities. ~~Union[str, int]~~ | -| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | +| Name | Description | +| ------------- | --------------------------------------------------------------------------------------- | +| `doc` | The parent document. ~~Doc~~ | +| `start` | The index of the first token of the span. ~~int~~ | +| `end` | The index of the first token after the span. ~~int~~ | +| `label` | A label to attach to the span, e.g. for named entities. ~~Union[str, int]~~ | +| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | +| `vector_norm` | The L2 norm of the document's vector representation. ~~float~~ | +| `kb_id` | A knowledge base ID to attach to the span, e.g. for named entities. ~~Union[str, int]~~ | ## Span.\_\_getitem\_\_ {#getitem tag="method"} @@ -303,6 +304,10 @@ not been implemeted for the given language, a `NotImplementedError` is raised. Create a new `Doc` object corresponding to the `Span`, with a copy of the data. +When calling this on many spans from the same doc, passing in a precomputed +array representation of the doc using the `array_head` and `array` args can save +time. + > #### Example > > ```python @@ -312,10 +317,12 @@ Create a new `Doc` object corresponding to the `Span`, with a copy of the data. > assert doc2.text == "New York" > ``` -| Name | Description | -| ---------------- | ------------------------------------------------------------- | -| `copy_user_data` | Whether or not to copy the original doc's user data. ~~bool~~ | -| **RETURNS** | A `Doc` object of the `Span`'s content. ~~Doc~~ | +| Name | Description | +| ---------------- | -------------------------------------------------------------------------------------------------------------------- | +| `copy_user_data` | Whether or not to copy the original doc's user data. ~~bool~~ | +| `array_head` | Precomputed array attributes (headers) of the original doc, as generated by `Doc._get_array_attrs()`. ~~Tuple~~ | +| `array` | Precomputed array version of the original doc as generated by [`Doc.to_array`](/api/doc#to_array). ~~numpy.ndarray~~ | +| **RETURNS** | A `Doc` object of the `Span`'s content. ~~Doc~~ | ## Span.root {#root tag="property" model="parser"} @@ -483,13 +490,40 @@ The L2 norm of the span's vector representation. | ----------- | --------------------------------------------------- | | **RETURNS** | The L2 norm of the vector representation. ~~float~~ | +## Span.sent {#sent tag="property" model="sentences"} + +The sentence span that this span is a part of. This property is only available +when [sentence boundaries](/usage/linguistic-features#sbd) have been set on the +document by the `parser`, `senter`, `sentencizer` or some custom function. It +will raise an error otherwise. + +If the span happens to cross sentence boundaries, only the first sentence will +be returned. If it is required that the sentence always includes the full span, +the result can be adjusted as such: + +```python +sent = span.sent +sent = doc[sent.start : max(sent.end, span.end)] +``` + +> #### Example +> +> ```python +> doc = nlp("Give it back! He pleaded.") +> span = doc[1:3] +> assert span.sent.text == "Give it back!" +> ``` + +| Name | Description | +| ----------- | ------------------------------------------------------- | +| **RETURNS** | The sentence span that this span is a part of. ~~Span~~ | + ## Attributes {#attributes} | Name | Description | | --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- | | `doc` | The parent document. ~~Doc~~ | | `tensor` 2.1.7 | The span's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~ | -| `sent` | The sentence span that this span is a part of. ~~Span~~ | | `start` | The token offset for the start of the span. ~~int~~ | | `end` | The token offset for the end of the span. ~~int~~ | | `start_char` | The character offset for the start of the span. ~~int~~ | diff --git a/website/docs/api/multilabel_textcategorizer.md b/website/docs/api/spancategorizer.md similarity index 51% rename from website/docs/api/multilabel_textcategorizer.md rename to website/docs/api/spancategorizer.md index d74f7ad9d..4edc6fb5b 100644 --- a/website/docs/api/multilabel_textcategorizer.md +++ b/website/docs/api/spancategorizer.md @@ -1,17 +1,33 @@ --- -title: Multi-label TextCategorizer -tag: class -source: spacy/pipeline/textcat_multilabel.py -new: 3 -teaser: 'Pipeline component for multi-label text classification' +title: SpanCategorizer +tag: class,experimental +source: spacy/pipeline/spancat.py +new: 3.1 +teaser: 'Pipeline component for labeling potentially overlapping spans of text' api_base_class: /api/pipe -api_string_name: textcat_multilabel +api_string_name: spancat api_trainable: true --- -The text categorizer predicts **categories over a whole document**. It -learns non-mutually exclusive labels, which means that zero or more labels -may be true per document. +A span categorizer consists of two parts: a [suggester function](#suggesters) +that proposes candidate spans, which may or may not overlap, and a labeler model +that predicts zero or more labels for each candidate. + +Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the doc. +Individual span scores can be found in `spangroup.attrs["scores"]`. + +## Assigned Attributes {#assigned-attributes} + +Predictions will be saved to `Doc.spans[spans_key]` as a +[`SpanGroup`](/api/spangroup). The scores for the spans in the `SpanGroup` will +be saved in `SpanGroup.attrs["scores"]`. + +`spans_key` defaults to `"sc"`, but can be passed as a parameter. + +| Location | Value | +| -------------------------------------- | -------------------------------------------------------- | +| `Doc.spans[spans_key]` | The annotated spans. ~~SpanGroup~~ | +| `Doc.spans[spans_key].attrs["scores"]` | The score for each span in the `SpanGroup`. ~~Floats1d~~ | ## Config and implementation {#config} @@ -25,68 +41,77 @@ architectures and their arguments and hyperparameters. > #### Example > > ```python -> from spacy.pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL +> from spacy.pipeline.spancat import DEFAULT_SPANCAT_MODEL > config = { -> "threshold": 0.5, -> "model": DEFAULT_MULTI_TEXTCAT_MODEL, +> "threshold": 0.5, +> "spans_key": "labeled_spans", +> "max_positive": None, +> "model": DEFAULT_SPANCAT_MODEL, +> "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, > } -> nlp.add_pipe("textcat_multilabel", config=config) +> nlp.add_pipe("spancat", config=config) > ``` -| Setting | Description | -| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | -| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ | +| Setting | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ | +| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ | +| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"spans"`. ~~str~~ | +| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ | +| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ | ```python -%%GITHUB_SPACY/spacy/pipeline/textcat_multilabel.py +%%GITHUB_SPACY/spacy/pipeline/spancat.py ``` -## MultiLabel_TextCategorizer.\_\_init\_\_ {#init tag="method"} +## SpanCategorizer.\_\_init\_\_ {#init tag="method"} > #### Example > > ```python > # Construction via add_pipe with default model -> textcat = nlp.add_pipe("textcat_multilabel") +> spancat = nlp.add_pipe("spancat") > > # Construction via add_pipe with custom model -> config = {"model": {"@architectures": "my_textcat"}} -> parser = nlp.add_pipe("textcat_multilabel", config=config) +> config = {"model": {"@architectures": "my_spancat"}} +> parser = nlp.add_pipe("spancat", config=config) > > # Construction from class -> from spacy.pipeline import MultiLabel_TextCategorizer -> textcat = MultiLabel_TextCategorizer(nlp.vocab, model, threshold=0.5) +> from spacy.pipeline import SpanCategorizer +> spancat = SpanCategorizer(nlp.vocab, model, suggester) > ``` Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#create_pipe). -| Name | Description | -| -------------- | -------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| _keyword-only_ | | -| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ | +| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `spans_key` | Key of the [`Doc.spans`](/api/doc#sans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"spans"`. ~~str~~ | +| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ | +| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ | -## MultiLabel_TextCategorizer.\_\_call\_\_ {#call tag="method"} +## SpanCategorizer.\_\_call\_\_ {#call tag="method"} Apply the pipe to one document. The document is modified in place, and returned. This usually happens under the hood when the `nlp` object is called on a text and all pipeline components are applied to the `Doc` in order. Both -[`__call__`](/api/multilabel_textcategorizer#call) and [`pipe`](/api/multilabel_textcategorizer#pipe) -delegate to the [`predict`](/api/multilabel_textcategorizer#predict) and -[`set_annotations`](/api/multilabel_textcategorizer#set_annotations) methods. +[`__call__`](/api/spancategorizer#call) and [`pipe`](/api/spancategorizer#pipe) +delegate to the [`predict`](/api/spancategorizer#predict) and +[`set_annotations`](/api/spancategorizer#set_annotations) methods. > #### Example > > ```python > doc = nlp("This is a sentence.") -> textcat = nlp.add_pipe("textcat_multilabel") +> spancat = nlp.add_pipe("spancat") > # This usually happens under the hood -> processed = textcat(doc) +> processed = spancat(doc) > ``` | Name | Description | @@ -94,20 +119,20 @@ delegate to the [`predict`](/api/multilabel_textcategorizer#predict) and | `doc` | The document to process. ~~Doc~~ | | **RETURNS** | The processed document. ~~Doc~~ | -## MultiLabel_TextCategorizer.pipe {#pipe tag="method"} +## SpanCategorizer.pipe {#pipe tag="method"} Apply the pipe to a stream of documents. This usually happens under the hood when the `nlp` object is called on a text and all pipeline components are -applied to the `Doc` in order. Both [`__call__`](/api/multilabel_textcategorizer#call) and -[`pipe`](/api/multilabel_textcategorizer#pipe) delegate to the -[`predict`](/api/multilabel_textcategorizer#predict) and -[`set_annotations`](/api/multilabel_textcategorizer#set_annotations) methods. +applied to the `Doc` in order. Both [`__call__`](/api/spancategorizer#call) and +[`pipe`](/api/spancategorizer#pipe) delegate to the +[`predict`](/api/spancategorizer#predict) and +[`set_annotations`](/api/spancategorizer#set_annotations) methods. > #### Example > > ```python -> textcat = nlp.add_pipe("textcat_multilabel") -> for doc in textcat.pipe(docs, batch_size=50): +> spancat = nlp.add_pipe("spancat") +> for doc in spancat.pipe(docs, batch_size=50): > pass > ``` @@ -118,7 +143,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/multilabel_textcategorizer | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## MultiLabel_TextCategorizer.initialize {#initialize tag="method" new="3"} +## SpanCategorizer.initialize {#initialize tag="method"} Initialize the component for training. `get_examples` should be a function that returns an iterable of [`Example`](/api/example) objects. The data examples are @@ -132,36 +157,30 @@ arguments it receives via the [`[initialize.components]`](/api/data-formats#config-initialize) block in the config. - - -This method was previously called `begin_training`. - - - > #### Example > > ```python -> textcat = nlp.add_pipe("textcat_multilabel") -> textcat.initialize(lambda: [], nlp=nlp) +> spancat = nlp.add_pipe("spancat") +> spancat.initialize(lambda: [], nlp=nlp) > ``` > > ```ini > ### config.cfg -> [initialize.components.textcat_multilabel] +> [initialize.components.spancat] > -> [initialize.components.textcat_multilabel.labels] +> [initialize.components.spancat.labels] > @readers = "spacy.read_labels.v1" -> path = "corpus/labels/textcat.json +> path = "corpus/labels/spancat.json > ``` -| Name | Description | -| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | -| _keyword-only_ | | -| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | -| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ | +| Name | Description | +| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ | -## MultiLabel_TextCategorizer.predict {#predict tag="method"} +## SpanCategorizer.predict {#predict tag="method"} Apply the component's model to a batch of [`Doc`](/api/doc) objects without modifying them. @@ -169,8 +188,8 @@ modifying them. > #### Example > > ```python -> textcat = nlp.add_pipe("textcat_multilabel") -> scores = textcat.predict([doc1, doc2]) +> spancat = nlp.add_pipe("spancat") +> scores = spancat.predict([doc1, doc2]) > ``` | Name | Description | @@ -178,60 +197,36 @@ modifying them. | `docs` | The documents to predict. ~~Iterable[Doc]~~ | | **RETURNS** | The model's prediction for each document. | -## MultiLabel_TextCategorizer.set_annotations {#set_annotations tag="method"} +## SpanCategorizer.set_annotations {#set_annotations tag="method"} Modify a batch of [`Doc`](/api/doc) objects using pre-computed scores. > #### Example > > ```python -> textcat = nlp.add_pipe("textcat_multilabel") -> scores = textcat.predict(docs) -> textcat.set_annotations(docs, scores) +> spancat = nlp.add_pipe("spancat") +> scores = spancat.predict(docs) +> spancat.set_annotations(docs, scores) > ``` | Name | Description | | -------- | --------------------------------------------------------- | | `docs` | The documents to modify. ~~Iterable[Doc]~~ | -| `scores` | The scores to set, produced by `MultiLabel_TextCategorizer.predict`. | +| `scores` | The scores to set, produced by `SpanCategorizer.predict`. | -## MultiLabel_TextCategorizer.update {#update tag="method"} +## SpanCategorizer.update {#update tag="method"} Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. -Delegates to [`predict`](/api/multilabel_textcategorizer#predict), -[`get_loss`](/api/multilabel_textcategorizer#get_loss) and -[`set_annotations`](/api/multilabel_textcategorizer#set_annotations). +Delegates to [`predict`](/api/spancategorizer#predict) and +[`get_loss`](/api/spancategorizer#get_loss). > #### Example > > ```python -> textcat = nlp.add_pipe("textcat_multilabel") +> spancat = nlp.add_pipe("spancat") > optimizer = nlp.initialize() -> losses = textcat.update(examples, sgd=optimizer) -> ``` - -| Name | Description | -| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| `drop` | The dropout rate. ~~float~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | -| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | - -## MultiLabel_TextCategorizer.rehearse {#rehearse tag="method,experimental" new="3"} - -Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the -current model to make predictions similar to an initial model to try to address -the "catastrophic forgetting" problem. This feature is experimental. - -> #### Example -> -> ```python -> textcat = nlp.add_pipe("textcat_multilabel") -> optimizer = nlp.resume_training() -> losses = textcat.rehearse(examples, sgd=optimizer) +> losses = spancat.update(examples, sgd=optimizer) > ``` | Name | Description | @@ -243,7 +238,7 @@ the "catastrophic forgetting" problem. This feature is experimental. | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | -## MultiLabel_TextCategorizer.get_loss {#get_loss tag="method"} +## SpanCategorizer.get_loss {#get_loss tag="method"} Find the loss and gradient of loss for the batch of documents and their predicted scores. @@ -251,65 +246,65 @@ predicted scores. > #### Example > > ```python -> textcat = nlp.add_pipe("textcat_multilabel") -> scores = textcat.predict([eg.predicted for eg in examples]) -> loss, d_loss = textcat.get_loss(examples, scores) +> spancat = nlp.add_pipe("spancat") +> scores = spancat.predict([eg.predicted for eg in examples]) +> loss, d_loss = spancat.get_loss(examples, scores) > ``` -| Name | Description | -| ----------- | --------------------------------------------------------------------------- | -| `examples` | The batch of examples. ~~Iterable[Example]~~ | -| `scores` | Scores representing the model's predictions. | -| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | +| Name | Description | +| -------------- | --------------------------------------------------------------------------- | +| `examples` | The batch of examples. ~~Iterable[Example]~~ | +| `spans_scores` | Scores representing the model's predictions. ~~Tuple[Ragged, Floats2d]~~ | +| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | -## MultiLabel_TextCategorizer.score {#score tag="method" new="3"} +## SpanCategorizer.score {#score tag="method"} Score a batch of examples. > #### Example > > ```python -> scores = textcat.score(examples) +> scores = spancat.score(examples) > ``` -| Name | Description | -| ---------------- | -------------------------------------------------------------------------------------------------------------------- | -| `examples` | The examples to score. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| **RETURNS** | The scores, produced by [`Scorer.score_cats`](/api/scorer#score_cats). ~~Dict[str, Union[float, Dict[str, float]]]~~ | +| Name | Description | +| -------------- | ---------------------------------------------------------------------------------------------------------------------- | +| `examples` | The examples to score. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). ~~Dict[str, Union[float, Dict[str, float]]]~~ | -## MultiLabel_TextCategorizer.create_optimizer {#create_optimizer tag="method"} +## SpanCategorizer.create_optimizer {#create_optimizer tag="method"} Create an optimizer for the pipeline component. > #### Example > > ```python -> textcat = nlp.add_pipe("textcat") -> optimizer = textcat.create_optimizer() +> spancat = nlp.add_pipe("spancat") +> optimizer = spancat.create_optimizer() > ``` | Name | Description | | ----------- | ---------------------------- | | **RETURNS** | The optimizer. ~~Optimizer~~ | -## MultiLabel_TextCategorizer.use_params {#use_params tag="method, contextmanager"} +## SpanCategorizer.use_params {#use_params tag="method, contextmanager"} Modify the pipe's model to use the given parameter values. > #### Example > > ```python -> textcat = nlp.add_pipe("textcat") -> with textcat.use_params(optimizer.averages): -> textcat.to_disk("/best_model") +> spancat = nlp.add_pipe("spancat") +> with spancat.use_params(optimizer.averages): +> spancat.to_disk("/best_model") > ``` | Name | Description | | -------- | -------------------------------------------------- | | `params` | The parameter values to use in the model. ~~dict~~ | -## MultiLabel_TextCategorizer.add_label {#add_label tag="method"} +## SpanCategorizer.add_label {#add_label tag="method"} Add a new label to the pipe. Raises an error if the output dimension is already set, or if the model has already been fully [initialized](#initialize). Note @@ -322,8 +317,8 @@ automatically. > #### Example > > ```python -> textcat = nlp.add_pipe("textcat") -> textcat.add_label("MY_LABEL") +> spancat = nlp.add_pipe("spancat") +> spancat.add_label("MY_LABEL") > ``` | Name | Description | @@ -331,15 +326,15 @@ automatically. | `label` | The label to add. ~~str~~ | | **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ | -## MultiLabel_TextCategorizer.to_disk {#to_disk tag="method"} +## SpanCategorizer.to_disk {#to_disk tag="method"} Serialize the pipe to disk. > #### Example > > ```python -> textcat = nlp.add_pipe("textcat") -> textcat.to_disk("/path/to/textcat") +> spancat = nlp.add_pipe("spancat") +> spancat.to_disk("/path/to/spancat") > ``` | Name | Description | @@ -348,15 +343,15 @@ Serialize the pipe to disk. | _keyword-only_ | | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | -## MultiLabel_TextCategorizer.from_disk {#from_disk tag="method"} +## SpanCategorizer.from_disk {#from_disk tag="method"} Load the pipe from disk. Modifies the object in place and returns it. > #### Example > > ```python -> textcat = nlp.add_pipe("textcat") -> textcat.from_disk("/path/to/textcat") +> spancat = nlp.add_pipe("spancat") +> spancat.from_disk("/path/to/spancat") > ``` | Name | Description | @@ -364,15 +359,15 @@ Load the pipe from disk. Modifies the object in place and returns it. | `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | | _keyword-only_ | | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | -| **RETURNS** | The modified `MultiLabel_TextCategorizer` object. ~~MultiLabel_TextCategorizer~~ | +| **RETURNS** | The modified `SpanCategorizer` object. ~~SpanCategorizer~~ | -## MultiLabel_TextCategorizer.to_bytes {#to_bytes tag="method"} +## SpanCategorizer.to_bytes {#to_bytes tag="method"} > #### Example > > ```python -> textcat = nlp.add_pipe("textcat") -> textcat_bytes = textcat.to_bytes() +> spancat = nlp.add_pipe("spancat") +> spancat_bytes = spancat.to_bytes() > ``` Serialize the pipe to a bytestring. @@ -381,18 +376,18 @@ Serialize the pipe to a bytestring. | -------------- | ------------------------------------------------------------------------------------------- | | _keyword-only_ | | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | -| **RETURNS** | The serialized form of the `MultiLabel_TextCategorizer` object. ~~bytes~~ | +| **RETURNS** | The serialized form of the `SpanCategorizer` object. ~~bytes~~ | -## MultiLabel_TextCategorizer.from_bytes {#from_bytes tag="method"} +## SpanCategorizer.from_bytes {#from_bytes tag="method"} Load the pipe from a bytestring. Modifies the object in place and returns it. > #### Example > > ```python -> textcat_bytes = textcat.to_bytes() -> textcat = nlp.add_pipe("textcat") -> textcat.from_bytes(textcat_bytes) +> spancat_bytes = spancat.to_bytes() +> spancat = nlp.add_pipe("spancat") +> spancat.from_bytes(spancat_bytes) > ``` | Name | Description | @@ -400,35 +395,35 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. | `bytes_data` | The data to load from. ~~bytes~~ | | _keyword-only_ | | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | -| **RETURNS** | The `MultiLabel_TextCategorizer` object. ~~MultiLabel_TextCategorizer~~ | +| **RETURNS** | The `SpanCategorizer` object. ~~SpanCategorizer~~ | -## MultiLabel_TextCategorizer.labels {#labels tag="property"} +## SpanCategorizer.labels {#labels tag="property"} The labels currently added to the component. > #### Example > > ```python -> textcat.add_label("MY_LABEL") -> assert "MY_LABEL" in textcat.labels +> spancat.add_label("MY_LABEL") +> assert "MY_LABEL" in spancat.labels > ``` | Name | Description | | ----------- | ------------------------------------------------------ | | **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ | -## MultiLabel_TextCategorizer.label_data {#label_data tag="property" new="3"} +## SpanCategorizer.label_data {#label_data tag="property"} The labels currently added to the component and their internal meta information. This is the data generated by [`init labels`](/api/cli#init-labels) and used by -[`MultiLabel_TextCategorizer.initialize`](/api/multilabel_textcategorizer#initialize) to initialize +[`SpanCategorizer.initialize`](/api/spancategorizer#initialize) to initialize the model with a pre-defined label set. > #### Example > > ```python -> labels = textcat.label_data -> textcat.initialize(lambda: [], nlp=nlp, labels=labels) +> labels = spancat.label_data +> spancat.initialize(lambda: [], nlp=nlp, labels=labels) > ``` | Name | Description | @@ -444,7 +439,7 @@ serialization by passing in the string names via the `exclude` argument. > #### Example > > ```python -> data = textcat.to_disk("/path", exclude=["vocab"]) +> data = spancat.to_disk("/path", exclude=["vocab"]) > ``` | Name | Description | @@ -452,3 +447,44 @@ serialization by passing in the string names via the `exclude` argument. | `vocab` | The shared [`Vocab`](/api/vocab). | | `cfg` | The config file. You usually don't want to exclude this. | | `model` | The binary model data. You usually don't want to exclude this. | + +## Suggesters {#suggesters tag="registered functions" source="spacy/pipeline/spancat.py"} + +### spacy.ngram_suggester.v1 {#ngram_suggester} + +> #### Example Config +> +> ```ini +> [components.spancat.suggester] +> @misc = "spacy.ngram_suggester.v1" +> sizes = [1, 2, 3] +> ``` + +Suggest all spans of the given lengths. Spans are returned as a ragged array of +integers. The array has two columns, indicating the start and end position. + +| Name | Description | +| ----------- | -------------------------------------------------------------------------------------------------------------------- | +| `sizes` | The phrase lengths to suggest. For example, `[1, 2]` will suggest phrases consisting of 1 or 2 tokens. ~~List[int]~~ | +| **CREATES** | The suggester function. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ | + +### spacy.ngram_range_suggester.v1 {#ngram_range_suggester} + +> #### Example Config +> +> ```ini +> [components.spancat.suggester] +> @misc = "spacy.ngram_range_suggester.v1" +> min_size = 2 +> max_size = 4 +> ``` + +Suggest all spans of at least length `min_size` and at most length `max_size` +(both inclusive). Spans are returned as a ragged array of integers. The array +has two columns, indicating the start and end position. + +| Name | Description | +| ----------- | ---------------------------------------------------------------------------- | +| `min_size` | The minimal phrase lengths to suggest (inclusive). ~~[int]~~ | +| `max_size` | The maximal phrase lengths to suggest (exclusive). ~~[int]~~ | +| **CREATES** | The suggester function. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ | diff --git a/website/docs/api/spangroup.md b/website/docs/api/spangroup.md index ba248f376..654067eb1 100644 --- a/website/docs/api/spangroup.md +++ b/website/docs/api/spangroup.md @@ -46,6 +46,16 @@ Create a `SpanGroup`. The [`Doc`](/api/doc) object the span group is referring to. + + +When a `Doc` object is garbage collected, any related `SpanGroup` object won't +be functional anymore, as these objects use a `weakref` to refer to the +document. An error will be raised as the internal `doc` object will be `None`. +To avoid this, make sure that the original `Doc` objects are still available in +the scope of your function. + + + > #### Example > > ```python diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index 8e6132d40..f34456b0c 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -8,6 +8,21 @@ api_string_name: tagger api_trainable: true --- +A trainable pipeline component to predict part-of-speech tags for any +part-of-speech tag set. + +In the pre-trained pipelines, the tag schemas vary by language; see the +[individual model pages](/models) for details. + +## Assigned Attributes {#assigned-attributes} + +Predictions are assigned to `Token.tag`. + +| Location | Value | +| ------------ | ---------------------------------- | +| `Token.tag` | The part of speech (hash). ~~int~~ | +| `Token.tag_` | The part of speech. ~~str~~ | + ## Config and implementation {#config} The default config is defined by the pipeline component factory and describes @@ -25,9 +40,9 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("tagger", config=config) > ``` -| Setting | Description | -| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | +| Setting | Description | +| ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/tagger.pyx @@ -54,11 +69,11 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). -| Name | Description | -| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| Name | Description | +| ------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | ## Tagger.\_\_call\_\_ {#call tag="method"} @@ -187,9 +202,8 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores. Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. -Delegates to [`predict`](/api/tagger#predict), -[`get_loss`](/api/tagger#get_loss) and -[`set_annotations`](/api/tagger#set_annotations). +Delegates to [`predict`](/api/tagger#predict) and +[`get_loss`](/api/tagger#get_loss). > #### Example > @@ -199,14 +213,14 @@ Delegates to [`predict`](/api/tagger#predict), > losses = tagger.update(examples, sgd=optimizer) > ``` -| Name | Description | -| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| `drop` | The dropout rate. ~~float~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | -| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------ | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## Tagger.rehearse {#rehearse tag="method,experimental" new="3"} diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index 16049c327..62a921d02 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -3,15 +3,47 @@ title: TextCategorizer tag: class source: spacy/pipeline/textcat.py new: 2 -teaser: 'Pipeline component for single-label text classification' +teaser: 'Pipeline component for text classification' api_base_class: /api/pipe api_string_name: textcat api_trainable: true --- -The text categorizer predicts **categories over a whole document**. It can learn -one or more labels, and the labels are mutually exclusive - there is exactly one -true label per document. +The text categorizer predicts **categories over a whole document**. and comes in +two flavors: `textcat` and `textcat_multilabel`. When you need to predict +exactly one true label per document, use the `textcat` which has mutually +exclusive labels. If you want to perform multi-label classification and predict +zero, one or more true labels per document, use the `textcat_multilabel` +component instead. For a binary classification task, you can use `textcat` with +**two** labels or `textcat_multilabel` with **one** label. + +Both components are documented on this page. + + + +In spaCy v2, the `textcat` component could also perform **multi-label +classification**, and even used this setting by default. Since v3.0, the +component `textcat_multilabel` should be used for multi-label classification +instead. The `textcat` component is now used for mutually exclusive classes +only. + + + +## Assigned Attributes {#assigned-attributes} + +Predictions will be saved to `doc.cats` as a dictionary, where the key is the +name of the category and the value is a score between 0 and 1 (inclusive). For +`textcat` (exclusive categories), the scores will sum to 1, while for +`textcat_multilabel` there is no particular guarantee about their sum. + +Note that when assigning values to create training data, the score of each +category must be 0 or 1. Using other values, for example to create a document +that is a little bit in category A and a little bit in category B, is not +supported. + +| Location | Value | +| ---------- | ------------------------------------- | +| `Doc.cats` | Category scores. ~~Dict[str, float]~~ | ## Config and implementation {#config} @@ -22,7 +54,7 @@ how the component should be configured. You can override its settings via the [model architectures](/api/architectures) documentation for details on the architectures and their arguments and hyperparameters. -> #### Example +> #### Example (textcat) > > ```python > from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL @@ -33,6 +65,17 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("textcat", config=config) > ``` +> #### Example (textcat_multilabel) +> +> ```python +> from spacy.pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL +> config = { +> "threshold": 0.5, +> "model": DEFAULT_MULTI_TEXTCAT_MODEL, +> } +> nlp.add_pipe("textcat_multilabel", config=config) +> ``` + | Setting | Description | | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | @@ -42,12 +85,17 @@ architectures and their arguments and hyperparameters. %%GITHUB_SPACY/spacy/pipeline/textcat.py ``` +```python +%%GITHUB_SPACY/spacy/pipeline/textcat_multilabel.py +``` + ## TextCategorizer.\_\_init\_\_ {#init tag="method"} > #### Example > > ```python > # Construction via add_pipe with default model +> # Use 'textcat_multilabel' for multi-label classification > textcat = nlp.add_pipe("textcat") > > # Construction via add_pipe with custom model @@ -55,6 +103,7 @@ architectures and their arguments and hyperparameters. > parser = nlp.add_pipe("textcat", config=config) > > # Construction from class +> # Use 'MultiLabel_TextCategorizer' for multi-label classification > from spacy.pipeline import TextCategorizer > textcat = TextCategorizer(nlp.vocab, model, threshold=0.5) > ``` @@ -161,7 +210,7 @@ This method was previously called `begin_training`. | _keyword-only_ | | | `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | | `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ | -| `positive_label` | The positive label for a binary task with exclusive classes, None otherwise and by default. ~~Optional[str]~~ | +| `positive_label` | The positive label for a binary task with exclusive classes, `None` otherwise and by default. This parameter is only used during scoring. It is not available when using the `textcat_multilabel` component. ~~Optional[str]~~ | ## TextCategorizer.predict {#predict tag="method"} @@ -201,9 +250,8 @@ Modify a batch of [`Doc`](/api/doc) objects using pre-computed scores. Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. -Delegates to [`predict`](/api/textcategorizer#predict), -[`get_loss`](/api/textcategorizer#get_loss) and -[`set_annotations`](/api/textcategorizer#set_annotations). +Delegates to [`predict`](/api/textcategorizer#predict) and +[`get_loss`](/api/textcategorizer#get_loss). > #### Example > @@ -213,14 +261,14 @@ Delegates to [`predict`](/api/textcategorizer#predict), > losses = textcat.update(examples, sgd=optimizer) > ``` -| Name | Description | -| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| `drop` | The dropout rate. ~~float~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | -| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------ | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## TextCategorizer.rehearse {#rehearse tag="method,experimental" new="3"} @@ -274,11 +322,11 @@ Score a batch of examples. > scores = textcat.score(examples) > ``` -| Name | Description | -| ---------------- | -------------------------------------------------------------------------------------------------------------------- | -| `examples` | The examples to score. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| **RETURNS** | The scores, produced by [`Scorer.score_cats`](/api/scorer#score_cats). ~~Dict[str, Union[float, Dict[str, float]]]~~ | +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------- | +| `examples` | The examples to score. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| **RETURNS** | The scores, produced by [`Scorer.score_cats`](/api/scorer#score_cats). ~~Dict[str, Union[float, Dict[str, float]]]~~ | ## TextCategorizer.create_optimizer {#create_optimizer tag="method"} diff --git a/website/docs/api/tok2vec.md b/website/docs/api/tok2vec.md index 409c7f25b..70c352b4d 100644 --- a/website/docs/api/tok2vec.md +++ b/website/docs/api/tok2vec.md @@ -186,8 +186,7 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores. Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. -Delegates to [`predict`](/api/tok2vec#predict) and -[`set_annotations`](/api/tok2vec#set_annotations). +Delegates to [`predict`](/api/tok2vec#predict). > #### Example > @@ -197,14 +196,14 @@ Delegates to [`predict`](/api/tok2vec#predict) and > losses = tok2vec.update(examples, sgd=optimizer) > ``` -| Name | Description | -| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| `drop` | The dropout rate. ~~float~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | -| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------ | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## Tok2Vec.create_optimizer {#create_optimizer tag="method"} diff --git a/website/docs/api/token.md b/website/docs/api/token.md index cb0efe7bb..44a2ea9e8 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -362,9 +362,9 @@ unknown. Defaults to `True` for the first token in the `Doc`. > assert not doc[5].is_sent_start > ``` -| Name | Description | -| ----------- | --------------------------------------------- | -| **RETURNS** | Whether the token starts a sentence. ~~bool~~ | +| Name | Description | +| ----------- | ------------------------------------------------------- | +| **RETURNS** | Whether the token starts a sentence. ~~Optional[bool]~~ | ## Token.has_vector {#has_vector tag="property" model="vectors"} @@ -420,73 +420,73 @@ The L2 norm of the token's vector representation. ## Attributes {#attributes} -| Name | Description | -| -------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doc` | The parent document. ~~Doc~~ | -| `lex` 3 | The underlying lexeme. ~~Lexeme~~ | -| `sent` 2.0.12 | The sentence span that this token is a part of. ~~Span~~ | -| `text` | Verbatim text content. ~~str~~ | -| `text_with_ws` | Text content, with trailing space character if present. ~~str~~ | -| `whitespace_` | Trailing space character if present. ~~str~~ | -| `orth` | ID of the verbatim text content. ~~int~~ | -| `orth_` | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. ~~str~~ | -| `vocab` | The vocab object of the parent `Doc`. ~~vocab~~ | -| `tensor` 2.1.7 | The tokens's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~ | -| `head` | The syntactic parent, or "governor", of this token. ~~Token~~ | -| `left_edge` | The leftmost token of this token's syntactic descendants. ~~Token~~ | -| `right_edge` | The rightmost token of this token's syntactic descendants. ~~Token~~ | -| `i` | The index of the token within the parent document. ~~int~~ | -| `ent_type` | Named entity type. ~~int~~ | -| `ent_type_` | Named entity type. ~~str~~ | -| `ent_iob` | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. ~~int~~ | -| `ent_iob_` | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~ | -| `ent_kb_id` 2.2 | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~ | -| `ent_kb_id_` 2.2 | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~ | -| `ent_id` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~int~~ | -| `ent_id_` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~str~~ | -| `lemma` | Base form of the token, with no inflectional suffixes. ~~int~~ | -| `lemma_` | Base form of the token, with no inflectional suffixes. ~~str~~ | -| `norm` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~int~~ | -| `norm_` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~str~~ | -| `lower` | Lowercase form of the token. ~~int~~ | -| `lower_` | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~ | -| `shape` | Transform of the tokens's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ | -| `shape_` | Transform of the tokens's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ | -| `prefix` | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~ | -| `prefix_` | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~ | -| `suffix` | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~ | -| `suffix_` | Length-N substring from the end of the token. Defaults to `N=3`. ~~str~~ | -| `is_alpha` | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. ~~bool~~ | -| `is_ascii` | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. ~~bool~~ | -| `is_digit` | Does the token consist of digits? Equivalent to `token.text.isdigit()`. ~~bool~~ | -| `is_lower` | Is the token in lowercase? Equivalent to `token.text.islower()`. ~~bool~~ | -| `is_upper` | Is the token in uppercase? Equivalent to `token.text.isupper()`. ~~bool~~ | -| `is_title` | Is the token in titlecase? Equivalent to `token.text.istitle()`. ~~bool~~ | -| `is_punct` | Is the token punctuation? ~~bool~~ | -| `is_left_punct` | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~ | -| `is_right_punct` | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~ | -| `is_space` | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~ | -| `is_bracket` | Is the token a bracket? ~~bool~~ | -| `is_quote` | Is the token a quotation mark? ~~bool~~ | -| `is_currency` 2.0.8 | Is the token a currency symbol? ~~bool~~ | -| `like_url` | Does the token resemble a URL? ~~bool~~ | -| `like_num` | Does the token represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~ | -| `like_email` | Does the token resemble an email address? ~~bool~~ | -| `is_oov` | Does the token have a word vector? ~~bool~~ | -| `is_stop` | Is the token part of a "stop list"? ~~bool~~ | -| `pos` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~int~~ | -| `pos_` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~str~~ | -| `tag` | Fine-grained part-of-speech. ~~int~~ | -| `tag_` | Fine-grained part-of-speech. ~~str~~ | -| `morph` 3 | Morphological analysis. ~~MorphAnalysis~~ | -| `dep` | Syntactic dependency relation. ~~int~~ | -| `dep_` | Syntactic dependency relation. ~~str~~ | -| `lang` | Language of the parent document's vocabulary. ~~int~~ | -| `lang_` | Language of the parent document's vocabulary. ~~str~~ | -| `prob` | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~ | -| `idx` | The character offset of the token within the parent document. ~~int~~ | -| `sentiment` | A scalar value indicating the positivity or negativity of the token. ~~float~~ | -| `lex_id` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ | -| `rank` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ | -| `cluster` | Brown cluster ID. ~~int~~ | -| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ | +| Name | Description | +| -------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doc` | The parent document. ~~Doc~~ | +| `lex` 3 | The underlying lexeme. ~~Lexeme~~ | +| `sent` 2.0.12 | The sentence span that this token is a part of. ~~Span~~ | +| `text` | Verbatim text content. ~~str~~ | +| `text_with_ws` | Text content, with trailing space character if present. ~~str~~ | +| `whitespace_` | Trailing space character if present. ~~str~~ | +| `orth` | ID of the verbatim text content. ~~int~~ | +| `orth_` | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. ~~str~~ | +| `vocab` | The vocab object of the parent `Doc`. ~~vocab~~ | +| `tensor` 2.1.7 | The token's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~ | +| `head` | The syntactic parent, or "governor", of this token. ~~Token~~ | +| `left_edge` | The leftmost token of this token's syntactic descendants. ~~Token~~ | +| `right_edge` | The rightmost token of this token's syntactic descendants. ~~Token~~ | +| `i` | The index of the token within the parent document. ~~int~~ | +| `ent_type` | Named entity type. ~~int~~ | +| `ent_type_` | Named entity type. ~~str~~ | +| `ent_iob` | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. ~~int~~ | +| `ent_iob_` | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~ | +| `ent_kb_id` 2.2 | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~ | +| `ent_kb_id_` 2.2 | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~ | +| `ent_id` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~int~~ | +| `ent_id_` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~str~~ | +| `lemma` | Base form of the token, with no inflectional suffixes. ~~int~~ | +| `lemma_` | Base form of the token, with no inflectional suffixes. ~~str~~ | +| `norm` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~int~~ | +| `norm_` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~str~~ | +| `lower` | Lowercase form of the token. ~~int~~ | +| `lower_` | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~ | +| `shape` | Transform of the token's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ | +| `shape_` | Transform of the token's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ | +| `prefix` | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~ | +| `prefix_` | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~ | +| `suffix` | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~ | +| `suffix_` | Length-N substring from the end of the token. Defaults to `N=3`. ~~str~~ | +| `is_alpha` | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. ~~bool~~ | +| `is_ascii` | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. ~~bool~~ | +| `is_digit` | Does the token consist of digits? Equivalent to `token.text.isdigit()`. ~~bool~~ | +| `is_lower` | Is the token in lowercase? Equivalent to `token.text.islower()`. ~~bool~~ | +| `is_upper` | Is the token in uppercase? Equivalent to `token.text.isupper()`. ~~bool~~ | +| `is_title` | Is the token in titlecase? Equivalent to `token.text.istitle()`. ~~bool~~ | +| `is_punct` | Is the token punctuation? ~~bool~~ | +| `is_left_punct` | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~ | +| `is_right_punct` | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~ | +| `is_space` | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~ | +| `is_bracket` | Is the token a bracket? ~~bool~~ | +| `is_quote` | Is the token a quotation mark? ~~bool~~ | +| `is_currency` 2.0.8 | Is the token a currency symbol? ~~bool~~ | +| `like_url` | Does the token resemble a URL? ~~bool~~ | +| `like_num` | Does the token represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~ | +| `like_email` | Does the token resemble an email address? ~~bool~~ | +| `is_oov` | Is the token out-of-vocabulary (i.e. does it not have a word vector)? ~~bool~~ | +| `is_stop` | Is the token part of a "stop list"? ~~bool~~ | +| `pos` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~int~~ | +| `pos_` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~str~~ | +| `tag` | Fine-grained part-of-speech. ~~int~~ | +| `tag_` | Fine-grained part-of-speech. ~~str~~ | +| `morph` 3 | Morphological analysis. ~~MorphAnalysis~~ | +| `dep` | Syntactic dependency relation. ~~int~~ | +| `dep_` | Syntactic dependency relation. ~~str~~ | +| `lang` | Language of the parent document's vocabulary. ~~int~~ | +| `lang_` | Language of the parent document's vocabulary. ~~str~~ | +| `prob` | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~ | +| `idx` | The character offset of the token within the parent document. ~~int~~ | +| `sentiment` | A scalar value indicating the positivity or negativity of the token. ~~float~~ | +| `lex_id` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ | +| `rank` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ | +| `cluster` | Brown cluster ID. ~~int~~ | +| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ | diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index 5958f2e57..8809c10bc 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -239,6 +239,7 @@ it. | `infix_finditer` | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) sequence of `re.MatchObject` objects. ~~Optional[Callable[[str], Iterator[Match]]]~~ | | `token_match` | A function matching the signature of `re.compile(string).match` to find token matches. Returns an `re.MatchObject` or `None`. ~~Optional[Callable[[str], Optional[Match]]]~~ | | `rules` | A dictionary of tokenizer exceptions and special cases. ~~Optional[Dict[str, List[Dict[int, str]]]]~~ | + ## Serialization fields {#serialization-fields} During serialization, spaCy will export several data fields used to restore diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 09a64f9e3..c78a1de03 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -8,6 +8,7 @@ menu: - ['Readers', 'readers'] - ['Batchers', 'batchers'] - ['Augmenters', 'augmenters'] + - ['Callbacks', 'callbacks'] - ['Training & Alignment', 'gold'] - ['Utility Functions', 'util'] --- @@ -48,6 +49,7 @@ specified separately using the new `exclude` keyword argument. | ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `name` | Pipeline to load, i.e. package name or path. ~~Union[str, Path]~~ | | _keyword-only_ | | +| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | | `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ | | `exclude` 3 | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ | | `config` 3 | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | @@ -83,9 +85,9 @@ Create a blank pipeline of a given language class. This function is the twin of | ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `name` | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. ~~str~~ | | _keyword-only_ | | -| `vocab` 3 | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. | +| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | | `config` 3 | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | -| `meta` 3 | Optional meta overrides for [`nlp.meta`](/api/language#meta). ~~Dict[str, Any]~~ | +| `meta` | Optional meta overrides for [`nlp.meta`](/api/language#meta). ~~Dict[str, Any]~~ | | **RETURNS** | An empty `Language` object of the appropriate subclass. ~~Language~~ | ### spacy.info {#spacy.info tag="function"} @@ -138,6 +140,14 @@ data has already been allocated on CPU, it will not be moved. Ideally, this function should be called right after importing spaCy and _before_ loading any pipelines. + + +In a Jupyter notebook, run `prefer_gpu()` in the same cell as `spacy.load()` to +ensure that the model is loaded on the correct device. See +[more details](/usage/v3#jupyter-notebook-gpu). + + + > #### Example > > ```python @@ -158,6 +168,14 @@ if no GPU is available. If data has already been allocated on CPU, it will not be moved. Ideally, this function should be called right after importing spaCy and _before_ loading any pipelines. + + +In a Jupyter notebook, run `require_gpu()` in the same cell as `spacy.load()` to +ensure that the model is loaded on the correct device. See +[more details](/usage/v3#jupyter-notebook-gpu). + + + > #### Example > > ```python @@ -173,10 +191,17 @@ and _before_ loading any pipelines. ### spacy.require_cpu {#spacy.require_cpu tag="function" new="3.0.0"} -Allocate data and perform operations on CPU. -If data has already been allocated on GPU, it will not -be moved. Ideally, this function should be called right after importing spaCy -and _before_ loading any pipelines. +Allocate data and perform operations on CPU. If data has already been allocated +on GPU, it will not be moved. Ideally, this function should be called right +after importing spaCy and _before_ loading any pipelines. + + + +In a Jupyter notebook, run `require_cpu()` in the same cell as `spacy.load()` to +ensure that the model is loaded on the correct device. See +[more details](/usage/v3#jupyter-notebook-gpu). + + > #### Example > @@ -186,9 +211,9 @@ and _before_ loading any pipelines. > nlp = spacy.load("en_core_web_sm") > ``` -| Name | Description | -| ----------- | ------------------------------------------------ | -| **RETURNS** | `True` ~~bool~~ | +| Name | Description | +| ----------- | --------------- | +| **RETURNS** | `True` ~~bool~~ | ## displaCy {#displacy source="spacy/displacy"} @@ -437,7 +462,7 @@ start decreasing across epochs. -#### spacy.WandbLogger.v1 {#WandbLogger tag="registered function"} +#### spacy.WandbLogger.v3 {#WandbLogger tag="registered function"} > #### Installation > @@ -469,15 +494,21 @@ remain in the config file stored on your local system. > > ```ini > [training.logger] -> @loggers = "spacy.WandbLogger.v1" +> @loggers = "spacy.WandbLogger.v3" > project_name = "monitor_spacy_training" > remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"] +> log_dataset_dir = "corpus" +> model_log_interval = 1000 > ``` -| Name | Description | -| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------- | -| `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ | -| `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ | +| Name | Description | +| ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ | +| `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ | +| `model_log_interval` | Steps to wait between logging model checkpoints to W&B dasboard (default: None). ~~Optional[int]~~ | +| `log_dataset_dir` | Directory containing dataset to be logged and versioned as W&B artifact (default: None). ~~Optional[str]~~ | +| `run_name` | The name of the run. If you don't specify a run_name, the name will be created by wandb library. (default: None ). ~~Optional[str]~~ | +| `entity` | An entity is a username or team name where you're sending runs. If you don't specify an entity, the run will be sent to your default entity, which is usually your username. (default: None). ~~Optional[str]~~ | @@ -545,7 +576,7 @@ label sets. | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `path` | The path to the labels file generated with [`init labels`](/api/cli#init-labels). ~~Path~~ | | `require` | Whether to require the file to exist. If set to `False` and the labels file doesn't exist, the loader will return `None` and the `initialize` method will extract the labels from the data. Defaults to `False`. ~~bool~~ | -| **CREATES** | The | +| **CREATES** | The list of labels. ~~List[str]~~ | ### Corpus readers {#corpus-readers source="spacy/training/corpus.py" new="3"} @@ -728,7 +759,7 @@ capitalization by including a mix of capitalized and lowercase examples. See the Create a data augmentation callback that uses orth-variant replacement. The callback can be added to a corpus or other data iterator during training. It's -is especially useful for punctuation and case replacement, to help generalize +especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart quotes, or only have smart quotes etc. | Name | Description | @@ -757,6 +788,55 @@ useful for making the model less sensitive to capitalization. | `level` | The percentage of texts that will be augmented. ~~float~~ | | **CREATES** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~ | +## Callbacks {#callbacks source="spacy/training/callbacks.py" new="3"} + +The config supports [callbacks](/usage/training#custom-code-nlp-callbacks) at +several points in the lifecycle that can be used modify the `nlp` object. + +### spacy.copy_from_base_model.v1 {#copy_from_base_model tag="registered function"} + +> #### Example config +> +> ```ini +> [initialize.before_init] +> @callbacks = "spacy.copy_from_base_model.v1" +> tokenizer = "en_core_sci_md" +> vocab = "en_core_sci_md" +> ``` + +Copy the tokenizer and/or vocab from the specified models. It's similar to the +v2 [base model](https://v2.spacy.io/api/cli#train) option and useful in +combination with +[sourced components](/usage/processing-pipelines#sourced-components) when +fine-tuning an existing pipeline. The vocab includes the lookups and the vectors +from the specified model. Intended for use in `[initialize.before_init]`. + +| Name | Description | +| ----------- | ----------------------------------------------------------------------------------------------------------------------- | +| `tokenizer` | The pipeline to copy the tokenizer from. Defaults to `None`. ~~Optional[str]~~ | +| `vocab` | The pipeline to copy the vocab from. The vocab includes the lookups and vectors. Defaults to `None`. ~~Optional[str]~~ | +| **CREATES** | A function that takes the current `nlp` object and modifies its `tokenizer` and `vocab`. ~~Callable[[Language], None]~~ | + +### spacy.models_with_nvtx_range.v1 {#models_with_nvtx_range tag="registered function"} + +> #### Example config +> +> ```ini +> [nlp] +> after_pipeline_creation = {"@callbacks":"spacy.models_with_nvtx_range.v1"} +> ``` + +Recursively wrap the models in each pipe using +[NVTX](https://nvidia.github.io/NVTX/) range markers. These markers aid in GPU +profiling by attributing specific operations to a ~~Model~~'s forward or +backprop passes. + +| Name | Description | +| ---------------- | ---------------------------------------------------------------------------------------------------------------------------- | +| `forward_color` | Color identifier for forward passes. Defaults to `-1`. ~~int~~ | +| `backprop_color` | Color identifier for backpropagation passes. Defaults to `-1`. ~~int~~ | +| **CREATES** | A function that takes the current `nlp` and wraps forward/backprop passes in NVTX ranges. ~~Callable[[Language], Language]~~ | + ## Training data and alignment {#gold source="spacy/training"} ### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"} @@ -821,7 +901,7 @@ This method was previously available as `spacy.gold.offsets_from_biluo_tags`. | Name | Description | | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `doc` | The document that the BILUO tags refer to. ~~Doc~~ | -| `entities` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ | +| `tags` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ | | **RETURNS** | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, str]]~~ | ### training.biluo_tags_to_spans {#biluo_tags_to_spans tag="function" new="2.1"} @@ -850,7 +930,7 @@ This method was previously available as `spacy.gold.spans_from_biluo_tags`. | Name | Description | | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `doc` | The document that the BILUO tags refer to. ~~Doc~~ | -| `entities` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ | +| `tags` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ | | **RETURNS** | A sequence of `Span` objects with added entity labels. ~~List[Span]~~ | ## Utility functions {#util source="spacy/util.py"} @@ -922,7 +1002,8 @@ and create a `Language` object. The model data will then be loaded in via | Name | Description | | ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `name` | Package name or path. ~~str~~ | -| `vocab` 3 | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. | +| _keyword-only_ | | +| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | | `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ | | `exclude` 3 | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ | | `config` 3 | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ | @@ -945,7 +1026,8 @@ A helper function to use in the `load()` method of a pipeline package's | Name | Description | | ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `init_file` | Path to package's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~ | -| `vocab` 3 | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. | +| _keyword-only_ | | +| `vocab` 3 | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | | `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ | | `exclude` 3 | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ | | `config` 3 | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ | @@ -1124,11 +1206,11 @@ vary on each step. > nlp.update(batch) > ``` -| Name | Description | -| ---------- | ---------------------------------------- | -| `items` | The items to batch up. ~~Iterable[Any]~~ | -| `size` | int / iterable | The batch size(s). ~~Union[int, Sequence[int]]~~ | -| **YIELDS** | The batches. | +| Name | Description | +| ---------- | ------------------------------------------------ | +| `items` | The items to batch up. ~~Iterable[Any]~~ | +| `size` | The batch size(s). ~~Union[int, Sequence[int]]~~ | +| **YIELDS** | The batches. | ### util.filter_spans {#util.filter_spans tag="function" new="2.1.4"} diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md index 5aaa1d23e..b1673cdbe 100644 --- a/website/docs/api/transformer.md +++ b/website/docs/api/transformer.md @@ -38,12 +38,21 @@ attributes. We also calculate an alignment between the word-piece tokens and the spaCy tokenization, so that we can use the last hidden states to set the `Doc.tensor` attribute. When multiple word-piece tokens align to the same spaCy token, the spaCy token receives the sum of their values. To access the values, -you can use the custom [`Doc._.trf_data`](#custom-attributes) attribute. The +you can use the custom [`Doc._.trf_data`](#assigned-attributes) attribute. The package also adds the function registries [`@span_getters`](#span_getters) and [`@annotation_setters`](#annotation_setters) with several built-in registered functions. For more details, see the [usage documentation](/usage/embeddings-transformers). +## Assigned Attributes {#assigned-attributes} + +The component sets the following +[custom extension attribute](/usage/processing-pipeline#custom-components-attributes): + +| Location | Value | +| ---------------- | ------------------------------------------------------------------------ | +| `Doc._.trf_data` | Transformer tokens and outputs for the `Doc` object. ~~TransformerData~~ | + ## Config and implementation {#config} The default config is defined by the pipeline component factory and describes @@ -83,9 +92,12 @@ https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/p > # Construction via add_pipe with custom config > config = { > "model": { -> "@architectures": "spacy-transformers.TransformerModel.v1", +> "@architectures": "spacy-transformers.TransformerModel.v3", > "name": "bert-base-uncased", -> "tokenizer_config": {"use_fast": True} +> "tokenizer_config": {"use_fast": True}, +> "transformer_config": {"output_attentions": True}, +> "mixed_precision": True, +> "grad_scaler_config": {"init_scale": 32768} > } > } > trf = nlp.add_pipe("transformer", config=config) @@ -98,7 +110,7 @@ https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/p Construct a `Transformer` component. One or more subsequent spaCy components can use the transformer outputs as features in its model, with gradients backpropagated to the single shared weights. The activations from the -transformer are saved in the [`Doc._.trf_data`](#custom-attributes) extension +transformer are saved in the [`Doc._.trf_data`](#assigned-attributes) extension attribute. You can also provide a callback to set additional annotations. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#create_pipe). @@ -175,7 +187,7 @@ by [`Language.initialize`](/api/language#initialize). > > ```python > trf = nlp.add_pipe("transformer") -> trf.initialize(lambda: [], nlp=nlp) +> trf.initialize(lambda: iter([]), nlp=nlp) > ``` | Name | Description | @@ -205,7 +217,7 @@ modifying them. Assign the extracted features to the `Doc` objects. By default, the [`TransformerData`](/api/transformer#transformerdata) object is written to the -[`Doc._.trf_data`](#custom-attributes) attribute. Your `set_extra_annotations` +[`Doc._.trf_data`](#assigned-attributes) attribute. Your `set_extra_annotations` callback is then called, if provided. > #### Example @@ -245,14 +257,14 @@ and call the optimizer, while the others simply increment the gradients. > losses = trf.update(examples, sgd=optimizer) > ``` -| Name | Description | -| ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| `drop` | The dropout rate. ~~float~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | -| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## Transformer.create_optimizer {#create_optimizer tag="method"} @@ -383,14 +395,15 @@ are wrapped into the [FullTransformerBatch](/api/transformer#fulltransformerbatch) object. The `FullTransformerBatch` then splits out the per-document data, which is handled by this class. Instances of this class are typically assigned to the -[`Doc._.trf_data`](/api/transformer#custom-attributes) extension attribute. +[`Doc._.trf_data`](/api/transformer#assigned-attributes) extension attribute. -| Name | Description | -| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `tokens` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~ | -| `tensors` | The activations for the `Doc` from the transformer. Usually the last tensor that is 3-dimensional will be the most important, as that will provide the final hidden state. Generally activations that are 2-dimensional will be attention weights. Details of this variable will differ depending on the underlying transformer model. ~~List[FloatsXd]~~ | -| `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ | -| `width` | The width of the last hidden layer. ~~int~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `tokens` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~ | +| `model_output` | The model output from the transformer model, determined by the model and transformer config. New in `spacy-transformers` v1.1.0. ~~transformers.file_utils.ModelOutput~~ | +| `tensors` | The `model_output` in the earlier `transformers` tuple format converted using [`ModelOutput.to_tuple()`](https://huggingface.co/transformers/main_classes/output.html#transformers.file_utils.ModelOutput.to_tuple). Returns `Tuple` instead of `List` as of `spacy-transformers` v1.1.0. ~~Tuple[Union[FloatsXd, List[FloatsXd]]]~~ | +| `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ | +| `width` | The width of the last hidden layer. ~~int~~ | ### TransformerData.empty {#transformerdata-emoty tag="classmethod"} @@ -400,19 +413,32 @@ Create an empty `TransformerData` container. | ----------- | ---------------------------------- | | **RETURNS** | The container. ~~TransformerData~~ | + + +In `spacy-transformers` v1.0, the model output is stored in +`TransformerData.tensors` as `List[Union[FloatsXd]]` and only includes the +activations for the `Doc` from the transformer. Usually the last tensor that is +3-dimensional will be the most important, as that will provide the final hidden +state. Generally activations that are 2-dimensional will be attention weights. +Details of this variable will differ depending on the underlying transformer +model. + + + ## FullTransformerBatch {#fulltransformerbatch tag="dataclass"} Holds a batch of input and output objects for a transformer model. The data can then be split to a list of [`TransformerData`](/api/transformer#transformerdata) objects to associate the outputs to each [`Doc`](/api/doc) in the batch. -| Name | Description | -| ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `spans` | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each `Span` can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each `Span` may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. ~~List[List[Span]]~~ | -| `tokens` | The output of the tokenizer. ~~transformers.BatchEncoding~~ | -| `tensors` | The output of the transformer model. ~~List[torch.Tensor]~~ | -| `align` | Alignment from the spaCy tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ | -| `doc_data` | The outputs, split per `Doc` object. ~~List[TransformerData]~~ | +| Name | Description | +| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `spans` | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each `Span` can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each `Span` may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. ~~List[List[Span]]~~ | +| `tokens` | The output of the tokenizer. ~~transformers.BatchEncoding~~ | +| `model_output` | The model output from the transformer model, determined by the model and transformer config. New in `spacy-transformers` v1.1.0. ~~transformers.file_utils.ModelOutput~~ | +| `tensors` | The `model_output` in the earlier `transformers` tuple format converted using [`ModelOutput.to_tuple()`](https://huggingface.co/transformers/main_classes/output.html#transformers.file_utils.ModelOutput.to_tuple). Returns `Tuple` instead of `List` as of `spacy-transformers` v1.1.0. ~~Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]~~ | +| `align` | Alignment from the spaCy tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ | +| `doc_data` | The outputs, split per `Doc` object. ~~List[TransformerData]~~ | ### FullTransformerBatch.unsplit_by_doc {#fulltransformerbatch-unsplit_by_doc tag="method"} @@ -435,6 +461,13 @@ Split a `TransformerData` object that represents a batch into a list with one | ----------- | ------------------------------------------ | | **RETURNS** | The split batch. ~~List[TransformerData]~~ | + + +In `spacy-transformers` v1.0, the model output is stored in +`FullTransformerBatch.tensors` as `List[torch.Tensor]`. + + + ## Span getters {#span_getters source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"} Span getters are functions that take a batch of [`Doc`](/api/doc) objects and @@ -493,6 +526,11 @@ This requires sentence boundaries to be set (e.g. by the depending on the sentence lengths. However, it does provide the transformer with more meaningful windows to attend over. +To set sentence boundaries with the `sentencizer` during training, add a +`sentencizer` to the beginning of the pipeline and include it in +[`[training.annotating_components]`](/usage/training#annotating-components) to +have it set the sentence boundaries before the `transformer` component runs. + ### strided_spans.v1 {#strided_spans tag="registered function"} > #### Example config @@ -544,12 +582,3 @@ The following built-in functions are available: | Name | Description | | ---------------------------------------------- | ------------------------------------- | | `spacy-transformers.null_annotation_setter.v1` | Don't set any additional annotations. | - -## Custom attributes {#custom-attributes} - -The component sets the following -[custom extension attributes](/usage/processing-pipeline#custom-components-attributes): - -| Name | Description | -| ---------------- | ------------------------------------------------------------------------ | -| `Doc._.trf_data` | Transformer tokens and outputs for the `Doc` object. ~~TransformerData~~ | diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index ba2d5ab42..1a7f7a3f5 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -290,8 +290,8 @@ If a table is full, it can be resized using ## Vectors.n_keys {#n_keys tag="property"} Get the number of keys in the table. Note that this is the number of _all_ keys, -not just unique vectors. If several keys are mapped to the same -vectors, they will be counted individually. +not just unique vectors. If several keys are mapped to the same vectors, they +will be counted individually. > #### Example > diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index a2ca63002..c0a269d95 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -21,15 +21,15 @@ Create the vocabulary. > vocab = Vocab(strings=["hello", "world"]) > ``` -| Name | Description | -| ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `lex_attr_getters` | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. ~~Optional[Dict[str, Callable[[str], Any]]]~~ | -| `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~ | -| `lookups` | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~ | -| `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ | -| `vectors_name` 2.2 | A name to identify the vectors table. ~~str~~ | -| `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ | -| `get_noun_chunks` | A function that yields base noun phrases used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]~~ | +| Name | Description | +| ------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `lex_attr_getters` | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. ~~Optional[Dict[str, Callable[[str], Any]]]~~ | +| `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~ | +| `lookups` | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~ | +| `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ | +| `vectors_name` 2.2 | A name to identify the vectors table. ~~str~~ | +| `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ | +| `get_noun_chunks` | A function that yields base noun phrases used for [`Doc.noun_chunks`](/api/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ | ## Vocab.\_\_len\_\_ {#len tag="method"} @@ -182,14 +182,14 @@ subword features by average over n-grams of `orth` (introduced in spaCy `v2.1`). | Name | Description | | ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------- | | `orth` | The hash value of a word, or its unicode string. ~~Union[int, str]~~ | -| `minn` 2.1 | Minimum n-gram length used for FastText's n-gram computation. Defaults to the length of `orth`. ~~int~~ | -| `maxn` 2.1 | Maximum n-gram length used for FastText's n-gram computation. Defaults to the length of `orth`. ~~int~~ | +| `minn` 2.1 | Minimum n-gram length used for FastText's n-gram computation. Defaults to the length of `orth`. ~~int~~ | +| `maxn` 2.1 | Maximum n-gram length used for FastText's n-gram computation. Defaults to the length of `orth`. ~~int~~ | | **RETURNS** | A word vector. Size and shape are determined by the `Vocab.vectors` instance. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | ## Vocab.set_vector {#set_vector tag="method" new="2"} -Set a vector for a word in the vocabulary. Words can be referenced by string -or hash value. +Set a vector for a word in the vocabulary. Words can be referenced by string or +hash value. > #### Example > @@ -300,13 +300,14 @@ Load state from a binary string. > assert type(PERSON) == int > ``` -| Name | Description | -| --------------------------------------------- | ------------------------------------------------------------------------------- | -| `strings` | A table managing the string-to-int mapping. ~~StringStore~~ | -| `vectors` 2 | A table associating word IDs to word vectors. ~~Vectors~~ | -| `vectors_length` | Number of dimensions for each word vector. ~~int~~ | -| `lookups` | The available lookup tables in this vocab. ~~Lookups~~ | -| `writing_system` 2.1 | A dict with information about the language's writing system. ~~Dict[str, Any]~~ | +| Name | Description | +| ---------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `strings` | A table managing the string-to-int mapping. ~~StringStore~~ | +| `vectors` 2 | A table associating word IDs to word vectors. ~~Vectors~~ | +| `vectors_length` | Number of dimensions for each word vector. ~~int~~ | +| `lookups` | The available lookup tables in this vocab. ~~Lookups~~ | +| `writing_system` 2.1 | A dict with information about the language's writing system. ~~Dict[str, Any]~~ | +| `get_noun_chunks` 3.0 | A function that yields base noun phrases used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ | ## Serialization fields {#serialization-fields} @@ -324,6 +325,5 @@ serialization by passing in the string names via the `exclude` argument. | Name | Description | | --------- | ----------------------------------------------------- | | `strings` | The strings in the [`StringStore`](/api/stringstore). | -| `lexemes` | The lexeme data. | | `vectors` | The word vectors, if available. | | `lookups` | The lookup tables, if available. | diff --git a/website/docs/images/huggingface_hub.jpg b/website/docs/images/huggingface_hub.jpg new file mode 100644 index 000000000..5618df020 Binary files /dev/null and b/website/docs/images/huggingface_hub.jpg differ diff --git a/website/docs/images/pipeline-design.svg b/website/docs/images/pipeline-design.svg new file mode 100644 index 000000000..88ccdab99 --- /dev/null +++ b/website/docs/images/pipeline-design.svg @@ -0,0 +1,49 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/website/docs/images/prodigy_spans-manual.jpg b/website/docs/images/prodigy_spans-manual.jpg new file mode 100644 index 000000000..d67f347e0 Binary files /dev/null and b/website/docs/images/prodigy_spans-manual.jpg differ diff --git a/website/docs/images/prodigy_train_curve.jpg b/website/docs/images/prodigy_train_curve.jpg new file mode 100644 index 000000000..af22cd065 Binary files /dev/null and b/website/docs/images/prodigy_train_curve.jpg differ diff --git a/website/docs/models/index.md b/website/docs/models/index.md index 30b4f11d9..92d1b0172 100644 --- a/website/docs/models/index.md +++ b/website/docs/models/index.md @@ -4,6 +4,7 @@ teaser: Downloadable trained pipelines and weights for spaCy menu: - ['Quickstart', 'quickstart'] - ['Conventions', 'conventions'] + - ['Pipeline Design', 'design'] --- @@ -26,30 +27,180 @@ of `[lang]\_[name]`. For spaCy's pipelines, we also chose to divide the name into three components: 1. **Type:** Capabilities (e.g. `core` for general-purpose pipeline with - vocabulary, syntax, entities and word vectors, or `dep` for only vocab and - syntax). + tagging, parsing, lemmatization and named entity recognition, or `dep` for + only tagging, parsing and lemmatization). 2. **Genre:** Type of text the pipeline is trained on, e.g. `web` or `news`. -3. **Size:** Package size indicator, `sm`, `md` or `lg`. +3. **Size:** Package size indicator, `sm`, `md`, `lg` or `trf` (`sm`: no word + vectors, `md`: reduced word vector table with 20k unique vectors for ~500k + words, `lg`: large word vector table with ~500k entries, `trf`: transformer + pipeline without static word vectors) For example, [`en_core_web_sm`](/models/en#en_core_web_sm) is a small English pipeline trained on written web text (blogs, news, comments), that includes -vocabulary, vectors, syntax and entities. +vocabulary, syntax and entities. ### Package versioning {#model-versioning} Additionally, the pipeline package versioning reflects both the compatibility -with spaCy, as well as the major and minor version. A package version `a.b.c` -translates to: +with spaCy, as well as the model version. A package version `a.b.c` translates +to: - `a`: **spaCy major version**. For example, `2` for spaCy v2.x. -- `b`: **Package major version**. Pipelines with a different major version can't - be loaded by the same code. For example, changing the width of the model, - adding hidden layers or changing the activation changes the major version. -- `c`: **Package minor version**. Same pipeline structure, but different - parameter values, e.g. from being trained on different data, for different - numbers of iterations, etc. +- `b`: **spaCy minor version**. For example, `3` for spaCy v2.3.x. +- `c`: **Model version**. Different model config: e.g. from being trained on + different data, with different parameters, for different numbers of + iterations, with different vectors, etc. For a detailed compatibility overview, see the [`compatibility.json`](https://github.com/explosion/spacy-models/tree/master/compatibility.json). This is also the source of spaCy's internal compatibility check, performed when you run the [`download`](/api/cli#download) command. + +## Trained pipeline design {#design} + +The spaCy v3 trained pipelines are designed to be efficient and configurable. +For example, multiple components can share a common "token-to-vector" model and +it's easy to swap out or disable the lemmatizer. The pipelines are designed to +be efficient in terms of speed and size and work well when the pipeline is run +in full. + +When modifying a trained pipeline, it's important to understand how the +components **depend on** each other. Unlike spaCy v2, where the `tagger`, +`parser` and `ner` components were all independent, some v3 components depend on +earlier components in the pipeline. As a result, disabling or reordering +components can affect the annotation quality or lead to warnings and errors. + +Main changes from spaCy v2 models: + +- The [`Tok2Vec`](/api/tok2vec) component may be a separate, shared component. A + component like a tagger or parser can + [listen](/api/architectures#Tok2VecListener) to an earlier `tok2vec` or + `transformer` rather than having its own separate tok2vec layer. +- Rule-based exceptions move from individual components to the + `attribute_ruler`. Lemma and POS exceptions move from the tokenizer exceptions + to the attribute ruler and the tag map and morph rules move from the tagger to + the attribute ruler. +- The lemmatizer tables and processing move from the vocab and tagger to a + separate `lemmatizer` component. + +### CNN/CPU pipeline design {#design-cnn} + +![Components and their dependencies in the CNN pipelines](../images/pipeline-design.svg) + +In the `sm`/`md`/`lg` models: + +- The `tagger`, `morphologizer` and `parser` components listen to the `tok2vec` + component. +- The `attribute_ruler` maps `token.tag` to `token.pos` if there is no + `morphologizer`. The `attribute_ruler` additionally makes sure whitespace is + tagged consistently and copies `token.pos` to `token.tag` if there is no + tagger. For English, the attribute ruler can improve its mapping from + `token.tag` to `token.pos` if dependency parses from a `parser` are present, + but the parser is not required. +- The `lemmatizer` component for many languages (Catalan, Dutch, English, + French, Greek, Italian Macedonian, Norwegian, Polish and Spanish) requires + `token.pos` annotation from either `tagger`+`attribute_ruler` or + `morphologizer`. +- The `ner` component is independent with its own internal tok2vec layer. + +### Transformer pipeline design {#design-trf} + +In the transformer (`trf`) models, the `tagger`, `parser` and `ner` (if present) +all listen to the `transformer` component. The `attribute_ruler` and +`lemmatizer` have the same configuration as in the CNN models. + +### Modifying the default pipeline {#design-modify} + +For faster processing, you may only want to run a subset of the components in a +trained pipeline. The `disable` and `exclude` arguments to +[`spacy.load`](/api/top-level#spacy.load) let you control which components are +loaded and run. Disabled components are loaded in the background so it's +possible to reenable them in the same pipeline in the future with +[`nlp.enable_pipe`](/api/language/#enable_pipe). To skip loading a component +completely, use `exclude` instead of `disable`. + +#### Disable part-of-speech tagging and lemmatization + +To disable part-of-speech tagging and lemmatization, disable the `tagger`, +`morphologizer`, `attribute_ruler` and `lemmatizer` components. + +```python +# Note: English doesn't include a morphologizer +nlp = spacy.load("en_core_web_sm", disable=["tagger", "attribute_ruler", "lemmatizer"]) +nlp = spacy.load("en_core_web_trf", disable=["tagger", "attribute_ruler", "lemmatizer"]) +``` + + + +The lemmatizer depends on `tagger`+`attribute_ruler` or `morphologizer` for +Catalan, Dutch, English, French, Greek, Italian, Macedonian, Norwegian, Polish +and Spanish. If you disable any of these components, you'll see lemmatizer +warnings unless the lemmatizer is also disabled. + + + +#### Use senter rather than parser for fast sentence segmentation + +If you need fast sentence segmentation without dependency parses, disable the +`parser` use the `senter` component instead: + +```python +nlp = spacy.load("en_core_web_sm") +nlp.disable_pipe("parser") +nlp.enable_pipe("senter") +``` + +The `senter` component is ~10× faster than the parser and more accurate +than the rule-based `sentencizer`. + +#### Switch from rule-based to lookup lemmatization + +For the Dutch, English, French, Greek, Macedonian, Norwegian and Spanish +pipelines, you can switch from the default rule-based lemmatizer to a lookup +lemmatizer: + +```python +# Requirements: pip install spacy-lookups-data +nlp = spacy.load("en_core_web_sm") +nlp.remove_pipe("lemmatizer") +nlp.add_pipe("lemmatizer", config={"mode": "lookup"}).initialize() +``` + +#### Disable everything except NER + +For the non-transformer models, the `ner` component is independent, so you can +disable everything else: + +```python +nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]) +``` + +In the transformer models, `ner` listens to the `transformer` component, so you +can disable all components related tagging, parsing, and lemmatization. + +```python +nlp = spacy.load("en_core_web_trf", disable=["tagger", "parser", "attribute_ruler", "lemmatizer"]) +``` + +#### Move NER to the end of the pipeline + + + +As of v3.1, the NER component is at the end of the pipeline by default. + + + +For access to `POS` and `LEMMA` features in an `entity_ruler`, move `ner` to the +end of the pipeline after `attribute_ruler` and `lemmatizer`: + +```python +# load without NER +nlp = spacy.load("en_core_web_sm", exclude=["ner"]) + +# source NER from the same pipeline package as the last component +nlp.add_pipe("ner", source=spacy.load("en_core_web_sm")) + +# insert the entity ruler +nlp.add_pipe("entity_ruler", before="ner") +``` diff --git a/website/docs/usage/101/_language-data.md b/website/docs/usage/101/_language-data.md index 239cec9d1..29f7bf46f 100644 --- a/website/docs/usage/101/_language-data.md +++ b/website/docs/usage/101/_language-data.md @@ -21,12 +21,12 @@ values are defined in the [`Language.Defaults`](/api/language#defaults). > nlp_de = German() # Includes German data > ``` -| Name | Description | -| ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | -| **Stop words**
[`stop_words.py`](%%GITHUB_SPACY/spacy/lang/en/stop_words.py) | List of most common words of a language that are often useful to filter out, for example "and" or "I". Matching tokens will return `True` for `is_stop`. | -| **Tokenizer exceptions**
[`tokenizer_exceptions.py`](%%GITHUB_SPACY/spacy/lang/de/tokenizer_exceptions.py) | Special-case rules for the tokenizer, for example, contractions like "can't" and abbreviations with punctuation, like "U.K.". | -| **Punctuation rules**
[`punctuation.py`](%%GITHUB_SPACY/spacy/lang/punctuation.py) | Regular expressions for splitting tokens, e.g. on punctuation or special characters like emoji. Includes rules for prefixes, suffixes and infixes. | -| **Character classes**
[`char_classes.py`](%%GITHUB_SPACY/spacy/lang/char_classes.py) | Character classes to be used in regular expressions, for example, Latin characters, quotes, hyphens or icons. | -| **Lexical attributes**
[`lex_attrs.py`](%%GITHUB_SPACY/spacy/lang/en/lex_attrs.py) | Custom functions for setting lexical attributes on tokens, e.g. `like_num`, which includes language-specific words like "ten" or "hundred". | -| **Syntax iterators**
[`syntax_iterators.py`](%%GITHUB_SPACY/spacy/lang/en/syntax_iterators.py) | Functions that compute views of a `Doc` object based on its syntax. At the moment, only used for [noun chunks](/usage/linguistic-features#noun-chunks). | -| **Lemmatizer**
[`lemmatizer.py`](%%GITHUB_SPACY/master/spacy/lang/fr/lemmatizer.py) [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) | Custom lemmatizer implementation and lemmatization tables. | +| Name | Description | +| --------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **Stop words**
[`stop_words.py`](%%GITHUB_SPACY/spacy/lang/en/stop_words.py) | List of most common words of a language that are often useful to filter out, for example "and" or "I". Matching tokens will return `True` for `is_stop`. | +| **Tokenizer exceptions**
[`tokenizer_exceptions.py`](%%GITHUB_SPACY/spacy/lang/de/tokenizer_exceptions.py) | Special-case rules for the tokenizer, for example, contractions like "can't" and abbreviations with punctuation, like "U.K.". | +| **Punctuation rules**
[`punctuation.py`](%%GITHUB_SPACY/spacy/lang/punctuation.py) | Regular expressions for splitting tokens, e.g. on punctuation or special characters like emoji. Includes rules for prefixes, suffixes and infixes. | +| **Character classes**
[`char_classes.py`](%%GITHUB_SPACY/spacy/lang/char_classes.py) | Character classes to be used in regular expressions, for example, Latin characters, quotes, hyphens or icons. | +| **Lexical attributes**
[`lex_attrs.py`](%%GITHUB_SPACY/spacy/lang/en/lex_attrs.py) | Custom functions for setting lexical attributes on tokens, e.g. `like_num`, which includes language-specific words like "ten" or "hundred". | +| **Syntax iterators**
[`syntax_iterators.py`](%%GITHUB_SPACY/spacy/lang/en/syntax_iterators.py) | Functions that compute views of a `Doc` object based on its syntax. At the moment, only used for [noun chunks](/usage/linguistic-features#noun-chunks). | +| **Lemmatizer**
[`lemmatizer.py`](%%GITHUB_SPACY/spacy/lang/fr/lemmatizer.py) [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) | Custom lemmatizer implementation and lemmatization tables. | diff --git a/website/docs/usage/101/_pos-deps.md b/website/docs/usage/101/_pos-deps.md index a531b245e..93ad0961a 100644 --- a/website/docs/usage/101/_pos-deps.md +++ b/website/docs/usage/101/_pos-deps.md @@ -25,7 +25,7 @@ for token in doc: > - **Text:** The original word text. > - **Lemma:** The base form of the word. -> - **POS:** The simple [UPOS](https://universaldependencies.org/docs/u/pos/) +> - **POS:** The simple [UPOS](https://universaldependencies.org/u/pos/) > part-of-speech tag. > - **Tag:** The detailed part-of-speech tag. > - **Dep:** Syntactic dependency, i.e. the relation between tokens. diff --git a/website/docs/usage/101/_training.md b/website/docs/usage/101/_training.md index b73a83d6a..4218c1b5a 100644 --- a/website/docs/usage/101/_training.md +++ b/website/docs/usage/101/_training.md @@ -10,7 +10,7 @@ any other information. Training is an iterative process in which the model's predictions are compared against the reference annotations in order to estimate the **gradient of the loss**. The gradient of the loss is then used to calculate the gradient of the -weights through [backpropagation](https://thinc.ai/backprop101). The gradients +weights through [backpropagation](https://thinc.ai/docs/backprop101). The gradients indicate how the weight values should be changed so that the model's predictions become more similar to the reference labels over time. diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md index 1e755e39d..5bf9e63ca 100644 --- a/website/docs/usage/_benchmarks-models.md +++ b/website/docs/usage/_benchmarks-models.md @@ -4,13 +4,13 @@ import { Help } from 'components/typography'; import Link from 'components/link' | Pipeline | Parser | Tagger | NER | | ---------------------------------------------------------- | -----: | -----: | ---: | -| [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) | 95.5 | 98.3 | 89.4 | -| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3) | 92.2 | 97.4 | 85.4 | +| [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) | 95.1 | 97.8 | 89.8 | +| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3) | 92.0 | 97.4 | 85.5 | | `en_core_web_lg` (spaCy v2) | 91.9 | 97.2 | 85.5 |
-**Full pipeline accuracy and speed** on the +**Full pipeline accuracy** on the [OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) corpus (reported on the development set). @@ -22,7 +22,7 @@ the development set). | Named Entity Recognition System | OntoNotes | CoNLL '03 | | -------------------------------- | --------: | --------: | -| spaCy RoBERTa (2020) | 89.7 | 91.6 | +| spaCy RoBERTa (2020) | 89.8 | 91.6 | | Stanza (StanfordNLP)1 | 88.8 | 92.1 | | Flair2 | 89.7 | 93.1 | diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index b39bc3eb3..985678d15 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -132,7 +132,7 @@ factory = "tok2vec" @architectures = "spacy.Tok2Vec.v2" [components.tok2vec.model.embed] -@architectures = "spacy.MultiHashEmbed.v1" +@architectures = "spacy.MultiHashEmbed.v2" [components.tok2vec.model.encode] @architectures = "spacy.MaxoutWindowEncoder.v2" @@ -164,7 +164,7 @@ factory = "ner" @architectures = "spacy.Tok2Vec.v2" [components.ner.model.tok2vec.embed] -@architectures = "spacy.MultiHashEmbed.v1" +@architectures = "spacy.MultiHashEmbed.v2" [components.ner.model.tok2vec.encode] @architectures = "spacy.MaxoutWindowEncoder.v2" @@ -204,14 +204,25 @@ drop-in replacements that let you achieve **higher accuracy** in exchange for > downloaded: 3GB CUDA runtime, 800MB PyTorch, 400MB CuPy, 500MB weights, 200MB > spaCy and dependencies. -Once you have CUDA installed, you'll need to install two pip packages, -[`cupy`](https://docs.cupy.dev/en/stable/install.html) and -[`spacy-transformers`](https://github.com/explosion/spacy-transformers). `cupy` -is just like `numpy`, but for GPU. The best way to install it is to choose a -wheel that matches the version of CUDA you're using. You may also need to set -the `CUDA_PATH` environment variable if your CUDA runtime is installed in a -non-standard location. Putting it all together, if you had installed CUDA 10.2 -in `/opt/nvidia/cuda`, you would run: +Once you have CUDA installed, we recommend installing PyTorch following the +[PyTorch installation guidelines](https://pytorch.org/get-started/locally/) for +your package manager and CUDA version. If you skip this step, pip will install +PyTorch as a dependency below, but it may not find the best version for your +setup. + +```bash +### Example: Install PyTorch 1.7.1 for CUDA 10.1 with pip +# See: https://pytorch.org/get-started/locally/ +$ pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html +``` + +Next, install spaCy with the extras for your CUDA version and transformers. The +CUDA extra (e.g., `cuda92`, `cuda102`, `cuda111`) installs the correct version +of [`cupy`](https://docs.cupy.dev/en/stable/install.html#installing-cupy), which +is just like `numpy`, but for GPU. You may also need to set the `CUDA_PATH` +environment variable if your CUDA runtime is installed in a non-standard +location. Putting it all together, if you had installed CUDA 10.2 in +`/opt/nvidia/cuda`, you would run: ```bash ### Installation with CUDA @@ -219,6 +230,16 @@ $ export CUDA_PATH="/opt/nvidia/cuda" $ pip install -U %%SPACY_PKG_NAME[cuda102,transformers]%%SPACY_PKG_FLAGS ``` +For [`transformers`](https://huggingface.co/transformers/) v4.0.0+ and models +that require [`SentencePiece`](https://github.com/google/sentencepiece) (e.g., +ALBERT, CamemBERT, XLNet, Marian, and T5), install the additional dependencies +with: + +```bash +### Install sentencepiece +$ pip install transformers[sentencepiece] +``` + ### Runtime usage {#transformers-runtime} Transformer models can be used as **drop-in replacements** for other types of @@ -330,7 +351,7 @@ factory = "transformer" max_batch_items = 4096 [components.transformer.model] -@architectures = "spacy-transformers.TransformerModel.v1" +@architectures = "spacy-transformers.TransformerModel.v3" name = "bert-base-cased" tokenizer_config = {"use_fast": true} @@ -346,7 +367,7 @@ The `[components.transformer.model]` block describes the `model` argument passed to the transformer component. It's a Thinc [`Model`](https://thinc.ai/docs/api-model) object that will be passed into the component. Here, it references the function -[spacy-transformers.TransformerModel.v1](/api/architectures#TransformerModel) +[spacy-transformers.TransformerModel.v3](/api/architectures#TransformerModel) registered in the [`architectures` registry](/api/top-level#registry). If a key in a block starts with `@`, it's **resolved to a function** and all other settings are passed to the function as arguments. In this case, `name`, @@ -358,6 +379,21 @@ of potentially overlapping `Span` objects to process by the transformer. Several to process the whole document or individual sentences. When the config is resolved, the function is created and passed into the model as an argument. +The `name` value is the name of any [HuggingFace model](huggingface-models), +which will be downloaded automatically the first time it's used. You can also +use a local file path. For full details, see the +[`TransformerModel` docs](/api/architectures#TransformerModel). + +[huggingface-models]: + https://huggingface.co/models?library=pytorch&sort=downloads + +A wide variety of PyTorch models are supported, but some might not work. If a +model doesn't seem to work feel free to open an +[issue](https://github.com/explosion/spacy/issues). Additionally note that +Transformers loaded in spaCy can only be used for tensors, and pretrained +task-specific heads or text generation features cannot be used as part of +the `transformer` pipeline component. + Remember that the `config.cfg` used for training should contain **no missing @@ -481,50 +517,6 @@ custom learning rate for each component. Instead of a constant, you can also provide a schedule, allowing you to freeze the shared parameters at the start of training. -### Managing transformer model max length limitations {#transformer-max-length} - -Many transformer models have a limit on the maximum number of tokens that the -model can process, for example BERT models are limited to 512 tokens. This limit -refers to the number of transformer tokens (BPE, WordPiece, etc.), not the -number of spaCy tokens. - -To be able to process longer texts, the spaCy [`transformer`](/api/transformer) -component uses [`span_getters`](/api/transformer#span_getters) to convert a -batch of [`Doc`](/api/doc) objects into lists of [`Span`](/api/span) objects. A -span may correspond to a doc (for `doc_spans`), a sentence (for `sent_spans`) or -a window of spaCy tokens (`strided_spans`). If a single span corresponds to more -transformer tokens than the transformer model supports, the spaCy pipeline can't -process the text because some spaCy tokens would be left without an analysis. - -In general, it is up to the transformer pipeline user to manage the input texts -so that the model max length is not exceeded. If you're training a **new -pipeline**, you have a number of options to handle the max length limit: - -- Use `doc_spans` with short texts only -- Use `sent_spans` with short sentences only -- For `strided_spans`, lower the `window` size to be short enough for your input - texts (and don't forget to lower the `stride` correspondingly) -- Implement a [custom span getter](#transformers-training-custom-settings) - -You may still run into the max length limit if a single spaCy token is very -long, like a long URL or a noisy string, or if you're using a **pretrained -pipeline** like `en_core_web_trf` with a fixed `window` size for -`strided_spans`. In this case, you need to modify either your texts or your -pipeline so that you have shorter spaCy tokens. Some options: - -- Preprocess your texts to clean up noise and split long tokens with whitespace -- Add a `token_splitter` to the beginning of your pipeline to break up - tokens that are longer than a specified length: - - ```python - config={"min_length": 20, "split_length": 5} - nlp.add_pipe("token_splitter", config=config, first=True) - ``` - - In this example, tokens that are at least 20 characters long will be split up - into smaller tokens of 5 characters each, resulting in strided spans that - correspond to fewer transformer tokens. - ## Static vectors {#static-vectors} If your pipeline includes a **word vectors table**, you'll be able to use the @@ -564,7 +556,7 @@ word vector tables using the `include_static_vectors` flag. ```ini [tagger.model.tok2vec.embed] -@architectures = "spacy.MultiHashEmbed.v1" +@architectures = "spacy.MultiHashEmbed.v2" width = 128 attrs = ["LOWER","PREFIX","SUFFIX","SHAPE"] rows = [5000,2500,2500,2500] @@ -573,7 +565,7 @@ include_static_vectors = true -The configuration system will look up the string `"spacy.MultiHashEmbed.v1"` in +The configuration system will look up the string `"spacy.MultiHashEmbed.v2"` in the `architectures` [registry](/api/top-level#registry), and call the returned object with the rest of the arguments from the block. This will result in a call to the @@ -694,14 +686,14 @@ You can then run [`spacy pretrain`](/api/cli#pretrain) with the updated config and pass in optional config overrides, like the path to the raw text file: ```cli -$ python -m spacy pretrain config_pretrain.cfg ./output --paths.raw text.jsonl +$ python -m spacy pretrain config_pretrain.cfg ./output --paths.raw_text text.jsonl ``` The following defaults are used for the `[pretraining]` block and merged into your existing config when you run [`init config`](/api/cli#init-config) or [`init fill-config`](/api/cli#init-fill-config) with `--pretraining`. If needed, you can [configure](#pretraining-configure) the settings and hyperparameters or -change the [objective](#pretraining-details). +change the [objective](#pretraining-objectives). ```ini %%GITHUB_SPACY/spacy/default_config_pretraining.cfg @@ -720,9 +712,11 @@ given you a 10% error reduction, pretraining with spaCy might give you another The [`spacy pretrain`](/api/cli#pretrain) command will take a **specific subnetwork** within one of your components, and add additional layers to build a network for a temporary task that forces the model to learn something about -sentence structure and word cooccurrence statistics. Pretraining produces a -**binary weights file** that can be loaded back in at the start of training. The -weights file specifies an initial set of weights. Training then proceeds as +sentence structure and word cooccurrence statistics. + +Pretraining produces a **binary weights file** that can be loaded back in at the +start of training, using the configuration option `initialize.init_tok2vec`. +The weights file specifies an initial set of weights. Training then proceeds as normal. You can only pretrain one subnetwork from your pipeline at a time, and the @@ -755,7 +749,41 @@ component = "textcat" layer = "tok2vec" ``` -#### Pretraining objectives {#pretraining-details} +#### Connecting pretraining to training {#pretraining-training} + +To benefit from pretraining, your training step needs to know to initialize +its `tok2vec` component with the weights learned from the pretraining step. +You do this by setting `initialize.init_tok2vec` to the filename of the +`.bin` file that you want to use from pretraining. + +A pretraining step that runs for 5 epochs with an output path of `pretrain/`, +as an example, produces `pretrain/model0.bin` through `pretrain/model4.bin`. +To make use of the final output, you could fill in this value in your config +file: + +```ini +### config.cfg + +[paths] +init_tok2vec = "pretrain/model4.bin" + +[initialize] +init_tok2vec = ${paths.init_tok2vec} +``` + + + +The outputs of `spacy pretrain` are not the same data format as the +pre-packaged static word vectors that would go into +[`initialize.vectors`](/api/data-formats#config-initialize). +The pretraining output consists of the weights that the `tok2vec` +component should start with in an existing pipeline, so it goes in +`initialize.init_tok2vec`. + + + + +#### Pretraining objectives {#pretraining-objectives} > ```ini > ### Characters objective diff --git a/website/docs/usage/facts-figures.md b/website/docs/usage/facts-figures.md index 269ac5e17..4bee31ed0 100644 --- a/website/docs/usage/facts-figures.md +++ b/website/docs/usage/facts-figures.md @@ -77,7 +77,7 @@ import Benchmarks from 'usage/\_benchmarks-models.md' | Dependency Parsing System | UAS | LAS | | ------------------------------------------------------------------------------ | ---: | ---: | -| spaCy RoBERTa (2020) | 95.5 | 94.3 | +| spaCy RoBERTa (2020) | 95.1 | 93.7 | | [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.4 | 96.3 | | [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019) | 97.2 | 95.7 | @@ -92,6 +92,31 @@ results. Project template: +### Speed comparison {#benchmarks-speed} + +We compare the speed of different NLP libraries, measured in words per second +(WPS) - higher is better. The evaluation was performed on 10,000 Reddit +comments. + +
+ +| Library | Pipeline | WPS CPU words per second on CPU, higher is better | WPS GPU words per second on GPU, higher is better | +| ------- | ----------------------------------------------- | -------------------------------------------------------------: | -------------------------------------------------------------: | +| spaCy | [`en_core_web_lg`](/models/en#en_core_web_lg) | 10,014 | 14,954 | +| spaCy | [`en_core_web_trf`](/models/en#en_core_web_trf) | 684 | 3,768 | +| Stanza | `en_ewt` | 878 | 2,180 | +| Flair | `pos`(`-fast`) & `ner`(`-fast`) | 323 | 1,184 | +| UDPipe | `english-ewt-ud-2.5` | 1,101 | _n/a_ | + +
+ +**End-to-end processing speed** on raw unannotated text. Project template: +[`benchmarks/speed`](%%GITHUB_PROJECTS/benchmarks/speed). + +
+ +
+ diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md index 2df8cdaa0..54ab62467 100644 --- a/website/docs/usage/index.md +++ b/website/docs/usage/index.md @@ -71,13 +71,14 @@ spaCy's [`setup.cfg`](%%GITHUB_SPACY/setup.cfg) for details on what's included. > $ pip install %%SPACY_PKG_NAME[lookups,transformers]%%SPACY_PKG_FLAGS > ``` -| Name | Description | -| ---------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `lookups` | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. | -| `transformers` | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline. | -| `ray` | Install [`spacy-ray`](https://github.com/explosion/spacy-ray) to add CLI commands for [parallel training](/usage/training#parallel-training). | -| `cuda`, ... | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options. | -| `ja`, `ko`, `th`, `zh` | Install additional dependencies required for tokenization for the [languages](/usage/models#languages). | +| Name | Description | +| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `lookups` | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. | +| `transformers` | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline. | +| `ray` | Install [`spacy-ray`](https://github.com/explosion/spacy-ray) to add CLI commands for [parallel training](/usage/training#parallel-training). | +| `cuda`, ... | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options. | +| `apple` | Install [`thinc-apple-ops`](https://github.com/explosion/thinc-apple-ops) to improve performance on an Apple M1. | +| `ja`, `ko`, `th` | Install additional dependencies required for tokenization for the [languages](/usage/models#languages). | ### conda {#conda} @@ -130,9 +131,9 @@ which provides a numpy-compatible interface for GPU arrays. spaCy can be installed on GPU by specifying `spacy[cuda]`, `spacy[cuda90]`, `spacy[cuda91]`, `spacy[cuda92]`, `spacy[cuda100]`, `spacy[cuda101]`, -`spacy[cuda102]`, `spacy[cuda110]` or `spacy[cuda111]`. If you know your cuda -version, using the more explicit specifier allows cupy to be installed via -wheel, saving some compilation time. The specifiers should install +`spacy[cuda102]`, `spacy[cuda110]`, `spacy[cuda111]` or `spacy[cuda112]`. If you +know your cuda version, using the more explicit specifier allows cupy to be +installed via wheel, saving some compilation time. The specifiers should install [`cupy`](https://cupy.chainer.org). ```bash @@ -170,26 +171,17 @@ $ git clone https://github.com/explosion/spaCy # clone spaCy $ cd spaCy # navigate into dir $ python -m venv .env # create environment in .env $ source .env/bin/activate # activate virtual env -$ pip install . # compile and install spaCy +$ pip install -r requirements.txt # install requirements +$ pip install --no-build-isolation --editable . # compile and install spaCy ``` To install with extras: ```bash -$ pip install .[lookups,cuda102] # install spaCy with extras +$ pip install --no-build-isolation --editable .[lookups,cuda102] ``` -To install all dependencies required for development: - -```bash -$ pip install -r requirements.txt -``` - -Compared to a regular install via pip, the -[`requirements.txt`](%%GITHUB_SPACY/requirements.txt) additionally includes -developer dependencies such as Cython and the libraries required to run the test -suite. See the [quickstart widget](#quickstart) to get the right commands for -your platform and Python version. +How to install compilers and related build tools: @@ -227,7 +219,7 @@ source code and recompiling frequently. ```bash $ pip install -r requirements.txt $ python setup.py build_ext --inplace -j N - $ pip install --no-build-isolation --editable . + $ python setup.py develop ``` ### Building an executable {#executable} @@ -264,36 +256,6 @@ You can configure the build process with the following environment variables: | `PYVER` | The Python version to build against. This version needs to be available on your build and runtime machines. Defaults to `3.6`. | | `WHEELHOUSE` | Directory to store the wheel files during compilation. Defaults to `./wheelhouse`. | -#### Additional options for developers {#source-developers} - -Some additional options may be useful for spaCy developers who are editing the -source code and recompiling frequently. - -- Install in editable mode. Changes to `.py` files will be reflected as soon as - the files are saved, but edits to Cython files (`.pxd`, `.pyx`) will require - the `pip install` or `python setup.py build_ext` command below to be run - again. Before installing in editable mode, be sure you have removed any - previous installs with `pip uninstall spacy`, which you may need to run - multiple times to remove all traces of earlier installs. - - ```diff - pip install -U pip setuptools wheel - - pip install . - + pip install -r requirements.txt - + pip install --no-build-isolation --editable . - ``` - -- Build in parallel using `N` CPUs to speed up compilation and then install in - editable mode: - - ```diff - pip install -U pip setuptools wheel - - pip install . - + pip install -r requirements.txt - + python setup.py build_ext --inplace -j N - + python setup.py develop - ``` - ### Run tests {#run-tests} spaCy comes with an [extensive test suite](%%GITHUB_SPACY/spacy/tests). In order @@ -323,7 +285,9 @@ $ python -m pytest --pyargs %%SPACY_PKG_NAME --slow # basic and slow test ## Troubleshooting guide {#troubleshooting} This section collects some of the most common errors you may come across when -installing, loading and using spaCy, as well as their solutions. +installing, loading and using spaCy, as well as their solutions. Also see the +[Discussions FAQ Thread](https://github.com/explosion/spaCy/discussions/8226), +which is updated more frequently and covers more transitory issues. > #### Help us improve this guide > @@ -350,62 +314,6 @@ language's `Language` class instead, for example - - -``` -no such option: --no-cache-dir -``` - -The `download` command uses pip to install the pipeline packages and sets the -`--no-cache-dir` flag to prevent it from requiring too much memory. -[This setting](https://pip.pypa.io/en/stable/reference/pip_install/#caching) -requires pip v6.0 or newer. Run `pip install -U pip` to upgrade to the latest -version of pip. To see which version you have installed, run `pip --version`. - - - - - -``` -sre_constants.error: bad character range -``` - -In [v2.1](/usage/v2-1), spaCy changed its implementation of regular expressions -for tokenization to make it up to 2-3 times faster. But this also means that -it's very important now that you run spaCy with a wide unicode build of Python. -This means that the build has 1114111 unicode characters available, instead of -only 65535 in a narrow unicode build. You can check this by running the -following command: - -```bash -$ python -c "import sys; print(sys.maxunicode)" -``` - -If you're running a narrow unicode build, reinstall Python and use a wide -unicode build instead. You can also rebuild Python and set the -`--enable-unicode=ucs4` flag. - - - - - -``` -ValueError: unknown locale: UTF-8 -``` - -This error can sometimes occur on OSX and is likely related to a still -unresolved [Python bug](https://bugs.python.org/issue18378). However, it's easy -to fix: just add the following to your `~/.bash_profile` or `~/.zshrc` and then -run `source ~/.bash_profile` or `source ~/.zshrc`. Make sure to add **both -lines** for `LC_ALL` and `LANG`. - -```bash -$ export LC_ALL=en_US.UTF-8 -$ export LANG=en_US.UTF-8 -``` - - - ``` diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index 13e474bfe..2e23b3684 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -15,7 +15,7 @@ next: /usage/projects > ```python > from thinc.api import Model, chain > -> @spacy.registry.architectures.register("model.v1") +> @spacy.registry.architectures("model.v1") > def build_model(width: int, classes: int) -> Model: > tok2vec = build_tok2vec(width) > output_layer = build_output_layer(width, classes) @@ -137,7 +137,7 @@ nO = null @architectures = "spacy.Tok2Vec.v2" [components.textcat.model.tok2vec.embed] -@architectures = "spacy.MultiHashEmbed.v1" +@architectures = "spacy.MultiHashEmbed.v2" width = 64 rows = [2000, 2000, 1000, 1000, 1000, 1000] attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] @@ -151,7 +151,7 @@ maxout_pieces = 3 depth = 2 [components.textcat.model.linear_model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = true ngram_size = 1 no_output_layer = false @@ -169,7 +169,7 @@ factory = "textcat" labels = [] [components.textcat.model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = true ngram_size = 1 no_output_layer = false @@ -204,7 +204,7 @@ factory = "tok2vec" @architectures = "spacy.Tok2Vec.v2" [components.tok2vec.model.embed] -@architectures = "spacy.MultiHashEmbed.v1" +@architectures = "spacy.MultiHashEmbed.v2" # ... [components.tok2vec.model.encode] @@ -220,7 +220,7 @@ architecture: ```ini ### config.cfg (excerpt) [components.tok2vec.model.embed] -@architectures = "spacy.CharacterEmbed.v1" +@architectures = "spacy.CharacterEmbed.v2" # ... [components.tok2vec.model.encode] @@ -537,15 +537,17 @@ two major steps required: pass through the `nlp` pipeline. -Run this example use-case by using our project template. It includes all the +Run this example use-case by using our project template. It includes all the code to create the ML model and the pipeline component from scratch. -It also contains two config files to train the model: +It also contains two config files to train the model: one to run on CPU with a Tok2Vec layer, and one for the GPU using a transformer. -The project applies the relation extraction component to identify biomolecular -interactions in a sample dataset, but you can easily swap in your own dataset +The project applies the relation extraction component to identify biomolecular +interactions in a sample dataset, but you can easily swap in your own dataset for your experiments in any other domain. + + #### Step 1: Implementing the Model {#component-rel-model} We need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes a @@ -561,7 +563,7 @@ matrix** (~~Floats2d~~) of predictions: ```python ### The model architecture -@spacy.registry.architectures.register("rel_model.v1") +@spacy.registry.architectures("rel_model.v1") def create_relation_model(...) -> Model[List[Doc], Floats2d]: model = ... # 👈 model will go here return model @@ -587,7 +589,7 @@ transforms the instance tensor into a final tensor holding the predictions: ```python ### The model architecture {highlight="6"} -@spacy.registry.architectures.register("rel_model.v1") +@spacy.registry.architectures("rel_model.v1") def create_relation_model( create_instance_tensor: Model[List[Doc], Floats2d], classification_layer: Model[Floats2d, Floats2d], @@ -611,7 +613,7 @@ The `classification_layer` could be something like a ```python ### The classification layer -@spacy.registry.architectures.register("rel_classification_layer.v1") +@spacy.registry.architectures("rel_classification_layer.v1") def create_classification_layer( nO: int = None, nI: int = None ) -> Model[Floats2d, Floats2d]: @@ -636,7 +638,7 @@ that has the full implementation. > @architectures = "rel_instance_tensor.v1" > > [model.create_instance_tensor.tok2vec] -> @architectures = "spacy.HashEmbedCNN.v1" +> @architectures = "spacy.HashEmbedCNN.v2" > # ... > > [model.create_instance_tensor.pooling] @@ -648,7 +650,7 @@ that has the full implementation. ```python ### The layer that creates the instance tensor -@spacy.registry.architectures.register("rel_instance_tensor.v1") +@spacy.registry.architectures("rel_instance_tensor.v1") def create_tensors( tok2vec: Model[List[Doc], List[Floats2d]], pooling: Model[Ragged, Floats2d], @@ -729,7 +731,7 @@ are within a **maximum distance** (in number of tokens) of each other: ```python ### Candidate generation -@spacy.registry.misc.register("rel_instance_generator.v1") +@spacy.registry.misc("rel_instance_generator.v1") def create_instances(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]: def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]: candidates = [] @@ -824,14 +826,14 @@ will predict scores for each label. We add convenience methods to easily retrieve and add to them. ```python -### The constructor (continued) +### The constructor (continued) def __init__(self, vocab, model, name="rel"): """Create a component instance.""" # ... self.cfg = {"labels": []} @property - def labels(self) -> Tuple[str]: + def labels(self) -> Tuple[str, ...]: """Returns the labels currently added to the component.""" return tuple(self.cfg["labels"]) @@ -1041,11 +1043,11 @@ def make_relation_extractor(nlp, name, model): ``` -Run this example use-case by using our project template. It includes all the +Run this example use-case by using our project template. It includes all the code to create the ML model and the pipeline component from scratch. -It contains two config files to train the model: +It contains two config files to train the model: one to run on CPU with a Tok2Vec layer, and one for the GPU using a transformer. -The project applies the relation extraction component to identify biomolecular -interactions, but you can easily swap in your own dataset for your experiments +The project applies the relation extraction component to identify biomolecular +interactions, but you can easily swap in your own dataset for your experiments in any other domain. diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 80a8eab1b..f8f47ab53 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -512,7 +512,7 @@ nlp = spacy.load("en_core_web_sm", disable=["parser"]) spaCy features an extremely fast statistical entity recognition system, that assigns labels to contiguous spans of tokens. The default -[trained pipelines](/models) can indentify a variety of named and numeric +[trained pipelines](/models) can identify a variety of named and numeric entities, including companies, locations, organizations and products. You can add arbitrary classes to the entity recognition system, and update the model with new examples. @@ -550,7 +550,7 @@ on a token, it will return an empty string. > - `I` – Token is **inside** a multi-token entity. > - `L` – Token is the **last** token of a multi-token entity. > - `U` – Token is a single-token **unit** entity. -> - `O` – Toke is **outside** an entity. +> - `O` – Token is **outside** an entity. ```python ### {executable="true"} @@ -585,7 +585,7 @@ print(ent_francisco) # ['Francisco', 'I', 'GPE'] To ensure that the sequence of token annotations remains consistent, you have to set entity annotations **at the document level**. However, you can't write directly to the `token.ent_iob` or `token.ent_type` attributes, so the easiest -way to set entities is to assign to the [`doc.ents`](/api/doc#ents) attribute +way to set entities is to use the [`doc.set_ents`](/api/doc#set_ents) function and create the new entity as a [`Span`](/api/span). ```python @@ -599,18 +599,28 @@ ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] print('Before', ents) # The model didn't recognize "fb" as an entity :( -fb_ent = Span(doc, 0, 1, label="ORG") # create a Span for the new entity -doc.ents = list(doc.ents) + [fb_ent] +# Create a span for the new entity +fb_ent = Span(doc, 0, 1, label="ORG") +orig_ents = list(doc.ents) -ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] +# Option 1: Modify the provided entity spans, leaving the rest unmodified +doc.set_ents([fb_ent], default="unmodified") + +# Option 2: Assign a complete list of ents to doc.ents +doc.ents = orig_ents + [fb_ent] + +ents = [(e.text, e.start, e.end, e.label_) for e in doc.ents] print('After', ents) -# [('fb', 0, 2, 'ORG')] 🎉 +# [('fb', 0, 1, 'ORG')] 🎉 ``` -Keep in mind that you need to create a `Span` with the start and end index of -the **token**, not the start and end index of the entity in the document. In -this case, "fb" is token `(0, 1)` – but at the document level, the entity will -have the start and end indices `(0, 2)`. +Keep in mind that `Span` is initialized with the start and end **token** +indices, not the character offsets. To create a span from character offsets, use +[`Doc.char_span`](/api/doc#char_span): + +```python +fb_ent = doc.char_span(0, 2, label="ORG") +``` #### Setting entity annotations from array {#setting-from-array} @@ -645,9 +655,10 @@ write efficient native code. ```python # cython: infer_types=True +from spacy.typedefs cimport attr_t from spacy.tokens.doc cimport Doc -cpdef set_entity(Doc doc, int start, int end, int ent_type): +cpdef set_entity(Doc doc, int start, int end, attr_t ent_type): for i in range(start, end): doc.c[i].ent_type = ent_type doc.c[start].ent_iob = 3 @@ -776,6 +787,7 @@ rather than performance: ```python def tokenizer_pseudo_code( + text, special_cases, prefix_search, suffix_search, @@ -829,12 +841,14 @@ def tokenizer_pseudo_code( tokens.append(substring) substring = "" tokens.extend(reversed(suffixes)) + for match in matcher(special_cases, text): + tokens.replace(match, special_cases[match]) return tokens ``` The algorithm can be summarized as follows: -1. Iterate over whitespace-separated substrings. +1. Iterate over space-separated substrings. 2. Look for a token match. If there is a match, stop processing and keep this token. 3. Check whether we have an explicitly defined special case for this substring. @@ -848,6 +862,8 @@ The algorithm can be summarized as follows: 8. Look for "infixes" – stuff like hyphens etc. and split the substring into tokens on all infixes. 9. Once we can't consume any more of the string, handle it as a single token. +10. Make a final pass over the text to check for special cases that include + spaces or that were missed due to the incremental processing of affixes. @@ -952,7 +968,7 @@ domain. There are six things you may need to define: quotes, open brackets, etc. 3. A function `suffix_search`, to handle **succeeding punctuation**, such as commas, periods, close quotes, etc. -4. A function `infixes_finditer`, to handle non-whitespace separators, such as +4. A function `infix_finditer`, to handle non-whitespace separators, such as hyphens etc. 5. An optional boolean function `token_match` matching strings that should never be split, overriding the infix rules. Useful for things like numbers. @@ -1153,7 +1169,20 @@ class WhitespaceTokenizer: def __call__(self, text): words = text.split(" ") - return Doc(self.vocab, words=words) + spaces = [True] * len(words) + # Avoid zero-length tokens + for i, word in enumerate(words): + if word == "": + words[i] = " " + spaces[i] = False + # Remove the final trailing space + if words[-1] == " ": + words = words[0:-1] + spaces = spaces[0:-1] + else: + spaces[-1] = False + + return Doc(self.vocab, words=words, spaces=spaces) nlp = spacy.blank("en") nlp.tokenizer = WhitespaceTokenizer(nlp.vocab) @@ -1232,7 +1261,7 @@ hyperparameters, pipeline and tokenizer used for constructing and training the pipeline. The `[nlp.tokenizer]` block refers to a **registered function** that takes the `nlp` object and returns a tokenizer. Here, we're registering a function called `whitespace_tokenizer` in the -[`@tokenizers` registry](/api/registry). To make sure spaCy knows how to +[`@tokenizers` registry](/api/top-level#registry). To make sure spaCy knows how to construct your tokenizer during training, you can pass in your Python file by setting `--code functions.py` when you run [`spacy train`](/api/cli#train). @@ -1469,7 +1498,7 @@ that time, the `Doc` will already be tokenized. This process of splitting a token requires more settings, because you need to specify the text of the individual tokens, optional per-token attributes and how -the should be attached to the existing syntax tree. This can be done by +the tokens should be attached to the existing syntax tree. This can be done by supplying a list of `heads` – either the token to attach the newly split token to, or a `(token, subtoken)` tuple if the newly split token should be attached to another subtoken. In this case, "New" should be attached to "York" (the diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index 8c8875b9e..d1c9a0a81 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -33,7 +33,9 @@ spaCy currently provides support for the following languages. You can help by improving the existing [language data](/usage/linguistic-features#language-data) and extending the tokenization patterns. [See here](https://github.com/explosion/spaCy/issues/3056) for details on how to -contribute to development. +contribute to development. Also see the +[training documentation](/usage/training) for how to train your own pipelines on +your data. > #### Usage note > @@ -267,7 +269,7 @@ best-matching package compatible with your spaCy installation. > > ```diff > - python -m spacy download en -> + python -m spacy dowmload en_core_web_sm +> + python -m spacy download en_core_web_sm > ``` > > ```diff @@ -297,19 +299,33 @@ nlp = spacy.load("en_core_web_sm") doc = nlp("This is a sentence.") ``` +If you're in a **Jupyter notebook** or similar environment, you can use the `!` +prefix to +[execute commands](https://ipython.org/ipython-doc/3/interactive/tutorial.html#system-shell-commands). +Make sure to **restart your kernel** or runtime after installation (just like +you would when installing other Python packages) to make sure that the installed +pipeline package can be found. + +```cli +!python -m spacy download en_core_web_sm +``` + ### Installation via pip {#download-pip} To download a trained pipeline directly using [pip](https://pypi.python.org/pypi/pip), point `pip install` to the URL or local -path of the archive file. To find the direct link to a package, head over to the +path of the wheel file or archive. Installing the wheel is usually more +efficient. To find the direct link to a package, head over to the [releases](https://github.com/explosion/spacy-models/releases), right click on the archive link and copy it to your clipboard. ```bash # With external URL +$ pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl $ pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz # With local file +$ pip install /Users/you/en_core_web_sm-3.0.0-py3-none-any.whl $ pip install /Users/you/en_core_web_sm-3.0.0.tar.gz ``` @@ -349,6 +365,27 @@ pipeline data. You can place the **pipeline package directory** anywhere on your local file system. +### Installation from Python {#download-python} + +Since the [`spacy download`](/api/cli#download) command installs the pipeline as +a **Python package**, we always recommend running it from the command line, just +like you install other Python packages with `pip install`. However, if you need +to, or if you want to integrate the download process into another CLI command, +you can also import and call the `download` function used by the CLI via Python. + + + +Keep in mind that the `download` command installs a Python package into your +environment. In order for it to be found after installation, you will need to +**restart or reload** your Python process so that new packages are recognized. + + + +```python +import spacy +spacy.cli.download("en_core_web_sm") +``` + ### Using trained pipelines with spaCy {#usage} To load a pipeline package, use [`spacy.load`](/api/top-level#spacy.load) with @@ -362,7 +399,7 @@ the package name or a path to the data directory: > > ```diff > - python -m spacy download en -> + python -m spacy dowmload en_core_web_sm +> + python -m spacy download en_core_web_sm > ``` ```python @@ -377,7 +414,7 @@ doc = nlp("This is a sentence.") You can use the [`info`](/api/cli#info) command or [`spacy.info()`](/api/top-level#spacy.info) method to print a pipeline -packages's meta data before loading it. Each `Language` object with a loaded +package's meta data before loading it. Each `Language` object with a loaded pipeline also exposes the pipeline's meta data as the attribute `meta`. For example, `nlp.meta['version']` will return the package version. @@ -433,8 +470,8 @@ URLs. ```text ### requirements.txt -spacy>=2.2.0,<3.0.0 -https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz#egg=en_core_web_sm +spacy>=3.0.0,<4.0.0 +https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web_sm ``` Specifying `#egg=` with the package name tells pip which package to expect from @@ -471,6 +508,5 @@ logic around spaCy's loader, you can use [pytest](http://pytest.readthedocs.io/en/latest/)'s [`importorskip()`](https://docs.pytest.org/en/latest/builtin.html#_pytest.outcomes.importorskip) method to only run a test if a specific pipeline package or version is -installed. Each pipeline package package exposes a `__version__` attribute which -you can also use to perform your own version compatibility checks before loading -it. +installed. Each pipeline package exposes a `__version__` attribute which you can +also use to perform your own version compatibility checks before loading it. diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index b9824ea04..0264a2825 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -54,9 +54,8 @@ texts = ["This is a text", "These are lots of texts", "..."] In this example, we're using [`nlp.pipe`](/api/language#pipe) to process a (potentially very large) iterable of texts as a stream. Because we're only accessing the named entities in `doc.ents` (set by the `ner` component), we'll -disable all other statistical components (the `tagger` and `parser`) during -processing. `nlp.pipe` yields `Doc` objects, so we can iterate over them and -access the named entity predictions: +disable all other components during processing. `nlp.pipe` yields `Doc` objects, +so we can iterate over them and access the named entity predictions: > #### ✏️ Things to try > @@ -73,7 +72,7 @@ texts = [ ] nlp = spacy.load("en_core_web_sm") -for doc in nlp.pipe(texts, disable=["tagger", "parser"]): +for doc in nlp.pipe(texts, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]): # Do something with the doc here print([(ent.text, ent.label_) for ent in doc.ents]) ``` @@ -92,6 +91,85 @@ have to call `list()` on it first:
+You can use the `as_tuples` option to pass additional context along with each +doc when using [`nlp.pipe`](/api/language#pipe). If `as_tuples` is `True`, then +the input should be a sequence of `(text, context)` tuples and the output will +be a sequence of `(doc, context)` tuples. For example, you can pass metadata in +the context and save it in a [custom attribute](#custom-components-attributes): + +```python +### {executable="true"} +import spacy +from spacy.tokens import Doc + +if not Doc.has_extension("text_id"): + Doc.set_extension("text_id", default=None) + +text_tuples = [ + ("This is the first text.", {"text_id": "text1"}), + ("This is the second text.", {"text_id": "text2"}) +] + +nlp = spacy.load("en_core_web_sm") +doc_tuples = nlp.pipe(text_tuples, as_tuples=True) + +docs = [] +for doc, context in doc_tuples: + doc._.text_id = context["text_id"] + docs.append(doc) + +for doc in docs: + print(f"{doc._.text_id}: {doc.text}") +``` + +### Multiprocessing {#multiprocessing} + +spaCy includes built-in support for multiprocessing with +[`nlp.pipe`](/api/language#pipe) using the `n_process` option: + +```python +# Multiprocessing with 4 processes +docs = nlp.pipe(texts, n_process=4) + +# With as many processes as CPUs (use with caution!) +docs = nlp.pipe(texts, n_process=-1) +``` + +Depending on your platform, starting many processes with multiprocessing can add +a lot of overhead. In particular, the default start method `spawn` used in +macOS/OS X (as of Python 3.8) and in Windows can be slow for larger models +because the model data is copied in memory for each new process. See the +[Python docs on multiprocessing](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) +for further details. + +For shorter tasks and in particular with `spawn`, it can be faster to use a +smaller number of processes with a larger batch size. The optimal `batch_size` +setting will depend on the pipeline components, the length of your documents, +the number of processes and how much memory is available. + +```python +# Default batch size is `nlp.batch_size` (typically 1000) +docs = nlp.pipe(texts, n_process=2, batch_size=2000) +``` + + + +Multiprocessing is not generally recommended on GPU because RAM is too limited. +If you want to try it out, be aware that it is only possible using `spawn` due +to limitations in CUDA. + + + + + +In Linux, transformer models may hang or deadlock with multiprocessing due to an +[issue in PyTorch](https://github.com/pytorch/pytorch/issues/17199). One +suggested workaround is to use `spawn` instead of `fork` and another is to limit +the number of threads before loading any models using +`torch.set_num_threads(1)`. + + + ## Pipelines and built-in components {#pipelines} spaCy makes it very easy to create your own pipelines consisting of reusable @@ -144,10 +222,12 @@ nlp = spacy.load("en_core_web_sm") ``` ... the pipeline's `config.cfg` tells spaCy to use the language `"en"` and the -pipeline `["tok2vec", "tagger", "parser", "ner"]`. spaCy will then initialize -`spacy.lang.en.English`, and create each pipeline component and add it to the -processing pipeline. It'll then load in the model data from the data directory -and return the modified `Language` class for you to use as the `nlp` object. +pipeline +`["tok2vec", "tagger", "parser", "ner", "attribute_ruler", "lemmatizer"]`. spaCy +will then initialize `spacy.lang.en.English`, and create each pipeline component +and add it to the processing pipeline. It'll then load in the model data from +the data directory and return the modified `Language` class for you to use as +the `nlp` object. @@ -171,7 +251,7 @@ the binary data: ```python ### spacy.load under the hood lang = "en" -pipeline = ["tok2vec", "tagger", "parser", "ner"] +pipeline = ["tok2vec", "tagger", "parser", "ner", "attribute_ruler", "lemmatizer"] data_path = "path/to/en_core_web_sm/en_core_web_sm-3.0.0" cls = spacy.util.get_lang_class(lang) # 1. Get Language class, e.g. English @@ -186,7 +266,7 @@ component** on the `Doc`, in order. Since the model data is loaded, the components can access it to assign annotations to the `Doc` object, and subsequently to the `Token` and `Span` which are only views of the `Doc`, and don't own any data themselves. All components return the modified document, -which is then processed by the component next in the pipeline. +which is then processed by the next component in the pipeline. ```python ### The pipeline under the hood @@ -201,9 +281,9 @@ list of human-readable component names. ```python print(nlp.pipeline) -# [('tok2vec', ), ('tagger', ), ('parser', ), ('ner', )] +# [('tok2vec', ), ('tagger', ), ('parser', ), ('ner', ), ('attribute_ruler', ), ('lemmatizer', )] print(nlp.pipe_names) -# ['tok2vec', 'tagger', 'parser', 'ner'] +# ['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'] ``` ### Built-in pipeline components {#built-in} @@ -223,21 +303,22 @@ available pipeline components and component functions. > ruler = nlp.add_pipe("entity_ruler") > ``` -| String name | Component | Description | -| ----------------- | ----------------------------------------------- | ----------------------------------------------------------------------------------------- | -| `tagger` | [`Tagger`](/api/tagger) | Assign part-of-speech-tags. | -| `parser` | [`DependencyParser`](/api/dependencyparser) | Assign dependency labels. | -| `ner` | [`EntityRecognizer`](/api/entityrecognizer) | Assign named entities. | -| `entity_linker` | [`EntityLinker`](/api/entitylinker) | Assign knowledge base IDs to named entities. Should be added after the entity recognizer. | -| `entity_ruler` | [`EntityRuler`](/api/entityruler) | Assign named entities based on pattern rules and dictionaries. | -| `textcat` | [`TextCategorizer`](/api/textcategorizer) | Assign text categories. | -| `lemmatizer` | [`Lemmatizer`](/api/lemmatizer) | Assign base forms to words. | -| `morphologizer` | [`Morphologizer`](/api/morphologizer) | Assign morphological features and coarse-grained POS tags. | -| `attribute_ruler` | [`AttributeRuler`](/api/attributeruler) | Assign token attribute mappings and rule-based exceptions. | -| `senter` | [`SentenceRecognizer`](/api/sentencerecognizer) | Assign sentence boundaries. | -| `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. | -| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | Assign token-to-vector embeddings. | -| `transformer` | [`Transformer`](/api/transformer) | Assign the tokens and outputs of a transformer model. | +| String name | Component | Description | +| -------------------- | ---------------------------------------------------- | ----------------------------------------------------------------------------------------- | +| `tagger` | [`Tagger`](/api/tagger) | Assign part-of-speech-tags. | +| `parser` | [`DependencyParser`](/api/dependencyparser) | Assign dependency labels. | +| `ner` | [`EntityRecognizer`](/api/entityrecognizer) | Assign named entities. | +| `entity_linker` | [`EntityLinker`](/api/entitylinker) | Assign knowledge base IDs to named entities. Should be added after the entity recognizer. | +| `entity_ruler` | [`EntityRuler`](/api/entityruler) | Assign named entities based on pattern rules and dictionaries. | +| `textcat` | [`TextCategorizer`](/api/textcategorizer) | Assign text categories: exactly one category is predicted per document. | +| `textcat_multilabel` | [`MultiLabel_TextCategorizer`](/api/textcategorizer) | Assign text categories in a multi-label setting: zero, one or more labels per document. | +| `lemmatizer` | [`Lemmatizer`](/api/lemmatizer) | Assign base forms to words. | +| `morphologizer` | [`Morphologizer`](/api/morphologizer) | Assign morphological features and coarse-grained POS tags. | +| `attribute_ruler` | [`AttributeRuler`](/api/attributeruler) | Assign token attribute mappings and rule-based exceptions. | +| `senter` | [`SentenceRecognizer`](/api/sentencerecognizer) | Assign sentence boundaries. | +| `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. | +| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | Assign token-to-vector embeddings. | +| `transformer` | [`Transformer`](/api/transformer) | Assign the tokens and outputs of a transformer model. | ### Disabling, excluding and modifying components {#disabling} @@ -299,7 +380,7 @@ blocks. ```python ### Disable for block # 1. Use as a context manager -with nlp.select_pipes(disable=["tagger", "parser"]): +with nlp.select_pipes(disable=["tagger", "parser", "lemmatizer"]): doc = nlp("I won't be tagged and parsed") doc = nlp("I will be tagged and parsed") @@ -323,7 +404,7 @@ The [`nlp.pipe`](/api/language#pipe) method also supports a `disable` keyword argument if you only want to disable components during processing: ```python -for doc in nlp.pipe(texts, disable=["tagger", "parser"]): +for doc in nlp.pipe(texts, disable=["tagger", "parser", "lemmatizer"]): # Do something with the doc here ``` @@ -400,8 +481,8 @@ vectors available – otherwise, it won't be able to make the same predictions. > ``` > > By default, sourced components will be updated with your data during training. -> If you want to preserve the component as-is, you can "freeze" it if the pipeline -> is not using a shared `Tok2Vec` layer: +> If you want to preserve the component as-is, you can "freeze" it if the +> pipeline is not using a shared `Tok2Vec` layer: > > ```ini > [training] @@ -714,7 +795,7 @@ if there's no state to be passed through – spaCy can just take care of this fo you. The following two code examples are equivalent: ```python -# Statless component with @Language.factory +# Stateless component with @Language.factory @Language.factory("my_component") def create_my_component(): def my_component(doc): @@ -1024,10 +1105,10 @@ While you could use a registered function or a file loader like [`srsly.read_json.v1`](/api/top-level#file_readers) as an argument of the component factory, this approach is problematic: the component factory runs **every time the component is created**. This means it will run when creating -the `nlp` object before training, but also every a user loads your pipeline. So -your runtime pipeline would either depend on a local path on your file system, -or it's loaded twice: once when the component is created, and then again when -the data is by `from_disk`. +the `nlp` object before training, but also every time a user loads your +pipeline. So your runtime pipeline would either depend on a local path on your +file system, or it's loaded twice: once when the component is created, and then +again when the data is by `from_disk`. > ```ini > ### config.cfg @@ -1243,8 +1324,8 @@ labels = [] # This function is created and then passed to the "textcat" component as # the argument "model" [components.textcat.model] -@architectures = "spacy.TextCatBOW.v1" -exclusive_classes = false +@architectures = "spacy.TextCatBOW.v2" +exclusive_classes = true ngram_size = 1 no_output_layer = false @@ -1273,7 +1354,7 @@ loss is calculated and to add evaluation scores to the training output. | [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. | | [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and can be passed custom arguments via the [`[initialize]`](/api/data-formats#config-initialize) config block that are only loaded during training or when you call [`nlp.initialize`](/api/language#initialize), not at runtime. | | [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. | -| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. | +| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_score_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. | @@ -1323,6 +1404,8 @@ There are three main types of extensions, which can be defined using the [`Span.set_extension`](/api/span#set_extension) and [`Token.set_extension`](/api/token#set_extension) methods. +## Description + 1. **Attribute extensions.** Set a default value for an attribute, which can be overwritten manually at any time. Attribute extensions work like "normal" variables and are the quickest way to store arbitrary information on a `Doc`, @@ -1496,24 +1579,33 @@ to `Doc.user_span_hooks` and `Doc.user_token_hooks`. | Name | Customizes | | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `user_hooks` | [`Doc.vector`](/api/doc#vector), [`Doc.has_vector`](/api/doc#has_vector), [`Doc.vector_norm`](/api/doc#vector_norm), [`Doc.sents`](/api/doc#sents) | +| `user_hooks` | [`Doc.similarity`](/api/doc#similarity), [`Doc.vector`](/api/doc#vector), [`Doc.has_vector`](/api/doc#has_vector), [`Doc.vector_norm`](/api/doc#vector_norm), [`Doc.sents`](/api/doc#sents) | | `user_token_hooks` | [`Token.similarity`](/api/token#similarity), [`Token.vector`](/api/token#vector), [`Token.has_vector`](/api/token#has_vector), [`Token.vector_norm`](/api/token#vector_norm), [`Token.conjuncts`](/api/token#conjuncts) | | `user_span_hooks` | [`Span.similarity`](/api/span#similarity), [`Span.vector`](/api/span#vector), [`Span.has_vector`](/api/span#has_vector), [`Span.vector_norm`](/api/span#vector_norm), [`Span.root`](/api/span#root) | ```python ### Add custom similarity hooks +from spacy.language import Language + + class SimilarityModel: - def __init__(self, model): - self._model = model + def __init__(self, name: str, index: int): + self.name = name + self.index = index def __call__(self, doc): doc.user_hooks["similarity"] = self.similarity doc.user_span_hooks["similarity"] = self.similarity doc.user_token_hooks["similarity"] = self.similarity + return doc def similarity(self, obj1, obj2): - y = self._model([obj1.vector, obj2.vector]) - return float(y[0]) + return obj1.vector[self.index] + obj2.vector[self.index] + + +@Language.factory("similarity_component", default_config={"index": 0}) +def create_similarity_component(nlp, name, index: int): + return SimilarityModel(name, index) ``` ## Developing plugins and wrappers {#plugins} diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 492345f2f..6f6cef7c8 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -49,6 +49,7 @@ production. Serve your models and host APIs Distributed and parallel training Track your experiments and results +Upload your pipelines to the Hugging Face Hub ### 1. Clone a project template {#clone} @@ -69,9 +70,9 @@ python -m spacy project clone pipelines/tagger_parser_ud By default, the project will be cloned into the current working directory. You can specify an optional second argument to define the output directory. The -`--repo` option lets you define a custom repo to clone from if you don't want -to use the spaCy [`projects`](https://github.com/explosion/projects) repo. You -can also use any private repo you have access to with Git. +`--repo` option lets you define a custom repo to clone from if you don't want to +use the spaCy [`projects`](https://github.com/explosion/projects) repo. You can +also use any private repo you have access to with Git. ### 2. Fetch the project assets {#assets} @@ -221,6 +222,7 @@ pipelines. | `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). | | `description` | An optional project description used in [auto-generated docs](#custom-docs). | | `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. | +| `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. | | `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. | | `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. | | `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. | @@ -289,7 +291,7 @@ files you need and not the whole repo. | Name | Description | | ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. | -| `git` | `repo`: The URL of the repo to download from.
`path`: Path of the file or directory to download, relative to the repo root.
`branch`: The branch to download from. Defaults to `"master"`. | +| `git` | `repo`: The URL of the repo to download from.
`path`: Path of the file or directory to download, relative to the repo root. "" specifies the root directory.
`branch`: The branch to download from. Defaults to `"master"`. | | `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. | | `description` | Optional asset description, used in [auto-generated docs](#custom-docs). | @@ -310,8 +312,8 @@ company-internal and not available over the internet. In that case, you can specify the destination paths and a checksum, and leave out the URL. When your teammates clone and run your project, they can place the files in the respective directory themselves. The [`project assets`](/api/cli#project-assets) command -will alert you about missing files and mismatched checksums, so you can ensure that -others are running your project with the same data. +will alert you about missing files and mismatched checksums, so you can ensure +that others are running your project with the same data. ### Dependencies and outputs {#deps-outputs} @@ -358,9 +360,10 @@ graphs based on the dependencies and outputs, and won't re-run previous steps automatically. For instance, if you only run the command `train` that depends on data created by `preprocess` and those files are missing, spaCy will show an error – it won't just re-run `preprocess`. If you're looking for more advanced -data management, check out the [Data Version Control (DVC) integration](#dvc). If you're planning on integrating your spaCy project with DVC, you -can also use `outputs_no_cache` instead of `outputs` to define outputs that -won't be cached or tracked. +data management, check out the [Data Version Control (DVC) integration](#dvc). +If you're planning on integrating your spaCy project with DVC, you can also use +`outputs_no_cache` instead of `outputs` to define outputs that won't be cached +or tracked. ### Files and directory structure {#project-files} @@ -467,7 +470,9 @@ In your `project.yml`, you can then run the script by calling `python scripts/custom_evaluation.py` with the function arguments. You can also use the `vars` section to define reusable variables that will be substituted in commands, paths and URLs. In this example, the batch size is defined as a -variable will be added in place of `${vars.batch_size}` in the script. +variable will be added in place of `${vars.batch_size}` in the script. Just like +in the [training config](/usage/training##config-overrides), you can also +override settings on the command line – for example using `--vars.batch_size`. > #### Calling into Python > @@ -491,6 +496,29 @@ commands: - 'corpus/eval.json' ``` +You can also use the `env` section to reference **environment variables** and +make their values available to the commands. This can be useful for overriding +settings on the command line and passing through system-level settings. + +> #### Usage example +> +> ```bash +> export GPU_ID=1 +> BATCH_SIZE=128 python -m spacy project run evaluate +> ``` + +```yaml +### project.yml +env: + batch_size: BATCH_SIZE + gpu_id: GPU_ID + +commands: + - name: evaluate + script: + - 'python scripts/custom_evaluation.py ${env.batch_size}' +``` + ### Documenting your project {#custom-docs} > #### Readme Example @@ -730,16 +758,6 @@ workflows, but only one can be tracked by DVC. ### Prodigy {#prodigy} - - -The Prodigy integration will require a nightly version of Prodigy that supports -spaCy v3+. You can already use annotations created with Prodigy in spaCy v3 by -exporting your data with -[`data-to-spacy`](https://prodi.gy/docs/recipes#data-to-spacy) and running -[`spacy convert`](/api/cli#convert) to convert it to the binary format. - - - [Prodigy](https://prodi.gy) is a modern annotation tool for creating training data for machine learning models, developed by us. It integrates with spaCy out-of-the-box and provides many different @@ -748,17 +766,23 @@ with and without a model in the loop. If Prodigy is installed in your project, you can start the annotation server from your `project.yml` for a tight feedback loop between data development and training. -The following example command starts the Prodigy app using the -[`ner.correct`](https://prodi.gy/docs/recipes#ner-correct) recipe and streams in -suggestions for the given entity labels produced by a pretrained model. You can -then correct the suggestions manually in the UI. After you save and exit the -server, the full dataset is exported in spaCy's format and split into a training -and evaluation set. + + +This integration requires [Prodigy v1.11](https://prodi.gy/docs/changelog#v1.11) +or higher. If you're using an older version of Prodigy, you can still use your +annotations in spaCy v3 by exporting your data with +[`data-to-spacy`](https://prodi.gy/docs/recipes#data-to-spacy) and running +[`spacy convert`](/api/cli#convert) to convert it to the binary format. + + + +The following example shows a workflow for merging and exporting NER annotations +collected with Prodigy and training a spaCy pipeline: > #### Example usage > > ```cli -> $ python -m spacy project run annotate +> $ python -m spacy project run all > ``` @@ -766,36 +790,71 @@ and evaluation set. ### project.yml vars: prodigy: - dataset: 'ner_articles' - labels: 'PERSON,ORG,PRODUCT' - model: 'en_core_web_md' + train_dataset: "fashion_brands_training" + eval_dataset: "fashion_brands_eval" + +workflows: + all: + - data-to-spacy + - train_spacy commands: - - name: annotate - - script: - - 'python -m prodigy ner.correct ${vars.prodigy.dataset} ./assets/raw_data.jsonl ${vars.prodigy.model} --labels ${vars.prodigy.labels}' - - 'python -m prodigy data-to-spacy ./corpus/train.json ./corpus/eval.json --ner ${vars.prodigy.dataset}' - - 'python -m spacy convert ./corpus/train.json ./corpus/train.spacy' - - 'python -m spacy convert ./corpus/eval.json ./corpus/eval.spacy' - - deps: - - 'assets/raw_data.jsonl' - - outputs: - - 'corpus/train.spacy' - - 'corpus/eval.spacy' + - name: "data-to-spacy" + help: "Merge your annotations and create data in spaCy's binary format" + script: + - "python -m prodigy data-to-spacy corpus/ --ner ${vars.prodigy.train_dataset},eval:${vars.prodigy.eval_dataset}" + outputs: + - "corpus/train.spacy" + - "corpus/dev.spacy" + - name: "train_spacy" + help: "Train a named entity recognition model with spaCy" + script: + - "python -m spacy train configs/config.cfg --output training/ --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy" + deps: + - "corpus/train.spacy" + - "corpus/dev.spacy" + outputs: + - "training/model-best" ``` -You can use the same approach for other types of projects and annotation +> #### Example train curve output +> +> [![Screenshot of train curve terminal output](../images/prodigy_train_curve.jpg)](https://prodi.gy/docs/recipes#train-curve) + +The [`train-curve`](https://prodi.gy/docs/recipes#train-curve) recipe is another +cool workflow you can include in your project. It will run the training with +different portions of the data, e.g. 25%, 50%, 75% and 100%. As a rule of thumb, +if accuracy increases in the last segment, this could indicate that collecting +more annotations of the same type might improve the model further. + + +```yaml +### project.yml (excerpt) +- name: "train_curve" + help: "Train the model with Prodigy by using different portions of training examples to evaluate if more annotations can potentially improve the performance" + script: + - "python -m prodigy train-curve --ner ${vars.prodigy.train_dataset},eval:${vars.prodigy.eval_dataset} --config configs/${vars.config} --show-plot" +``` + +You can use the same approach for various types of projects and annotation workflows, including -[text classification](https://prodi.gy/docs/recipes#textcat), -[dependency parsing](https://prodi.gy/docs/recipes#dep), +[named entity recognition](https://prodi.gy/docs/named-entity-recognition), +[span categorization](https://prodi.gy/docs/span-categorization), +[text classification](https://prodi.gy/docs/text-classification), +[dependency parsing](https://prodi.gy/docs/dependencies-relations), [part-of-speech tagging](https://prodi.gy/docs/recipes#pos) or fully -[custom recipes](https://prodi.gy/docs/custom-recipes) – for instance, an A/B -evaluation workflow that lets you compare two different models and their -results. +[custom recipes](https://prodi.gy/docs/custom-recipes). You can also use spaCy +project templates to quickly start the annotation server to collect more +annotations and add them to your Prodigy dataset. - +Get started with spaCy and Prodigy using our project template. It includes +commands to create a merged training corpus from your Prodigy annotations, +training and packaging a spaCy pipeline and analyzing if more annotations may +improve performance. + + --- @@ -968,7 +1027,7 @@ your results. > > ```ini > [training.logger] -> @loggers = "spacy.WandbLogger.v1" +> @loggers = "spacy.WandbLogger.v2" > project_name = "monitor_spacy_training" > remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"] > ``` @@ -986,3 +1045,68 @@ creating variants of the config for a simple hyperparameter grid search and logging the results. + +--- + +### Hugging Face Hub {#huggingface_hub} + +The [Hugging Face Hub](https://huggingface.co/) lets you upload models and share +them with others. It hosts models as Git-based repositories which are storage +spaces that can contain all your files. It support versioning, branches and +custom metadata out-of-the-box, and provides browser-based visualizers for +exploring your models interactively, as well as an API for production use. The +[`spacy-huggingface-hub`](https://github.com/explosion/spacy-huggingface-hub) +package automatically adds the `huggingface-hub` command to your `spacy` CLI if +it's installed. + +> #### Installation +> +> ```cli +> $ pip install spacy-huggingface-hub +> # Check that the CLI is registered +> $ python -m spacy huggingface-hub --help +> ``` + +You can then upload any pipeline packaged with +[`spacy package`](/api/cli#package). Make sure to set `--build wheel` to output +a binary `.whl` file. The uploader will read all metadata from the pipeline +package, including the auto-generated pretty `README.md` and the model details +available in the `meta.json`. For examples, check out the +[spaCy pipelines](https://huggingface.co/spacy) we've uploaded. + +```cli +$ huggingface-cli login +$ python -m spacy package ./en_ner_fashion ./output --build wheel +$ cd ./output/en_ner_fashion-0.0.0/dist +$ python -m spacy huggingface-hub push en_ner_fashion-0.0.0-py3-none-any.whl +``` + +After uploading, you will see the live URL of your pipeline packages, as well as +the direct URL to the model wheel you can install via `pip install`. You'll also +be able to test your pipeline interactively from your browser: + +![Screenshot: interactive NER visualizer](../images/huggingface_hub.jpg) + +In your `project.yml`, you can add a command that uploads your trained and +packaged pipeline to the hub. You can either run this as a manual step, or +automatically as part of a workflow. Make sure to set `--build wheel` when +running `spacy package` to build a wheel file for your pipeline package. + + +```yaml +### project.yml +- name: "push_to_hub" + help: "Upload the trained model to the Hugging Face Hub" + script: + - "python -m spacy huggingface-hub push packages/en_${vars.name}-${vars.version}/dist/en_${vars.name}-${vars.version}-py3-none-any.whl" + deps: + - "packages/en_${vars.name}-${vars.version}/dist/en_${vars.name}-${vars.version}-py3-none-any.whl" +``` + + + +Get started with uploading your models to the Hugging Face hub using our project +template. It trains a simple pipeline, packages it and uploads it if the +packaged model has changed. This makes it easy to deploy your models end-to-end. + + diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 22bf4f470..74bb10304 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -63,7 +63,7 @@ another token that's at least 10 characters long. spaCy features a rule-matching engine, the [`Matcher`](/api/matcher), that operates over tokens, similar to regular expressions. The rules can refer to -token annotations (e.g. the token `text` or `tag_`, and flags (e.g. `IS_PUNCT`). +token annotations (e.g. the token `text` or `tag_`, and flags like `IS_PUNCT`). The rule matcher also lets you pass in a custom callback to act on matches – for example, to merge entities and apply custom labels. You can also associate patterns with entity IDs, to allow some basic entity linking or disambiguation. @@ -232,15 +232,22 @@ following rich comparison attributes are available: > > # Matches tokens of length >= 10 > pattern2 = [{"LENGTH": {">=": 10}}] +> +> # Match based on morph attributes +> pattern3 = [{"MORPH": {"IS_SUBSET": ["Number=Sing", "Gender=Neut"]}}] +> # "", "Number=Sing" and "Number=Sing|Gender=Neut" will match as subsets +> # "Number=Plur|Gender=Neut" will not match +> # "Number=Sing|Gender=Neut|Polite=Infm" will not match because it's a superset > ``` -| Attribute | Description | -| -------------------------- | ------------------------------------------------------------------------------------------------------- | -| `IN` | Attribute value is member of a list. ~~Any~~ | -| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ | -| `ISSUBSET` | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~ | -| `ISSUPERSET` | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~ | -| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ | +| Attribute | Description | +| -------------------------- | --------------------------------------------------------------------------------------------------------- | +| `IN` | Attribute value is member of a list. ~~Any~~ | +| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ | +| `IS_SUBSET` | Attribute value (for `MORPH` or custom list attributes) is a subset of a list. ~~Any~~ | +| `IS_SUPERSET` | Attribute value (for `MORPH` or custom list attributes) is a superset of a list. ~~Any~~ | +| `INTERSECTS` | Attribute value (for `MORPH` or custom list attributes) has a non-empty intersection with a list. ~~Any~~ | +| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ | #### Regular expressions {#regex new="2.1"} @@ -422,7 +429,7 @@ matcher.add("HelloWorld", [pattern]) # 🚨 Raises an error: # MatchPatternError: Invalid token patterns for matcher rule 'HelloWorld' # Pattern 0: -# - Additional properties are not allowed ('CASEINSENSITIVE' was unexpected) [2] +# - [pattern -> 2 -> CASEINSENSITIVE] extra fields not permitted ``` @@ -431,7 +438,8 @@ matcher.add("HelloWorld", [pattern]) To move on to a more realistic example, let's say you're working with a large corpus of blog articles, and you want to match all mentions of "Google I/O" (which spaCy tokenizes as `['Google', 'I', '/', 'O'`]). To be safe, you only -match on the uppercase versions, in case someone has written it as "Google i/o". +match on the uppercase versions, avoiding matches with phrases such as "Google +i/o". ```python ### {executable="true"} @@ -1552,7 +1560,7 @@ doc = nlp("Dr. Alex Smith chaired first board meeting of Acme Corp Inc.") print([(ent.text, ent.label_) for ent in doc.ents]) ``` -An alternative approach would be to an +An alternative approach would be to use an [extension attribute](/usage/processing-pipelines/#custom-components-attributes) like `._.person_title` and add it to `Span` objects (which includes entity spans in `doc.ents`). The advantage here is that the entity text stays intact and can diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index 38e80db40..9dad077e7 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -19,9 +19,8 @@ import Serialization101 from 'usage/101/\_serialization.md' When serializing the pipeline, keep in mind that this will only save out the **binary data for the individual components** to allow spaCy to restore them – not the entire objects. This is a good thing, because it makes serialization -safe. But it also means that you have to take care of storing the language name -and pipeline component names as well, and restoring them separately before you -can load in the data. +safe. But it also means that you have to take care of storing the config, which +contains the pipeline configuration and all the relevant settings. > #### Saving the meta and config > @@ -33,24 +32,21 @@ can load in the data. ```python ### Serialize +config = nlp.config bytes_data = nlp.to_bytes() -lang = nlp.config["nlp"]["lang"] # "en" -pipeline = nlp.config["nlp"]["pipeline"] # ["tagger", "parser", "ner"] ``` ```python ### Deserialize -nlp = spacy.blank(lang) -for pipe_name in pipeline: - nlp.add_pipe(pipe_name) +lang_cls = spacy.util.get_lang_class(config["nlp"]["lang"]) +nlp = lang_cls.from_config(config) nlp.from_bytes(bytes_data) ``` This is also how spaCy does it under the hood when loading a pipeline: it loads the `config.cfg` containing the language and pipeline information, initializes -the language class, creates and adds the pipeline components based on the -defined [factories](/usage/processing-pipeline#custom-components-factories) and -_then_ loads in the binary data. You can read more about this process +the language class, creates and adds the pipeline components based on the config +and _then_ loads in the binary data. You can read more about this process [here](/usage/processing-pipelines#pipelines). ## Serializing Doc objects efficiently {#docs new="2.2"} @@ -658,8 +654,9 @@ $ python -m spacy package ./en_example_pipeline ./packages ``` This command will create a pipeline package directory and will run -`python setup.py sdist` in that directory to create `.tar.gz` archive of your -package that can be installed using `pip install`. +`python setup.py sdist` in that directory to create a binary `.whl` file or +`.tar.gz` archive of your package that can be installed using `pip install`. +Installing the binary wheel is usually more efficient. ```yaml ### Directory structure diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 16b2b0f5a..bd5ea7751 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -6,6 +6,7 @@ menu: - ['Introduction', 'basics'] - ['Quickstart', 'quickstart'] - ['Config System', 'config'] + - ['Training Data', 'training-data'] - ['Custom Training', 'config-custom'] - ['Custom Functions', 'custom-functions'] - ['Initialization', 'initialization'] @@ -45,6 +46,14 @@ you generate a starter config with the **recommended settings** for your specific use case. It's also available in spaCy as the [`init config`](/api/cli#init-config) command. + + +Upgrade to the [latest version of spaCy](/usage) to use the quickstart widget. +For earlier releases, follow the CLI instructions to generate a compatible +config. + + + > #### Instructions: widget > > 1. Select your requirements and settings. @@ -95,6 +104,14 @@ spaCy's binary `.spacy` format. You can either include the data paths in the $ python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy ``` +> #### Tip: Enable your GPU +> +> Use the `--gpu-id` option to select the GPU: +> +> ```cli +> $ python -m spacy train config.cfg --gpu-id 0 +> ``` + The recommended config settings generated by the quickstart widget and the @@ -185,7 +202,7 @@ sections of a config file are: For a full overview of spaCy's config format and settings, see the [data format documentation](/api/data-formats#config) and -[Thinc's config system docs](https://thinc.ai/usage/config). The settings +[Thinc's config system docs](https://thinc.ai/docs/usage-config). The settings available for the different architectures are documented with the [model architectures API](/api/architectures). See the Thinc documentation for [optimizers](https://thinc.ai/docs/api-optimizers) and @@ -193,6 +210,8 @@ available for the different architectures are documented with the
+ + ### Config lifecycle at runtime and training {#config-lifecycle} A pipeline's `config.cfg` is considered the "single source of truth", both at @@ -282,8 +301,6 @@ fly without having to save to and load from disk. $ python -m spacy init config - --lang en --pipeline ner,textcat --optimize accuracy | python -m spacy train - --paths.train ./corpus/train.spacy --paths.dev ./corpus/dev.spacy ``` - - ### Using variable interpolation {#config-interpolation} Another very useful feature of the config system is that it supports variable @@ -337,6 +354,59 @@ that reference this variable.
+## Preparing Training Data {#training-data} + +Training data for NLP projects comes in many different formats. For some common +formats such as CoNLL, spaCy provides [converters](/api/cli#convert) you can use +from the command line. In other cases you'll have to prepare the training data +yourself. + +When converting training data for use in spaCy, the main thing is to create +[`Doc`](/api/doc) objects just like the results you want as output from the +pipeline. For example, if you're creating an NER pipeline, loading your +annotations and setting them as the `.ents` property on a `Doc` is all you need +to worry about. On disk the annotations will be saved as a +[`DocBin`](/api/docbin) in the +[`.spacy` format](/api/data-formats#binary-training), but the details of that +are handled automatically. + +Here's an example of creating a `.spacy` file from some NER annotations. + +```python +### preprocess.py +import spacy +from spacy.tokens import DocBin + +nlp = spacy.blank("en") +training_data = [ + ("Tokyo Tower is 333m tall.", [(0, 11, "BUILDING")]), +] +# the DocBin will store the example documents +db = DocBin() +for text, annotations in training_data: + doc = nlp(text) + ents = [] + for start, end, label in annotations: + span = doc.char_span(start, end, label=label) + ents.append(span) + doc.ents = ents + db.add(doc) +db.to_disk("./train.spacy") +``` + +For more examples of how to convert training data from a wide variety of formats +for use with spaCy, look at the preprocessing steps in the +[tutorial projects](https://github.com/explosion/projects/tree/v3/tutorials). + + + +In spaCy v2, the recommended way to store training data was in +[a particular JSON format](/api/data-formats#json-input), but in v3 this format +is deprecated. It's fine as a readable storage format, but there's no need to +convert your data to JSON before creating a `.spacy` file. + + + ## Customizing the pipeline and training {#config-custom} ### Defining pipeline components {#config-components} @@ -404,11 +474,14 @@ as-is. They are also excluded when calling > #### Note on frozen components > > Even though frozen components are not **updated** during training, they will -> still **run** during training and evaluation. This is very important, because -> they may still impact your model's performance – for instance, a sentence -> boundary detector can impact what the parser or entity recognizer considers a -> valid parse. So the evaluation results should always reflect what your -> pipeline will produce at runtime. +> still **run** during evaluation. This is very important, because they may +> still impact your model's performance – for instance, a sentence boundary +> detector can impact what the parser or entity recognizer considers a valid +> parse. So the evaluation results should always reflect what your pipeline will +> produce at runtime. If you want a frozen component to run (without updating) +> during training as well, so that downstream components can use its +> **predictions**, you can add it to the list of +> [`annotating_components`](/usage/training#annotating-components). ```ini [nlp] @@ -419,13 +492,91 @@ pipeline = ["parser", "ner", "textcat", "custom"] frozen_components = ["parser", "custom"] ``` - + When the components in your pipeline [share an embedding layer](/usage/embeddings-transformers#embedding-layers), the -**performance** of your frozen component will be **degraded** if you continue training -other layers with the same underlying `Tok2Vec` instance. As a rule of thumb, -ensure that your frozen components are truly **independent** in the pipeline. +**performance** of your frozen component will be **degraded** if you continue +training other layers with the same underlying `Tok2Vec` instance. As a rule of +thumb, ensure that your frozen components are truly **independent** in the +pipeline. + +To automatically replace a shared token-to-vector listener with an independent +copy of the token-to-vector layer, you can use the `replace_listeners` setting +of a sourced component, pointing to the listener layer(s) in the config. For +more details on how this works under the hood, see +[`Language.replace_listeners`](/api/language#replace_listeners). + +```ini +[training] +frozen_components = ["tagger"] + +[components.tagger] +source = "en_core_web_sm" +replace_listeners = ["model.tok2vec"] +``` + + + +### Using predictions from preceding components {#annotating-components new="3.1"} + +By default, components are updated in isolation during training, which means +that they don't see the predictions of any earlier components in the pipeline. A +component receives [`Example.predicted`](/api/example) as input and compares its +predictions to [`Example.reference`](/api/example) without saving its +annotations in the `predicted` doc. + +Instead, if certain components should **set their annotations** during training, +use the setting `annotating_components` in the `[training]` block to specify a +list of components. For example, the feature `DEP` from the parser could be used +as a tagger feature by including `DEP` in the tok2vec `attrs` and including +`parser` in `annotating_components`: + +```ini +### config.cfg (excerpt) {highlight="7,12"} +[nlp] +pipeline = ["parser", "tagger"] + +[components.tagger.model.tok2vec.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = ${components.tagger.model.tok2vec.encode.width} +attrs = ["NORM","DEP"] +rows = [5000,2500] +include_static_vectors = false + +[training] +annotating_components = ["parser"] +``` + +Any component in the pipeline can be included as an annotating component, +including frozen components. Frozen components can set annotations during +training just as they would set annotations during evaluation or when the final +pipeline is run. The config excerpt below shows how a frozen `ner` component and +a `sentencizer` can provide the required `doc.sents` and `doc.ents` for the +entity linker during training: + +```ini +### config.cfg (excerpt) +[nlp] +pipeline = ["sentencizer", "ner", "entity_linker"] + +[components.ner] +source = "en_core_web_sm" + +[training] +frozen_components = ["ner"] +annotating_components = ["sentencizer", "ner"] +``` + +Similarly, a pretrained `tok2vec` layer can be frozen and specified in the list +of `annotating_components` to ensure that a downstream component can use the +embedding layer without updating it. + + + +Be aware that non-frozen annotating components with statistical models will +**run twice** on each batch, once to update the model and once to apply the +now-updated model to the predicted docs. @@ -551,16 +702,16 @@ The default score weights are defined by each pipeline component via the components are weighted equally. If a score weight is set to `null`, it will be excluded from the logs and the score won't be weighted. - + -| Name | Description | -| -------------------------- | ----------------------------------------------------------------------------------------------------------------------- | -| **Loss** | The training loss representing the amount of work left for the optimizer. Should decrease, but usually not to `0`. | -| **Precision** (P) | Percentage of predicted annotations that were correct. Should increase. | -| **Recall** (R) | Percentage of reference annotations recovered. Should increase. | -| **F-Score** (F) | Harmonic mean of precision and recall. Should increase. | -| **UAS** / **LAS** | Unlabeled and labeled attachment score for the dependency parser, i.e. the percentage of correct arcs. Should increase. | -| **Words per second** (WPS) | Prediction speed in words per second. Should stay stable. | +| Name | Description | +| ----------------- | ----------------------------------------------------------------------------------------------------------------------- | +| **Loss** | The training loss representing the amount of work left for the optimizer. Should decrease, but usually not to `0`. | +| **Precision** (P) | Percentage of predicted annotations that were correct. Should increase. | +| **Recall** (R) | Percentage of reference annotations recovered. Should increase. | +| **F-Score** (F) | Harmonic mean of precision and recall. Should increase. | +| **UAS** / **LAS** | Unlabeled and labeled attachment score for the dependency parser, i.e. the percentage of correct arcs. Should increase. | +| **Speed** | Prediction speed in words per second (WPS). Should stay stable. | Note that if the development data has raw text, some of the gold-standard entities might not align to the predicted tokenization. These tokenization @@ -922,7 +1073,7 @@ import spacy from spacy.tokens import Doc @spacy.registry.architectures("custom_neural_network.v1") -def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]: +def custom_neural_network(output_width: int) -> Model[List[Doc], List[Floats2d]]: return create_model(output_width) ``` @@ -1104,8 +1255,8 @@ any other custom workflows. `corpora.train` and `corpora.dev` are used as conventions within spaCy's default configs, but you can also define any other custom blocks. Each section in the corpora config should resolve to a [`Corpus`](/api/corpus) – for example, using spaCy's built-in -[corpus reader](/api/top-level#readers) that takes a path to a binary `.spacy` -file. The `train_corpus` and `dev_corpus` fields in the +[corpus reader](/api/top-level#corpus-readers) that takes a path to a binary +`.spacy` file. The `train_corpus` and `dev_corpus` fields in the [`[training]`](/api/data-formats#config-training) block specify where to find the corpus in your config. This makes it easy to **swap out** different corpora by only changing a single config setting. @@ -1116,21 +1267,23 @@ corpora, keyed by corpus name, e.g. `"train"` and `"dev"`. This can be especially useful if you need to split a single file into corpora for training and evaluation, without loading the same file twice. +By default, the training data is loaded into memory and shuffled before each +epoch. If the corpus is **too large to fit into memory** during training, stream +the corpus using a custom reader as described in the next section. + ### Custom data reading and batching {#custom-code-readers-batchers} Some use-cases require **streaming in data** or manipulating datasets on the -fly, rather than generating all data beforehand and storing it to file. Instead +fly, rather than generating all data beforehand and storing it to disk. Instead of using the built-in [`Corpus`](/api/corpus) reader, which uses static file paths, you can create and register a custom function that generates -[`Example`](/api/example) objects. The resulting generator can be infinite. When -using this dataset for training, stopping criteria such as maximum number of -steps, or stopping when the loss does not decrease further, can be used. +[`Example`](/api/example) objects. -In this example we assume a custom function `read_custom_data` which loads or -generates texts with relevant text classification annotations. Then, small -lexical variations of the input text are created before generating the final -[`Example`](/api/example) objects. The `@spacy.registry.readers` decorator lets -you register the function creating the custom reader in the `readers` +In the following example we assume a custom function `read_custom_data` which +loads or generates texts with relevant text classification annotations. Then, +small lexical variations of the input text are created before generating the +final [`Example`](/api/example) objects. The `@spacy.registry.readers` decorator +lets you register the function creating the custom reader in the `readers` [registry](/api/top-level#registry) and assign it a string name, so it can be used in your config. All arguments on the registered function become available as **config settings** – in this case, `source`. @@ -1173,6 +1326,80 @@ Remember that a registered function should always be a function that spaCy +If the corpus is **too large to load into memory** or the corpus reader is an +**infinite generator**, use the setting `max_epochs = -1` to indicate that the +train corpus should be streamed. With this setting the train corpus is merely +streamed and batched, not shuffled, so any shuffling needs to be implemented in +the corpus reader itself. In the example below, a corpus reader that generates +sentences containing even or odd numbers is used with an unlimited number of +examples for the train corpus and a limited number of examples for the dev +corpus. The dev corpus should always be finite and fit in memory during the +evaluation step. `max_steps` and/or `patience` are used to determine when the +training should stop. + +> #### config.cfg +> +> ```ini +> [corpora.dev] +> @readers = "even_odd.v1" +> limit = 100 +> +> [corpora.train] +> @readers = "even_odd.v1" +> limit = -1 +> +> [training] +> max_epochs = -1 +> patience = 500 +> max_steps = 2000 +> ``` + +```python +### functions.py +from typing import Callable, Iterable, Iterator +from spacy import util +import random +from spacy.training import Example +from spacy import Language + + +@util.registry.readers("even_odd.v1") +def create_even_odd_corpus(limit: int = -1) -> Callable[[Language], Iterable[Example]]: + return EvenOddCorpus(limit) + + +class EvenOddCorpus: + def __init__(self, limit): + self.limit = limit + + def __call__(self, nlp: Language) -> Iterator[Example]: + i = 0 + while i < self.limit or self.limit < 0: + r = random.randint(0, 1000) + cat = r % 2 == 0 + text = "This is sentence " + str(r) + yield Example.from_dict( + nlp.make_doc(text), {"cats": {"EVEN": cat, "ODD": not cat}} + ) + i += 1 +``` + +> #### config.cfg +> +> ```ini +> [initialize.components.textcat.labels] +> @readers = "spacy.read_labels.v1" +> path = "labels/textcat.json" +> require = true +> ``` + +If the train corpus is streamed, the initialize step peeks at the first 100 +examples in the corpus to find the labels for each component. If this isn't +sufficient, you'll need to [provide the labels](#initialization-labels) for each +component in the `[initialize]` block. [`init labels`](/api/cli#init-labels) can +be used to generate JSON files in the correct format, which you can extend with +the full label set. + We can also customize the **batching strategy** by registering a new batcher function in the `batchers` [registry](/api/top-level#registry). A batcher turns a stream of items into a stream of batches. spaCy has several useful built-in @@ -1418,7 +1645,7 @@ workers are stuck waiting for it to complete before they can continue. ## Internal training API {#api} - + spaCy gives you full control over the training loop. However, for most use cases, it's recommended to train your pipelines via the @@ -1430,6 +1657,32 @@ typically give you everything you need to train fully custom pipelines with +### Training from a Python script {#api-train new="3.2"} + +If you want to run the training from a Python script instead of using the +[`spacy train`](/api/cli#train) CLI command, you can call into the +[`train`](/api/cli#train-function) helper function directly. It takes the path +to the config file, an optional output directory and an optional dictionary of +[config overrides](#config-overrides). + +```python +from spacy.cli.train import train + +train("./config.cfg", overrides={"paths.train": "./train.spacy", "paths.dev": "./dev.spacy"}) +``` + +### Internal training loop API {#api-loop} + + + +This section documents how the training loop and updates to the `nlp` object +work internally. You typically shouldn't have to implement this in Python unless +you're writing your own trainable components. To train a pipeline, use +[`spacy train`](/api/cli#train) or the [`train`](/api/cli#train-function) helper +function instead. + + + The [`Example`](/api/example) object contains annotated training data, also called the **gold standard**. It's initialized with a [`Doc`](/api/doc) object that will hold the predictions, and another `Doc` object that holds the diff --git a/website/docs/usage/v2-1.md b/website/docs/usage/v2-1.md index 8d310f1a4..500e43803 100644 --- a/website/docs/usage/v2-1.md +++ b/website/docs/usage/v2-1.md @@ -180,7 +180,7 @@ entirely **in Markdown**, without having to compromise on easy-to-use custom UI components. We're hoping that the Markdown source will make it even easier to contribute to the documentation. For more details, check out the [styleguide](/styleguide) and -[source](https://github.com/explosion/spaCy/tree/master/website). While +[source](https://github.com/explosion/spacy/tree/v2.x/website). While converting the pages to Markdown, we've also fixed a bunch of typos, improved the existing pages and added some new content: diff --git a/website/docs/usage/v2-3.md b/website/docs/usage/v2-3.md index b6c4d7dfb..075e1ce81 100644 --- a/website/docs/usage/v2-3.md +++ b/website/docs/usage/v2-3.md @@ -161,8 +161,8 @@ debugging your tokenizer configuration. spaCy's custom warnings have been replaced with native Python [`warnings`](https://docs.python.org/3/library/warnings.html). Instead of -setting `SPACY_WARNING_IGNORE`, use the [`warnings` -filters](https://docs.python.org/3/library/warnings.html#the-warnings-filter) +setting `SPACY_WARNING_IGNORE`, use the +[`warnings` filters](https://docs.python.org/3/library/warnings.html#the-warnings-filter) to manage warnings. ```diff @@ -176,7 +176,7 @@ import spacy #### Normalization tables The normalization tables have moved from the language data in -[`spacy/lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang) to the +[`spacy/lang`](https://github.com/explosion/spacy/tree/v2.x/spacy/lang) to the package [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). If you're adding data for a new language, the normalization table should be added to `spacy-lookups-data`. See @@ -190,8 +190,8 @@ lexemes will be added to the vocab automatically, just as in small models without vectors. To see the number of unique vectors and number of words with vectors, see -`nlp.meta['vectors']`, for example for `en_core_web_md` there are `20000` -unique vectors and `684830` words with vectors: +`nlp.meta['vectors']`, for example for `en_core_web_md` there are `20000` unique +vectors and `684830` words with vectors: ```python { @@ -210,8 +210,8 @@ for orth in nlp.vocab.vectors: _ = nlp.vocab[orth] ``` -If your workflow previously iterated over `nlp.vocab`, a similar alternative -is to iterate over words with vectors instead: +If your workflow previously iterated over `nlp.vocab`, a similar alternative is +to iterate over words with vectors instead: ```diff - lexemes = [w for w in nlp.vocab] @@ -220,9 +220,9 @@ is to iterate over words with vectors instead: Be aware that the set of preloaded lexemes in a v2.2 model is not equivalent to the set of words with vectors. For English, v2.2 `md/lg` models have 1.3M -provided lexemes but only 685K words with vectors. The vectors have been -updated for most languages in v2.2, but the English models contain the same -vectors for both v2.2 and v2.3. +provided lexemes but only 685K words with vectors. The vectors have been updated +for most languages in v2.2, but the English models contain the same vectors for +both v2.2 and v2.3. #### Lexeme.is_oov and Token.is_oov @@ -234,8 +234,7 @@ fixed in the next patch release v2.3.1. In v2.3, `Lexeme.is_oov` and `Token.is_oov` are `True` if the lexeme does not -have a word vector. This is equivalent to `token.orth not in -nlp.vocab.vectors`. +have a word vector. This is equivalent to `token.orth not in nlp.vocab.vectors`. Previously in v2.2, `is_oov` corresponded to whether a lexeme had stored probability and cluster features. The probability and cluster features are no @@ -270,8 +269,8 @@ as part of the model vocab. To load the probability table into a provided model, first make sure you have `spacy-lookups-data` installed. To load the table, remove the empty provided -`lexeme_prob` table and then access `Lexeme.prob` for any word to load the -table from `spacy-lookups-data`: +`lexeme_prob` table and then access `Lexeme.prob` for any word to load the table +from `spacy-lookups-data`: ```diff + # prerequisite: pip install spacy-lookups-data @@ -321,9 +320,9 @@ the [train CLI](/api/cli#train), you can use the new `--tag-map-path` option to provide in the tag map as a JSON dict. If you want to export a tag map from a provided model for use with the train -CLI, you can save it as a JSON dict. To only use string keys as required by -JSON and to make it easier to read and edit, any internal integer IDs need to -be converted back to strings: +CLI, you can save it as a JSON dict. To only use string keys as required by JSON +and to make it easier to read and edit, any internal integer IDs need to be +converted back to strings: ```python import spacy diff --git a/website/docs/usage/v2.md b/website/docs/usage/v2.md index aee3c24a6..210565c11 100644 --- a/website/docs/usage/v2.md +++ b/website/docs/usage/v2.md @@ -303,7 +303,7 @@ lookup-based lemmatization – and **many new languages**! **API:** [`Language`](/api/language) **Code:** -[`spacy/lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang) +[`spacy/lang`](https://github.com/explosion/spacy/tree/v2.x/spacy/lang) **Usage:** [Adding languages](/usage/adding-languages) diff --git a/website/docs/usage/v3-1.md b/website/docs/usage/v3-1.md new file mode 100644 index 000000000..1bac8fd81 --- /dev/null +++ b/website/docs/usage/v3-1.md @@ -0,0 +1,320 @@ +--- +title: What's New in v3.1 +teaser: New features and how to upgrade +menu: + - ['New Features', 'features'] + - ['Upgrading Notes', 'upgrading'] +--- + +## New Features {#features hidden="true"} + +It's been great to see the adoption of the new spaCy v3, which introduced +[transformer-based](/usage/embeddings-transformers) pipelines, a new +[config and training system](/usage/training) for reproducible experiments, +[projects](/usage/projects) for end-to-end workflows, and many +[other features](/usage/v3). Version 3.1 adds more on top of it, including the +ability to use predicted annotations during training, a new `SpanCategorizer` +component for predicting arbitrary and potentially overlapping spans, support +for partial incorrect annotations in the entity recognizer, new trained +pipelines for Catalan and Danish, as well as many bug fixes and improvements. + +### Using predicted annotations during training {#predicted-annotations-training} + +By default, components are updated in isolation during training, which means +that they don't see the predictions of any earlier components in the pipeline. +The new +[`[training.annotating_components]`](/usage/training#annotating-components) +config setting lets you specify pipeline components that should set annotations +on the predicted docs during training. This makes it easy to use the predictions +of a previous component in the pipeline as features for a subsequent component, +e.g. the dependency labels in the tagger: + +```ini +### config.cfg (excerpt) {highlight="7,12"} +[nlp] +pipeline = ["parser", "tagger"] + +[components.tagger.model.tok2vec.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = ${components.tagger.model.tok2vec.encode.width} +attrs = ["NORM","DEP"] +rows = [5000,2500] +include_static_vectors = false + +[training] +annotating_components = ["parser"] +``` + + + +This project shows how to use the `token.dep` attribute predicted by the parser +as a feature for a subsequent tagger component in the pipeline. + + + +### SpanCategorizer for predicting arbitrary and overlapping spans {#spancategorizer tag="experimental"} + +A common task in applied NLP is extracting spans of texts from documents, +including longer phrases or nested expressions. Named entity recognition isn't +the right tool for this problem, since an entity recognizer typically predicts +single token-based tags that are very sensitive to boundaries. This is effective +for proper nouns and self-contained expressions, but less useful for other types +of phrases or overlapping spans. The new +[`SpanCategorizer`](/api/spancategorizer) component and +[SpanCategorizer](/api/architectures#spancategorizer) architecture let you label +arbitrary and potentially overlapping spans of texts. A span categorizer +consists of two parts: a [suggester function](/api/spancategorizer#suggesters) +that proposes candidate spans, which may or may not overlap, and a labeler model +that predicts zero or more labels for each candidate. The predicted spans are +available via the [`Doc.spans`](/api/doc#spans) container. + + + +This project trains a span categorizer for Indonesian NER. + + + + + +[![Prodigy: example of the new manual spans UI](../images/prodigy_spans-manual.jpg)](https://support.prodi.gy/t/3861) + +The upcoming version of our annotation tool [Prodigy](https://prodi.gy) +(currently available as a [pre-release](https://support.prodi.gy/t/3861) for all +users) features a [new workflow and UI](https://support.prodi.gy/t/3861) for +annotating overlapping and nested spans. You can use it to create training data +for spaCy's `SpanCategorizer` component. + + + +### Update the entity recognizer with partial incorrect annotations {#negative-samples} + +> #### config.cfg (excerpt) +> +> ```ini +> [components.ner] +> factory = "ner" +> incorrect_spans_key = "incorrect_spans" +> moves = null +> update_with_oracle_cut_size = 100 +> ``` + +The [`EntityRecognizer`](/api/entityrecognizer) can now be updated with known +incorrect annotations, which lets you take advantage of partial and sparse data. +For example, you'll be able to use the information that certain spans of text +are definitely **not** `PERSON` entities, without having to provide the complete +gold-standard annotations for the given example. The incorrect span annotations +can be added via the [`Doc.spans`](/api/doc#spans) in the training data under +the key defined as [`incorrect_spans_key`](/api/entityrecognizer#init) in the +component config. + +```python +train_doc = nlp.make_doc("Barack Obama was born in Hawaii.") +# The doc.spans key can be defined in the config +train_doc.spans["incorrect_spans"] = [ + Span(doc, 0, 2, label="ORG"), + Span(doc, 5, 6, label="PRODUCT") +] +``` + + + +### New pipeline packages for Catalan and Danish {#pipeline-packages} + +spaCy v3.1 adds 5 new pipeline packages, including a new core family for Catalan +and a new transformer-based pipeline for Danish using the +[`danish-bert-botxo`](http://huggingface.co/Maltehb/danish-bert-botxo) weights. +See the [models directory](/models) for an overview of all available trained +pipelines and the [training guide](/usage/training) for details on how to train +your own. + +> Thanks to Carlos Rodríguez Penagos and the +> [Barcelona Supercomputing Center](https://temu.bsc.es/) for their +> contributions for Catalan and to Kenneth Enevoldsen for Danish. For additional +> Danish pipelines, check out [DaCy](https://github.com/KennethEnevoldsen/DaCy). + +| Package | Language | UPOS | Parser LAS |  NER F | +| ------------------------------------------------- | -------- | ---: | ---------: | -----: | +| [`ca_core_news_sm`](/models/ca#ca_core_news_sm) | Catalan | 98.2 | 87.4 | 79.8 | +| [`ca_core_news_md`](/models/ca#ca_core_news_md) | Catalan | 98.3 | 88.2 | 84.0 | +| [`ca_core_news_lg`](/models/ca#ca_core_news_lg) | Catalan | 98.5 | 88.4 | 84.2 | +| [`ca_core_news_trf`](/models/ca#ca_core_news_trf) | Catalan | 98.9 | 93.0 | 91.2 | +| [`da_core_news_trf`](/models/da#da_core_news_trf) | Danish | 98.0 | 85.0 | 82.9 | + +### Resizable text classification architectures {#resizable-textcat} + +Previously, the [`TextCategorizer`](/api/textcategorizer) architectures could +not be resized, meaning that you couldn't add new labels to an already trained +model. In spaCy v3.1, the [TextCatCNN](/api/architectures#TextCatCNN) and +[TextCatBOW](/api/architectures#TextCatBOW) architectures are now resizable, +while ensuring that the predictions for the old labels remain the same. + +### CLI command to assemble pipeline from config {#assemble} + +The [`spacy assemble`](/api/cli#assemble) command lets you assemble a pipeline +from a config file without additional training. It can be especially useful for +creating a blank pipeline with a custom tokenizer, rule-based components or word +vectors. + +```cli +$ python -m spacy assemble config.cfg ./output +``` + +### Pretty pipeline package READMEs {#package-readme} + +The [`spacy package`](/api/cli#package) command now auto-generates a pretty +`README.md` based on the pipeline information defined in the `meta.json`. This +includes a table with a general overview, as well as the label scheme and +accuracy figures, if available. For an example, see the +[model releases](https://github.com/explosion/spacy-models/releases). + +### Support for streaming large or infinite corpora {#streaming-corpora} + +> #### config.cfg (excerpt) +> +> ```ini +> [training] +> max_epochs = -1 +> ``` + +The training process now supports streaming large or infinite corpora +out-of-the-box, which can be controlled via the +[`[training.max_epochs]`](/api/data-formats#training) config setting. Setting it +to `-1` means that the train corpus should be streamed rather than loaded into +memory with no shuffling within the training loop. For details on how to +implement a custom corpus loader, e.g. to stream in data from a remote storage, +see the usage guide on +[custom data reading](/usage/training#custom-code-readers-batchers). + +When streaming a corpus, only the first 100 examples will be used for +[initialization](/usage/training#config-lifecycle). This is no problem if you're +training a component like the text classifier with data that specifies all +available labels in every example. If necessary, you can use the +[`init labels`](/api/cli#init-labels) command to pre-generate the labels for +your components using a representative sample so the model can be initialized +correctly before training. + +### New lemmatizers for Catalan and Italian {#pos-lemmatizers} + +The trained pipelines for [Catalan](/models/ca) and [Italian](/models/it) now +include lemmatizers that use the predicted part-of-speech tags as part of the +lookup lemmatization for higher lemmatization accuracy. If you're training your +own pipelines for these languages and you want to include a lemmatizer, make +sure you have the +[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) package +installed, which provides the relevant tables. + +### Upload your pipelines to the Hugging Face Hub {#huggingface-hub} + +The [Hugging Face Hub](https://huggingface.co/) lets you upload models and share +them with others, and it now supports spaCy pipelines out-of-the-box. The new +[`spacy-huggingface-hub`](https://github.com/explosion/spacy-huggingface-hub) +package automatically adds the `huggingface-hub` command to your `spacy` CLI. It +lets you upload any pipelines packaged with [`spacy package`](/api/cli#package) +and `--build wheel` and takes care of auto-generating all required meta +information. + +After uploading, you'll get a live URL for your model page that includes all +details, files and interactive visualizers, as well as a direct URL to the wheel +file that you can install via `pip install`. For examples, check out the +[spaCy pipelines](https://huggingface.co/spacy) we've uploaded. + +```cli +$ pip install spacy-huggingface-hub +$ huggingface-cli login +$ python -m spacy package ./en_ner_fashion ./output --build wheel +$ cd ./output/en_ner_fashion-0.0.0/dist +$ python -m spacy huggingface-hub push en_ner_fashion-0.0.0-py3-none-any.whl +``` + +You can also integrate the upload command into your +[project template](/usage/projects#huggingface_hub) to automatically upload your +packaged pipelines after training. + + + +Get started with uploading your models to the Hugging Face hub using our project +template. It trains a simple pipeline, packages it and uploads it if the +packaged model has changed. This makes it easy to deploy your models end-to-end. + + + +## Notes about upgrading from v3.0 {#upgrading} + +### Pipeline package version compatibility {#version-compat} + +> #### Using legacy implementations +> +> In spaCy v3, you'll still be able to load and reference legacy implementations +> via [`spacy-legacy`](https://github.com/explosion/spacy-legacy), even if the +> components or architectures change and newer versions are available in the +> core library. + +When you're loading a pipeline package trained with spaCy v3.0, you will see a +warning telling you that the pipeline may be incompatible. This doesn't +necessarily have to be true, but we recommend running your pipelines against +your test suite or evaluation data to make sure there are no unexpected results. +If you're using one of the [trained pipelines](/models) we provide, you should +run [`spacy download`](/api/cli#download) to update to the latest version. To +see an overview of all installed packages and their compatibility, you can run +[`spacy validate`](/api/cli#validate). + +If you've trained your own custom pipeline and you've confirmed that it's still +working as expected, you can update the spaCy version requirements in the +[`meta.json`](/api/data-formats#meta): + +```diff +- "spacy_version": ">=3.0.0,<3.1.0", ++ "spacy_version": ">=3.0.0,<3.2.0", +``` + +### Updating v3.0 configs + +To update a config from spaCy v3.0 with the new v3.1 settings, run +[`init fill-config`](/api/cli#init-fill-config): + +```bash +python -m spacy init fill-config config-v3.0.cfg config-v3.1.cfg +``` + +In many cases (`spacy train`, `spacy.load()`), the new defaults will be filled +in automatically, but you'll need to fill in the new settings to run +[`debug config`](/api/cli#debug) and [`debug data`](/api/cli#debug-data). + +### Sourcing pipeline components with vectors {#source-vectors} + +If you're sourcing a pipeline component that requires static vectors (for +example, a tagger or parser from an `md` or `lg` pretrained pipeline), be sure +to include the source model's vectors in the setting `[initialize.vectors]`. In +spaCy v3.0, a bug allowed vectors to be loaded implicitly through `source`, +however in v3.1 this setting must be provided explicitly as +`[initialize.vectors]`: + +```ini +### config.cfg (excerpt) +[components.ner] +source = "en_core_web_md" + +[initialize] +vectors = "en_core_web_md" +``` + + + +Each pipeline can only store one set of static vectors, so it's not possible to +assemble a pipeline with components that were trained on different static +vectors. + + + +[`spacy train`](/api/cli#train) and [`spacy assemble`](/api/cli#assemble) will +provide warnings if the source and target pipelines don't contain the same +vectors. If you are sourcing a rule-based component like an entity ruler or +lemmatizer that does not use the vectors as a model feature, then this warning +can be safely ignored. + +### Warnings {#warnings} + +Logger warnings have been converted to Python warnings. Use +[`warnings.filterwarnings`](https://docs.python.org/3/library/warnings.html#warnings.filterwarnings) +or the new helper method `spacy.errors.filter_warning(action, error_msg='')` to +manage warnings. diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index 5aa5507f7..980f06172 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -10,6 +10,11 @@ menu: ## Summary {#summary hidden="true"} +> #### 📖 Looking for the old docs? +> +> To help you make the transition from v2.x to v3.0, we've uploaded the old +> website to [**v2.spacy.io**](https://v2.spacy.io/docs). + Want to make the transition from spaCy v2 to spaCy v3 as smooth as possible for @@ -67,6 +72,16 @@ improvements**. The [API docs](/api) include additional deprecation notes. New methods and functions that were introduced in this version are marked with the tag 3. + + + + + + + + + + ### Transformer-based pipelines {#features-transformers} > #### Example @@ -103,11 +118,11 @@ import Benchmarks from 'usage/\_benchmarks-models.md' | Package | Language | Transformer | Tagger | Parser |  NER | | ------------------------------------------------ | -------- | --------------------------------------------------------------------------------------------- | -----: | -----: | ---: | -| [`en_core_web_trf`](/models/en#en_core_web_trf) | English | [`roberta-base`](https://huggingface.co/roberta-base) | 97.8 | 95.0 | 89.4 | +| [`en_core_web_trf`](/models/en#en_core_web_trf) | English | [`roberta-base`](https://huggingface.co/roberta-base) | 97.8 | 95.2 | 89.9 | | [`de_dep_news_trf`](/models/de#de_dep_news_trf) | German | [`bert-base-german-cased`](https://huggingface.co/bert-base-german-cased) | 99.0 | 95.8 | - | | [`es_dep_news_trf`](/models/es#es_dep_news_trf) | Spanish | [`bert-base-spanish-wwm-cased`](https://huggingface.co/dccuchile/bert-base-spanish-wwm-cased) | 98.2 | 94.6 | - | -| [`fr_dep_news_trf`](/models/fr#fr_dep_news_trf) | French | [`camembert-base`](https://huggingface.co/camembert-base) | 95.7 | 94.9 | - | -| [`zh_core_web_trf`](/models/zh#zh_core_news_trf) | Chinese | [`bert-base-chinese`](https://huggingface.co/bert-base-chinese) | 92.5 | 77.2 | 75.6 | +| [`fr_dep_news_trf`](/models/fr#fr_dep_news_trf) | French | [`camembert-base`](https://huggingface.co/camembert-base) | 95.7 | 94.4 | - | +| [`zh_core_web_trf`](/models/zh#zh_core_news_trf) | Chinese | [`bert-base-chinese`](https://huggingface.co/bert-base-chinese) | 92.5 | 76.6 | 75.4 | @@ -305,14 +320,15 @@ add to your pipeline and customize for your use case: > nlp.add_pipe("lemmatizer") > ``` -| Name | Description | -| ----------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| [`SentenceRecognizer`](/api/sentencerecognizer) | Trainable component for sentence segmentation. | -| [`Morphologizer`](/api/morphologizer) | Trainable component to predict morphological features. | -| [`Lemmatizer`](/api/lemmatizer) | Standalone component for rule-based and lookup lemmatization. | -| [`AttributeRuler`](/api/attributeruler) | Component for setting token attributes using match patterns. | -| [`Transformer`](/api/transformer) | Component for using [transformer models](/usage/embeddings-transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). | -| [`TrainablePipe`](/api/pipe) | Base class for trainable pipeline components. | +| Name | Description | +| ----------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [`SentenceRecognizer`](/api/sentencerecognizer) | Trainable component for sentence segmentation. | +| [`Morphologizer`](/api/morphologizer) | Trainable component to predict morphological features. | +| [`Lemmatizer`](/api/lemmatizer) | Standalone component for rule-based and lookup lemmatization. | +| [`AttributeRuler`](/api/attributeruler) | Component for setting token attributes using match patterns. | +| [`Transformer`](/api/transformer) | Component for using [transformer models](/usage/embeddings-transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). | +| [`TrainablePipe`](/api/pipe) | Base class for trainable pipeline components. | +| [`Multi-label TextCategorizer`](/api/textcategorizer) | Trainable component for multi-label text classification. | @@ -446,6 +462,7 @@ The following methods, attributes and commands are new in spaCy v3.0. | ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). | | [`Token.morph`](/api/token#attributes) | Access a token's morphological analysis. | +| [`Doc.spans`](/api/doc#spans) | Named span groups to store and access collections of potentially overlapping spans. Uses the new [`SpanGroup`](/api/spangroup) data structure. | | [`Doc.has_annotation`](/api/doc#has_annotation) | Check whether a doc has annotation on a token attribute. | | [`Language.select_pipes`](/api/language#select_pipes) | Context manager for enabling or disabling specific pipeline components for a block. | | [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe) | Disable or enable a loaded pipeline component (but don't remove it). | @@ -574,8 +591,12 @@ Note that spaCy v3.0 now requires **Python 3.6+**. rule-based lemmas. You can now add it to your pipeline explicitly and set its mode on initialization. - Various keyword arguments across functions and methods are now explicitly - declared as _keyword-only_ arguments. Those arguments are documented - accordingly across the API reference. + declared as **keyword-only** arguments. Those arguments are documented + accordingly across the API reference using the keyword-only tag. +- The `textcat` pipeline component is now only applicable for classification of + mutually exclusives classes - i.e. one predicted class per input sentence or + document. To perform multi-label classification, use the new + `textcat_multilabel` component instead. ### Removed or renamed API {#incompat-removed} @@ -587,6 +608,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**. | `GoldParse` | [`Example`](/api/example) | | `GoldCorpus` | [`Corpus`](/api/corpus) | | `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) | +| `KnowledgeBase.get_candidates` | [`KnowledgeBase.get_alias_candidates`](/api/kb#get_alias_candidates) | | `Matcher.pipe`, `PhraseMatcher.pipe` | not needed | | `gold.offsets_from_biluo_tags`, `gold.spans_from_biluo_tags`, `gold.biluo_tags_from_offsets` | [`training.biluo_tags_to_offsets`](/api/top-level#biluo_tags_to_offsets), [`training.biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans), [`training.offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) | | `spacy init-model` | [`spacy init vectors`](/api/cli#init-vectors) | @@ -594,11 +616,11 @@ Note that spaCy v3.0 now requires **Python 3.6+**. | `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) | | `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, symlinks are deprecated | -The following deprecated methods, attributes and arguments were removed in v3.0. -Most of them have been **deprecated for a while** and many would previously -raise errors. Many of them were also mostly internals. If you've been working -with more recent versions of spaCy v2.x, it's **unlikely** that your code relied -on them. +The following methods, attributes and arguments were removed in v3.0. Most of +them have been **deprecated for a while** and many would previously raise +errors. Many of them were also mostly internals. If you've been working with +more recent versions of spaCy v2.x, it's **unlikely** that your code relied on +them. | Removed | Replacement | | ----------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -615,10 +637,10 @@ on them. ### Downloading and loading trained pipelines {#migrating-downloading-models} -Symlinks and shortcuts like `en` are now officially deprecated. There are -[many different trained pipelines](/models) with different capabilities and not -just one "English model". In order to download and load a package, you should -always use its full name – for instance, +Symlinks and shortcuts like `en` have been deprecated for a while, and are now +not supported anymore. There are [many different trained pipelines](/models) +with different capabilities and not just one "English model". In order to +download and load a package, you should always use its full name – for instance, [`en_core_web_sm`](/models/en#en_core_web_sm). ```diff @@ -832,6 +854,19 @@ pipeline component, the [`AttributeRuler`](/api/attributeruler). See the you have tag maps and morph rules in the v2.x format, you can load them into the attribute ruler before training using the `[initialize]` block of your config. +### Using Lexeme Tables + +To use tables like `lexeme_prob` when training a model from scratch, you need +to add an entry to the `initialize` block in your config. Here's what that +looks like for the existing trained pipelines: + +```ini +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +``` + > #### What does the initialization do? > > The `[initialize]` block is used when @@ -1042,7 +1077,7 @@ nlp.initialize(lambda: examples) for i in range(20): random.shuffle(examples) for batch in minibatch(examples, size=8): - nlp.update(examples) + nlp.update(batch) ``` `Language.begin_training` and `TrainablePipe.begin_training` have been renamed @@ -1064,8 +1099,10 @@ setting up the label scheme. The [`spacy package`](/api/cli#package) command now automatically builds the installable `.tar.gz` sdist of the Python package, so you don't have to run this -step manually anymore. You can disable the behavior by setting the `--no-sdist` -flag. +step manually anymore. To disable the behavior, you can set `--build none`. You +can also choose to build a binary wheel (which installs more efficiently) by +setting `--build wheel`, or to build both the sdist and wheel by setting +`--build sdist,wheel`. ```diff python -m spacy package ./output ./packages @@ -1155,3 +1192,16 @@ This means that spaCy knows how to initialize `my_component`, even if your package isn't imported. + +#### Using GPUs in Jupyter notebooks {#jupyter-notebook-gpu} + +In Jupyter notebooks, run [`prefer_gpu`](/api/top-level#spacy.prefer_gpu), +[`require_gpu`](/api/top-level#spacy.require_gpu) or +[`require_cpu`](/api/top-level#spacy.require_cpu) in the same cell as +[`spacy.load`](/api/top-level#spacy.load) to ensure that the model is loaded on +the correct device. + +Due to a bug related to `contextvars` (see the +[bug report](https://github.com/ipython/ipython/issues/11565)), the GPU settings +may not be preserved correctly across cells, resulting in models being loaded on +the wrong device or only partially on GPU. diff --git a/website/docs/usage/visualizers.md b/website/docs/usage/visualizers.md index cc73e7e67..072718f91 100644 --- a/website/docs/usage/visualizers.md +++ b/website/docs/usage/visualizers.md @@ -328,6 +328,15 @@ position. } ``` +```python +### ENT input with knowledge base links +{ + "text": "But Google is starting from behind.", + "ents": [{"start": 4, "end": 10, "label": "ORG", "kb_id": "Q95", "kb_url": "https://www.wikidata.org/entity/Q95"}], + "title": None +} +``` + ## Using displaCy in a web application {#webapp} If you want to use the visualizers as part of a web application, for example to diff --git a/website/gatsby-config.js b/website/gatsby-config.js index ea88e4890..1d919dc33 100644 --- a/website/gatsby-config.js +++ b/website/gatsby-config.js @@ -19,11 +19,11 @@ const universe = require('./meta/universe.json') const DEFAULT_TEMPLATE = path.resolve('./src/templates/index.js') -const isNightly = !!+process.env.SPACY_NIGHTLY || site.nightlyBranches.includes(process.env.BRANCH) -const favicon = isNightly ? `src/images/icon_nightly.png` : `src/images/icon.png` -const binderBranch = isNightly ? 'nightly' : site.binderBranch -const siteUrl = isNightly ? site.siteUrlNightly : site.siteUrl -const domain = isNightly ? site.domainNightly : site.domain +const domain = process.env.BRANCH || site.domain +const siteUrl = `https://${domain}` +const isNightly = site.nightlyBranches.includes(domain) +const isLegacy = site.legacy || !!+process.env.SPACY_LEGACY +const favicon = `src/images/icon${isNightly ? '_nightly' : isLegacy ? '_legacy' : ''}.png` const branch = isNightly ? 'develop' : 'master' // Those variables are going to be replaced in the Markdown, e.g. %%GITHUB_SPACY @@ -53,7 +53,8 @@ module.exports = { counts: getCounts(models.languages), universe, nightly: isNightly, - binderBranch, + legacy: isLegacy, + binderBranch: domain, siteUrl, }, diff --git a/website/meta/languages.json b/website/meta/languages.json index 918d62240..2ba117d53 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -1,85 +1,208 @@ { "languages": [ - { "code": "af", "name": "Afrikaans" }, - { "code": "ar", "name": "Arabic", "example": "هذه جملة", "has_examples": true }, - { "code": "bg", "name": "Bulgarian", "example": "Това е изречение", "has_examples": true }, - { "code": "bn", "name": "Bengali", "has_examples": true }, - { "code": "ca", "name": "Catalan", "example": "Això és una frase.", "has_examples": true }, - { "code": "cs", "name": "Czech", "has_examples": true }, + { + "code": "af", + "name": "Afrikaans" + }, + { + "code": "ar", + "name": "Arabic", + "example": "هذه جملة", + "has_examples": true + }, + { + "code": "bg", + "name": "Bulgarian", + "example": "Това е изречение", + "has_examples": true + }, + { + "code": "bn", + "name": "Bengali", + "has_examples": true + }, + { + "code": "ca", + "name": "Catalan", + "example": "Això és una frase.", + "has_examples": true, + "models": [ + "ca_core_news_sm", + "ca_core_news_md", + "ca_core_news_lg", + "ca_core_news_trf" + ] + }, + { + "code": "cs", + "name": "Czech", + "has_examples": true + }, { "code": "da", "name": "Danish", "example": "Dette er en sætning.", "has_examples": true, - "models": ["da_core_news_sm", "da_core_news_md", "da_core_news_lg"] + "models": [ + "da_core_news_sm", + "da_core_news_md", + "da_core_news_lg", + "da_core_news_trf" + ] }, { "code": "de", "name": "German", - "models": ["de_core_news_sm", "de_core_news_md", "de_core_news_lg", "de_dep_news_trf"], + "models": [ + "de_core_news_sm", + "de_core_news_md", + "de_core_news_lg", + "de_dep_news_trf" + ], "example": "Dies ist ein Satz.", "has_examples": true }, { "code": "el", "name": "Greek", - "models": ["el_core_news_sm", "el_core_news_md", "el_core_news_lg"], + "models": [ + "el_core_news_sm", + "el_core_news_md", + "el_core_news_lg" + ], "example": "Αυτή είναι μια πρόταση.", "has_examples": true }, { "code": "en", "name": "English", - "models": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "en_core_web_trf"], + "models": [ + "en_core_web_sm", + "en_core_web_md", + "en_core_web_lg", + "en_core_web_trf" + ], "example": "This is a sentence.", "has_examples": true }, { "code": "es", "name": "Spanish", - "models": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg", "es_dep_news_trf"], + "models": [ + "es_core_news_sm", + "es_core_news_md", + "es_core_news_lg", + "es_dep_news_trf" + ], "example": "Esto es una frase.", "has_examples": true }, - { "code": "et", "name": "Estonian" }, - { "code": "eu", "name": "Basque", "has_examples": true }, - { "code": "fa", "name": "Persian", "has_examples": true }, - { "code": "fi", "name": "Finnish", "has_examples": true }, + { + "code": "et", + "name": "Estonian" + }, + { + "code": "eu", + "name": "Basque", + "has_examples": true + }, + { + "code": "fa", + "name": "Persian", + "has_examples": true + }, + { + "code": "fi", + "name": "Finnish", + "has_examples": true + }, { "code": "fr", "name": "French", - "models": ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg", "fr_dep_news_trf"], + "models": [ + "fr_core_news_sm", + "fr_core_news_md", + "fr_core_news_lg", + "fr_dep_news_trf" + ], "example": "C'est une phrase.", "has_examples": true }, - { "code": "ga", "name": "Irish" }, - { "code": "gu", "name": "Gujarati", "has_examples": true }, - { "code": "he", "name": "Hebrew", "example": "זהו משפט.", "has_examples": true }, - { "code": "hi", "name": "Hindi", "example": "यह एक वाक्य है।", "has_examples": true }, - { "code": "hr", "name": "Croatian", "has_examples": true }, - { "code": "hu", "name": "Hungarian", "example": "Ez egy mondat.", "has_examples": true }, - { "code": "hy", "name": "Armenian", "has_examples": true }, + { + "code": "ga", + "name": "Irish" + }, + { + "code": "gu", + "name": "Gujarati", + "has_examples": true + }, + { + "code": "he", + "name": "Hebrew", + "example": "זהו משפט.", + "has_examples": true + }, + { + "code": "hi", + "name": "Hindi", + "example": "यह एक वाक्य है।", + "has_examples": true + }, + { + "code": "hr", + "name": "Croatian", + "has_examples": true + }, + { + "code": "hu", + "name": "Hungarian", + "example": "Ez egy mondat.", + "has_examples": true + }, + { + "code": "hy", + "name": "Armenian", + "has_examples": true + }, { "code": "id", "name": "Indonesian", "example": "Ini adalah sebuah kalimat.", "has_examples": true }, - { "code": "is", "name": "Icelandic" }, + { + "code": "is", + "name": "Icelandic" + }, { "code": "it", "name": "Italian", - "models": ["it_core_news_sm", "it_core_news_md", "it_core_news_lg"], + "models": [ + "it_core_news_sm", + "it_core_news_md", + "it_core_news_lg" + ], "example": "Questa è una frase.", "has_examples": true }, { "code": "ja", "name": "Japanese", - "models": ["ja_core_news_sm", "ja_core_news_md", "ja_core_news_lg"], + "models": [ + "ja_core_news_sm", + "ja_core_news_md", + "ja_core_news_lg" + ], "dependencies": [ - { "name": "Unidic", "url": "http://unidic.ninjal.ac.jp/back_number#unidic_cwj" }, - { "name": "Mecab", "url": "https://github.com/taku910/mecab" }, + { + "name": "Unidic", + "url": "http://unidic.ninjal.ac.jp/back_number#unidic_cwj" + }, + { + "name": "Mecab", + "url": "https://github.com/taku910/mecab" + }, { "name": "SudachiPy", "url": "https://github.com/WorksApplications/SudachiPy" @@ -88,7 +211,11 @@ "example": "これは文章です。", "has_examples": true }, - { "code": "kn", "name": "Kannada", "has_examples": true }, + { + "code": "kn", + "name": "Kannada", + "has_examples": true + }, { "code": "ko", "name": "Korean", @@ -97,13 +224,29 @@ "name": "mecab-ko", "url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md" }, - { "name": "mecab-ko-dic", "url": "https://bitbucket.org/eunjeon/mecab-ko-dic" }, - { "name": "natto-py", "url": "https://github.com/buruzaemon/natto-py" } + { + "name": "mecab-ko-dic", + "url": "https://bitbucket.org/eunjeon/mecab-ko-dic" + }, + { + "name": "natto-py", + "url": "https://github.com/buruzaemon/natto-py" + } ], "example": "이것은 문장입니다.", "has_examples": true }, - { "code": "lb", "name": "Luxembourgish", "has_examples": true }, + { + "code": "ky", + "name": "Kyrgyz", + "example": "Адамга эң кыйыны — күн сайын адам болуу", + "has_examples": true + }, + { + "code": "lb", + "name": "Luxembourgish", + "has_examples": true + }, { "code": "lij", "name": "Ligurian", @@ -114,29 +257,58 @@ "code": "lt", "name": "Lithuanian", "has_examples": true, - "models": ["lt_core_news_sm", "lt_core_news_md", "lt_core_news_lg"] + "models": [ + "lt_core_news_sm", + "lt_core_news_md", + "lt_core_news_lg" + ] + }, + { + "code": "lv", + "name": "Latvian" }, - { "code": "lv", "name": "Latvian" }, { "code": "mk", "name": "Macedonian", - "has_examples": false, - "models": ["mk_core_news_sm", "mk_core_news_md", "mk_core_news_lg"] + "models": [ + "mk_core_news_sm", + "mk_core_news_md", + "mk_core_news_lg" + ] + }, + { + "code": "ml", + "name": "Malayalam", + "has_examples": true + }, + { + "code": "mr", + "name": "Marathi" }, - { "code": "ml", "name": "Malayalam", "has_examples": true }, - { "code": "mr", "name": "Marathi" }, { "code": "nb", "name": "Norwegian Bokmål", "example": "Dette er en setning.", "has_examples": true, - "models": ["nb_core_news_sm", "nb_core_news_md", "nb_core_news_lg"] + "models": [ + "nb_core_news_sm", + "nb_core_news_md", + "nb_core_news_lg" + ] + }, + { + "code": "ne", + "name": "Nepali", + "has_examples": true }, - { "code": "ne", "name": "Nepali", "has_examples": true }, { "code": "nl", "name": "Dutch", - "models": ["nl_core_news_sm", "nl_core_news_md", "nl_core_news_lg"], + "models": [ + "nl_core_news_sm", + "nl_core_news_md", + "nl_core_news_lg" + ], "example": "Dit is een zin.", "has_examples": true }, @@ -145,12 +317,20 @@ "name": "Polish", "example": "To jest zdanie.", "has_examples": true, - "models": ["pl_core_news_sm", "pl_core_news_md", "pl_core_news_lg"] + "models": [ + "pl_core_news_sm", + "pl_core_news_md", + "pl_core_news_lg" + ] }, { "code": "pt", "name": "Portuguese", - "models": ["pt_core_news_sm", "pt_core_news_md", "pt_core_news_lg"], + "models": [ + "pt_core_news_sm", + "pt_core_news_md", + "pt_core_news_lg" + ], "example": "Esta é uma frase.", "has_examples": true }, @@ -159,94 +339,157 @@ "name": "Romanian", "example": "Aceasta este o propoziție.", "has_examples": true, - "models": ["ro_core_news_sm", "ro_core_news_md", "ro_core_news_lg"] + "models": [ + "ro_core_news_sm", + "ro_core_news_md", + "ro_core_news_lg" + ] }, { "code": "ru", "name": "Russian", "has_examples": true, - "dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }], - "models": ["ru_core_news_sm", "ru_core_news_md", "ru_core_news_lg"] + "dependencies": [ + { + "name": "pymorphy2", + "url": "https://github.com/kmike/pymorphy2" + } + ], + "models": [ + "ru_core_news_sm", + "ru_core_news_md", + "ru_core_news_lg" + ] + }, + { + "code": "sa", + "name": "Sanskrit", + "has_examples": true + }, + { + "code": "si", + "name": "Sinhala", + "example": "මෙය වාක්‍යයකි.", + "has_examples": true + }, + { + "code": "sk", + "name": "Slovak", + "has_examples": true + }, + { + "code": "sl", + "name": "Slovenian" }, - { "code": "sa", "name": "Sanskrit", "has_examples": true }, - { "code": "si", "name": "Sinhala", "example": "මෙය වාක්‍යයකි.", "has_examples": true }, - { "code": "sk", "name": "Slovak", "has_examples": true }, - { "code": "sl", "name": "Slovenian" }, { "code": "sq", "name": "Albanian", "example": "Kjo është një fjali.", "has_examples": true }, - { "code": "sr", "name": "Serbian", "has_examples": true }, - { "code": "sv", "name": "Swedish", "has_examples": true }, - { "code": "ta", "name": "Tamil", "has_examples": true }, - { "code": "te", "name": "Telugu", "example": "ఇది ఒక వాక్యం.", "has_examples": true }, + { + "code": "sr", + "name": "Serbian", + "has_examples": true + }, + { + "code": "sv", + "name": "Swedish", + "has_examples": true + }, + { + "code": "ta", + "name": "Tamil", + "has_examples": true + }, + { + "code": "te", + "name": "Telugu", + "example": "ఇది ఒక వాక్యం.", + "has_examples": true + }, { "code": "th", "name": "Thai", "dependencies": [ - { "name": "pythainlp", "url": "https://github.com/wannaphongcom/pythainlp" } + { + "name": "pythainlp", + "url": "https://github.com/wannaphongcom/pythainlp" + } ], "example": "นี่คือประโยค", "has_examples": true }, - { "code": "tl", "name": "Tagalog" }, - { "code": "tr", "name": "Turkish", "example": "Bu bir cümledir.", "has_examples": true }, - { "code": "tt", "name": "Tatar", "has_examples": true }, + { + "code": "tl", + "name": "Tagalog" + }, + { + "code": "tn", + "name": "Setswana", + "has_examples": true + }, + { + "code": "tr", + "name": "Turkish", + "example": "Bu bir cümledir.", + "has_examples": true + }, + { + "code": "tt", + "name": "Tatar", + "has_examples": true + }, { "code": "uk", "name": "Ukrainian", "has_examples": true, - "dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }] + "dependencies": [ + { + "name": "pymorphy2", + "url": "https://github.com/kmike/pymorphy2" + } + ] + }, + { + "code": "ur", + "name": "Urdu", + "example": "یہ ایک جملہ ہے", + "has_examples": true }, - { "code": "ur", "name": "Urdu", "example": "یہ ایک جملہ ہے", "has_examples": true }, { "code": "vi", "name": "Vietnamese", - "dependencies": [{ "name": "Pyvi", "url": "https://github.com/trungtv/pyvi" }] - }, - { - "code": "lij", - "name": "Ligurian", - "example": "Sta chì a l'é unna fraxe.", - "has_examples": true - }, - { - "code": "hy", - "name": "Armenian", - "has_examples": true - }, - { - "code": "gu", - "name": "Gujarati", - "has_examples": true - }, - { - "code": "ml", - "name": "Malayalam", - "has_examples": true - }, - { - "code": "ne", - "name": "Nepali", - "has_examples": true - }, - { - "code": "mk", - "name": "Macedonian" + "dependencies": [ + { + "name": "Pyvi", + "url": "https://github.com/trungtv/pyvi" + } + ] }, { "code": "xx", "name": "Multi-language", - "models": ["xx_ent_wiki_sm", "xx_sent_ud_sm"], + "models": [ + "xx_ent_wiki_sm", + "xx_sent_ud_sm" + ], "example": "This is a sentence about Facebook." }, - { "code": "yo", "name": "Yoruba", "has_examples": true }, + { + "code": "yo", + "name": "Yoruba", + "has_examples": true + }, { "code": "zh", "name": "Chinese", - "models": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg", "zh_core_web_trf"], + "models": [ + "zh_core_web_sm", + "zh_core_web_md", + "zh_core_web_lg", + "zh_core_web_trf" + ], "dependencies": [ { "name": "Jieba", @@ -261,18 +504,57 @@ } ], "licenses": [ - { "id": "CC BY 4.0", "url": "https://creativecommons.org/licenses/by/4.0/" }, - { "id": "CC BY-SA", "url": "https://creativecommons.org/licenses/by-sa/3.0/" }, - { "id": "CC BY-SA 3.0", "url": "https://creativecommons.org/licenses/by-sa/3.0/" }, - { "id": "CC BY-SA 4.0", "url": "https://creativecommons.org/licenses/by-sa/4.0/" }, - { "id": "CC BY-NC", "url": "https://creativecommons.org/licenses/by-nc/3.0/" }, - { "id": "CC BY-NC 3.0", "url": "https://creativecommons.org/licenses/by-nc/3.0/" }, - { "id": "CC BY-NC 4.0", "url": "https://creativecommons.org/licenses/by-nc/4.0/" }, - { "id": "CC-BY-NC-SA 3.0", "url": "https://creativecommons.org/licenses/by-nc-sa/3.0/" }, - { "id": "GPL", "url": "https://www.gnu.org/licenses/gpl.html" }, - { "id": "GPU GPL 3.0", "url": "https://www.gnu.org/licenses/gpl-3.0.en.html" }, - { "id": "LGPL", "url": "https://www.gnu.org/licenses/lgpl.html" }, - { "id": "MIT", "url": "https://opensource.org/licenses/MIT" }, - { "id": "LGPL-LR", "url": "https://github.com/UniversalDependencies/UD_French-Sequoia/blob/master/LICENSE.txt" } + { + "id": "CC BY 4.0", + "url": "https://creativecommons.org/licenses/by/4.0/" + }, + { + "id": "CC BY-SA", + "url": "https://creativecommons.org/licenses/by-sa/3.0/" + }, + { + "id": "CC BY-SA 3.0", + "url": "https://creativecommons.org/licenses/by-sa/3.0/" + }, + { + "id": "CC BY-SA 4.0", + "url": "https://creativecommons.org/licenses/by-sa/4.0/" + }, + { + "id": "CC BY-NC", + "url": "https://creativecommons.org/licenses/by-nc/3.0/" + }, + { + "id": "CC BY-NC 3.0", + "url": "https://creativecommons.org/licenses/by-nc/3.0/" + }, + { + "id": "CC BY-NC 4.0", + "url": "https://creativecommons.org/licenses/by-nc/4.0/" + }, + { + "id": "CC-BY-NC-SA 3.0", + "url": "https://creativecommons.org/licenses/by-nc-sa/3.0/" + }, + { + "id": "GPL", + "url": "https://www.gnu.org/licenses/gpl.html" + }, + { + "id": "GPU GPL 3.0", + "url": "https://www.gnu.org/licenses/gpl-3.0.en.html" + }, + { + "id": "LGPL", + "url": "https://www.gnu.org/licenses/lgpl.html" + }, + { + "id": "MIT", + "url": "https://opensource.org/licenses/MIT" + }, + { + "id": "LGPL-LR", + "url": "https://github.com/UniversalDependencies/UD_French-Sequoia/blob/master/LICENSE.txt" + } ] } diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index d3a0726e6..6fe09f052 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -9,7 +9,8 @@ { "text": "Models & Languages", "url": "/usage/models" }, { "text": "Facts & Figures", "url": "/usage/facts-figures" }, { "text": "spaCy 101", "url": "/usage/spacy-101" }, - { "text": "New in v3.0", "url": "/usage/v3" } + { "text": "New in v3.0", "url": "/usage/v3" }, + { "text": "New in v3.1", "url": "/usage/v3-1" } ] }, { @@ -37,7 +38,8 @@ { "label": "Resources", "items": [ - { "text": "Project Templates", "url": "https://github.com/explosion/projects" } + { "text": "Project Templates", "url": "https://github.com/explosion/projects" }, + { "text": "v2.x Documentation", "url": "https://v2.spacy.io" } ] } ] @@ -93,6 +95,7 @@ { "text": "Morphologizer", "url": "/api/morphologizer" }, { "text": "SentenceRecognizer", "url": "/api/sentencerecognizer" }, { "text": "Sentencizer", "url": "/api/sentencizer" }, + { "text": "SpanCategorizer", "url": "/api/spancategorizer" }, { "text": "Tagger", "url": "/api/tagger" }, { "text": "TextCategorizer", "url": "/api/textcategorizer" }, { "text": "Tok2Vec", "url": "/api/tok2vec" }, @@ -131,6 +134,10 @@ { "text": "Classes", "url": "/api/cython-classes" }, { "text": "Structs", "url": "/api/cython-structs" } ] + }, + { + "label": "Legacy", + "items": [{ "text": "Legacy functions", "url": "/api/legacy" }] } ] } diff --git a/website/meta/site.json b/website/meta/site.json index fcff96b56..b8f1a58ef 100644 --- a/website/meta/site.json +++ b/website/meta/site.json @@ -2,11 +2,9 @@ "title": "spaCy", "description": "spaCy is a free open-source library for Natural Language Processing in Python. It features NER, POS tagging, dependency parsing, word vectors and more.", "slogan": "Industrial-strength Natural Language Processing in Python", - "siteUrl": "https://spacy.io", "domain": "spacy.io", - "siteUrlNightly": "https://nightly.spacy.io", - "domainNightly": "nightly.spacy.io", "nightlyBranches": ["nightly.spacy.io"], + "legacy": false, "email": "contact@explosion.ai", "company": "Explosion", "companyUrl": "https://explosion.ai", @@ -28,8 +26,8 @@ "indexName": "spacy" }, "binderUrl": "explosion/spacy-io-binder", - "binderBranch": "live", - "binderVersion": "3.0.0", + "binderBranch": "spacy.io", + "binderVersion": "3.0", "sections": [ { "id": "usage", "title": "Usage Documentation", "theme": "blue" }, { "id": "models", "title": "Models Documentation", "theme": "blue" }, diff --git a/website/meta/type-annotations.json b/website/meta/type-annotations.json index 8136b3e96..0ffcbfb33 100644 --- a/website/meta/type-annotations.json +++ b/website/meta/type-annotations.json @@ -43,6 +43,7 @@ "cymem.Pool": "https://github.com/explosion/cymem", "preshed.BloomFilter": "https://github.com/explosion/preshed", "transformers.BatchEncoding": "https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding", + "transformers.file_utils.ModelOutput": "https://huggingface.co/transformers/main_classes/output.html#modeloutput", "torch.Tensor": "https://pytorch.org/docs/stable/tensors.html", "numpy.ndarray": "https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html", "Match": "https://docs.python.org/3/library/re.html#match-objects", diff --git a/website/meta/universe.json b/website/meta/universe.json index d5768d73b..80608c77d 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,36 +1,248 @@ { "resources": [ - { + { + "id": "nlpcloud", + "title": "NLPCloud.io", + "slogan": "Production-ready API for spaCy models in production", + "description": "A highly-available hosted API to easily deploy and use spaCy models in production. Supports NER, POS tagging, dependency parsing, and tokenization.", + "github": "nlpcloud", + "pip": "nlpcloud", + "code_example": [ + "import nlpcloud", + "", + "client = nlpcloud.Client('en_core_web_lg', '4eC39HqLyjWDarjtT1zdp7dc')", + "client.entities('John Doe is a Go Developer at Google')", + "# [{'end': 8, 'start': 0, 'text': 'John Doe', 'type': 'PERSON'}, {'end': 25, 'start': 13, 'text': 'Go Developer', 'type': 'POSITION'}, {'end': 35,'start': 30, 'text': 'Google', 'type': 'ORG'}]" + ], + "thumb": "https://avatars.githubusercontent.com/u/77671902", + "image": "https://nlpcloud.io/assets/images/logo.svg", + "code_language": "python", + "author": "NLPCloud.io", + "author_links": { + "github": "nlpcloud", + "twitter": "cloud_nlp", + "website": "https://nlpcloud.io" + }, + "category": ["apis", "nonpython", "standalone"], + "tags": ["api", "deploy", "production"] + }, + { + "id": "denomme", + "title": "denomme : Multilingual Name Detector", + "slogan": "Multilingual Name Detection", + "description": "A SpaCy extension for Spans to extract multilingual names out of documents trained on XLM-roberta backbone", + "github": "meghanabhange/denomme", + "pip": "denomme https://denomme.s3.us-east-2.amazonaws.com/xx_denomme-0.3.1/dist/xx_denomme-0.3.1.tar.gz", + "code_example": [ + "from spacy.lang.xx import MultiLanguage", + "from denomme.name import person_name_component", + "nlp = MultiLanguage()", + "nlp.add_pipe('denomme')", + "doc = nlp('Hi my name is Meghana S.R Bhange and I want to talk Asha')", + "print(doc._.person_name)", + "# ['Meghana S.R Bhange', 'Asha']" + ], + "thumb": "https://i.ibb.co/jwGVWPZ/rainbow-bohemian-logo-removebg-preview.png", + "code_language": "python", + "author": "Meghana Bhange", + "author_links": { + "github": "meghanabhange", + "twitter": "_aspiringcat" + }, + "category": ["standalone"], + "tags": ["person-name-detection"] + }, + { + "id": "eMFDscore", + "title": "eMFDscore : Extended Moral Foundation Dictionary Scoring for Python", + "slogan": "Extended Moral Foundation Dictionary Scoring for Python", + "description": "eMFDscore is a library for the fast and flexible extraction of various moral information metrics from textual input data. eMFDscore is built on spaCy for faster execution and performs minimal preprocessing consisting of tokenization, syntactic dependency parsing, lower-casing, and stopword/punctuation/whitespace removal. eMFDscore lets users score documents with multiple Moral Foundations Dictionaries, provides various metrics for analyzing moral information, and extracts moral patient, agent, and attribute words related to entities.", + "github": "medianeuroscience/emfdscore", + "code_example": [ + "from emfdscore.scoring import score_docs", + "import pandas as pd", + "template_input = pd.read_csv('emfdscore/template_input.csv', header=None)", + "DICT_TYPE = 'emfd'", + "PROB_MAP = 'single'", + "SCORE_METHOD = 'bow'", + "OUT_METRICS = 'vice-virtue'", + "OUT_CSV_PATH = 'single-vv.csv'", + "df = score_docs(template_input,DICT_TYPE,PROB_MAP,SCORE_METHOD,OUT_METRICS,num_docs)" + ], + "code_language": "python", + "author": "Media Neuroscience Lab", + "author_links": { + "github": "medianeuroscience", + "twitter": "medianeuro" + }, + "category": ["research", "teaching"], + "tags": ["morality", "dictionary", "sentiment"] + }, + { + "id": "skweak", + "title": "skweak", + "slogan": "Weak supervision for NLP", + "description": "`skweak` brings the power of weak supervision to NLP tasks, and in particular sequence labelling and text classification. Instead of annotating documents by hand, `skweak` allows you to define *labelling functions* to automatically label your documents, and then aggregate their results using a statistical model that estimates the accuracy and confusions of each labelling function.", + "github": "NorskRegnesentral/skweak", + "pip": "skweak", + "code_example": [ + "import spacy, re", + "from skweak import heuristics, gazetteers, aggregation, utils", + "", + "# LF 1: heuristic to detect occurrences of MONEY entities", + "def money_detector(doc):", + " for tok in doc[1:]:", + " if tok.text[0].isdigit() and tok.nbor(-1).is_currency:", + " yield tok.i-1, tok.i+1, 'MONEY'", + "lf1 = heuristics.FunctionAnnotator('money', money_detector)", + "", + "# LF 2: detection of years with a regex", + "lf2= heuristics.TokenConstraintAnnotator ('years', lambda tok: re.match('(19|20)\\d{2}$', tok.text), 'DATE')", + "", + "# LF 3: a gazetteer with a few names", + "NAMES = [('Barack', 'Obama'), ('Donald', 'Trump'), ('Joe', 'Biden')]", + "trie = gazetteers.Trie(NAMES)", + "lf3 = gazetteers.GazetteerAnnotator('presidents', {'PERSON':trie})", + "", + "# We create a corpus (here with a single text)", + "nlp = spacy.load('en_core_web_sm')", + "doc = nlp('Donald Trump paid $750 in federal income taxes in 2016')", + "", + "# apply the labelling functions", + "doc = lf3(lf2(lf1(doc)))", + "", + "# and aggregate them", + "hmm = aggregation.HMM('hmm', ['PERSON', 'DATE', 'MONEY'])", + "hmm.fit_and_aggregate([doc])", + "", + "# we can then visualise the final result (in Jupyter)", + "utils.display_entities(doc, 'hmm')" + ], + "code_language": "python", + "url": "https://github.com/NorskRegnesentral/skweak", + "thumb": "https://raw.githubusercontent.com/NorskRegnesentral/skweak/main/data/skweak_logo_thumbnail.jpg", + "image": "https://raw.githubusercontent.com/NorskRegnesentral/skweak/main/data/skweak_logo.jpg", + "author": "Pierre Lison", + "author_links": { + "twitter": "plison2", + "github": "plison", + "website": "https://www.nr.no/~plison" + }, + "category": ["pipeline", "standalone", "research", "training"], + "tags": [] + }, + { + "id": "numerizer", + "title": "numerizer", + "slogan": "Convert natural language numerics into ints and floats.", + "description": "A SpaCy extension for Docs, Spans and Tokens that converts numerical words and quantitative named entities into numeric strings.", + "github": "jaidevd/numerizer", + "pip": "numerizer", + "code_example": [ + "from spacy import load", + "import numerizer", + "nlp = load('en_core_web_sm') # or any other model", + "doc = nlp('The Hogwarts Express is at platform nine and three quarters')", + "doc._.numerize()", + "# {nine and three quarters: '9.75'}" + ], + "author": "Jaidev Deshpande", + "author_links": { + "github": "jaidevd", + "twitter": "jaidevd" + }, + "category": ["standalone"] + }, + { + "id": "spikex", + "title": "SpikeX - SpaCy Pipes for Knowledge Extraction", + "slogan": "Use SpikeX to build knowledge extraction tools with almost-zero effort", + "description": "SpikeX is a collection of pipes ready to be plugged in a spaCy pipeline. It aims to help in building knowledge extraction tools with almost-zero effort.", + "github": "erre-quadro/spikex", + "pip": "spikex", + "code_example": [ + "from spacy import load as spacy_load", + "from spikex.wikigraph import load as wg_load", + "from spikex.pipes import WikiPageX", + "", + "# load a spacy model and get a doc", + "nlp = spacy_load('en_core_web_sm')", + "doc = nlp('An apple a day keeps the doctor away')", + "# load a WikiGraph", + "wg = wg_load('simplewiki_core')", + "# get a WikiPageX and extract all pages", + "wikipagex = WikiPageX(wg)", + "doc = wikipagex(doc)", + "# see all pages extracted from the doc", + "for span in doc._.wiki_spans:", + " print(span._.wiki_pages)" + ], + "category": ["pipeline", "standalone"], + "author": "Erre Quadro", + "author_links": { + "github": "erre-quadro", + "website": "https://www.errequadrosrl.com" + } + }, + { + "id": "spacy-dbpedia-spotlight", + "title": "DBpedia Spotlight for SpaCy", + "slogan": "Use DBpedia Spotlight to link entities inside SpaCy", + "description": "This library links SpaCy with [DBpedia Spotlight](https://www.dbpedia-spotlight.org/). You can easily get the DBpedia entities from your documents, using the public web service or by using your own instance of DBpedia Spotlight. The `doc.ents` are populated with the entities and all their details (URI, type, ...).", + "github": "MartinoMensio/spacy-dbpedia-spotlight", + "pip": "spacy-dbpedia-spotlight", + "code_example": [ + "import spacy_dbpedia_spotlight", + "# load your model as usual", + "nlp = spacy.load('en_core_web_lg')", + "# add the pipeline stage", + "nlp.add_pipe('dbpedia_spotlight')", + "# get the document", + "doc = nlp('The president of USA is calling Boris Johnson to decide what to do about coronavirus')", + "# see the entities", + "print('Entities', [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents])", + "# inspect the raw data from DBpedia spotlight", + "print(doc.ents[0]._.dbpedia_raw_result)" + ], + "category": ["models", "pipeline"], + "author": "Martino Mensio", + "author_links": { + "twitter": "MartinoMensio", + "github": "MartinoMensio", + "website": "https://martinomensio.github.io" + } + }, + { "id": "spacy-textblob", "title": "spaCyTextBlob", - "slogan": "Easy sentiment analysis for spaCy using TextBlob", - "description": "spaCyTextBlob is a pipeline component that enables sentiment analysis using the [TextBlob](https://github.com/sloria/TextBlob) library. It will add the additional extenstion `._.sentiment` to `Doc`, `Span`, and `Token` objects.", + "slogan": "Easy sentiment analysis for spaCy using TextBlob. Now supports spaCy 3.0!", + "thumb": "https://github.com/SamEdwardes/spaCyTextBlob/raw/main/website/static/img/logo-thumb-square-250x250.png", + "description": "spaCyTextBlob is a pipeline component that enables sentiment analysis using the [TextBlob](https://github.com/sloria/TextBlob) library. It will add the additional extensions `._.polarity`, `._.subjectivity`, and `._.assessments` to `Doc`, `Span`, and `Token` objects. For spaCy 2 please use `pip install pip install spacytextblob==0.1.7`", "github": "SamEdwardes/spaCyTextBlob", "pip": "spacytextblob", "code_example": [ - "import spacy", - "from spacytextblob.spacytextblob import SpacyTextBlob", - "", - "nlp = spacy.load('en_core_web_sm')", - "spacy_text_blob = SpacyTextBlob()", - "nlp.add_pipe(spacy_text_blob)", - "text = 'I had a really horrible day. It was the worst day ever! But every now and then I have a really good day that makes me happy.'", - "doc = nlp(text)", - "doc._.sentiment.polarity # Polarity: -0.125", - "doc._.sentiment.subjectivity # Sujectivity: 0.9", - "doc._.sentiment.assessments # Assessments: [(['really', 'horrible'], -1.0, 1.0, None), (['worst', '!'], -1.0, 1.0, None), (['really', 'good'], 0.7, 0.6000000000000001, None), (['happy'], 0.8, 1.0, None)]" + "import spacy", + "from spacytextblob.spacytextblob import SpacyTextBlob", + "", + "nlp = spacy.load('en_core_web_sm')", + "nlp.add_pipe('spacytextblob')", + "text = 'I had a really horrible day. It was the worst day ever! But every now and then I have a really good day that makes me happy.'", + "doc = nlp(text)", + "doc._.polarity # Polarity: -0.125", + "doc._.subjectivity # Sujectivity: 0.9", + "doc._.assessments # Assessments: [(['really', 'horrible'], -1.0, 1.0, None), (['worst', '!'], -1.0, 1.0, None), (['really', 'good'], 0.7, 0.6000000000000001, None), (['happy'], 0.8, 1.0, None)]" ], "code_language": "python", "url": "https://spacytextblob.netlify.app/", "author": "Sam Edwardes", "author_links": { - "twitter": "TheReaLSamlam", - "github": "SamEdwardes", - "website": "https://samedwardes.com" + "twitter": "TheReaLSamlam", + "github": "SamEdwardes", + "website": "https://samedwardes.com" }, "category": ["pipeline"], "tags": ["sentiment", "textblob"] - }, + }, { "id": "spacy-ray", "title": "spacy-ray", @@ -82,7 +294,7 @@ "", "models = [\"en_core_web_sm\", \"en_core_web_md\"]", "default_text = \"Sundar Pichai is the CEO of Google.\"", - "spacy_streamlit.visualize(models, default_text))" + "spacy_streamlit.visualize(models, default_text)" ], "author": "Ines Montani", "author_links": { @@ -172,6 +384,49 @@ "website": "https://koaning.io" } }, + { + "id": "tokenwiser", + "title": "tokenwiser", + "slogan": "Connect vowpal-wabbit & scikit-learn models to spaCy to run simple classification benchmarks. Comes with many utility functions for spaCy pipelines.", + "github": "koaning/tokenwiser", + "pip": "tokenwiser", + "thumb": "https://koaning.github.io/tokenwiser/token.png", + "image": "https://koaning.github.io/tokenwiser/logo-tokw.png", + "code_example": [ + "import spacy", + "", + "from sklearn.pipeline import make_pipeline", + "from sklearn.feature_extraction.text import CountVectorizer", + "from sklearn.linear_model import LogisticRegression", + "", + "from tokenwiser.component import attach_sklearn_categoriser", + "", + "X = [", + " 'i really like this post',", + " 'thanks for that comment',", + " 'i enjoy this friendly forum',", + " 'this is a bad post',", + " 'i dislike this article',", + " 'this is not well written'", + "]", + "", + "y = ['pos', 'pos', 'pos', 'neg', 'neg', 'neg']", + "", + "# Note that we're training a pipeline here via a single-batch `.fit()` method", + "pipe = make_pipeline(CountVectorizer(), LogisticRegression()).fit(X, y)", + "", + "nlp = spacy.load('en_core_web_sm')", + "# This is where we attach our pre-trained model as a pipeline step.", + "attach_sklearn_categoriser(nlp, pipe_name='silly_sentiment', estimator=pipe)" + ], + "category": ["pipeline", "training"], + "author": "Vincent D. Warmerdam", + "author_links": { + "twitter": "fishnets88", + "github": "koaning", + "website": "https://koaning.io" + } + }, { "id": "spacy-stanza", "title": "spacy-stanza", @@ -182,10 +437,10 @@ "thumb": "https://i.imgur.com/myhLjMJ.png", "code_example": [ "import stanza", - "from spacy_stanza import StanzaLanguage", + "import spacy_stanza", "", - "snlp = stanza.Pipeline(lang=\"en\")", - "nlp = StanzaLanguage(snlp)", + "stanza.download(\"en\")", + "nlp = spacy_stanza.load_pipeline(\"en\")", "", "doc = nlp(\"Barack Obama was born in Hawaii. He was elected president in 2008.\")", "for token in doc:", @@ -200,6 +455,32 @@ "website": "https://explosion.ai" } }, + { + "id": "spacy-udpipe", + "title": "spacy-udpipe", + "slogan": "Use the latest UDPipe models directly in spaCy", + "description": "This package wraps the fast and efficient UDPipe language-agnostic NLP pipeline (via its Python bindings), so you can use UDPipe pre-trained models as a spaCy pipeline for 50+ languages out-of-the-box. Inspired by spacy-stanza, this package offers slightly less accurate models that are in turn much faster.", + "github": "TakeLab/spacy-udpipe", + "pip": "spacy-udpipe", + "code_example": [ + "import spacy_udpipe", + "", + "spacy_udpipe.download(\"en\") # download English model", + "", + "text = \"Wikipedia is a free online encyclopedia, created and edited by volunteers around the world.\"", + "nlp = spacy_udpipe.load(\"en\")", + "", + "doc = nlp(text)", + "for token in doc:", + " print(token.text, token.lemma_, token.pos_, token.dep_)" + ], + "category": ["pipeline", "standalone", "models", "research"], + "author": "TakeLab", + "author_links": { + "github": "TakeLab", + "website": "https://takelab.fer.hr/" + } + }, { "id": "spacy-server", "title": "spaCy Server", @@ -235,12 +516,12 @@ "title": "NeuroNER", "slogan": "Named-entity recognition using neural networks", "github": "Franck-Dernoncourt/NeuroNER", + "category": ["models"], "pip": "pyneuroner[cpu]", "code_example": [ "from neuroner import neuromodel", "nn = neuromodel.NeuroNER(train_model=False, use_pretrained_model=True)" ], - "category": ["ner"], "tags": ["standalone"] }, { @@ -282,7 +563,7 @@ "trainer = ListTrainer(chatbot)", "trainer.train([", "'Hi, can I help you?',", - "'Sure, I would like to book a flight to Iceland.", + "'Sure, I would like to book a flight to Iceland.',", "'Your flight has been booked.'", "])", "", @@ -332,7 +613,7 @@ "id": "spacymoji", "slogan": "Emoji handling and meta data as a spaCy pipeline component", "github": "ines/spacymoji", - "description": "spaCy v2.0 extension and pipeline component for adding emoji meta data to `Doc` objects. Detects emoji consisting of one or more unicode characters, and can optionally merge multi-char emoji (combined pictures, emoji with skin tone modifiers) into one token. Human-readable emoji descriptions are added as a custom attribute, and an optional lookup table can be provided for your own descriptions. The extension sets the custom `Doc`, `Token` and `Span` attributes `._.is_emoji`, `._.emoji_desc`, `._.has_emoji` and `._.emoji`.", + "description": "spaCy extension and pipeline component for adding emoji meta data to `Doc` objects. Detects emoji consisting of one or more unicode characters, and can optionally merge multi-char emoji (combined pictures, emoji with skin tone modifiers) into one token. Human-readable emoji descriptions are added as a custom attribute, and an optional lookup table can be provided for your own descriptions. The extension sets the custom `Doc`, `Token` and `Span` attributes `._.is_emoji`, `._.emoji_desc`, `._.has_emoji` and `._.emoji`.", "pip": "spacymoji", "category": ["pipeline"], "tags": ["emoji", "unicode"], @@ -361,6 +642,32 @@ "website": "https://ines.io" } }, + { + "id": "spacyopentapioca", + "title": "spaCyOpenTapioca", + "slogan": "Named entity linking on Wikidata in spaCy via OpenTapioca", + "description": "A spaCy wrapper of OpenTapioca for named entity linking on Wikidata", + "github": "UB-Mannheim/spacyopentapioca", + "pip": "spacyopentapioca", + "code_example": [ + "import spacy", + "nlp = spacy.blank('en')", + "nlp.add_pipe('opentapioca')", + "doc = nlp('Christian Drosten works in Germany.')", + "for span in doc.ents:", + " print((span.text, span.kb_id_, span.label_, span._.description, span._.score))", + "# ('Christian Drosten', 'Q1079331', 'PERSON', 'German virologist and university teacher', 3.6533377082098895)", + "# ('Germany', 'Q183', 'LOC', 'sovereign state in Central Europe', 2.1099332471902863)", + "## Check also span._.types, span._.aliases, span._.rank" + ], + "category": ["models", "pipeline"], + "tags": ["NER", "NEL"], + "author": "Renat Shigapov", + "author_links": { + "twitter": "_shigapov", + "github": "shigapov" + } + }, { "id": "spacy_hunspell", "slogan": "Add spellchecking and spelling suggestions to your spaCy pipeline using Hunspell", @@ -633,6 +940,54 @@ "category": ["pipeline"], "tags": ["lemmatizer", "danish"] }, + { + "id": "dacy", + "title": "DaCy", + "slogan": "An efficient Pipeline for Danish NLP", + "description": "DaCy is a Danish preprocessing pipeline trained in SpaCy. It has achieved State-of-the-Art performance on Named entity recognition, part-of-speech tagging and dependency parsing for Danish. This repository contains material for using the DaCy, reproducing the results and guides on usage of the package. Furthermore, it also contains a series of behavioural test for biases and robustness of Danish NLP pipelines.", + "github": "centre-for-humanities-computing/DaCy", + "pip": "dacy", + "code_example": [ + "import dacy", + "print(dacy.models()) # get a list of dacy models", + "nlp = dacy.load('medium') # load your spacy pipeline", + "", + "# DaCy also includes functionality for adding other Danish models to the pipeline", + "# For instance you can add the BertTone model for classification of sentiment polarity to the pipeline:", + "nlp = add_berttone_polarity(nlp)" + ], + "thumb": "https://github.com/centre-for-humanities-computing/DaCy/blob/main/img/icon_no_title.png?raw=true", + "author": "Centre for Humanities Computing Aarhus", + "author_links": { + "github": "centre-for-humanities-computing", + "website": "https://chcaa.io/#/" + }, + "category": ["pipeline"], + "tags": ["pipeline", "danish"] + }, + { + "id": "textdescriptives", + "title": "TextDescriptives", + "slogan": "Extraction of descriptive stats, readability, and syntactic complexity measures", + "description": "Pipeline component for spaCy v.3 that calculates descriptive statistics, readability metrics, and syntactic complexity (dependency distance).", + "github": "HLasse/TextDescriptives", + "pip": "textdescriptives", + "code_example": [ + "import spacy", + "import textdescriptives as td", + "nlp = spacy.load('en_core_web_sm')", + "nlp.add_pipe('textdescriptives')", + "doc = nlp('This is a short test text')", + "doc._.readability # access some of the values", + "td.extract_df(doc) # extract all metrics to DataFrame" + ], + "author": "Lasse Hansen, Kenneth Enevoldsen, Ludvig Olsen", + "author_links": { + "github": "HLasse" + }, + "category": ["pipeline"], + "tags": ["pipeline", "readability", "syntactic complexity", "descriptive statistics"] + }, { "id": "wmd-relax", "slogan": "Calculates word mover's distance insanely fast", @@ -780,6 +1135,26 @@ }, "category": ["visualizers"] }, + { + "id": "deplacy", + "slogan": "CUI-based Tree Visualizer for Universal Dependencies and Immediate Catena Analysis", + "description": "Simple dependency visualizer for [spaCy](https://spacy.io/), [UniDic2UD](https://pypi.org/project/unidic2ud), [Stanza](https://stanfordnlp.github.io/stanza/), [NLP-Cube](https://github.com/Adobe/NLP-Cube), [Trankit](https://github.com/nlp-uoregon/trankit), etc.", + "github": "KoichiYasuoka/deplacy", + "image": "https://i.imgur.com/6uOI4Op.png", + "code_example": [ + "import spacy", + "import deplacy", + "", + "nlp=spacy.load('en_core_web_sm')", + "doc=nlp('I saw a horse yesterday which had no name.')", + "deplacy.render(doc)" + ], + "author": "Koichi Yasuoka", + "author_links": { + "github": "KoichiYasuoka" + }, + "category": ["visualizers"] + }, { "id": "scattertext", "slogan": "Beautiful visualizations of how language differs among document types", @@ -895,7 +1270,7 @@ "description": "`textacy` is a Python library for performing a variety of natural language processing (NLP) tasks, built on the high-performance `spacy` library. With the fundamentals – tokenization, part-of-speech tagging, dependency parsing, etc. – delegated to another library, `textacy` focuses on the tasks that come before and follow after.", "github": "chartbeat-labs/textacy", "pip": "textacy", - "url": "https://chartbeat-labs.github.io/textacy/", + "url": "https://github.com/chartbeat-labs/textacy", "author": "Burton DeWilde", "author_links": { "github": "bdewilde", @@ -988,20 +1363,19 @@ "url": "https://explosion.ai/demos/sense2vec", "code_example": [ "import spacy", - "from sense2vec import Sense2VecComponent", "", - "nlp = spacy.load('en')", - "s2v = Sense2VecComponent('/path/to/reddit_vectors-1.1.0')", - "nlp.add_pipe(s2v)", + "nlp = spacy.load(\"en_core_web_sm\")", + "s2v = nlp.add_pipe(\"sense2vec\")", + "s2v.from_disk(\"/path/to/s2v_reddit_2015_md\")", "", "doc = nlp(\"A sentence about natural language processing.\")", - "assert doc[3].text == 'natural language processing'", - "freq = doc[3]._.s2v_freq", - "vector = doc[3]._.s2v_vec", - "most_similar = doc[3]._.s2v_most_similar(3)", - "# [(('natural language processing', 'NOUN'), 1.0),", - "# (('machine learning', 'NOUN'), 0.8986966609954834),", - "# (('computer vision', 'NOUN'), 0.8636297583580017)]" + "assert doc[3:6].text == \"natural language processing\"", + "freq = doc[3:6]._.s2v_freq", + "vector = doc[3:6]._.s2v_vec", + "most_similar = doc[3:6]._.s2v_most_similar(3)", + "# [(('machine learning', 'NOUN'), 0.8986967),", + "# (('computer vision', 'NOUN'), 0.8636297),", + "# (('deep learning', 'NOUN'), 0.8573361)]" ], "category": ["pipeline", "standalone", "visualizers"], "tags": ["vectors"], @@ -1074,6 +1448,35 @@ }, "category": ["nonpython"] }, + { + "id": "ruby-spacy", + "title": "ruby-spacy", + "slogan": "Wrapper module for using spaCy from Ruby via PyCall", + "description": "ruby-spacy is a wrapper module for using spaCy from the Ruby programming language via PyCall. This module aims to make it easy and natural for Ruby programmers to use spaCy.", + "github": "yohasebe/ruby-spacy", + "code_example": [ + "require \"ruby-spacy\"", + "require \"terminal-table\"", + "nlp = Spacy::Language.new(\"en_core_web_sm\")", + "doc = nlp.read(\"Apple is looking at buying U.K. startup for $1 billion\")", + "headings = [\"text\", \"lemma\", \"pos\", \"tag\", \"dep\"]", + "rows = []", + "doc.each do |token|", + " rows << [token.text, token.lemma, token.pos, token.tag, token.dep]", + "end", + "table = Terminal::Table.new rows: rows, headings: headings", + "puts table" + ], + "code_language": "ruby", + "url": "https://rubygems.org/gems/ruby-spacy", + "author": "Yoichiro Hasebe", + "author_links": { + "github": "yohasebe", + "twitter": "yohasebe" + }, + "category": ["nonpython"], + "tags": ["ruby"] + }, { "id": "spacy_api", "slogan": "Server/client to load models in a separate, dedicated process", @@ -1279,6 +1682,38 @@ "author": "Bhargav Srinivasa-Desikan", "category": ["books"] }, + { + "type": "education", + "id": "mastering-spacy", + "title": "Mastering spaCy", + "slogan": "Packt, 2021", + "description": "This is your ultimate spaCy book. Master the crucial skills to use spaCy components effectively to create real-world NLP applications with spaCy. Explaining linguistic concepts such as dependency parsing, POS-tagging and named entity extraction with many examples, this book will help you to conquer computational linguistics with spaCy. The book further focuses on ML topics with Keras and Tensorflow. You'll cover popular topics, including intent recognition, sentiment analysis and context resolution; and use them on popular datasets and interpret the results. A special hands-on section on chatbot design is included.", + "github": "PacktPublishing/Mastering-spaCy", + "cover": "https://tinyimg.io/i/aWEm0dh.jpeg", + "url": "https://www.amazon.com/Mastering-spaCy-end-end-implementing/dp/1800563353", + "author": "Duygu Altinok", + "author_links": { + "github": "DuyguA", + "website": "https://www.linkedin.com/in/duygu-altinok-4021389a" + }, + "category": ["books"] + }, + { + "type": "education", + "id": "applied-nlp-in-enterprise", + "title": "Applied Natural Language Processing in the Enterprise: Teaching Machines to Read, Write, and Understand", + "slogan": "O'Reilly, 2021", + "description": "Natural language processing (NLP) is one of the hottest topics in AI today. Having lagged behind other deep learning fields such as computer vision for years, NLP only recently gained mainstream popularity. Even though Google, Facebook, and OpenAI have open sourced large pretrained language models to make NLP easier, many organizations today still struggle with developing and productionizing NLP applications. This hands-on guide helps you learn the field quickly.", + "github": "nlpbook/nlpbook", + "cover": "https://i.imgur.com/6RxLBvf.jpg", + "url": "https://www.amazon.com/dp/149206257X", + "author": "Ankur A. Patel", + "author_links": { + "github": "aapatel09", + "website": "https://www.ankurapatel.io" + }, + "category": ["books"] + }, { "type": "education", "id": "learning-path-spacy", @@ -1290,6 +1725,16 @@ "author": "Aaron Kramer", "category": ["courses"] }, + { + "type": "education", + "id": "introduction-into-spacy-3", + "title": "Introduction to spaCy 3", + "slogan": "A free course for beginners by Dr. W.J.B. Mattingly", + "url": "http://spacy.pythonhumanities.com/", + "thumb": "https://spacy.pythonhumanities.com/_static/freecodecamp_small.jpg", + "author": "Dr. W.J.B. Mattingly", + "category": ["courses"] + }, { "type": "education", "id": "spacy-course", @@ -1690,11 +2135,9 @@ "github": "nikitakit/self-attentive-parser", "pip": "benepar", "code_example": [ - "import spacy", - "from benepar.spacy_plugin import BeneparComponent", - "", - "nlp = spacy.load('en')", - "nlp.add_pipe(BeneparComponent('benepar_en'))", + "import benepar, spacy", + "nlp = spacy.load('en_core_web_md')", + "nlp.add_pipe('benepar', config={'model': 'benepar_en3'})", "doc = nlp('The time for action is now. It is never too late to do something.')", "sent = list(doc.sents)[0]", "print(sent._.parse_string)", @@ -1825,14 +2268,17 @@ "description": "`spacy-wordnet` creates annotations that easily allow the use of WordNet and [WordNet Domains](http://wndomains.fbk.eu/) by using the [NLTK WordNet interface](http://www.nltk.org/howto/wordnet.html)", "github": "recognai/spacy-wordnet", "tags": ["wordnet", "synsets"], - "thumb": "https://i.imgur.com/3y2uPUv.jpg", + "thumb": "https://i.imgur.com/ud4C7cj.png", "code_example": [ "import spacy", "from spacy_wordnet.wordnet_annotator import WordnetAnnotator ", "", "# Load an spacy model (supported models are \"es\" and \"en\") ", "nlp = spacy.load('en')", - "nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')", + "# Spacy 3.x", + "nlp.add_pipe(\"spacy_wordnet\", after='tagger', config={'lang': nlp.lang})", + "# Spacy 2.x", + "# nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')", "token = nlp('prices')[0]", "", "# wordnet object link spacy token with nltk wordnet interface by giving acces to", @@ -2100,6 +2546,39 @@ "github": "richardpaulhudson" } }, + { + "id": "coreferee", + "title": "Coreferee", + "slogan": "Coreference resolution for multiple languages", + "github": "msg-systems/coreferee", + "url": "https://github.com/msg-systems/coreferee", + "description": "Coreferee is a pipeline plugin that performs coreference resolution for English, German and Polish. It is designed so that it is easy to add support for new languages and optimised for limited training data. It uses a mixture of neural networks and programmed rules. Please note you will need to [install models](https://github.com/msg-systems/coreferee#getting-started) before running the code example.", + "pip": "coreferee", + "category": ["pipeline", "models", "standalone"], + "tags": ["coreference-resolution", "anaphora"], + "code_example": [ + "import coreferee, spacy", + "nlp = spacy.load('en_core_web_trf')", + "nlp.add_pipe('coreferee')", + "doc = nlp('Although he was very busy with his work, Peter had had enough of it. He and his wife decided they needed a holiday. They travelled to Spain because they loved the country very much.')", + "doc._.coref_chains.print()", + "# Output:", + "#", + "# 0: he(1), his(6), Peter(9), He(16), his(18)", + "# 1: work(7), it(14)", + "# 2: [He(16); wife(19)], they(21), They(26), they(31)", + "# 3: Spain(29), country(34)", + "#", + "print(doc._.coref_chains.resolve(doc[31]))", + "# Output:", + "#", + "# [Peter, wife]" + ], + "author": "Richard Paul Hudson", + "author_links": { + "github": "richardpaulhudson" + } + }, { "id": "spacy-transformers", "title": "spacy-transformers", @@ -2122,6 +2601,75 @@ "website": "https://explosion.ai" } }, + { + "id": "spacy-huggingface-hub", + "title": "spacy-huggingface-hub", + "slogan": "Push your spaCy pipelines to the Hugging Face Hub", + "description": "This package provides a CLI command for uploading any trained spaCy pipeline packaged with [`spacy package`](https://spacy.io/api/cli#package) to the [Hugging Face Hub](https://huggingface.co). It auto-generates all meta information for you, uploads a pretty README (requires spaCy v3.1+) and handles version control under the hood.", + "github": "explosion/spacy-huggingface-hub", + "thumb": "https://i.imgur.com/j6FO9O6.jpg", + "url": "https://github.com/explosion/spacy-huggingface-hub", + "pip": "spacy-huggingface-hub", + "category": ["pipeline", "models"], + "author": "Explosion", + "author_links": { + "twitter": "explosion_ai", + "github": "explosion", + "website": "https://explosion.ai" + } + }, + { + "id": "spacy-clausie", + "title": "spacy-clausie", + "slogan": "Implementation of the ClausIE information extraction system for Python+spaCy", + "github": "mmxgn/spacy-clausie", + "url": "https://github.com/mmxgn/spacy-clausie", + "description": "ClausIE, a novel, clause-based approach to open information extraction, which extracts relations and their arguments from natural language text", + "category": ["pipeline", "scientific", "research"], + "code_example": [ + "import spacy", + "import claucy", + "", + "nlp = spacy.load(\"en\")", + "claucy.add_to_pipe(nlp)", + "", + "doc = nlp(\"AE died in Princeton in 1955.\")", + "", + "print(doc._.clauses)", + "# Output:", + "# ", + "", + "propositions = doc._.clauses[0].to_propositions(as_text=True)", + "", + "print(propositions)", + "# Output:", + "# [AE died in Princeton in 1955, AE died in 1955, AE died in Princeton" + ], + "author": "Emmanouil Theofanis Chourdakis", + "author_links": { + "github": "mmxgn" + } + }, + { + "id": "ipymarkup", + "slogan": "NER, syntax markup visualizations", + "description": "Collection of NLP visualizations for NER and syntax tree markup. Similar to [displaCy](https://explosion.ai/demos/displacy) and [displaCy ENT](https://explosion.ai/demos/displacy-ent).", + "github": "natasha/ipymarkup", + "image": "https://github.com/natasha/ipymarkup/blob/master/table.png?raw=true", + "pip":"pip install ipymarkup", + "code_example": [ + "from ipymarkup import show_span_ascii_markup, show_dep_ascii_markup", + "", + "text = 'В мероприятии примут участие не только российские учёные, но и зарубежные исследователи, в том числе, Крис Хелмбрехт - управляющий директор и совладелец креативного агентства Kollektiv (Германия, США), Ннека Угбома - руководитель проекта Mushroom works (Великобритания), Гергей Ковач - политик и лидер субкультурной партии «Dog with two tails» (Венгрия), Георг Жено - немецкий режиссёр, один из создателей экспериментального театра «Театр.doc», Театра им. Йозефа Бойса (Германия).'", + "spans = [(102, 116, 'PER'), (186, 194, 'LOC'), (196, 199, 'LOC'), (202, 214, 'PER'), (254, 268, 'LOC'), (271, 283, 'PER'), (324, 342, 'ORG'), (345, 352, 'LOC'), (355, 365, 'PER'), (445, 455, 'ORG'), (456, 468, 'PER'), (470, 478, 'LOC')]", + "show_span_ascii_markup(text, spans)" + ], + "author": "Alexander Kukushkin", + "author_links": { + "github": "kuk" + }, + "category": ["visualizers"] + }, { "id": "negspacy", "title": "negspaCy", @@ -2139,8 +2687,7 @@ "from negspacy.negation import Negex", "", "nlp = spacy.load(\"en_core_web_sm\")", - "negex = Negex(nlp, ent_types=[\"PERSON','ORG\"])", - "nlp.add_pipe(negex, last=True)", + "nlp.add_pipe(\"negex\", config={\"ent_types\":[\"PERSON\",\"ORG\"]})", "", "doc = nlp(\"She does not like Steve Jobs but likes Apple products.\")", "for e in doc.ents:", @@ -2422,11 +2969,10 @@ "github": "thomasthiebaud/spacy-fastlang", "pip": "spacy_fastlang", "code_example": [ - "import spacy", - "from spacy_fastlang import LanguageDetector", + "import spacy_fastlang", "", - "nlp = spacy.load('en_core_web_sm')", - "nlp.add_pipe(LanguageDetector())", + "nlp = spacy.load(\"en_core_web_sm\")", + "nlp.add_pipe(\"language_detector\")", "doc = nlp('Life is like a box of chocolates. You never know what you are gonna get.')", "", "assert doc._.language == 'en'", @@ -2482,10 +3028,10 @@ "pip": "pyate", "code_example": [ "import spacy", - "from pyate.term_extraction_pipeline import TermExtractionPipeline", + "import pyate", "", "nlp = spacy.load('en_core_web_sm')", - "nlp.add_pipe(TermExtractionPipeline())", + "nlp.add_pipe(\"combo_basic\") # or any of `basic`, `weirdness`, `term_extractor` or `cvalue`", "# source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1994795/", "string = 'Central to the development of cancer are genetic changes that endow these “cancer cells” with many of the hallmarks of cancer, such as self-sufficient growth and resistance to anti-growth and pro-death signals. However, while the genetic changes that occur within cancer cells themselves, such as activated oncogenes or dysfunctional tumor suppressors, are responsible for many aspects of cancer development, they are not sufficient. Tumor promotion and progression are dependent on ancillary processes provided by cells of the tumor environment but that are not necessarily cancerous themselves. Inflammation has long been associated with the development of cancer. This review will discuss the reflexive relationship between cancer and inflammation with particular focus on how considering the role of inflammation in physiologic processes such as the maintenance of tissue homeostasis and repair may provide a logical framework for understanding the connection between the inflammatory response and cancer.'", "", @@ -2514,14 +3060,14 @@ "id": "contextualSpellCheck", "title": "Contextual Spell Check", "slogan": "Contextual spell correction using BERT (bidirectional representations)", - "description": "This package currently focuses on Out of Vocabulary (OOV) word or non-word error (NWE) correction using BERT model. The idea of using BERT was to use the context when correcting NWE. In the coming days, I would like to focus on RWE and optimising the package by implementing it in cython.", + "description": "This package currently focuses on Out of Vocabulary (OOV) word or non-word error (NWE) correction using BERT model. The idea of using BERT was to use the context when correcting NWE.", "github": "R1j1t/contextualSpellCheck", "pip": "contextualSpellCheck", "code_example": [ "import spacy", "import contextualSpellCheck", "", - "nlp = spacy.load('en')", + "nlp = spacy.load('en_core_web_sm')", "contextualSpellCheck.add_to_pipe(nlp)", "doc = nlp('Income was $9.4 milion compared to the prior year of $2.7 milion.')", "", @@ -2574,14 +3120,14 @@ "description": "A spaCy rule-based pipeline for identifying positive cases of COVID-19 from clinical text. A version of this system was deployed as part of the US Department of Veterans Affairs biosurveillance response to COVID-19.", "pip": "cov-bsv", "code_example": [ - "import cov_bsv", - "", - "nlp = cov_bsv.load()", - "doc = nlp('Pt tested for COVID-19. His wife was recently diagnosed with novel coronavirus. SARS-COV-2: Detected')", - "", - "print(doc.ents)", - "print(doc._.cov_classification)", - "cov_bsv.visualize_doc(doc)" + "import cov_bsv", + "", + "nlp = cov_bsv.load()", + "doc = nlp('Pt tested for COVID-19. His wife was recently diagnosed with novel coronavirus. SARS-COV-2: Detected')", + "", + "print(doc.ents)", + "print(doc._.cov_classification)", + "cov_bsv.visualize_doc(doc)" ], "category": ["pipeline", "standalone", "biomedical", "scientific"], "tags": ["clinical", "epidemiology", "covid-19", "surveillance"], @@ -2599,18 +3145,18 @@ "description": "A toolkit for clinical NLP with spaCy. Features include sentence splitting, section detection, and asserting negation, family history, and uncertainty.", "pip": "medspacy", "code_example": [ - "import medspacy", - "from medspacy.ner import TargetRule", - "", - "nlp = medspacy.load()", - "print(nlp.pipe_names)", - "", - "nlp.get_pipe('target_matcher').add([TargetRule('stroke', 'CONDITION'), TargetRule('diabetes', 'CONDITION'), TargetRule('pna', 'CONDITION')])", - "doc = nlp('Patient has hx of stroke. Mother diagnosed with diabetes. No evidence of pna.')", - "", - "for ent in doc.ents:", - " print(ent, ent._.is_negated, ent._.is_family, ent._.is_historical)", - "medspacy.visualization.visualize_ent(doc)" + "import medspacy", + "from medspacy.ner import TargetRule", + "", + "nlp = medspacy.load()", + "print(nlp.pipe_names)", + "", + "nlp.get_pipe('target_matcher').add([TargetRule('stroke', 'CONDITION'), TargetRule('diabetes', 'CONDITION'), TargetRule('pna', 'CONDITION')])", + "doc = nlp('Patient has hx of stroke. Mother diagnosed with diabetes. No evidence of pna.')", + "", + "for ent in doc.ents:", + " print(ent, ent._.is_negated, ent._.is_family, ent._.is_historical)", + "medspacy.visualization.visualize_ent(doc)" ], "category": ["biomedical", "scientific", "research"], "tags": ["clinical"], @@ -2619,14 +3165,14 @@ "github": "medspacy" } }, - { + { "id": "rita-dsl", "title": "RITA DSL", "slogan": "Domain Specific Language for creating language rules", "github": "zaibacu/rita-dsl", "description": "A Domain Specific Language (DSL) for building language patterns. These can be later compiled into spaCy patterns, pure regex, or any other format", "pip": "rita-dsl", - "thumb": "https://raw.githubusercontent.com/zaibacu/rita-dsl/master/docs/assets/logo-100px.png", + "thumb": "https://raw.githubusercontent.com/zaibacu/rita-dsl/master/docs/assets/logo-100px.png", "code_language": "python", "code_example": [ "import spacy", @@ -2726,15 +3272,253 @@ "{", " var lexeme = doc.Vocab[word.Text];", " Console.WriteLine($@\"{lexeme.Text} {lexeme.Orth} {lexeme.Shape} {lexeme.Prefix} {lexeme.Suffix} {lexeme.IsAlpha} {lexeme.IsDigit} {lexeme.IsTitle} {lexeme.Lang}\");", - "}" - ], + "}" + ], "code_language": "csharp", "author": "Antonio Miras", "author_links": { "github": "AMArostegui" }, "category": ["nonpython"] - } + }, + { + "id": "ruts", + "title": "ruTS", + "slogan": "A library for statistics extraction from texts in Russian", + "description": "The library allows extracting the following statistics from a text: basic statistics, readability metrics, lexical diversity metrics, morphological statistics", + "github": "SergeyShk/ruTS", + "pip": "ruts", + "code_example": [ + "import spacy", + "import ruts", + "", + "nlp = spacy.load('ru_core_news_sm')", + "nlp.add_pipe('basic', last=True)", + "doc = nlp('мама мыла раму')", + "doc._.basic.get_stats()" + ], + "code_language": "python", + "thumb": "https://habrastorage.org/webt/6z/le/fz/6zlefzjavzoqw_wymz7v3pwgfp4.png", + "image": "https://clipartart.com/images/free-tree-roots-clipart-black-and-white-2.png", + "author": "Sergey Shkarin", + "author_links": { + "twitter": "shk_sergey", + "github": "SergeyShk" + }, + "category": ["pipeline", "standalone"], + "tags": ["Text Analytics", "Russian"] + }, + { + "id": "trunajod", + "title": "TRUNAJOD", + "slogan": "A text complexity library for text analysis built on spaCy", + "description": "With all the basic NLP capabilities provided by spaCy (dependency parsing, POS tagging, tokenizing), `TRUNAJOD` focuses on extracting measurements from texts that might be interesting for different applications and use cases.", + "github": "dpalmasan/TRUNAJOD2.0", + "pip": "trunajod", + "code_example": [ + "import spacy", + "from TRUNAJOD.entity_grid import EntityGrid", + "", + "nlp = spacy.load('es_core_news_sm', disable=['ner', 'textcat'])", + "example_text = (", + " 'El espectáculo del cielo nocturno cautiva la mirada y suscita preguntas'", + " 'sobre el universo, su origen y su funcionamiento. No es sorprendente que '", + " 'todas las civilizaciones y culturas hayan formado sus propias '", + " 'cosmologías. Unas relatan, por ejemplo, que el universo ha'", + " 'sido siempre tal como es, con ciclos que inmutablemente se repiten; '", + " 'otras explican que este universo ha tenido un principio, '", + " 'que ha aparecido por obra creadora de una divinidad.'", + ")", + "doc = nlp(example_text)", + "egrid = EntityGrid(doc)", + "print(egrid.get_egrid())" + ], + "code_language": "python", + "thumb": "https://raw.githubusercontent.com/dpalmasan/TRUNAJOD2.0/master/imgs/trunajod_thumb.png", + "image": "https://raw.githubusercontent.com/dpalmasan/TRUNAJOD2.0/master/imgs/trunajod_logo.png", + "author": "Diego Palma", + "author_links": { + "github": "dpalmasan" + }, + "category": ["research", "standalone", "scientific"], + "tags": ["Text Analytics", "Coherence", "Cohesion"] + }, + { + "id": "hmrb", + "title": "Hammurabi", + "slogan": "Python Rule Processing Engine 🏺", + "description": "Hammurabi works as a rule engine to parse input using a defined set of rules. It uses a simple and readable syntax to define complex rules to handle phrase matching. The syntax supports nested logical statements, regular expressions, reusable or side-loaded variables and match triggered callback functions to modularize your rules. The latest version works with both spaCy 2.X and 3.X. For more information check the documentation on [ReadTheDocs](https://hmrb.readthedocs.io/en/latest/).", + "github": "babylonhealth/hmrb", + "pip": "hmrb", + "code_example": [ + "import spacy", + "from hmrb.core import SpacyCore", + "", + "nlp = spacy.load(\"en_core_web_sm\")", + "sentences = \"I love gorillas. Peter loves gorillas. Jane loves Tarzan.\"", + "", + "def conj_be(subj: str) -> str:", + " if subj == \"I\":", + " return \"am\"", + " elif subj == \"you\":", + " return \"are\"", + " else:", + " return \"is\"", + "", + "@spacy.registry.callbacks(\"gorilla_callback\")", + "def gorilla_clb(seq: list, span: slice, data: dict) -> None:", + " subj = seq[span.start].text", + " be = conj_be(subj)", + " print(f\"{subj} {be} a gorilla person.\")", + "@spacy.registry.callbacks(\"lover_callback\")", + "def lover_clb(seq: list, span: slice, data: dict) -> None:", + " print(f\"{seq[span][-1].text} is a love interest of {seq[span.start].text}.\")", + "", + "grammar = \"\"\"", + " Law:", + " - callback: \"loves_gorilla\"", + " (", + " ((pos: \"PROPN\") or (pos: \"PRON\"))", + " (lemma: \"love\")", + " (lemma: \"gorilla\")", + " )", + " Law:", + " - callback: \"loves_someone\"", + " (", + " (pos: \"PROPN\")", + " (lower: \"loves\")", + " (pos: \"PROPN\")", + " )", + "\"\"\"", + "", + "@spacy.registry.augmenters(\"jsonify_span\")", + "def jsonify_span(span):", + " return [{\"lemma\": token.lemma_, \"pos\": token.pos_, \"lower\": token.lower_} for token in span]", + "", + "conf = {", + " \"rules\": grammar,", + " \"callbacks\": {", + " \"loves_gorilla\": \"callbacks.gorilla_callback\",", + " \"loves_someone\": \"callbacks.lover_callback\",", + " },", + " \"map_doc\": \"augmenters.jsonify_span\",", + " \"sort_length\": True,", + "}", + "", + "nlp.add_pipe(\"hmrb\", config=conf)", + "nlp(sentences)" + ], + "code_language": "python", + "thumb": "https://user-images.githubusercontent.com/6807878/118643685-cae6b880-b7d4-11eb-976e-066aec9505da.png", + "image": "https://user-images.githubusercontent.com/6807878/118643685-cae6b880-b7d4-11eb-976e-066aec9505da.png", + "author": "Kristian Boda", + "author_links": { + "github": "bodak", + "twitter": "bodak", + "website": "https://github.com/babylonhealth/" + }, + "category": ["pipeline", "standalone", "scientific", "biomedical"], + "tags": ["babylonhealth", "rule-engine", "matcher"] + }, + { + "id": "forte", + "title": "Forte", + "slogan": "Forte is a toolkit for building Natural Language Processing pipelines, featuring cross-task interaction, adaptable data-model interfaces and composable pipelines.", + "description": "Forte provides a platform to assemble state-of-the-art NLP and ML technologies in a highly-composable fashion, including a wide spectrum of tasks ranging from Information Retrieval, Natural Language Understanding to Natural Language Generation.", + "github": "asyml/forte", + "pip": "forte.spacy stave torch", + "code_example": [ + "from fortex.spacy import SpacyProcessor", + "from forte.processors.stave import StaveProcessor", + "from forte import Pipeline", + "from forte.data.readers import StringReader", + "", + "pipeline = Pipeline()", + "pipeline.set_reader(StringReader())", + "pipeline.add(SpacyProcessor())", + "pipeline.add(StaveProcessor())", + "pipeline.run('Running SpaCy with Forte!')" + ], + "code_language": "python", + "url": "https://medium.com/casl-project/forte-building-modular-and-re-purposable-nlp-pipelines-cf5b5c5abbe9", + "thumb": "https://raw.githubusercontent.com/asyml/forte/master/docs/_static/img/forte_graphic.png", + "image": "https://raw.githubusercontent.com/asyml/forte/master/docs/_static/img/logo_h.png", + "author": "Petuum", + "author_links": { + "twitter": "PetuumInc", + "github": "asyml", + "website": "https://petuum.com" + }, + "category": ["pipeline", "standalone"], + "tags": ["pipeline"] + }, + { + "id": "spacy-api-docker-v3", + "slogan": "spaCy v3 REST API, wrapped in a Docker container", + "github": "bbieniek/spacy-api-docker", + "url": "https://hub.docker.com/r/bbieniek/spacyapi/", + "thumb": "https://i.imgur.com/NRnDKyj.jpg", + "code_example": [ + "version: '3'", + "", + "services:", + " spacyapi:", + " image: bbieniek/spacyapi:en_v3", + " ports:", + " - \"127.0.0.1:8080:80\"", + " restart: always" + ], + "code_language": "docker", + "author": "Baltazar Bieniek", + "author_links": { + "github": "bbieniek" + }, + "category": ["apis"] + }, + { + "id": "phruzz_matcher", + "title": "phruzz-matcher", + "slogan": "Phrase matcher using RapidFuzz", + "description": "Combination of the RapidFuzz library with Spacy PhraseMatcher The goal of this component is to find matches when there were NO \"perfect matches\" due to typos or abbreviations between a Spacy doc and a list of phrases.", + "github": "mjvallone/phruzz-matcher", + "pip": "phruzz_matcher", + "code_example": [ + "import spacy", + "from spacy.language import Language", + "from phruzz_matcher.phrase_matcher import PhruzzMatcher", + "", + "famous_people = [", + " \"Brad Pitt\",", + " \"Demi Moore\",", + " \"Bruce Willis\",", + " \"Jim Carrey\",", + "]", + "", + "@Language.factory(\"phrase_matcher\")", + "def phrase_matcher(nlp: Language, name: str):", + " return PhruzzMatcher(nlp, famous_people, \"FAMOUS_PEOPLE\", 85)", + "", + "nlp = spacy.blank('es')", + "nlp.add_pipe(\"phrase_matcher\")", + "", + "doc = nlp(\"El otro día fui a un bar donde vi a brad pit y a Demi Moore, estaban tomando unas cervezas mientras charlaban de sus asuntos.\")", + "print(f\"doc.ents: {doc.ents}\")", + "", + "#OUTPUT", + "#doc.ents: (brad pit, Demi Moore)" + ], + "thumb": "https://avatars.githubusercontent.com/u/961296?v=4", + "image": "", + "code_language": "python", + "author": "Martin Vallone", + "author_links": { + "github": "mjvallone", + "twitter": "vallotin", + "website": "https://fiqus.coop/" + }, + "category": ["pipeline", "research", "standalone"], + "tags": ["spacy", "python", "nlp", "ner"] + } ], "categories": [ @@ -2766,6 +3550,11 @@ "title": "Scientific", "description": "Frameworks and utilities for scientific text processing" }, + { + "id": "biomedical", + "title": "Biomedical", + "description": "Frameworks and utilities for processing biomedical text" + }, { "id": "visualizers", "title": "Visualizers", diff --git a/website/package-lock.json b/website/package-lock.json index d8444c2b2..11b1e4899 100644 --- a/website/package-lock.json +++ b/website/package-lock.json @@ -15069,6 +15069,11 @@ "parse-url": "^5.0.0" } }, + "github-buttons": { + "version": "2.14.2", + "resolved": "https://registry.npmjs.org/github-buttons/-/github-buttons-2.14.2.tgz", + "integrity": "sha512-DMakrcFRdojVAndkKYVDTHF3Ym09OoWia//IQ7B/MVxC+iQ2DenYfD7IR69ZZ9awM8PNS/9wthr4IyDhkFJ4mg==" + }, "github-from-package": { "version": "0.0.0", "resolved": "https://registry.npmjs.org/github-from-package/-/github-from-package-0.0.0.tgz", @@ -22693,6 +22698,14 @@ "resolved": "https://registry.npmjs.org/react-error-overlay/-/react-error-overlay-3.0.0.tgz", "integrity": "sha512-XzgvowFrwDo6TWcpJ/WTiarb9UI6lhA4PMzS7n1joK3sHfBBBOQHUc0U4u57D6DWO9vHv6lVSWx2Q/Ymfyv4hw==" }, + "react-github-btn": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/react-github-btn/-/react-github-btn-1.2.0.tgz", + "integrity": "sha512-/b2TGTeek5Ky+KtuP5BxOaXgb1FGhbwgZNI6rkwkGk7+xtCtsNMkdchOcCnC3qU1JGTWPKzYZWpPBIouVhXAoQ==", + "requires": { + "github-buttons": "^2.8.0" + } + }, "react-helmet": { "version": "5.2.0", "resolved": "https://registry.npmjs.org/react-helmet/-/react-helmet-5.2.0.tgz", diff --git a/website/package.json b/website/package.json index def94a9c2..95336a539 100644 --- a/website/package.json +++ b/website/package.json @@ -49,6 +49,7 @@ "prop-types": "^15.7.2", "react": "^16.8.2", "react-dom": "^16.8.2", + "react-github-btn": "^1.2.0", "react-helmet": "^5.2.0", "react-intersection-observer": "^8.0.1", "remark-react": "^5.0.1" @@ -57,6 +58,7 @@ "build": "npm run python:install && npm run python:setup && gatsby build", "dev": "npm run python:setup && gatsby develop", "dev:nightly": "BRANCH=nightly.spacy.io npm run dev", + "dev:legacy": "SPACY_LEGACY=1 npm run dev", "lint": "eslint **", "clear": "rm -rf .cache", "test": "echo \"Write tests! -> https://gatsby.app/unit-testing\"", diff --git a/website/src/components/code.js b/website/src/components/code.js index 336c7dc80..6e9f0c22e 100644 --- a/website/src/components/code.js +++ b/website/src/components/code.js @@ -14,7 +14,7 @@ import GitHubCode from './github' import classes from '../styles/code.module.sass' const WRAP_THRESHOLD = 30 -const CLI_GROUPS = ['init', 'debug', 'project', 'ray'] +const CLI_GROUPS = ['init', 'debug', 'project', 'ray', 'huggingface-hub'] export default props => (
@@ -121,7 +121,6 @@ function parseArgs(raw) {
 }
 
 function convertLine(line, i) {
-    console.log(line, i)
     const cliRegex = /^(\$ )?python -m spacy/
     if (cliRegex.test(line)) {
         const text = line.replace(cliRegex, '')
diff --git a/website/src/components/embed.js b/website/src/components/embed.js
index 90a640fe2..8d82bfaae 100644
--- a/website/src/components/embed.js
+++ b/website/src/components/embed.js
@@ -8,12 +8,12 @@ import { markdownToReact } from './util'
 
 import classes from '../styles/embed.module.sass'
 
-const YouTube = ({ id, ratio = '16x9' }) => {
-    const embedClassNames = classNames(classes.root, classes.responsive, {
+const YouTube = ({ id, ratio = '16x9', className }) => {
+    const embedClassNames = classNames(classes.root, classes.responsive, className, {
         [classes.ratio16x9]: ratio === '16x9',
         [classes.ratio4x3]: ratio === '4x3',
     })
-    const url = `https://www.youtube.com/embed/${id}`
+    const url = `https://www.youtube-nocookie.com/embed/${id}`
     return (