diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index 09de1cd05..fce1a1064 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,8 +1,11 @@
blank_issues_enabled: false
contact_links:
+ - name: ⚠️ Python 3.10 Support
+ url: https://github.com/explosion/spaCy/discussions/9418
+ about: Python 3.10 wheels haven't been released yet, see the link for details.
- name: 🗯 Discussions Forum
url: https://github.com/explosion/spaCy/discussions
- about: Usage questions, general discussion and anything else that isn't a bug report.
+ about: Install issues, usage questions, general discussion and anything else that isn't a bug report.
- name: 📖 spaCy FAQ & Troubleshooting
url: https://github.com/explosion/spaCy/discussions/8226
about: Before you post, check out the FAQ for answers to common community questions!
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index ec11b78bd..b48b2c51b 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -14,6 +14,6 @@ or new feature, or a change to the documentation? -->
## Checklist
-- [ ] I have submitted the spaCy Contributor Agreement.
+- [ ] I confirm that I have the right to submit this contribution under the project's MIT license.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index 50e81799e..80c88b0b8 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -25,6 +25,10 @@ steps:
${{ parameters.prefix }} python setup.py sdist --formats=gztar
displayName: "Compile and build sdist"
+ - script: python -m mypy spacy
+ displayName: 'Run mypy'
+ condition: ne(variables['python_version'], '3.10')
+
- task: DeleteFiles@1
inputs:
contents: "spacy"
@@ -100,3 +104,14 @@ steps:
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
displayName: 'Test assemble CLI vectors warning'
condition: eq(variables['python_version'], '3.8')
+
+ - script: |
+ python .github/validate_universe_json.py website/meta/universe.json
+ displayName: 'Test website/meta/universe.json'
+ condition: eq(variables['python_version'], '3.8')
+
+ - script: |
+ ${{ parameters.prefix }} python -m pip install thinc-apple-ops
+ ${{ parameters.prefix }} python -m pytest --pyargs spacy
+ displayName: "Run CPU tests with thinc-apple-ops"
+ condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.9'))
diff --git a/.github/contributors/Jette16.md b/.github/contributors/Jette16.md
new file mode 100644
index 000000000..c064f1d4f
--- /dev/null
+++ b/.github/contributors/Jette16.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Henriette Behr |
+| Company name (if applicable) | |
+| Title or role (if applicable) | |
+| Date | 23.09.2021 |
+| GitHub username | Jette16 |
+| Website (optional) | |
diff --git a/.github/contributors/KennethEnevoldsen.md b/.github/contributors/KennethEnevoldsen.md
new file mode 100644
index 000000000..0bbb28d61
--- /dev/null
+++ b/.github/contributors/KennethEnevoldsen.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [X] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------------- |
+| Name | Kenneth Enevoldsen |
+| Company name (if applicable) | |
+| Title or role (if applicable) | |
+| Date | 2021-07-13 |
+| GitHub username | KennethEnevoldsen |
+| Website (optional) | www.kennethenevoldsen.com |
diff --git a/.github/contributors/Pantalaymon.md b/.github/contributors/Pantalaymon.md
new file mode 100644
index 000000000..f017f2947
--- /dev/null
+++ b/.github/contributors/Pantalaymon.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name |Valentin-Gabriel Soumah|
+| Company name (if applicable) | |
+| Title or role (if applicable) | |
+| Date | 2021-11-23 |
+| GitHub username | Pantalaymon |
+| Website (optional) | |
diff --git a/.github/contributors/avi197.md b/.github/contributors/avi197.md
new file mode 100644
index 000000000..903d7db4c
--- /dev/null
+++ b/.github/contributors/avi197.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Son Pham |
+| Company name (if applicable) | |
+| Title or role (if applicable) | |
+| Date | 09/10/2021 |
+| GitHub username | Avi197 |
+| Website (optional) | |
diff --git a/.github/contributors/bbieniek.md b/.github/contributors/bbieniek.md
new file mode 100644
index 000000000..4050946aa
--- /dev/null
+++ b/.github/contributors/bbieniek.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [X] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Baltazar Bieniek |
+| Company name (if applicable) | |
+| Title or role (if applicable) | |
+| Date | 2021.08.19 |
+| GitHub username | bbieniek |
+| Website (optional) | https://baltazar.bieniek.org.pl/ |
\ No newline at end of file
diff --git a/.github/contributors/connorbrinton.md b/.github/contributors/connorbrinton.md
new file mode 100644
index 000000000..25d03b494
--- /dev/null
+++ b/.github/contributors/connorbrinton.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Connor Brinton |
+| Company name (if applicable) | |
+| Title or role (if applicable) | |
+| Date | July 20th, 2021 |
+| GitHub username | connorbrinton |
+| Website (optional) | |
diff --git a/.github/contributors/ezorita.md b/.github/contributors/ezorita.md
new file mode 100644
index 000000000..e5f3f5283
--- /dev/null
+++ b/.github/contributors/ezorita.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Eduard Zorita |
+| Company name (if applicable) | |
+| Title or role (if applicable) | |
+| Date | 06/17/2021 |
+| GitHub username | ezorita |
+| Website (optional) | |
diff --git a/.github/contributors/fgaim.md b/.github/contributors/fgaim.md
new file mode 100644
index 000000000..1c3b409b4
--- /dev/null
+++ b/.github/contributors/fgaim.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Fitsum Gaim |
+| Company name (if applicable) | |
+| Title or role (if applicable) | |
+| Date | 2021-08-07 |
+| GitHub username | fgaim |
+| Website (optional) | |
diff --git a/.github/contributors/hlasse.md b/.github/contributors/hlasse.md
new file mode 100644
index 000000000..b64b3c6a6
--- /dev/null
+++ b/.github/contributors/hlasse.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [X] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------------- |
+| Name | Lasse Hansen |
+| Company name (if applicable) | |
+| Title or role (if applicable) | |
+| Date | 2021-08-11 |
+| GitHub username | HLasse |
+| Website (optional) | www.lassehansen.me |
diff --git a/.github/contributors/jmyerston.md b/.github/contributors/jmyerston.md
new file mode 100644
index 000000000..be5db5453
--- /dev/null
+++ b/.github/contributors/jmyerston.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+| ----------------------------- | ----------------------------------- |
+| Name | Jacobo Myerston |
+| Company name (if applicable) | University of California, San Diego |
+| Title or role (if applicable) | Academic |
+| Date | 07/05/2021 |
+| GitHub username | jmyerston |
+| Website (optional) | diogenet.ucsd.edu |
diff --git a/.github/contributors/mariosasko.md b/.github/contributors/mariosasko.md
new file mode 100644
index 000000000..1f5acc934
--- /dev/null
+++ b/.github/contributors/mariosasko.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [ ] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [x] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Mario Šaško |
+| Company name (if applicable) | TakeLab FER |
+| Title or role (if applicable) | R&D Intern |
+| Date | 2021-07-12 |
+| GitHub username | mariosasko |
+| Website (optional) | |
diff --git a/.github/contributors/nsorros.md b/.github/contributors/nsorros.md
new file mode 100644
index 000000000..a449c52e1
--- /dev/null
+++ b/.github/contributors/nsorros.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Nick Sorros |
+| Company name (if applicable) | |
+| Title or role (if applicable) | |
+| Date | 2/8/2021 |
+| GitHub username | nsorros |
+| Website (optional) | |
diff --git a/.github/contributors/philipvollet.md b/.github/contributors/philipvollet.md
new file mode 100644
index 000000000..0bf58a701
--- /dev/null
+++ b/.github/contributors/philipvollet.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Philip Vollet |
+| Company name (if applicable) | |
+| Title or role (if applicable) | |
+| Date | 22.09.2021 |
+| GitHub username | philipvollet |
+| Website (optional) | |
diff --git a/.github/contributors/shigapov.md b/.github/contributors/shigapov.md
new file mode 100644
index 000000000..3c24c7982
--- /dev/null
+++ b/.github/contributors/shigapov.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | ------------------------ |
+| Name | Renat Shigapov |
+| Company name (if applicable) | |
+| Title or role (if applicable) | |
+| Date | 2021-09-09 |
+| GitHub username | shigapov |
+| Website (optional) | |
diff --git a/.github/contributors/swfarnsworth.md b/.github/contributors/swfarnsworth.md
new file mode 100644
index 000000000..c289e6658
--- /dev/null
+++ b/.github/contributors/swfarnsworth.md
@@ -0,0 +1,88 @@
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Steele Farnsworth |
+| Company name (if applicable) | |
+| Title or role (if applicable) | |
+| Date | 13 August, 2021 |
+| GitHub username | swfarnsworth |
+| Website (optional) | |
+
diff --git a/.github/contributors/syrull.md b/.github/contributors/syrull.md
new file mode 100644
index 000000000..82cdade12
--- /dev/null
+++ b/.github/contributors/syrull.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Dimitar Ganev |
+| Company name (if applicable) | |
+| Title or role (if applicable) | |
+| Date | 2021/8/2 |
+| GitHub username | syrull |
+| Website (optional) | |
diff --git a/.github/contributors/thomashacker.md b/.github/contributors/thomashacker.md
new file mode 100644
index 000000000..d88727dc8
--- /dev/null
+++ b/.github/contributors/thomashacker.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Edward Schmuhl |
+| Company name (if applicable) | |
+| Title or role (if applicable) | |
+| Date | 09.07.2021 |
+| GitHub username | thomashacker |
+| Website (optional) | |
diff --git a/.github/lock.yml b/.github/lock.yml
deleted file mode 100644
index 593e88397..000000000
--- a/.github/lock.yml
+++ /dev/null
@@ -1,19 +0,0 @@
-# Configuration for lock-threads - https://github.com/dessant/lock-threads
-
-# Number of days of inactivity before a closed issue or pull request is locked
-daysUntilLock: 30
-
-# Issues and pull requests with these labels will not be locked. Set to `[]` to disable
-exemptLabels: []
-
-# Label to add before locking, such as `outdated`. Set to `false` to disable
-lockLabel: false
-
-# Comment to post before locking. Set to `false` to disable
-lockComment: >
- This thread has been automatically locked since there has not been
- any recent activity after it was closed. Please open a new issue for
- related bugs.
-
-# Limit to only `issues` or `pulls`
-only: issues
diff --git a/.github/validate_universe_json.py b/.github/validate_universe_json.py
new file mode 100644
index 000000000..b96b7b347
--- /dev/null
+++ b/.github/validate_universe_json.py
@@ -0,0 +1,19 @@
+import json
+import re
+import sys
+from pathlib import Path
+
+
+def validate_json(document):
+ universe_file = Path(document)
+ with universe_file.open() as f:
+ universe_data = json.load(f)
+ for entry in universe_data["resources"]:
+ if "github" in entry:
+ assert not re.match(
+ r"^(http:)|^(https:)", entry["github"]
+ ), "Github field should be user/repo, not a url"
+
+
+if __name__ == "__main__":
+ validate_json(str(sys.argv[1]))
diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml
index 9f4f82ae5..8d0282650 100644
--- a/.github/workflows/autoblack.yml
+++ b/.github/workflows/autoblack.yml
@@ -9,7 +9,7 @@ on:
jobs:
autoblack:
- if: github.repository_owner = 'explosion'
+ if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
diff --git a/.github/workflows/explosionbot.yml b/.github/workflows/explosionbot.yml
new file mode 100644
index 000000000..e29ce8fe8
--- /dev/null
+++ b/.github/workflows/explosionbot.yml
@@ -0,0 +1,27 @@
+name: Explosion Bot
+
+on:
+ issue_comment:
+ types:
+ - created
+ - edited
+
+jobs:
+ explosion-bot:
+ runs-on: ubuntu-18.04
+ steps:
+ - name: Dump GitHub context
+ env:
+ GITHUB_CONTEXT: ${{ toJson(github) }}
+ run: echo "$GITHUB_CONTEXT"
+ - uses: actions/checkout@v1
+ - uses: actions/setup-python@v1
+ - name: Install and run explosion-bot
+ run: |
+ pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot
+ python -m explosionbot
+ env:
+ INPUT_TOKEN: ${{ secrets.EXPLOSIONBOT_TOKEN }}
+ INPUT_BK_TOKEN: ${{ secrets.BUILDKITE_SECRET }}
+ ENABLED_COMMANDS: "test_gpu,test_slow"
+ ALLOWED_TEAMS: "spaCy"
diff --git a/.github/workflows/lock.yml b/.github/workflows/lock.yml
new file mode 100644
index 000000000..c9833cdba
--- /dev/null
+++ b/.github/workflows/lock.yml
@@ -0,0 +1,25 @@
+name: 'Lock Threads'
+
+on:
+ schedule:
+ - cron: '0 0 * * *' # check every day
+ workflow_dispatch:
+
+permissions:
+ issues: write
+
+concurrency:
+ group: lock
+
+jobs:
+ action:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: dessant/lock-threads@v3
+ with:
+ process-only: 'issues'
+ issue-inactive-days: '30'
+ issue-comment: >
+ This thread has been automatically locked since there
+ has not been any recent activity after it was closed.
+ Please open a new issue for related bugs.
diff --git a/.gitignore b/.gitignore
index ac72f2bbf..60036a475 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,7 @@ keys/
spacy/tests/package/setup.cfg
spacy/tests/package/pyproject.toml
spacy/tests/package/requirements.txt
+spacy/tests/universe/universe.json
# Website
website/.cache/
diff --git a/CITATION b/CITATION
deleted file mode 100644
index bdaa90677..000000000
--- a/CITATION
+++ /dev/null
@@ -1,8 +0,0 @@
-@software{spacy,
- author = {Honnibal, Matthew and Montani, Ines and Van Landeghem, Sofie and Boyd, Adriane},
- title = {{spaCy: Industrial-strength Natural Language Processing in Python}},
- year = 2020,
- publisher = {Zenodo},
- doi = {10.5281/zenodo.1212303},
- url = {https://doi.org/10.5281/zenodo.1212303}
-}
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 000000000..88c05b2a3
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,16 @@
+cff-version: 1.2.0
+preferred-citation:
+ type: article
+ message: "If you use spaCy, please cite it as below."
+ authors:
+ - family-names: "Honnibal"
+ given-names: "Matthew"
+ - family-names: "Montani"
+ given-names: "Ines"
+ - family-names: "Van Landeghem"
+ given-names: "Sofie"
+ - family-names: "Boyd"
+ given-names: "Adriane"
+ title: "spaCy: Industrial-strength Natural Language Processing in Python"
+ doi: "10.5281/zenodo.1212303"
+ year: 2020
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3a94b9b67..9a7d0744a 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -140,29 +140,28 @@ Changes to `.py` files will be effective immediately.
📖 **For more details and instructions, see the documentation on [compiling spaCy from source](https://spacy.io/usage/#source) and the [quickstart widget](https://spacy.io/usage/#section-quickstart) to get the right commands for your platform and Python version.**
-### Contributor agreement
-
-If you've made a contribution to spaCy, you should fill in the
-[spaCy contributor agreement](.github/CONTRIBUTOR_AGREEMENT.md) to ensure that
-your contribution can be used across the project. If you agree to be bound by
-the terms of the agreement, fill in the [template](.github/CONTRIBUTOR_AGREEMENT.md)
-and include it with your pull request, or submit it separately to
-[`.github/contributors/`](/.github/contributors). The name of the file should be
-your GitHub username, with the extension `.md`. For example, the user
-example_user would create the file `.github/contributors/example_user.md`.
-
### Fixing bugs
When fixing a bug, first create an
-[issue](https://github.com/explosion/spaCy/issues) if one does not already exist.
-The description text can be very short – we don't want to make this too
+[issue](https://github.com/explosion/spaCy/issues) if one does not already
+exist. The description text can be very short – we don't want to make this too
bureaucratic.
-Next, create a test file named `test_issue[ISSUE NUMBER].py` in the
-[`spacy/tests/regression`](spacy/tests/regression) folder. Test for the bug
-you're fixing, and make sure the test fails. Next, add and commit your test file
-referencing the issue number in the commit message. Finally, fix the bug, make
-sure your test passes and reference the issue in your commit message.
+Next, add a test to the relevant file in the
+[`spacy/tests`](spacy/tests)folder. Then add a [pytest
+mark](https://docs.pytest.org/en/6.2.x/example/markers.html#working-with-custom-markers),
+`@pytest.mark.issue(NUMBER)`, to reference the issue number.
+
+```python
+# Assume you're fixing Issue #1234
+@pytest.mark.issue(1234)
+def test_issue1234():
+ ...
+```
+
+Test for the bug you're fixing, and make sure the test fails. Next, add and
+commit your test file. Finally, fix the bug, make sure your test passes and
+reference the issue number in your pull request description.
📖 **For more information on how to add tests, check out the [tests README](spacy/tests/README.md).**
@@ -185,7 +184,6 @@ Each time a `git commit` is initiated, `black` and `flake8` will run automatical
In case of error, or when `black` modified a file, the modified file needs to be `git add` once again and a new
`git commit` has to be issued.
-
### Code formatting
[`black`](https://github.com/ambv/black) is an opinionated Python code
@@ -414,14 +412,7 @@ all test files and test functions need to be prefixed with `test_`.
When adding tests, make sure to use descriptive names, keep the code short and
concise and only test for one behavior at a time. Try to `parametrize` test
cases wherever possible, use our pre-defined fixtures for spaCy components and
-avoid unnecessary imports.
-
-Extensive tests that take a long time should be marked with `@pytest.mark.slow`.
-Tests that require the model to be loaded should be marked with
-`@pytest.mark.models`. Loading the models is expensive and not necessary if
-you're not actually testing the model performance. If all you need is a `Doc`
-object with annotations like heads, POS tags or the dependency parse, you can
-use the `Doc` constructor to construct it manually.
+avoid unnecessary imports. Extensive tests that take a long time should be marked with `@pytest.mark.slow`.
📖 **For more guidelines and information on how to add tests, check out the [tests README](spacy/tests/README.md).**
@@ -438,7 +429,7 @@ simply click on the "Suggest edits" button at the bottom of a page.
## Publishing spaCy extensions and plugins
We're very excited about all the new possibilities for **community extensions**
-and plugins in spaCy v2.0, and we can't wait to see what you build with it!
+and plugins in spaCy v3.0, and we can't wait to see what you build with it!
- An extension or plugin should add substantial functionality, be
**well-documented** and **open-source**. It should be available for users to download
diff --git a/LICENSE b/LICENSE
index 86f501b92..d76864579 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
The MIT License (MIT)
-Copyright (C) 2016-2021 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
+Copyright (C) 2016-2022 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
diff --git a/MANIFEST.in b/MANIFEST.in
index 99fc174bd..b7826e456 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,11 +1,8 @@
-recursive-include include *.h
-recursive-include spacy *.pyx *.pxd *.txt *.cfg *.jinja
+recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml
include LICENSE
include README.md
include pyproject.toml
include spacy/py.typed
-recursive-exclude spacy/lang *.json
-recursive-include spacy/lang *.json.gz
-recursive-include spacy/cli *.json *.yml
+recursive-include spacy/cli *.yml
recursive-include licenses *
recursive-exclude spacy *.cpp
diff --git a/README.md b/README.md
index 61d5449a4..57d76fb45 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
model packaging, deployment and workflow management. spaCy is commercial
open-source software, released under the MIT license.
-💫 **Version 3.0 out now!**
+💫 **Version 3.2 out now!**
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
[](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index ac80b8a10..71a793911 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -12,17 +12,18 @@ trigger:
- "website/*"
- "*.md"
pr:
- paths:
+ paths:
exclude:
- - "website/*"
- "*.md"
+ - "website/docs/*"
+ - "website/src/*"
jobs:
# Perform basic checks for most important errors (syntax etc.) Uses the config
# defined in .flake8 and overwrites the selected codes.
- job: "Validate"
pool:
- vmImage: "ubuntu-18.04"
+ vmImage: "ubuntu-latest"
steps:
- task: UsePythonVersion@0
inputs:
@@ -38,41 +39,50 @@ jobs:
matrix:
# We're only running one platform per Python version to speed up builds
Python36Linux:
- imageName: "ubuntu-18.04"
+ imageName: "ubuntu-latest"
python.version: "3.6"
# Python36Windows:
- # imageName: "vs2017-win2016"
+ # imageName: "windows-latest"
# python.version: "3.6"
# Python36Mac:
- # imageName: "macos-10.14"
+ # imageName: "macos-latest"
# python.version: "3.6"
# Python37Linux:
- # imageName: "ubuntu-18.04"
+ # imageName: "ubuntu-latest"
# python.version: "3.7"
Python37Windows:
- imageName: "vs2017-win2016"
+ imageName: "windows-latest"
python.version: "3.7"
# Python37Mac:
- # imageName: "macos-10.14"
+ # imageName: "macos-latest"
# python.version: "3.7"
# Python38Linux:
- # imageName: "ubuntu-18.04"
+ # imageName: "ubuntu-latest"
# python.version: "3.8"
# Python38Windows:
- # imageName: "vs2017-win2016"
+ # imageName: "windows-latest"
# python.version: "3.8"
Python38Mac:
- imageName: "macos-10.14"
+ imageName: "macos-latest"
python.version: "3.8"
Python39Linux:
- imageName: "ubuntu-18.04"
- python.version: "3.9"
- Python39Windows:
- imageName: "vs2017-win2016"
- python.version: "3.9"
- Python39Mac:
- imageName: "macos-10.14"
+ imageName: "ubuntu-latest"
python.version: "3.9"
+ # Python39Windows:
+ # imageName: "windows-latest"
+ # python.version: "3.9"
+ # Python39Mac:
+ # imageName: "macos-latest"
+ # python.version: "3.9"
+ Python310Linux:
+ imageName: "ubuntu-latest"
+ python.version: "3.10"
+ Python310Windows:
+ imageName: "windows-latest"
+ python.version: "3.10"
+ Python310Mac:
+ imageName: "macos-latest"
+ python.version: "3.10"
maxParallel: 4
pool:
vmImage: $(imageName)
diff --git a/build-constraints.txt b/build-constraints.txt
index 23e660096..cf5fe3284 100644
--- a/build-constraints.txt
+++ b/build-constraints.txt
@@ -2,4 +2,5 @@
numpy==1.15.0; python_version<='3.7'
numpy==1.17.3; python_version=='3.8'
numpy==1.19.3; python_version=='3.9'
-numpy; python_version>='3.10'
+numpy==1.21.3; python_version=='3.10'
+numpy; python_version>='3.11'
diff --git a/extra/DEVELOPER_DOCS/Code Conventions.md b/extra/DEVELOPER_DOCS/Code Conventions.md
new file mode 100644
index 000000000..eba466c46
--- /dev/null
+++ b/extra/DEVELOPER_DOCS/Code Conventions.md
@@ -0,0 +1,546 @@
+# Code Conventions
+
+For a general overview of code conventions for contributors, see the [section in the contributing guide](https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md#code-conventions).
+
+1. [Code compatibility](#code-compatibility)
+2. [Auto-formatting](#auto-formatting)
+3. [Linting](#linting)
+4. [Documenting code](#documenting-code)
+5. [Type hints](#type-hints)
+6. [Structuring logic](#structuring-logic)
+7. [Naming](#naming)
+8. [Error handling](#error-handling)
+9. [Writing tests](#writing-tests)
+
+## Code compatibility
+
+spaCy supports **Python 3.6** and above, so all code should be written compatible with 3.6. This means that there are certain new syntax features that we won't be able to use until we drop support for older Python versions. Some newer features provide backports that we can conditionally install for older versions, although we only want to do this if it's absolutely necessary. If we need to use conditional imports based on the Python version or other custom compatibility-specific helpers, those should live in `compat.py`.
+
+## Auto-formatting
+
+spaCy uses `black` for auto-formatting (which is also available as a pre-commit hook). It's recommended to configure your editor to perform this automatically, either triggered manually or whenever you save a file. We also have a GitHub action that regularly formats the code base and submits a PR if changes are available. Note that auto-formatting is currently only available for `.py` (Python) files, not for `.pyx` (Cython).
+
+As a rule of thumb, if the auto-formatting produces output that looks messy, it can often indicate that there's a better way to structure the code to make it more concise.
+
+```diff
+- range_suggester = registry.misc.get("spacy.ngram_range_suggester.v1")(
+- min_size=1, max_size=3
+- )
++ suggester_factory = registry.misc.get("spacy.ngram_range_suggester.v1")
++ range_suggester = suggester_factory(min_size=1, max_size=3)
+```
+
+In some specific cases, e.g. in the tests, it can make sense to disable auto-formatting for a specific block. You can do this by wrapping the code in `# fmt: off` and `# fmt: on`:
+
+```diff
++ # fmt: off
+text = "I look forward to using Thingamajig. I've been told it will make my life easier..."
+deps = ["nsubj", "ROOT", "advmod", "prep", "pcomp", "dobj", "punct", "",
+ "nsubjpass", "aux", "auxpass", "ROOT", "nsubj", "aux", "ccomp",
+ "poss", "nsubj", "ccomp", "punct"]
++ # fmt: on
+```
+
+## Linting
+
+[`flake8`](http://flake8.pycqa.org/en/latest/) is a tool for enforcing code style. It scans one or more files and outputs errors and warnings. This feedback can help you stick to general standards and conventions, and can be very useful for spotting potential mistakes and inconsistencies in your code. Code you write should be compatible with our flake8 rules and not cause any warnings.
+
+```bash
+flake8 spacy
+```
+
+The most common problems surfaced by linting are:
+
+- **Trailing or missing whitespace.** This is related to formatting and should be fixed automatically by running `black`.
+- **Unused imports.** Those should be removed if the imports aren't actually used. If they're required, e.g. to expose them so they can be imported from the given module, you can add a comment and `# noqa: F401` exception (see details below).
+- **Unused variables.** This can often indicate bugs, e.g. a variable that's declared and not correctly passed on or returned. To prevent ambiguity here, your code shouldn't contain unused variables. If you're unpacking a list of tuples and end up with variables you don't need, you can call them `_` to indicate that they're unused.
+- **Redefinition of function.** This can also indicate bugs, e.g. a copy-pasted function that you forgot to rename and that now replaces the original function.
+- **Repeated dictionary keys.** This either indicates a bug or unnecessary duplication.
+- **Comparison with `True`, `False`, `None`**. This is mostly a stylistic thing: when checking whether a value is `True`, `False` or `None`, you should be using `is` instead of `==`. For example, `if value is None`.
+
+### Ignoring linter rules for special cases
+
+To ignore a given line, you can add a comment like `# noqa: F401`, specifying the code of the error or warning we want to ignore. It's also possible to ignore several comma-separated codes at once, e.g. `# noqa: E731,E123`. In general, you should always **specify the code(s)** you want to ignore – otherwise, you may end up missing actual problems.
+
+```python
+# The imported class isn't used in this file, but imported here, so it can be
+# imported *from* here by another module.
+from .submodule import SomeClass # noqa: F401
+
+try:
+ do_something()
+except: # noqa: E722
+ # This bare except is justified, for some specific reason
+ do_something_else()
+```
+
+## Documenting code
+
+All functions and methods you write should be documented with a docstring inline. The docstring can contain a simple summary, and an overview of the arguments and their (simplified) types. Modern editors will show this information to users when they call the function or method in their code.
+
+If it's part of the public API and there's a documentation section available, we usually add the link as `DOCS:` at the end. This allows us to keep the docstrings simple and concise, while also providing additional information and examples if necessary.
+
+```python
+def has_pipe(self, name: str) -> bool:
+ """Check if a component name is present in the pipeline. Equivalent to
+ `name in nlp.pipe_names`.
+
+ name (str): Name of the component.
+ RETURNS (bool): Whether a component of the name exists in the pipeline.
+
+ DOCS: https://spacy.io/api/language#has_pipe
+ """
+ ...
+```
+
+We specifically chose this approach of maintaining the docstrings and API reference separately, instead of auto-generating the API docs from the docstrings like other packages do. We want to be able to provide extensive explanations and examples in the documentation and use our own custom markup for it that would otherwise clog up the docstrings. We also want to be able to update the documentation independently of the code base. It's slightly more work, but it's absolutely worth it in terms of user and developer experience.
+
+### Inline code comments
+
+We don't expect you to add inline comments for everything you're doing – this should be obvious from reading the code. If it's not, the first thing to check is whether your code can be improved to make it more explicit. That said, if your code includes complex logic or aspects that may be unintuitive at first glance (or even included a subtle bug that you ended up fixing), you should leave a quick comment that provides more context.
+
+```diff
+token_index = indices[value]
++ # Index describes Token.i of last token but Span indices are inclusive
+span = doc[prev_token_index:token_index + 1]
+```
+
+```diff
++ # To create the components we need to use the final interpolated config
++ # so all values are available (if component configs use variables).
++ # Later we replace the component config with the raw config again.
+interpolated = filled.interpolate() if not filled.is_interpolated else filled
+```
+
+Don't be shy about including comments for tricky parts that _you_ found hard to implement or get right – those may come in handy for the next person working on this code, or even future you!
+
+If your change implements a fix to a specific issue, it can often be helpful to include the issue number in the comment, especially if it's a relatively straightforward adjustment:
+
+```diff
++ # Ensure object is a Span, not a Doc (#1234)
+if isinstance(obj, Doc):
+ obj = obj[obj.start:obj.end]
+```
+
+### Including TODOs
+
+It's fine to include code comments that indicate future TODOs, using the `TODO:` prefix. Modern editors typically format this in a different color, so it's easy to spot. TODOs don't necessarily have to be things that are absolutely critical to fix fight now – those should already be addressed in your pull request once it's ready for review. But they can include notes about potential future improvements.
+
+```diff
++ # TODO: this is currently pretty slow
+dir_checksum = hashlib.md5()
+for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
+ dir_checksum.update(sub_file.read_bytes())
+```
+
+If any of the TODOs you've added are important and should be fixed soon, you should add a task for this on Explosion's internal Ora board or an issue on the public issue tracker to make sure we don't forget to address it.
+
+## Type hints
+
+We use Python type hints across the `.py` files wherever possible. This makes it easy to understand what a function expects and returns, and modern editors will be able to show this information to you when you call an annotated function. Type hints are not currently used in the `.pyx` (Cython) code, except for definitions of registered functions and component factories, where they're used for config validation.
+
+If possible, you should always use the more descriptive type hints like `List[str]` or even `List[Any]` instead of only `list`. We also annotate arguments and return types of `Callable` – although, you can simplify this if the type otherwise gets too verbose (e.g. functions that return factories to create callbacks). Remember that `Callable` takes two values: a **list** of the argument type(s) in order, and the return values.
+
+```diff
+- def func(some_arg: dict) -> None:
++ def func(some_arg: Dict[str, Any]) -> None:
+ ...
+```
+
+```python
+def create_callback(some_arg: bool) -> Callable[[str, int], List[str]]:
+ def callback(arg1: str, arg2: int) -> List[str]:
+ ...
+
+ return callback
+```
+
+For model architectures, Thinc also provides a collection of [custom types](https://thinc.ai/docs/api-types), including more specific types for arrays and model inputs/outputs. Even outside of static type checking, using these types will make the code a lot easier to read and follow, since it's always clear what array types are expected (and what might go wrong if the output is different from the expected type).
+
+```python
+def build_tagger_model(
+ tok2vec: Model[List[Doc], List[Floats2d]], nO: Optional[int] = None
+) -> Model[List[Doc], List[Floats2d]]:
+ ...
+```
+
+If you need to use a type hint that refers to something later declared in the same module, or the class that a method belongs to, you can use a string value instead:
+
+```python
+class SomeClass:
+ def from_bytes(self, data: bytes) -> "SomeClass":
+ ...
+```
+
+In some cases, you won't be able to import a class from a different module to use it as a type hint because it'd cause circular imports. For instance, `spacy/util.py` includes various helper functions that return an instance of `Language`, but we couldn't import it, because `spacy/language.py` imports `util` itself. In this case, we can provide `"Language"` as a string and make the import conditional on `typing.TYPE_CHECKING` so it only runs when the code is evaluated by a type checker:
+
+```python
+from typing TYPE_CHECKING
+
+if TYPE_CHECKING:
+ from .language import Language
+
+def load_model(name: str) -> "Language":
+ ...
+```
+
+## Structuring logic
+
+### Positional and keyword arguments
+
+We generally try to avoid writing functions and methods with too many arguments, and use keyword-only arguments wherever possible. Python lets you define arguments as keyword-only by separating them with a `, *`. If you're writing functions with additional arguments that customize the behavior, you typically want to make those arguments keyword-only, so their names have to be provided explicitly.
+
+```diff
+- def do_something(name: str, validate: bool = False):
++ def do_something(name: str, *, validate: bool = False):
+ ...
+
+- do_something("some_name", True)
++ do_something("some_name", validate=True)
+```
+
+This makes the function calls easier to read, because it's immediately clear what the additional values mean. It also makes it easier to extend arguments or change their order later on, because you don't end up with any function calls that depend on a specific positional order.
+
+### Avoid mutable default arguments
+
+A common Python gotcha are [mutable default arguments](https://docs.python-guide.org/writing/gotchas/#mutable-default-arguments): if your argument defines a mutable default value like `[]` or `{}` and then goes and mutates it, the default value is created _once_ when the function is created and the same object is then mutated every time the function is called. This can be pretty unintuitive when you first encounter it. We therefore avoid writing logic that does this.
+
+If your arguments need to default to an empty list or dict, you can use the `SimpleFrozenList` and `SimpleFrozenDict` helpers provided by spaCy. They are simple frozen implementations that raise an error if they're being mutated to prevent bugs and logic that accidentally mutates default arguments.
+
+```diff
+- def to_bytes(self, *, exclude: List[str] = []):
++ def to_bytes(self, *, exclude: List[str] = SimpleFrozenList()):
+ ...
+```
+
+```diff
+def do_something(values: List[str] = SimpleFrozenList()):
+ if some_condition:
+- values.append("foo") # raises an error
++ values = [*values, "foo"]
+ return values
+```
+
+### Don't use `try`/`except` for control flow
+
+We strongly discourage using `try`/`except` blocks for anything that's not third-party error handling or error handling that we otherwise have little control over. There's typically always a way to anticipate the _actual_ problem and **check for it explicitly**, which makes the code easier to follow and understand, and prevents bugs:
+
+```diff
+- try:
+- token = doc[i]
+- except IndexError:
+- token = doc[-1]
+
++ if i < len(doc):
++ token = doc[i]
++ else:
++ token = doc[-1]
+```
+
+Even if you end up having to check for multiple conditions explicitly, this is still preferred over a catch-all `try`/`except`. It can be very helpful to think about the exact scenarios you need to cover, and what could go wrong at each step, which often leads to better code and fewer bugs. `try/except` blocks can also easily mask _other_ bugs and problems that raise the same errors you're catching, which is obviously bad.
+
+If you have to use `try`/`except`, make sure to only include what's **absolutely necessary** in the `try` block and define the exception(s) explicitly. Otherwise, you may end up masking very different exceptions caused by other bugs.
+
+```diff
+- try:
+- value1 = get_some_value()
+- value2 = get_some_other_value()
+- score = external_library.compute_some_score(value1, value2)
+- except:
+- score = 0.0
+
++ value1 = get_some_value()
++ value2 = get_some_other_value()
++ try:
++ score = external_library.compute_some_score(value1, value2)
++ except ValueError:
++ score = 0.0
+```
+
+### Avoid lambda functions
+
+`lambda` functions can be useful for defining simple anonymous functions in a single line, but they also introduce problems: for instance, they require [additional logic](https://stackoverflow.com/questions/25348532/can-python-pickle-lambda-functions) in order to be pickled and are pretty ugly to type-annotate. So we typically avoid them in the code base and only use them in the serialization handlers and within tests for simplicity. Instead of `lambda`s, check if your code can be refactored to not need them, or use helper functions instead.
+
+```diff
+- split_string: Callable[[str], List[str]] = lambda value: [v.strip() for v in value.split(",")]
+
++ def split_string(value: str) -> List[str]:
++ return [v.strip() for v in value.split(",")]
+```
+
+### Iteration and comprehensions
+
+We generally avoid using built-in functions like `filter` or `map` in favor of list or generator comprehensions.
+
+```diff
+- filtered = filter(lambda x: x in ["foo", "bar"], values)
++ filtered = (x for x in values if x in ["foo", "bar"])
+- filtered = list(filter(lambda x: x in ["foo", "bar"], values))
++ filtered = [x for x in values if x in ["foo", "bar"]]
+
+- result = map(lambda x: { x: x in ["foo", "bar"]}, values)
++ result = ({x: x in ["foo", "bar"]} for x in values)
+- result = list(map(lambda x: { x: x in ["foo", "bar"]}, values))
++ result = [{x: x in ["foo", "bar"]} for x in values]
+```
+
+If your logic is more complex, it's often better to write a loop instead, even if it adds more lines of code in total. The result will be much easier to follow and understand.
+
+```diff
+- result = [{"key": key, "scores": {f"{i}": score for i, score in enumerate(scores)}} for key, scores in values]
+
++ result = []
++ for key, scores in values:
++ scores_dict = {f"{i}": score for i, score in enumerate(scores)}
++ result.append({"key": key, "scores": scores_dict})
+```
+
+### Composition vs. inheritance
+
+Although spaCy uses a lot of classes, **inheritance is viewed with some suspicion** — it's seen as a mechanism of last resort. You should discuss plans to extend the class hierarchy before implementing. Unless you're implementing a new data structure or pipeline component, you typically shouldn't have to use classes at all.
+
+### Don't use `print`
+
+The core library never `print`s anything. While we encourage using `print` statements for simple debugging (it's the most straightforward way of looking at what's happening), make sure to clean them up once you're ready to submit your pull request. If you want to output warnings or debugging information for users, use the respective dedicated mechanisms for this instead (see sections on warnings and logging for details).
+
+The only exceptions are the CLI functions, which pretty-print messages for the user, and methods that are explicitly intended for printing things, e.g. `Language.analyze_pipes` with `pretty=True` enabled. For this, we use our lightweight helper library [`wasabi`](https://github.com/ines/wasabi).
+
+## Naming
+
+Naming is hard and often a topic of long internal discussions. We don't expect you to come up with the perfect names for everything you write – finding the right names is often an iterative and collaborative process. That said, we do try to follow some basic conventions.
+
+Consistent with general Python conventions, we use `CamelCase` for class names including dataclasses, `snake_case` for methods, functions and variables, and `UPPER_SNAKE_CASE` for constants, typically defined at the top of a module. We also avoid using variable names that shadow the names of built-in functions, e.g. `input`, `help` or `list`.
+
+### Naming variables
+
+Variable names should always make it clear _what exactly_ the variable is and what it's used for. Instances of common classes should use the same consistent names. For example, you should avoid naming a text string (or anything else that's not a `Doc` object) `doc`. The most common class-to-variable mappings are:
+
+| Class | Variable | Example |
+| ---------- | --------------------- | ------------------------------------------- |
+| `Language` | `nlp` | `nlp = spacy.blank("en")` |
+| `Doc` | `doc` | `doc = nlp("Some text")` |
+| `Span` | `span`, `ent`, `sent` | `span = doc[1:4]`, `ent = doc.ents[0]` |
+| `Token` | `token` | `token = doc[0]` |
+| `Lexeme` | `lexeme`, `lex` | `lex = nlp.vocab["foo"]` |
+| `Vocab` | `vocab` | `vocab = Vocab()` |
+| `Example` | `example`, `eg` | `example = Example.from_dict(doc, gold)` |
+| `Config` | `config`, `cfg` | `config = Config().from_disk("config.cfg")` |
+
+We try to avoid introducing too many temporary variables, as these clutter your namespace. It's okay to re-assign to an existing variable, but only if the value has the same type.
+
+```diff
+ents = get_a_list_of_entities()
+ents = [ent for ent in doc.ents if ent.label_ == "PERSON"]
+- ents = {(ent.start, ent.end): ent.label_ for ent in ents}
++ ent_mappings = {(ent.start, ent.end): ent.label_ for ent in ents}
+```
+
+### Naming methods and functions
+
+Try choosing short and descriptive names wherever possible and imperative verbs for methods that do something, e.g. `disable_pipes`, `add_patterns` or `get_vector`. Private methods and functions that are not intended to be part of the user-facing API should be prefixed with an underscore `_`. It's often helpful to look at the existing classes for inspiration.
+
+Objects that can be serialized, e.g. data structures and pipeline components, should implement the same consistent methods for serialization. Those usually include at least `to_disk`, `from_disk`, `to_bytes` and `from_bytes`. Some objects can also implement more specific methods like `{to/from}_dict` or `{to/from}_str`.
+
+## Error handling
+
+We always encourage writing helpful and detailed custom error messages for everything we can anticipate going wrong, and including as much detail as possible. spaCy provides a directory of error messages in `errors.py` with unique codes for each message. This allows us to keep the code base more concise and avoids long and nested blocks of texts throughout the code that disrupt the reading flow. The codes make it easy to find references to the same error in different places, and also helps identify problems reported by users (since we can just search for the error code).
+
+Errors can be referenced via their code, e.g. `Errors.E123`. Messages can also include placeholders for values, that can be populated by formatting the string with `.format()`.
+
+```python
+class Errors:
+ E123 = "Something went wrong"
+ E456 = "Unexpected value: {value}"
+```
+
+```diff
+if something_went_wrong:
+- raise ValueError("Something went wrong!")
++ raise ValueError(Errors.E123)
+
+if not isinstance(value, int):
+- raise ValueError(f"Unexpected value: {value}")
++ raise ValueError(Errors.E456.format(value=value))
+```
+
+As a general rule of thumb, all error messages raised within the **core library** should be added to `Errors`. The only place where we write errors and messages as strings is `spacy.cli`, since these functions typically pretty-print and generate a lot of output that'd otherwise be very difficult to separate from the actual logic.
+
+### Re-raising exceptions
+
+If we anticipate possible errors in third-party code that we don't control, or our own code in a very different context, we typically try to provide custom and more specific error messages if possible. If we need to re-raise an exception within a `try`/`except` block, we can re-raise a custom exception.
+
+[Re-raising `from`](https://docs.python.org/3/tutorial/errors.html#exception-chaining) the original caught exception lets us chain the exceptions, so the user sees both the original error, as well as the custom message with a note "The above exception was the direct cause of the following exception".
+
+```diff
+try:
+ run_third_party_code_that_might_fail()
+except ValueError as e:
++ raise ValueError(Errors.E123) from e
+```
+
+In some cases, it makes sense to suppress the original exception, e.g. if we know what it is and know that it's not particularly helpful. In that case, we can raise `from None`. This prevents clogging up the user's terminal with multiple and irrelevant chained exceptions.
+
+```diff
+try:
+ run_our_own_code_that_might_fail_confusingly()
+except ValueError:
++ raise ValueError(Errors.E123) from None
+```
+
+### Avoid using naked `assert`
+
+During development, it can sometimes be helpful to add `assert` statements throughout your code to make sure that the values you're working with are what you expect. However, as you clean up your code, those should either be removed or replaced by more explicit error handling:
+
+```diff
+- assert score >= 0.0
++ if score < 0.0:
++ raise ValueError(Errors.789.format(score=score))
+```
+
+Otherwise, the user will get to see a naked `AssertionError` with no further explanation, which is very unhelpful. Instead of adding an error message to `assert`, it's always better to `raise` more explicit errors for specific conditions. If you're checking for something that _has to be right_ and would otherwise be a bug in spaCy, you can express this in the error message:
+
+```python
+E161 = ("Found an internal inconsistency when predicting entity links. "
+ "This is likely a bug in spaCy, so feel free to open an issue: "
+ "https://github.com/explosion/spaCy/issues")
+```
+
+### Warnings
+
+Instead of raising an error, some parts of the code base can raise warnings to notify the user of a potential problem. This is done using Python's `warnings.warn` and the messages defined in `Warnings` in the `errors.py`. Whether or not warnings are shown can be controlled by the user, including custom filters for disabling specific warnings using a regular expression matching our internal codes, e.g. `W123`.
+
+```diff
+- print("Warning: No examples provided for validation")
++ warnings.warn(Warnings.W123)
+```
+
+When adding warnings, make sure you're not calling `warnings.warn` repeatedly, e.g. in a loop, which will clog up the terminal output. Instead, you can collect the potential problems first and then raise a single warning. If the problem is critical, consider raising an error instead.
+
+```diff
++ n_empty = 0
+for spans in lots_of_annotations:
+ if len(spans) == 0:
+- warnings.warn(Warnings.456)
++ n_empty += 1
++ warnings.warn(Warnings.456.format(count=n_empty))
+```
+
+### Logging
+
+Log statements can be added via spaCy's `logger`, which uses Python's native `logging` module under the hood. We generally only use logging for debugging information that **the user may choose to see** in debugging mode or that's **relevant during training** but not at runtime.
+
+```diff
++ logger.info("Set up nlp object from config")
+config = nlp.config.interpolate()
+```
+
+`spacy train` and similar CLI commands will enable all log statements of level `INFO` by default (which is not the case at runtime). This allows outputting specific information within certain parts of the core library during training, without having it shown at runtime. `DEBUG`-level logs are only shown if the user enables `--verbose` logging during training. They can be used to provide more specific and potentially more verbose details, especially in areas that can indicate bugs or problems, or to surface more details about what spaCy does under the hood. You should only use logging statements if absolutely necessary and important.
+
+## Writing tests
+
+spaCy uses the [`pytest`](http://doc.pytest.org/) framework for testing. Tests for spaCy modules and classes live in their own directories of the same name and all test files should be prefixed with `test_`. Tests included in the core library only cover the code and do not depend on any trained pipelines. When implementing a new feature or fixing a bug, it's usually good to start by writing some tests that describe what _should_ happen. As you write your code, you can then keep running the relevant tests until all of them pass.
+
+### Test suite structure
+
+When adding tests, make sure to use descriptive names and only test for one behavior at a time. Tests should be grouped into modules dedicated to the same type of functionality and some test modules are organized as directories of test files related to the same larger area of the library, e.g. `matcher` or `tokenizer`.
+
+Regression tests are tests that refer to bugs reported in specific issues. They should live in the relevant module of the test suite, named according to the issue number (e.g., `test_issue1234.py`), and [marked](https://docs.pytest.org/en/6.2.x/example/markers.html#working-with-custom-markers) appropriately (e.g. `@pytest.mark.issue(1234)`). This system allows us to relate tests for specific bugs back to the original reported issue, which is especially useful if we introduce a regression and a previously passing regression tests suddenly fails again. When fixing a bug, it's often useful to create a regression test for it first.
+
+The test suite also provides [fixtures](https://github.com/explosion/spaCy/blob/master/spacy/tests/conftest.py) for different language tokenizers that can be used as function arguments of the same name and will be passed in automatically. Those should only be used for tests related to those specific languages. We also have [test utility functions](https://github.com/explosion/spaCy/blob/master/spacy/tests/util.py) for common operations, like creating a temporary file.
+
+### Constructing objects and state
+
+Test functions usually follow the same simple structure: they set up some state, perform the operation you want to test and `assert` conditions that you expect to be true, usually before and after the operation.
+
+Tests should focus on exactly what they're testing and avoid dependencies on other unrelated library functionality wherever possible. If all your test needs is a `Doc` object with certain annotations set, you should always construct it manually:
+
+```python
+def test_doc_creation_with_pos():
+ doc = Doc(Vocab(), words=["hello", "world"], pos=["NOUN", "VERB"])
+ assert doc[0].pos_ == "NOUN"
+ assert doc[1].pos_ == "VERB"
+```
+
+### Parametrizing tests
+
+If you need to run the same test function over different input examples, you usually want to parametrize the test cases instead of using a loop within your test. This lets you keep a better separation between test cases and test logic, and it'll result in more useful output because `pytest` will be able to tell you which exact test case failed.
+
+The `@pytest.mark.parametrize` decorator takes two arguments: a string defining one or more comma-separated arguments that should be passed to the test function and a list of corresponding test cases (or a list of tuples to provide multiple arguments).
+
+```python
+@pytest.mark.parametrize("words", [["hello", "world"], ["this", "is", "a", "test"]])
+def test_doc_length(words):
+ doc = Doc(Vocab(), words=words)
+ assert len(doc) == len(words)
+```
+
+```python
+@pytest.mark.parametrize("text,expected_len", [("hello world", 2), ("I can't!", 4)])
+def test_token_length(en_tokenizer, text, expected_len): # en_tokenizer is a fixture
+ doc = en_tokenizer(text)
+ assert len(doc) == expected_len
+```
+
+You can also stack `@pytest.mark.parametrize` decorators, although this is not recommended unless it's absolutely needed or required for the test. When stacking decorators, keep in mind that this will run the test with all possible combinations of the respective parametrized values, which is often not what you want and can slow down the test suite.
+
+### Handling failing tests
+
+`xfail` means that a test **should pass but currently fails**, i.e. is expected to fail. You can mark a test as currently xfailing by adding the `@pytest.mark.xfail` decorator. This should only be used for tests that don't yet work, not for logic that cause errors we raise on purpose (see the section on testing errors for this). It's often very helpful to implement tests for edge cases that we don't yet cover and mark them as `xfail`. You can also provide a `reason` keyword argument to the decorator with an explanation of why the test currently fails.
+
+```diff
++ @pytest.mark.xfail(reason="Issue #225 - not yet implemented")
+def test_en_tokenizer_splits_em_dash_infix(en_tokenizer):
+ doc = en_tokenizer("Will this road take me to Puddleton?\u2014No.")
+ assert doc[8].text == "\u2014"
+```
+
+When you run the test suite, you may come across tests that are reported as `xpass`. This means that they're marked as `xfail` but didn't actually fail. This is worth looking into: sometimes, it can mean that we have since fixed a bug that caused the test to previously fail, so we can remove the decorator. In other cases, especially when it comes to machine learning model implementations, it can also indicate that the **test is flaky**: it sometimes passes and sometimes fails. This can be caused by a bug, or by constraints being too narrowly defined. If a test shows different behavior depending on whether its run in isolation or not, this can indicate that it reacts to global state set in a previous test, which is unideal and should be avoided.
+
+### Writing slow tests
+
+If a test is useful but potentially quite slow, you can mark it with the `@pytest.mark.slow` decorator. This is a special marker we introduced and tests decorated with it only run if you run the test suite with `--slow`, but not as part of the main CI process. Before introducing a slow test, double-check that there isn't another and more efficient way to test for the behavior. You should also consider adding a simpler test with maybe only a subset of the test cases that can always run, so we at least have some coverage.
+
+### Skipping tests
+
+The `@pytest.mark.skip` decorator lets you skip tests entirely. You only want to do this for failing tests that may be slow to run or cause memory errors or segfaults, which would otherwise terminate the entire process and wouldn't be caught by `xfail`. We also sometimes use the `skip` decorator for old and outdated regression tests that we want to keep around but that don't apply anymore. When using the `skip` decorator, make sure to provide the `reason` keyword argument with a quick explanation of why you chose to skip this test.
+
+### Testing errors and warnings
+
+`pytest` lets you check whether a given error is raised by using the `pytest.raises` contextmanager. This is very useful when implementing custom error handling, so make sure you're not only testing for the correct behavior but also for errors resulting from incorrect inputs. If you're testing errors, you should always check for `pytest.raises` explicitly and not use `xfail`.
+
+```python
+words = ["a", "b", "c", "d", "e"]
+ents = ["Q-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
+with pytest.raises(ValueError):
+ Doc(Vocab(), words=words, ents=ents)
+```
+
+You can also use the `pytest.warns` contextmanager to check that a given warning type is raised. The first argument is the warning type or `None` (which will capture a list of warnings that you can `assert` is empty).
+
+```python
+def test_phrase_matcher_validation(en_vocab):
+ doc1 = Doc(en_vocab, words=["Test"], deps=["ROOT"])
+ doc2 = Doc(en_vocab, words=["Test"])
+ matcher = PhraseMatcher(en_vocab, validate=True)
+ with pytest.warns(UserWarning):
+ # Warn about unnecessarily parsed document
+ matcher.add("TEST1", [doc1])
+ with pytest.warns(None) as record:
+ matcher.add("TEST2", [docs])
+ assert not record.list
+```
+
+Keep in mind that your tests will fail if you're using the `pytest.warns` contextmanager with a given warning and the warning is _not_ shown. So you should only use it to check that spaCy handles and outputs warnings correctly. If your test outputs a warning that's expected but not relevant to what you're testing, you can use the `@pytest.mark.filterwarnings` decorator and ignore specific warnings starting with a given code:
+
+```python
+@pytest.mark.filterwarnings("ignore:\\[W036")
+def test_matcher_empty(en_vocab):
+ matcher = Matcher(en_vocab)
+ matcher(Doc(en_vocab, words=["test"]))
+```
+
+### Testing trained pipelines
+
+Our regular test suite does not depend on any of the trained pipelines, since their outputs can vary and aren't generally required to test the library functionality. We test pipelines separately using the tests included in the [`spacy-models`](https://github.com/explosion/spacy-models) repository, which run whenever we train a new suite of models. The tests here mostly focus on making sure that the packages can be loaded and that the predictions seam reasonable, and they include checks for common bugs we encountered previously. If your test does not primarily focus on verifying a model's predictions, it should be part of the core library tests and construct the required objects manually, instead of being added to the models tests.
+
+Keep in mind that specific predictions may change, and we can't test for all incorrect predictions reported by users. Different models make different mistakes, so even a model that's significantly more accurate overall may end up making wrong predictions that it previously didn't. However, some surprising incorrect predictions may indicate deeper bugs that we definitely want to investigate.
diff --git a/extra/DEVELOPER_DOCS/Language.md b/extra/DEVELOPER_DOCS/Language.md
new file mode 100644
index 000000000..f4fc85095
--- /dev/null
+++ b/extra/DEVELOPER_DOCS/Language.md
@@ -0,0 +1,150 @@
+# Language
+
+> Reference: `spacy/language.py`
+
+1. [Constructing the `nlp` object from a config](#1-constructing-the-nlp-object-from-a-config)
+ - [A. Overview of `Language.from_config`](#1a-overview)
+ - [B. Component factories](#1b-how-pipeline-component-factories-work-in-the-config)
+ - [C. Sourcing a component](#1c-sourcing-a-pipeline-component)
+ - [D. Tracking components as they're modified](#1d-tracking-components-as-theyre-modified)
+ - [E. spaCy's config utility function](#1e-spacys-config-utility-functions)
+2. [Initialization](#initialization)
+ - [A. Initialization for training](#2a-initialization-for-training): `init_nlp`
+ - [B. Initializing the `nlp` object](#2b-initializing-the-nlp-object): `Language.initialize`
+ - [C. Initializing the vocab](#2c-initializing-the-vocab): `init_vocab`
+
+## 1. Constructing the `nlp` object from a config
+
+### 1A. Overview
+
+Most of the functions referenced in the config are regular functions with arbitrary arguments registered via the function registry. However, the pipeline components are a bit special: they don't only receive arguments passed in via the config file, but also the current `nlp` object and the string `name` of the individual component instance (so a user can have multiple components created with the same factory, e.g. `ner_one` and `ner_two`). This name can then be used by the components to add to the losses and scores. This special requirement means that pipeline components can't just be resolved via the config the "normal" way: we need to retrieve the component functions manually and pass them their arguments, plus the `nlp` and `name`.
+
+The `Language.from_config` classmethod takes care of constructing the `nlp` object from a config. It's the single place where this happens and what `spacy.load` delegates to under the hood. Its main responsibilities are:
+
+- **Load and validate the config**, and optionally **auto-fill** all missing values that we either have defaults for in the config template or that registered function arguments define defaults for. This helps ensure backwards-compatibility, because we're able to add a new argument `foo: str = "bar"` to an existing function, without breaking configs that don't specity it.
+- **Execute relevant callbacks** for pipeline creation, e.g. optional functions called before and after creation of the `nlp` object and pipeline.
+- **Initialize language subclass and create tokenizer**. The `from_config` classmethod will always be called on a language subclass, e.g. `English`, not on `Language` directly. Initializing the subclass takes a callback to create the tokenizer.
+- **Set up the pipeline components**. Components can either refer to a component factory or a `source`, i.e. an existing pipeline that's loaded and that the component is then copied from. We also need to ensure that we update the information about which components are disabled.
+- **Manage listeners.** If sourced components "listen" to other components (`tok2vec`, `transformer`), we need to ensure that the references are valid. If the config specifies that listeners should be replaced by copies (e.g. to give the `ner` component its own `tok2vec` model instead of listening to the shared `tok2vec` component in the pipeline), we also need to take care of that.
+
+Note that we only resolve and load **selected sections** in `Language.from_config`, i.e. only the parts that are relevant at runtime, which is `[nlp]` and `[components]`. We don't want to be resolving anything related to training or initialization, since this would mean loading and constructing unnecessary functions, including functions that require information that isn't necessarily available at runtime, like `paths.train`.
+
+### 1B. How pipeline component factories work in the config
+
+As opposed to regular registered functions that refer to a registry and function name (e.g. `"@misc": "foo.v1"`), pipeline components follow a different format and refer to their component `factory` name. This corresponds to the name defined via the `@Language.component` or `@Language.factory` decorator. We need this decorator to define additional meta information for the components, like their default config and score weights.
+
+```ini
+[components.my_component]
+factory = "foo"
+some_arg = "bar"
+other_arg = ${paths.some_path}
+```
+
+This means that we need to create and resolve the `config["components"]` separately from the rest of the config. There are some important considerations and things we need to manage explicitly to avoid unexpected behavior:
+
+#### Variable interpolation
+
+When a config is resolved, references to variables are replaced, so that the functions receive the correct value instead of just the variable name. To interpolate a config, we need it in its entirety: we couldn't just interpolate a subsection that refers to variables defined in a different subsection. So we first interpolate the entire config.
+
+However, the `nlp.config` should include the original config with variables intact – otherwise, loading a pipeline and saving it to disk will destroy all logic implemented via variables and hard-code the values all over the place. This means that when we create the components, we need to keep two versions of the config: the interpolated config with the "real" values and the `raw_config` including the variable references.
+
+#### Factory registry
+
+Component factories are special and use the `@Language.factory` or `@Language.component` decorator to register themselves and their meta. When the decorator runs, it performs some basic validation, stores the meta information for the factory on the `Language` class (default config, scores etc.) and then adds the factory function to `registry.factories`. The `component` decorator can be used for registering simple functions that just take a `Doc` object and return it so in that case, we create the factory for the user automatically.
+
+There's one important detail to note about how factories are registered via entry points: A package that wants to expose spaCy components still needs to register them via the `@Language` decorators so we have the component meta information and can perform required checks. All we care about here is that the decorated function is **loaded and imported**. When it is, the `@Language` decorator takes care of everything, including actually registering the component factory.
+
+Normally, adding to the registry via an entry point will just add the function to the registry under the given name. But for `spacy_factories`, we don't actually want that: all we care about is that the function decorated with `@Language` is imported so the decorator runs. So we only exploit Python's entry point system to automatically import the function, and the `spacy_factories` entry point group actually adds to a **separate registry**, `registry._factories`, under the hood. Its only purpose is that the functions are imported. The decorator then runs, creates the factory if needed and adds it to the `registry.factories` registry.
+
+#### Language-specific factories
+
+spaCy supports registering factories on the `Language` base class, as well as language-specific subclasses like `English` or `German`. This allows providing different factories depending on the language, e.g. a different default lemmatizer. The `Language.get_factory_name` classmethod constructs the factory name as `{lang}.{name}` if a language is available (i.e. if it's a subclass) and falls back to `{name}` otherwise. So `@German.factory("foo")` will add a factory `de.foo` under the hood. If you add `nlp.add_pipe("foo")`, we first check if there's a factory for `{nlp.lang}.foo` and if not, we fall back to checking for a factory `foo`.
+
+#### Creating a pipeline component from a factory
+
+`Language.add_pipe` takes care of adding a pipeline component, given its factory name, its config. If no source pipeline to copy the component from is provided, it delegates to `Language.create_pipe`, which sets up the actual component function.
+
+- Validate the config and make sure that the factory was registered via the decorator and that we have meta for it.
+- Update the component config with any defaults specified by the component's `default_config`, if available. This is done by merging the values we receive into the defaults. It ensures that you can still add a component without having to specify its _entire_ config including more complex settings like `model`. If no `model` is defined, we use the default.
+- Check if we have a language-specific factory for the given `nlp.lang` and if not, fall back to the global factory.
+- Construct the component config, consisting of whatever arguments were provided, plus the current `nlp` object and `name`, which are default expected arguments of all factories. We also add a reference to the `@factories` registry, so we can resolve the config via the registry, like any other config. With the added `nlp` and `name`, it should now include all expected arguments of the given function.
+- Fill the config to make sure all unspecified defaults from the function arguments are added and update the `raw_config` (uninterpolated with variables intact) with that information, so the component config we store in `nlp.config` is up to date. We do this by adding the `raw_config` _into_ the filled config – otherwise, the references to variables would be overwritten.
+- Resolve the config and create all functions it refers to (e.g. `model`). This gives us the actual component function that we can insert into the pipeline.
+
+### 1C. Sourcing a pipeline component
+
+```ini
+[components.ner]
+source = "en_core_web_sm"
+```
+
+spaCy also allows ["sourcing" a component](https://spacy.io/usage/processing-pipelines#sourced-components), which will copy it over from an existing pipeline. In this case, `Language.add_pipe` will delegate to `Language.create_pipe_from_source`. In order to copy a component effectively and validate it, the source pipeline first needs to be loaded. This is done in `Language.from_config`, so a source pipeline only has to be loaded once if multiple components source from it. Sourcing a component will perform the following checks and modifications:
+
+- For each sourced pipeline component loaded in `Language.from_config`, a hash of the vectors data from the source pipeline is stored in the pipeline meta so we're able to check whether the vectors match and warn if not (since different vectors that are used as features in components can lead to degraded performance). Because the vectors are not loaded at the point when components are sourced, the check is postponed to `init_vocab` as part of `Language.initialize`.
+- If the sourced pipeline component is loaded through `Language.add_pipe(source=)`, the vectors are already loaded and can be compared directly. The check compares the shape and keys first and finally falls back to comparing the actual byte representation of the vectors (which is slower).
+- Ensure that the component is available in the pipeline.
+- Interpolate the entire config of the source pipeline so all variables are replaced and the component's config that's copied over doesn't include references to variables that are not available in the destination config.
+- Add the source `vocab.strings` to the destination's `vocab.strings` so we don't end up with unavailable strings in the final pipeline (which would also include labels used by the sourced component).
+
+Note that there may be other incompatibilities that we're currently not checking for and that could cause a sourced component to not work in the destination pipeline. We're interested in adding more checks here but there'll always be a small number of edge cases we'll never be able to catch, including a sourced component depending on other pipeline state that's not available in the destination pipeline.
+
+### 1D. Tracking components as they're modified
+
+The `Language` class implements methods for removing, replacing or renaming pipeline components. Whenever we make these changes, we need to update the information stored on the `Language` object to ensure that it matches the current state of the pipeline. If a user just writes to `nlp.config` manually, we obviously can't ensure that the config matches the reality – but since we offer modification via the pipe methods, it's expected that spaCy keeps the config in sync under the hood. Otherwise, saving a modified pipeline to disk and loading it back wouldn't work. The internal attributes we need to keep in sync here are:
+
+| Attribute | Type | Description |
+| ------------------------ | ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `Language._components` | `List[Tuple[str, Callable]]` | All pipeline components as `(name, func)` tuples. This is used as the source of truth for `Language.pipeline`, `Language.pipe_names` and `Language.components`. |
+| `Language._pipe_meta` | `Dict[str, FactoryMeta]` | The meta information of a component's factory, keyed by component name. This can include multiple components referring to the same factory meta. |
+| `Language._pipe_configs` | `Dict[str, Config]` | The component's config, keyed by component name. |
+| `Language._disabled` | `Set[str]` | Names of components that are currently disabled. |
+| `Language._config` | `Config` | The underlying config. This is only internals and will be used as the basis for constructing the config in the `Language.config` property. |
+
+In addition to the actual component settings in `[components]`, the config also allows specifying component-specific arguments via the `[initialize.components]` block, which are passed to the component's `initialize` method during initialization if it's available. So we also need to keep this in sync in the underlying config.
+
+### 1E. spaCy's config utility functions
+
+When working with configs in spaCy, make sure to use the utility functions provided by spaCy if available, instead of calling the respective `Config` methods. The utilities take care of providing spaCy-specific error messages and ensure a consistent order of config sections by setting the `section_order` argument. This ensures that exported configs always have the same consistent format.
+
+- `util.load_config`: load a config from a file
+- `util.load_config_from_str`: load a confirm from a string representation
+- `util.copy_config`: deepcopy a config
+
+## 2. Initialization
+
+Initialization is a separate step of the [config lifecycle](https://spacy.io/usage/training#config-lifecycle) that's not performed at runtime. It's implemented via the `training.initialize.init_nlp` helper and calls into `Language.initialize` method, which sets up the pipeline and component models before training. The `initialize` method takes a callback that returns a sample of examples, which is used to initialize the component models, add all required labels and perform shape inference if applicable.
+
+Components can also define custom initialization setting via the `[initialize.components]` block, e.g. if they require external data like lookup tables to be loaded in. All config settings defined here will be passed to the component's `initialize` method, if it implements one. Components are expected to handle their own serialization after they're initialized so that any data or settings they require are saved with the pipeline and will be available from disk when the pipeline is loaded back at runtime.
+
+### 2A. Initialization for training
+
+The `init_nlp` function is called before training and returns an initialized `nlp` object that can be updated with the examples. It only needs the config and does the following:
+
+- Load and validate the config. In order to validate certain settings like the `seed`, we also interpolate the config to get the final value (because in theory, a user could provide this via a variable).
+- Set up the GPU allocation, if required.
+- Create the `nlp` object from the raw, uninterpolated config, which delegates to `Language.from_config`. Since this method may modify and auto-fill the config and pipeline component settings, we then use the interpolated version of `nlp.config` going forward, to ensure that what we're training with is up to date.
+- Resolve the `[training]` block of the config and perform validation, e.g. to check that the corpora are available.
+- Determine the components that should be frozen (not updated during training) or resumed (sourced components from a different pipeline that should be updated from the examples and not reset and re-initialized). To resume training, we can call the `nlp.resume_training` method.
+- Initialize the `nlp` object via `nlp.initialize` and pass it a `get_examples` callback that returns the training corpus (used for shape inference, setting up labels etc.). If the training corpus is streamed, we only provide a small sample of the data, which can potentially be infinite. `nlp.initialize` will delegate to the components as well and pass the data sample forward.
+- Check the listeners and warn about components dependencies, e.g. if a frozen component listens to a component that is retrained, or vice versa (which can degrade results).
+
+### 2B. Initializing the `nlp` object
+
+The `Language.initialize` method does the following:
+
+- **Resolve the config** defined in the `[initialize]` block separately (since everything else is already available in the loaded `nlp` object), based on the fully interpolated config.
+- **Execute callbacks**, i.e. `before_init` and `after_init`, if they're defined.
+- **Initialize the vocab**, including vocab data, lookup tables and vectors.
+- **Initialize the tokenizer** if it implements an `initialize` method. This is not the case for the default tokenizers, but it allows custom tokenizers to depend on external data resources that are loaded in on initialization.
+- **Initialize all pipeline components** if they implement an `initialize` method and pass them the `get_examples` callback, the current `nlp` object as well as well additional initialization config settings provided in the component-specific block.
+- **Initialize pretraining** if a `[pretraining]` block is available in the config. This allows loading pretrained tok2vec weights in `spacy pretrain`.
+- **Register listeners** if token-to-vector embedding layers of a component model "listen" to a previous component (`tok2vec`, `transformer`) in the pipeline.
+- **Create an optimizer** on the `Language` class, either by adding the optimizer passed as `sgd` to `initialize`, or by creating the optimizer defined in the config's training settings.
+
+### 2C. Initializing the vocab
+
+Vocab initialization is handled in the `training.initialize.init_vocab` helper. It takes the relevant loaded functions and values from the config and takes care of the following:
+
+- Add lookup tables defined in the config initialization, e.g. custom lemmatization tables. Those will be added to `nlp.vocab.lookups` from where they can be accessed by components.
+- Add JSONL-formatted [vocabulary data](https://spacy.io/api/data-formats#vocab-jsonl) to pre-populate the lexical attributes.
+- Load vectors into the pipeline. Vectors are defined as a name or path to a saved `nlp` object containing the vectors, e.g. `en_vectors_web_lg`. It's loaded and the vectors are ported over, while ensuring that all source strings are available in the destination strings. We also warn if there's a mismatch between sourced vectors, since this can lead to problems.
diff --git a/extra/DEVELOPER_DOCS/Listeners.md b/extra/DEVELOPER_DOCS/Listeners.md
new file mode 100644
index 000000000..3a71082e0
--- /dev/null
+++ b/extra/DEVELOPER_DOCS/Listeners.md
@@ -0,0 +1,220 @@
+# Listeners
+
+1. [Overview](#1-overview)
+2. [Initialization](#2-initialization)
+ - [A. Linking listeners to the embedding component](#2a-linking-listeners-to-the-embedding-component)
+ - [B. Shape inference](#2b-shape-inference)
+3. [Internal communication](#3-internal-communication)
+ - [A. During prediction](#3a-during-prediction)
+ - [B. During training](#3b-during-training)
+ - [C. Frozen components](#3c-frozen-components)
+4. [Replacing listener with standalone](#4-replacing-listener-with-standalone)
+
+## 1. Overview
+
+Trainable spaCy components typically use some sort of `tok2vec` layer as part of the `model` definition.
+This `tok2vec` layer produces embeddings and is either a standard `Tok2Vec` layer, or a Transformer-based one.
+Both versions can be used either inline/standalone, which means that they are defined and used
+by only one specific component (e.g. NER), or
+[shared](https://spacy.io/usage/embeddings-transformers#embedding-layers),
+in which case the embedding functionality becomes a separate component that can
+feed embeddings to multiple components downstream, using a listener-pattern.
+
+| Type | Usage | Model Architecture |
+| ------------- | ---------- | -------------------------------------------------------------------------------------------------- |
+| `Tok2Vec` | standalone | [`spacy.Tok2Vec`](https://spacy.io/api/architectures#Tok2Vec) |
+| `Tok2Vec` | listener | [`spacy.Tok2VecListener`](https://spacy.io/api/architectures#Tok2VecListener) |
+| `Transformer` | standalone | [`spacy-transformers.Tok2VecTransformer`](https://spacy.io/api/architectures#Tok2VecTransformer) |
+| `Transformer` | listener | [`spacy-transformers.TransformerListener`](https://spacy.io/api/architectures#TransformerListener) |
+
+Here we discuss the listener pattern and its implementation in code in more detail.
+
+## 2. Initialization
+
+### 2A. Linking listeners to the embedding component
+
+To allow sharing a `tok2vec` layer, a separate `tok2vec` component needs to be defined in the config:
+
+```
+[components.tok2vec]
+factory = "tok2vec"
+
+[components.tok2vec.model]
+@architectures = "spacy.Tok2Vec.v2"
+```
+
+A listener can then be set up by making sure the correct `upstream` name is defined, referring to the
+name of the `tok2vec` component (which equals the factory name by default), or `*` as a wildcard:
+
+```
+[components.ner.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+upstream = "tok2vec"
+```
+
+When an [`nlp`](https://github.com/explosion/spaCy/blob/master/extra/DEVELOPER_DOCS/Language.md) object is
+initialized or deserialized, it will make sure to link each `tok2vec` component to its listeners. This is
+implemented in the method `nlp._link_components()` which loops over each
+component in the pipeline and calls `find_listeners()` on a component if it's defined.
+The [`tok2vec` component](https://github.com/explosion/spaCy/blob/master/spacy/pipeline/tok2vec.py)'s implementation
+of this `find_listener()` method will specifically identify sublayers of a model definition that are of type
+`Tok2VecListener` with a matching upstream name and will then add that listener to the internal `self.listener_map`.
+
+If it's a Transformer-based pipeline, a
+[`transformer` component](https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py)
+has a similar implementation but its `find_listener()` function will specifically look for `TransformerListener`
+sublayers of downstream components.
+
+### 2B. Shape inference
+
+Typically, the output dimension `nO` of a listener's model equals the `nO` (or `width`) of the upstream embedding layer.
+For a standard `Tok2Vec`-based component, this is typically known up-front and defined as such in the config:
+
+```
+[components.ner.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+```
+
+A `transformer` component however only knows its `nO` dimension after the HuggingFace transformer
+is set with the function `model.attrs["set_transformer"]`,
+[implemented](https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/layers/transformer_model.py)
+by `set_pytorch_transformer`.
+This is why, upon linking of the transformer listeners, the `transformer` component also makes sure to set
+the listener's output dimension correctly.
+
+This shape inference mechanism also needs to happen with resumed/frozen components, which means that for some CLI
+commands (`assemble` and `train`), we need to call `nlp._link_components` even before initializing the `nlp`
+object. To cover all use-cases and avoid negative side effects, the code base ensures that performing the
+linking twice is not harmful.
+
+## 3. Internal communication
+
+The internal communication between a listener and its downstream components is organized by sending and
+receiving information across the components - either directly or implicitly.
+The details are different depending on whether the pipeline is currently training, or predicting.
+Either way, the `tok2vec` or `transformer` component always needs to run before the listener.
+
+### 3A. During prediction
+
+When the `Tok2Vec` pipeline component is called, its `predict()` method is executed to produce the results,
+which are then stored by `set_annotations()` in the `doc.tensor` field of the document(s).
+Similarly, the `Transformer` component stores the produced embeddings
+in `doc._.trf_data`. Next, the `forward` pass of a
+[`Tok2VecListener`](https://github.com/explosion/spaCy/blob/master/spacy/pipeline/tok2vec.py)
+or a
+[`TransformerListener`](https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/layers/listener.py)
+accesses these fields on the `Doc` directly. Both listener implementations have a fallback mechanism for when these
+properties were not set on the `Doc`: in that case an all-zero tensor is produced and returned.
+We need this fallback mechanism to enable shape inference methods in Thinc, but the code
+is slightly risky and at times might hide another bug - so it's a good spot to be aware of.
+
+### 3B. During training
+
+During training, the `update()` methods of the `Tok2Vec` & `Transformer` components don't necessarily set the
+annotations on the `Doc` (though since 3.1 they can if they are part of the `annotating_components` list in the config).
+Instead, we rely on a caching mechanism between the original embedding component and its listener.
+Specifically, the produced embeddings are sent to the listeners by calling `listener.receive()` and uniquely
+identifying the batch of documents with a `batch_id`. This `receive()` call also sends the appropriate `backprop`
+call to ensure that gradients from the downstream component flow back to the trainable `Tok2Vec` or `Transformer`
+network.
+
+We rely on the `nlp` object properly batching the data and sending each batch through the pipeline in sequence,
+which means that only one such batch needs to be kept in memory for each listener.
+When the downstream component runs and the listener should produce embeddings, it accesses the batch in memory,
+runs the backpropagation, and returns the results and the gradients.
+
+There are two ways in which this mechanism can fail, both are detected by `verify_inputs()`:
+
+- `E953` if a different batch is in memory than the requested one - signaling some kind of out-of-sync state of the
+ training pipeline.
+- `E954` if no batch is in memory at all - signaling that the pipeline is probably not set up correctly.
+
+#### Training with multiple listeners
+
+One `Tok2Vec` or `Transformer` component may be listened to by several downstream components, e.g.
+a tagger and a parser could be sharing the same embeddings. In this case, we need to be careful about how we do
+the backpropagation. When the `Tok2Vec` or `Transformer` sends out data to the listener with `receive()`, they will
+send an `accumulate_gradient` function call to all listeners, except the last one. This function will keep track
+of the gradients received so far. Only the final listener in the pipeline will get an actual `backprop` call that
+will initiate the backpropagation of the `tok2vec` or `transformer` model with the accumulated gradients.
+
+### 3C. Frozen components
+
+The listener pattern can get particularly tricky in combination with frozen components. To detect components
+with listeners that are not frozen consistently, `init_nlp()` (which is called by `spacy train`) goes through
+the listeners and their upstream components and warns in two scenarios.
+
+#### The Tok2Vec or Transformer is frozen
+
+If the `Tok2Vec` or `Transformer` was already trained,
+e.g. by [pretraining](https://spacy.io/usage/embeddings-transformers#pretraining),
+it could be a valid use-case to freeze the embedding architecture and only train downstream components such
+as a tagger or a parser. This used to be impossible before 3.1, but has become supported since then by putting the
+embedding component in the [`annotating_components`](https://spacy.io/usage/training#annotating-components)
+list of the config. This works like any other "annotating component" because it relies on the `Doc` attributes.
+
+However, if the `Tok2Vec` or `Transformer` is frozen, and not present in `annotating_components`, and a related
+listener isn't frozen, then a `W086` warning is shown and further training of the pipeline will likely end with `E954`.
+
+#### The upstream component is frozen
+
+If an upstream component is frozen but the underlying `Tok2Vec` or `Transformer` isn't, the performance of
+the upstream component will be degraded after training. In this case, a `W087` warning is shown, explaining
+how to use the `replace_listeners` functionality to prevent this problem.
+
+## 4. Replacing listener with standalone
+
+The [`replace_listeners`](https://spacy.io/api/language#replace_listeners) functionality changes the architecture
+of a downstream component from using a listener pattern to a standalone `tok2vec` or `transformer` layer,
+effectively making the downstream component independent of any other components in the pipeline.
+It is implemented by `nlp.replace_listeners()` and typically executed by `nlp.from_config()`.
+First, it fetches the original `Model` of the original component that creates the embeddings:
+
+```
+tok2vec = self.get_pipe(tok2vec_name)
+tok2vec_model = tok2vec.model
+```
+
+Which is either a [`Tok2Vec` model](https://github.com/explosion/spaCy/blob/master/spacy/ml/models/tok2vec.py) or a
+[`TransformerModel`](https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/layers/transformer_model.py).
+
+In the case of the `tok2vec`, this model can be copied as-is into the configuration and architecture of the
+downstream component. However, for the `transformer`, this doesn't work.
+The reason is that the `TransformerListener` architecture chains the listener with
+[`trfs2arrays`](https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/layers/trfs2arrays.py):
+
+```
+model = chain(
+ TransformerListener(upstream_name=upstream)
+ trfs2arrays(pooling, grad_factor),
+)
+```
+
+but the standalone `Tok2VecTransformer` has an additional `split_trf_batch` chained inbetween the model
+and `trfs2arrays`:
+
+```
+model = chain(
+ TransformerModel(name, get_spans, tokenizer_config),
+ split_trf_batch(),
+ trfs2arrays(pooling, grad_factor),
+)
+```
+
+So you can't just take the model from the listener, and drop that into the component internally. You need to
+adjust the model and the config. To facilitate this, `nlp.replace_listeners()` will check whether additional
+[functions](https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/layers/_util.py) are
+[defined](https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/layers/transformer_model.py)
+in `model.attrs`, and if so, it will essentially call these to make the appropriate changes:
+
+```
+replace_func = tok2vec_model.attrs["replace_listener_cfg"]
+new_config = replace_func(tok2vec_cfg["model"], pipe_cfg["model"]["tok2vec"])
+...
+new_model = tok2vec_model.attrs["replace_listener"](new_model)
+```
+
+The new config and model are then properly stored on the `nlp` object.
+Note that this functionality (running the replacement for a transformer listener) was broken prior to
+`spacy-transformers` 1.0.5.
diff --git a/extra/DEVELOPER_DOCS/README.md b/extra/DEVELOPER_DOCS/README.md
new file mode 100644
index 000000000..8ff505dc6
--- /dev/null
+++ b/extra/DEVELOPER_DOCS/README.md
@@ -0,0 +1,7 @@
+
+
+# Developer Documentation
+
+This directory includes additional documentation and explanations of spaCy's internals. It's mostly intended for the spaCy core development team and contributors interested in the more complex parts of the library. The documents generally focus on more abstract implementation details and how specific methods and algorithms work, and they assume knowledge of what's already available in the [usage documentation](https://spacy.io/usage) and [API reference](https://spacy.io/api).
+
+If you're looking to contribute to spaCy, make sure to check out the documentation and [contributing guide](https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md) first.
diff --git a/extra/DEVELOPER_DOCS/StringStore-Vocab.md b/extra/DEVELOPER_DOCS/StringStore-Vocab.md
new file mode 100644
index 000000000..866ba2aae
--- /dev/null
+++ b/extra/DEVELOPER_DOCS/StringStore-Vocab.md
@@ -0,0 +1,216 @@
+# StringStore & Vocab
+
+> Reference: `spacy/strings.pyx`
+> Reference: `spacy/vocab.pyx`
+
+## Overview
+
+spaCy represents mosts strings internally using a `uint64` in Cython which
+corresponds to a hash. The magic required to make this largely transparent is
+handled by the `StringStore`, and is integrated into the pipelines using the
+`Vocab`, which also connects it to some other information.
+
+These are mostly internal details that average library users should never have
+to think about. On the other hand, when developing a component it's normal to
+interact with the Vocab for lexeme data or word vectors, and it's not unusual
+to add labels to the `StringStore`.
+
+## StringStore
+
+### Overview
+
+The `StringStore` is a `cdef class` that looks a bit like a two-way dictionary,
+though it is not a subclass of anything in particular.
+
+The main functionality of the `StringStore` is that `__getitem__` converts
+hashes into strings or strings into hashes.
+
+The full details of the conversion are complicated. Normally you shouldn't have
+to worry about them, but the first applicable case here is used to get the
+return value:
+
+1. 0 and the empty string are special cased to each other
+2. internal symbols use a lookup table (`SYMBOLS_BY_STR`)
+3. normal strings or bytes are hashed
+4. internal symbol IDs in `SYMBOLS_BY_INT` are handled
+5. anything not yet handled is used as a hash to lookup a string
+
+For the symbol enums, see [`symbols.pxd`](https://github.com/explosion/spaCy/blob/master/spacy/symbols.pxd).
+
+Almost all strings in spaCy are stored in the `StringStore`. This naturally
+includes tokens, but also includes things like labels (not just NER/POS/dep,
+but also categories etc.), lemmas, lowercase forms, word shapes, and so on. One
+of the main results of this is that tokens can be represented by a compact C
+struct ([`LexemeC`](https://spacy.io/api/cython-structs#lexemec)/[`TokenC`](https://github.com/explosion/spaCy/issues/4854)) that mostly consists of string hashes. This also means that converting
+input for the models is straightforward, and there's not a token mapping step
+like in many machine learning frameworks. Additionally, because the token IDs
+in spaCy are based on hashes, they are consistent across environments or
+models.
+
+One pattern you'll see a lot in spaCy APIs is that `something.value` returns an
+`int` and `something.value_` returns a string. That's implemented using the
+`StringStore`. Typically the `int` is stored in a C struct and the string is
+generated via a property that calls into the `StringStore` with the `int`.
+
+Besides `__getitem__`, the `StringStore` has functions to return specifically a
+string or specifically a hash, regardless of whether the input was a string or
+hash to begin with, though these are only used occasionally.
+
+### Implementation Details: Hashes and Allocations
+
+Hashes are 64-bit and are computed using [murmurhash][] on UTF-8 bytes. There is no
+mechanism for detecting and avoiding collisions. To date there has never been a
+reproducible collision or user report about any related issues.
+
+[murmurhash]: https://github.com/explosion/murmurhash
+
+The empty string is not hashed, it's just converted to/from 0.
+
+A small number of strings use indices into a lookup table (so low integers)
+rather than hashes. This is mostly Universal Dependencies labels or other
+strings considered "core" in spaCy. This was critical in v1, which hadn't
+introduced hashing yet. Since v2 it's important for items in `spacy.attrs`,
+especially lexeme flags, but is otherwise only maintained for backwards
+compatibility.
+
+You can call `strings["mystring"]` with a string the `StringStore` has never seen
+before and it will return a hash. But in order to do the reverse operation, you
+need to call `strings.add("mystring")` first. Without a call to `add` the
+string will not be interned.
+
+Example:
+
+```
+from spacy.strings import StringStore
+
+ss = StringStore()
+hashval = ss["spacy"] # 10639093010105930009
+try:
+ # this won't work
+ ss[hashval]
+except KeyError:
+ print(f"key {hashval} unknown in the StringStore.")
+
+ss.add("spacy")
+assert ss[hashval] == "spacy" # it works now
+
+# There is no `.keys` property, but you can iterate over keys
+# The empty string will never be in the list of keys
+for key in ss:
+ print(key)
+```
+
+In normal use nothing is ever removed from the `StringStore`. In theory this
+means that if you do something like iterate through all hex values of a certain
+length you can have explosive memory usage. In practice this has never been an
+issue. (Note that this is also different from using `sys.intern` to intern
+Python strings, which does not guarantee they won't be garbage collected later.)
+
+Strings are stored in the `StringStore` in a peculiar way: each string uses a
+union that is either an eight-byte `char[]` or a `char*`. Short strings are
+stored directly in the `char[]`, while longer strings are stored in allocated
+memory and prefixed with their length. This is a strategy to reduce indirection
+and memory fragmentation. See `decode_Utf8Str` and `_allocate` in
+`strings.pyx` for the implementation.
+
+### When to Use the StringStore?
+
+While you can ignore the `StringStore` in many cases, there are situations where
+you should make use of it to avoid errors.
+
+Any time you introduce a string that may be set on a `Doc` field that has a hash,
+you should add the string to the `StringStore`. This mainly happens when adding
+labels in components, but there are some other cases:
+
+- syntax iterators, mainly `get_noun_chunks`
+- external data used in components, like the `KnowledgeBase` in the `entity_linker`
+- labels used in tests
+
+## Vocab
+
+The `Vocab` is a core component of a `Language` pipeline. Its main function is
+to manage `Lexeme`s, which are structs that contain information about a token
+that depends only on its surface form, without context. `Lexeme`s store much of
+the data associated with `Token`s. As a side effect of this the `Vocab` also
+manages the `StringStore` for a pipeline and a grab-bag of other data.
+
+These are things stored in the vocab:
+
+- `Lexeme`s
+- `StringStore`
+- `Morphology`: manages info used in `MorphAnalysis` objects
+- `vectors`: basically a dict for word vectors
+- `lookups`: language specific data like lemmas
+- `writing_system`: language specific metadata
+- `get_noun_chunks`: a syntax iterator
+- lex attribute getters: functions like `is_punct`, set in language defaults
+- `cfg`: **not** the pipeline config, this is mostly unused
+- `_unused_object`: Formerly an unused object, kept around until v4 for compatability
+
+Some of these, like the Morphology and Vectors, are complex enough that they
+need their own explanations. Here we'll just look at Vocab-specific items.
+
+### Lexemes
+
+A `Lexeme` is a type that mainly wraps a `LexemeC`, a struct consisting of ints
+that identify various context-free token attributes. Lexemes are the core data
+of the `Vocab`, and can be accessed using `__getitem__` on the `Vocab`. The memory
+for storing `LexemeC` objects is managed by a pool that belongs to the `Vocab`.
+
+Note that `__getitem__` on the `Vocab` works much like the `StringStore`, in
+that it accepts a hash or id, with one important difference: if you do a lookup
+using a string, that value is added to the `StringStore` automatically.
+
+The attributes stored in a `LexemeC` are:
+
+- orth (the raw text)
+- lower
+- norm
+- shape
+- prefix
+- suffix
+
+Most of these are straightforward. All of them can be customized, and (except
+`orth`) probably should be since the defaults are based on English, but in
+practice this is rarely done at present.
+
+### Lookups
+
+This is basically a dict of dicts, implemented using a `Table` for each
+sub-dict, that stores lemmas and other language-specific lookup data.
+
+A `Table` is a subclass of `OrderedDict` used for string-to-string data. It uses
+Bloom filters to speed up misses and has some extra serialization features.
+Tables are not used outside of the lookups.
+
+### Lex Attribute Getters
+
+Lexical Attribute Getters like `is_punct` are defined on a per-language basis,
+much like lookups, but take the form of functions rather than string-to-string
+dicts, so they're stored separately.
+
+### Writing System
+
+This is a dict with three attributes:
+
+- `direction`: ltr or rtl (default ltr)
+- `has_case`: bool (default `True`)
+- `has_letters`: bool (default `True`, `False` only for CJK for now)
+
+Currently these are not used much - the main use is that `direction` is used in
+visualizers, though `rtl` doesn't quite work (see
+[#4854](https://github.com/explosion/spaCy/issues/4854)). In the future they
+could be used when choosing hyperparameters for subwords, controlling word
+shape generation, and similar tasks.
+
+### Other Vocab Members
+
+The Vocab is kind of the default place to store things from `Language.defaults`
+that don't belong to the Tokenizer. The following properties are in the Vocab
+just because they don't have anywhere else to go.
+
+- `get_noun_chunks`
+- `cfg`: This is a dict that just stores `oov_prob` (hardcoded to `-20`)
+- `_unused_object`: Leftover C member, should be removed in next major version
+
+
diff --git a/licenses/3rd_party_licenses.txt b/licenses/3rd_party_licenses.txt
index 7bc3d4547..d58da9c4a 100644
--- a/licenses/3rd_party_licenses.txt
+++ b/licenses/3rd_party_licenses.txt
@@ -104,3 +104,26 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
+
+
+importlib_metadata
+------------------
+
+* Files: util.py
+
+The implementation of packages_distributions() is adapted from
+importlib_metadata, which is distributed under the following license:
+
+Copyright 2017-2019 Jason R. Coombs, Barry Warsaw
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/pyproject.toml b/pyproject.toml
index 6d2dd2030..f81484d43 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
- "thinc>=8.0.7,<8.1.0",
+ "thinc>=8.0.12,<8.1.0",
"blis>=0.4.0,<0.8.0",
"pathy",
"numpy>=1.15.0",
diff --git a/requirements.txt b/requirements.txt
index b626c691a..8d7372cfe 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,15 +1,16 @@
# Our libraries
-spacy-legacy>=3.0.7,<3.1.0
+spacy-legacy>=3.0.8,<3.1.0
+spacy-loggers>=1.0.0,<2.0.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
-thinc>=8.0.7,<8.1.0
+thinc>=8.0.12,<8.1.0
blis>=0.4.0,<0.8.0
ml_datasets>=0.2.0,<0.3.0
murmurhash>=0.28.0,<1.1.0
wasabi>=0.8.1,<1.1.0
srsly>=2.4.1,<3.0.0
-catalogue>=2.0.4,<2.1.0
-typer>=0.3.0,<0.4.0
+catalogue>=2.0.6,<2.1.0
+typer>=0.3.0,<0.5.0
pathy>=0.3.5
# Third party dependencies
numpy>=1.15.0
@@ -17,6 +18,7 @@ requests>=2.13.0,<3.0.0
tqdm>=4.38.0,<5.0.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0
jinja2
+langcodes>=3.2.0,<4.0.0
# Official Python utilities
setuptools
packaging>=20.0
@@ -29,3 +31,7 @@ pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0
flake8>=3.8.0,<3.10.0
hypothesis>=3.27.0,<7.0.0
+mypy==0.910
+types-dataclasses>=0.1.3; python_version < "3.7"
+types-mock>=0.1.1
+types-requests
diff --git a/setup.cfg b/setup.cfg
index afc4c4ed1..586a044ff 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -21,6 +21,7 @@ classifiers =
Programming Language :: Python :: 3.7
Programming Language :: Python :: 3.8
Programming Language :: Python :: 3.9
+ Programming Language :: Python :: 3.10
Topic :: Scientific/Engineering
project_urls =
Release notes = https://github.com/explosion/spaCy/releases
@@ -37,19 +38,20 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
- thinc>=8.0.7,<8.1.0
+ thinc>=8.0.12,<8.1.0
install_requires =
# Our libraries
- spacy-legacy>=3.0.7,<3.1.0
+ spacy-legacy>=3.0.8,<3.1.0
+ spacy-loggers>=1.0.0,<2.0.0
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
- thinc>=8.0.7,<8.1.0
+ thinc>=8.0.12,<8.1.0
blis>=0.4.0,<0.8.0
wasabi>=0.8.1,<1.1.0
srsly>=2.4.1,<3.0.0
- catalogue>=2.0.4,<2.1.0
- typer>=0.3.0,<0.4.0
+ catalogue>=2.0.6,<2.1.0
+ typer>=0.3.0,<0.5.0
pathy>=0.3.5
# Third-party dependencies
tqdm>=4.38.0,<5.0.0
@@ -61,6 +63,7 @@ install_requires =
setuptools
packaging>=20.0
typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8"
+ langcodes>=3.2.0,<4.0.0
[options.entry_points]
console_scripts =
@@ -68,37 +71,45 @@ console_scripts =
[options.extras_require]
lookups =
- spacy_lookups_data>=1.0.2,<1.1.0
+ spacy_lookups_data>=1.0.3,<1.1.0
transformers =
- spacy_transformers>=1.0.1,<1.1.0
+ spacy_transformers>=1.1.2,<1.2.0
ray =
spacy_ray>=0.1.0,<1.0.0
cuda =
- cupy>=5.0.0b4,<10.0.0
+ cupy>=5.0.0b4,<11.0.0
cuda80 =
- cupy-cuda80>=5.0.0b4,<10.0.0
+ cupy-cuda80>=5.0.0b4,<11.0.0
cuda90 =
- cupy-cuda90>=5.0.0b4,<10.0.0
+ cupy-cuda90>=5.0.0b4,<11.0.0
cuda91 =
- cupy-cuda91>=5.0.0b4,<10.0.0
+ cupy-cuda91>=5.0.0b4,<11.0.0
cuda92 =
- cupy-cuda92>=5.0.0b4,<10.0.0
+ cupy-cuda92>=5.0.0b4,<11.0.0
cuda100 =
- cupy-cuda100>=5.0.0b4,<10.0.0
+ cupy-cuda100>=5.0.0b4,<11.0.0
cuda101 =
- cupy-cuda101>=5.0.0b4,<10.0.0
+ cupy-cuda101>=5.0.0b4,<11.0.0
cuda102 =
- cupy-cuda102>=5.0.0b4,<10.0.0
+ cupy-cuda102>=5.0.0b4,<11.0.0
cuda110 =
- cupy-cuda110>=5.0.0b4,<10.0.0
+ cupy-cuda110>=5.0.0b4,<11.0.0
cuda111 =
- cupy-cuda111>=5.0.0b4,<10.0.0
+ cupy-cuda111>=5.0.0b4,<11.0.0
cuda112 =
- cupy-cuda112>=5.0.0b4,<10.0.0
+ cupy-cuda112>=5.0.0b4,<11.0.0
+cuda113 =
+ cupy-cuda113>=5.0.0b4,<11.0.0
+cuda114 =
+ cupy-cuda114>=5.0.0b4,<11.0.0
+cuda115 =
+ cupy-cuda115>=5.0.0b4,<11.0.0
+apple =
+ thinc-apple-ops>=0.0.4,<1.0.0
# Language tokenizers with external dependencies
ja =
- sudachipy>=0.4.9
- sudachidict_core>=20200330
+ sudachipy>=0.5.2,!=0.6.1
+ sudachidict_core>=20211220
ko =
natto-py==0.9.0
th =
@@ -122,9 +133,11 @@ exclude =
[tool:pytest]
markers =
- slow
+ slow: mark a test as slow
+ issue: reference specific issue
[mypy]
ignore_missing_imports = True
no_implicit_optional = True
plugins = pydantic.mypy, thinc.mypy
+allow_redefinition = True
diff --git a/setup.py b/setup.py
index fcc124a43..03a1e01dd 100755
--- a/setup.py
+++ b/setup.py
@@ -81,6 +81,7 @@ COPY_FILES = {
ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package",
ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package",
ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package",
+ ROOT / "website" / "meta" / "universe.json": PACKAGE_ROOT / "tests" / "universe",
}
diff --git a/spacy/__init__.py b/spacy/__init__.py
index f20c32eb5..ca47edc94 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -5,7 +5,7 @@ import sys
# set library-specific custom warning handling before doing anything else
from .errors import setup_default_warnings
-setup_default_warnings()
+setup_default_warnings() # noqa: E402
# These are imported as part of the API
from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401
diff --git a/spacy/about.py b/spacy/about.py
index 499133cc0..c253d5052 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy"
-__version__ = "3.1.0"
+__version__ = "3.2.1"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects"
diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx
index 9122de17b..dc8eed7c3 100644
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@@ -1,3 +1,6 @@
+from .errors import Errors
+
+IOB_STRINGS = ("", "I", "O", "B")
IDS = {
"": NULL_ATTR,
@@ -64,7 +67,6 @@ IDS = {
"FLAG61": FLAG61,
"FLAG62": FLAG62,
"FLAG63": FLAG63,
-
"ID": ID,
"ORTH": ORTH,
"LOWER": LOWER,
@@ -72,7 +74,6 @@ IDS = {
"SHAPE": SHAPE,
"PREFIX": PREFIX,
"SUFFIX": SUFFIX,
-
"LENGTH": LENGTH,
"LEMMA": LEMMA,
"POS": POS,
@@ -87,7 +88,7 @@ IDS = {
"SPACY": SPACY,
"LANG": LANG,
"MORPH": MORPH,
- "IDX": IDX
+ "IDX": IDX,
}
@@ -109,28 +110,66 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
"""
inty_attrs = {}
if _do_deprecated:
- if 'F' in stringy_attrs:
+ if "F" in stringy_attrs:
stringy_attrs["ORTH"] = stringy_attrs.pop("F")
- if 'L' in stringy_attrs:
+ if "L" in stringy_attrs:
stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
- if 'pos' in stringy_attrs:
+ if "pos" in stringy_attrs:
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
- if 'morph' in stringy_attrs:
- morphs = stringy_attrs.pop('morph')
- if 'number' in stringy_attrs:
- stringy_attrs.pop('number')
- if 'tenspect' in stringy_attrs:
- stringy_attrs.pop('tenspect')
+ if "morph" in stringy_attrs:
+ morphs = stringy_attrs.pop("morph")
+ if "number" in stringy_attrs:
+ stringy_attrs.pop("number")
+ if "tenspect" in stringy_attrs:
+ stringy_attrs.pop("tenspect")
morph_keys = [
- 'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number',
- 'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
- 'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
- 'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr',
- 'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm',
- 'NumValue', 'PartType', 'Polite', 'StyleVariant',
- 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
- 'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
- 'Polarity', 'PrepCase', 'Animacy' # U20
+ "PunctType",
+ "PunctSide",
+ "Other",
+ "Degree",
+ "AdvType",
+ "Number",
+ "VerbForm",
+ "PronType",
+ "Aspect",
+ "Tense",
+ "PartType",
+ "Poss",
+ "Hyph",
+ "ConjType",
+ "NumType",
+ "Foreign",
+ "VerbType",
+ "NounType",
+ "Gender",
+ "Mood",
+ "Negative",
+ "Tense",
+ "Voice",
+ "Abbr",
+ "Derivation",
+ "Echo",
+ "Foreign",
+ "NameType",
+ "NounType",
+ "NumForm",
+ "NumValue",
+ "PartType",
+ "Polite",
+ "StyleVariant",
+ "PronType",
+ "AdjType",
+ "Person",
+ "Variant",
+ "AdpType",
+ "Reflex",
+ "Negative",
+ "Mood",
+ "Aspect",
+ "Case",
+ "Polarity",
+ "PrepCase",
+ "Animacy", # U20
]
for key in morph_keys:
if key in stringy_attrs:
@@ -142,8 +181,13 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
for name, value in stringy_attrs.items():
int_key = intify_attr(name)
if int_key is not None:
- if strings_map is not None and isinstance(value, basestring):
- if hasattr(strings_map, 'add'):
+ if int_key == ENT_IOB:
+ if value in IOB_STRINGS:
+ value = IOB_STRINGS.index(value)
+ elif isinstance(value, str):
+ raise ValueError(Errors.E1025.format(value=value))
+ if strings_map is not None and isinstance(value, str):
+ if hasattr(strings_map, "add"):
value = strings_map.add(value)
else:
value = strings_map[value]
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index ed1e840a5..fb680d888 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -1,4 +1,5 @@
-from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, TYPE_CHECKING
+from typing import Dict, Any, Union, List, Optional, Tuple, Iterable
+from typing import TYPE_CHECKING, overload
import sys
import shutil
from pathlib import Path
@@ -15,6 +16,7 @@ from thinc.util import has_cupy, gpu_is_available
from configparser import InterpolationError
import os
+from ..compat import Literal
from ..schemas import ProjectConfigSchema, validate
from ..util import import_file, run_command, make_tempdir, registry, logger
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
@@ -260,15 +262,16 @@ def get_checksum(path: Union[Path, str]) -> str:
RETURNS (str): The checksum.
"""
path = Path(path)
+ if not (path.is_file() or path.is_dir()):
+ msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1)
if path.is_file():
return hashlib.md5(Path(path).read_bytes()).hexdigest()
- if path.is_dir():
+ else:
# TODO: this is currently pretty slow
dir_checksum = hashlib.md5()
for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
dir_checksum.update(sub_file.read_bytes())
return dir_checksum.hexdigest()
- msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1)
@contextmanager
@@ -397,7 +400,11 @@ def git_checkout(
run_command(cmd, capture=True)
# We need Path(name) to make sure we also support subdirectories
try:
- shutil.copytree(str(tmp_dir / Path(subpath)), str(dest))
+ source_path = tmp_dir / Path(subpath)
+ if not is_subpath_of(tmp_dir, source_path):
+ err = f"'{subpath}' is a path outside of the cloned repository."
+ msg.fail(err, repo, exits=1)
+ shutil.copytree(str(source_path), str(dest))
except FileNotFoundError:
err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')"
msg.fail(err, repo, exits=1)
@@ -445,8 +452,14 @@ def git_sparse_checkout(repo, subpath, dest, branch):
# And finally, we can checkout our subpath
cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
run_command(cmd, capture=True)
- # We need Path(name) to make sure we also support subdirectories
- shutil.move(str(tmp_dir / Path(subpath)), str(dest))
+
+ # Get a subdirectory of the cloned path, if appropriate
+ source_path = tmp_dir / Path(subpath)
+ if not is_subpath_of(tmp_dir, source_path):
+ err = f"'{subpath}' is a path outside of the cloned repository."
+ msg.fail(err, repo, exits=1)
+
+ shutil.move(str(source_path), str(dest))
def get_git_version(
@@ -458,12 +471,15 @@ def get_git_version(
RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
(0, 0) if the version couldn't be determined.
"""
- ret = run_command("git --version", capture=True)
+ try:
+ ret = run_command("git --version", capture=True)
+ except:
+ raise RuntimeError(error)
stdout = ret.stdout.strip()
if not stdout or not stdout.startswith("git version"):
- return (0, 0)
+ return 0, 0
version = stdout[11:].strip().split(".")
- return (int(version[0]), int(version[1]))
+ return int(version[0]), int(version[1])
def _http_to_git(repo: str) -> str:
@@ -477,6 +493,29 @@ def _http_to_git(repo: str) -> str:
return repo
+def is_subpath_of(parent, child):
+ """
+ Check whether `child` is a path contained within `parent`.
+ """
+ # Based on https://stackoverflow.com/a/37095733 .
+
+ # In Python 3.9, the `Path.is_relative_to()` method will supplant this, so
+ # we can stop using crusty old os.path functions.
+ parent_realpath = os.path.realpath(parent)
+ child_realpath = os.path.realpath(child)
+ return os.path.commonpath([parent_realpath, child_realpath]) == parent_realpath
+
+
+@overload
+def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]:
+ ...
+
+
+@overload
+def string_to_list(value: str, intify: Literal[True]) -> List[int]:
+ ...
+
+
def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[int]]:
"""Parse a comma-separated string to a list and account for various
formatting options. Mostly used to handle CLI arguments that take a list of
@@ -487,7 +526,7 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
RETURNS (Union[List[str], List[int]]): A list of strings or ints.
"""
if not value:
- return []
+ return [] # type: ignore[return-value]
if value.startswith("[") and value.endswith("]"):
value = value[1:-1]
result = []
@@ -499,7 +538,7 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
p = p[1:-1]
p = p.strip()
if intify:
- p = int(p)
+ p = int(p) # type: ignore[assignment]
result.append(p)
return result
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index c84aa6431..04eb7078f 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -1,4 +1,4 @@
-from typing import Optional, Any, List, Union
+from typing import Callable, Iterable, Mapping, Optional, Any, List, Union
from enum import Enum
from pathlib import Path
from wasabi import Printer
@@ -9,7 +9,7 @@ import itertools
from ._util import app, Arg, Opt
from ..training import docs_to_json
-from ..tokens import DocBin
+from ..tokens import Doc, DocBin
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
from ..training.converters import conllu_to_docs
@@ -19,7 +19,7 @@ from ..training.converters import conllu_to_docs
# entry to this dict with the file extension mapped to the converter function
# imported from /converters.
-CONVERTERS = {
+CONVERTERS: Mapping[str, Callable[..., Iterable[Doc]]] = {
"conllubio": conllu_to_docs,
"conllu": conllu_to_docs,
"conll": conll_ner_to_docs,
@@ -66,19 +66,16 @@ def convert_cli(
DOCS: https://spacy.io/api/cli#convert
"""
- if isinstance(file_type, FileTypes):
- # We get an instance of the FileTypes from the CLI so we need its string value
- file_type = file_type.value
input_path = Path(input_path)
- output_dir = "-" if output_dir == Path("-") else output_dir
+ output_dir: Union[str, Path] = "-" if output_dir == Path("-") else output_dir
silent = output_dir == "-"
msg = Printer(no_print=silent)
- verify_cli_args(msg, input_path, output_dir, file_type, converter, ner_map)
+ verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
converter = _get_converter(msg, converter, input_path)
convert(
input_path,
output_dir,
- file_type=file_type,
+ file_type=file_type.value,
n_sents=n_sents,
seg_sents=seg_sents,
model=model,
@@ -94,7 +91,7 @@ def convert_cli(
def convert(
- input_path: Union[str, Path],
+ input_path: Path,
output_dir: Union[str, Path],
*,
file_type: str = "json",
@@ -108,13 +105,14 @@ def convert(
lang: Optional[str] = None,
concatenate: bool = False,
silent: bool = True,
- msg: Optional[Printer],
+ msg: Optional[Printer] = None,
) -> None:
+ input_path = Path(input_path)
if not msg:
msg = Printer(no_print=silent)
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
doc_files = []
- for input_loc in walk_directory(Path(input_path), converter):
+ for input_loc in walk_directory(input_path, converter):
with input_loc.open("r", encoding="utf-8") as infile:
input_data = infile.read()
# Use converter function to convert data
@@ -141,7 +139,7 @@ def convert(
else:
db = DocBin(docs=docs, store_user_data=True)
len_docs = len(db)
- data = db.to_bytes()
+ data = db.to_bytes() # type: ignore[assignment]
if output_dir == "-":
_print_docs_to_stdout(data, file_type)
else:
@@ -220,13 +218,12 @@ def walk_directory(path: Path, converter: str) -> List[Path]:
def verify_cli_args(
msg: Printer,
- input_path: Union[str, Path],
+ input_path: Path,
output_dir: Union[str, Path],
- file_type: FileTypes,
+ file_type: str,
converter: str,
ner_map: Optional[Path],
):
- input_path = Path(input_path)
if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
msg.fail(
f"Can't write .{file_type} data to stdout. Please specify an output directory.",
@@ -244,13 +241,13 @@ def verify_cli_args(
msg.fail("No input files in directory", input_path, exits=1)
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
if converter == "auto" and len(file_types) >= 2:
- file_types = ",".join(file_types)
- msg.fail("All input files must be same type", file_types, exits=1)
+ file_types_str = ",".join(file_types)
+ msg.fail("All input files must be same type", file_types_str, exits=1)
if converter != "auto" and converter not in CONVERTERS:
msg.fail(f"Can't find converter for {converter}", exits=1)
-def _get_converter(msg, converter, input_path):
+def _get_converter(msg, converter, input_path: Path):
if input_path.is_dir():
input_path = walk_directory(input_path, converter)[0]
if converter == "auto":
diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py
index 56ee12336..409fac4ed 100644
--- a/spacy/cli/debug_config.py
+++ b/spacy/cli/debug_config.py
@@ -25,7 +25,7 @@ def debug_config_cli(
show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")
# fmt: on
):
- """Debug a config.cfg file and show validation errors. The command will
+ """Debug a config file and show validation errors. The command will
create all objects in the tree and validate them. Note that some config
validation errors are blocking and will prevent the rest of the config from
being resolved. This means that you may not see all validation errors at
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index b4119abdf..ab7c20d48 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -1,4 +1,5 @@
-from typing import List, Sequence, Dict, Any, Tuple, Optional, Set
+from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
+from typing import cast, overload
from pathlib import Path
from collections import Counter
import sys
@@ -13,10 +14,11 @@ from ..training.initialize import get_sourced_components
from ..schemas import ConfigSchemaTraining
from ..pipeline._parser_internals import nonproj
from ..pipeline._parser_internals.nonproj import DELIMITER
-from ..pipeline import Morphologizer
+from ..pipeline import Morphologizer, SpanCategorizer
from ..morphology import Morphology
from ..language import Language
from ..util import registry, resolve_dot_names
+from ..compat import Literal
from .. import util
@@ -101,13 +103,14 @@ def debug_data(
# Create the gold corpus to be able to better analyze data
dot_names = [T["train_corpus"], T["dev_corpus"]]
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
+
+ nlp.initialize(lambda: train_corpus(nlp))
+ msg.good("Pipeline can be initialized with data")
+
train_dataset = list(train_corpus(nlp))
dev_dataset = list(dev_corpus(nlp))
msg.good("Corpus is loadable")
- nlp.initialize(lambda: train_dataset)
- msg.good("Pipeline can be initialized with data")
-
# Create all gold data here to avoid iterating over the train_dataset constantly
gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True)
gold_train_unpreprocessed_data = _compile_gold(
@@ -200,7 +203,7 @@ def debug_data(
has_low_data_warning = False
has_no_neg_warning = False
has_ws_ents_error = False
- has_punct_ents_warning = False
+ has_boundary_cross_ents_warning = False
msg.divider("Named Entity Recognition")
msg.info(f"{len(model_labels)} label(s)")
@@ -227,10 +230,6 @@ def debug_data(
msg.fail(f"{gold_train_data['ws_ents']} invalid whitespace entity spans")
has_ws_ents_error = True
- if gold_train_data["punct_ents"]:
- msg.warn(f"{gold_train_data['punct_ents']} entity span(s) with punctuation")
- has_punct_ents_warning = True
-
for label in labels:
if label_counts[label] <= NEW_LABEL_THRESHOLD:
msg.warn(
@@ -244,14 +243,20 @@ def debug_data(
msg.warn(f"No examples for texts WITHOUT new label '{label}'")
has_no_neg_warning = True
+ if gold_train_data["boundary_cross_ents"]:
+ msg.warn(
+ f"{gold_train_data['boundary_cross_ents']} entity span(s) crossing sentence boundaries"
+ )
+ has_boundary_cross_ents_warning = True
+
if not has_low_data_warning:
msg.good("Good amount of examples for all labels")
if not has_no_neg_warning:
msg.good("Examples without occurrences available for all labels")
if not has_ws_ents_error:
msg.good("No entities consisting of or starting/ending with whitespace")
- if not has_punct_ents_warning:
- msg.good("No entities consisting of or starting/ending with punctuation")
+ if not has_boundary_cross_ents_warning:
+ msg.good("No entities crossing sentence boundaries")
if has_low_data_warning:
msg.text(
@@ -267,15 +272,9 @@ def debug_data(
show=verbose,
)
if has_ws_ents_error:
- msg.text(
- "As of spaCy v2.1.0, entity spans consisting of or starting/ending "
- "with whitespace characters are considered invalid."
- )
-
- if has_punct_ents_warning:
msg.text(
"Entity spans consisting of or starting/ending "
- "with punctuation can not be trained with a noise level > 0."
+ "with whitespace characters are considered invalid."
)
if "textcat" in factory_names:
@@ -377,10 +376,11 @@ def debug_data(
if "tagger" in factory_names:
msg.divider("Part-of-speech Tagging")
- labels = [label for label in gold_train_data["tags"]]
+ label_list = [label for label in gold_train_data["tags"]]
model_labels = _get_labels_from_model(nlp, "tagger")
- msg.info(f"{len(labels)} label(s) in train data")
- missing_labels = model_labels - set(labels)
+ msg.info(f"{len(label_list)} label(s) in train data")
+ labels = set(label_list)
+ missing_labels = model_labels - labels
if missing_labels:
msg.warn(
"Some model labels are not present in the train data. The "
@@ -394,10 +394,11 @@ def debug_data(
if "morphologizer" in factory_names:
msg.divider("Morphologizer (POS+Morph)")
- labels = [label for label in gold_train_data["morphs"]]
+ label_list = [label for label in gold_train_data["morphs"]]
model_labels = _get_labels_from_model(nlp, "morphologizer")
- msg.info(f"{len(labels)} label(s) in train data")
- missing_labels = model_labels - set(labels)
+ msg.info(f"{len(label_list)} label(s) in train data")
+ labels = set(label_list)
+ missing_labels = model_labels - labels
if missing_labels:
msg.warn(
"Some model labels are not present in the train data. The "
@@ -564,7 +565,7 @@ def _compile_gold(
nlp: Language,
make_proj: bool,
) -> Dict[str, Any]:
- data = {
+ data: Dict[str, Any] = {
"ner": Counter(),
"cats": Counter(),
"tags": Counter(),
@@ -573,7 +574,7 @@ def _compile_gold(
"words": Counter(),
"roots": Counter(),
"ws_ents": 0,
- "punct_ents": 0,
+ "boundary_cross_ents": 0,
"n_words": 0,
"n_misaligned_words": 0,
"words_missing_vectors": Counter(),
@@ -608,19 +609,11 @@ def _compile_gold(
if label.startswith(("B-", "U-", "L-")) and doc[i].is_space:
# "Illegal" whitespace entity
data["ws_ents"] += 1
- if label.startswith(("B-", "U-", "L-")) and doc[i].text in [
- ".",
- "'",
- "!",
- "?",
- ",",
- ]:
- # punctuation entity: could be replaced by whitespace when training with noise,
- # so add a warning to alert the user to this unexpected side effect.
- data["punct_ents"] += 1
if label.startswith(("B-", "U-")):
combined_label = label.split("-")[1]
data["ner"][combined_label] += 1
+ if gold[i].is_sent_start and label.startswith(("I-", "L-")):
+ data["boundary_cross_ents"] += 1
elif label == "-":
data["ner"]["-"] += 1
if "textcat" in factory_names or "textcat_multilabel" in factory_names:
@@ -669,10 +662,28 @@ def _compile_gold(
return data
-def _format_labels(labels: List[Tuple[str, int]], counts: bool = False) -> str:
+@overload
+def _format_labels(labels: Iterable[str], counts: Literal[False] = False) -> str:
+ ...
+
+
+@overload
+def _format_labels(
+ labels: Iterable[Tuple[str, int]],
+ counts: Literal[True],
+) -> str:
+ ...
+
+
+def _format_labels(
+ labels: Union[Iterable[str], Iterable[Tuple[str, int]]],
+ counts: bool = False,
+) -> str:
if counts:
- return ", ".join([f"'{l}' ({c})" for l, c in labels])
- return ", ".join([f"'{l}'" for l in labels])
+ return ", ".join(
+ [f"'{l}' ({c})" for l, c in cast(Iterable[Tuple[str, int]], labels)]
+ )
+ return ", ".join([f"'{l}'" for l in cast(Iterable[str], labels)])
def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
@@ -688,8 +699,30 @@ def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
return count
-def _get_labels_from_model(nlp: Language, pipe_name: str) -> Set[str]:
- if pipe_name not in nlp.pipe_names:
- return set()
- pipe = nlp.get_pipe(pipe_name)
- return set(pipe.labels)
+def _get_labels_from_model(nlp: Language, factory_name: str) -> Set[str]:
+ pipe_names = [
+ pipe_name
+ for pipe_name in nlp.pipe_names
+ if nlp.get_pipe_meta(pipe_name).factory == factory_name
+ ]
+ labels: Set[str] = set()
+ for pipe_name in pipe_names:
+ pipe = nlp.get_pipe(pipe_name)
+ labels.update(pipe.labels)
+ return labels
+
+
+def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
+ pipe_names = [
+ pipe_name
+ for pipe_name in nlp.pipe_names
+ if nlp.get_pipe_meta(pipe_name).factory == "spancat"
+ ]
+ labels: Dict[str, Set[str]] = {}
+ for pipe_name in pipe_names:
+ pipe = nlp.get_pipe(pipe_name)
+ assert isinstance(pipe, SpanCategorizer)
+ if pipe.key not in labels:
+ labels[pipe.key] = set()
+ labels[pipe.key].update(pipe.labels)
+ return labels
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index 378911a20..0d08d2c5e 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -136,7 +136,7 @@ def evaluate(
def handle_scores_per_type(
- scores: Union[Scorer, Dict[str, Any]],
+ scores: Dict[str, Any],
data: Dict[str, Any] = {},
*,
spans_key: str = "sc",
diff --git a/spacy/cli/info.py b/spacy/cli/info.py
index 8cc7018ff..e6a1cb616 100644
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@@ -15,7 +15,7 @@ def info_cli(
model: Optional[str] = Arg(None, help="Optional loadable spaCy pipeline"),
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
- exclude: Optional[str] = Opt("labels", "--exclude", "-e", help="Comma-separated keys to exclude from the print-out"),
+ exclude: str = Opt("labels", "--exclude", "-e", help="Comma-separated keys to exclude from the print-out"),
# fmt: on
):
"""
@@ -61,7 +61,7 @@ def info(
return raw_data
-def info_spacy() -> Dict[str, any]:
+def info_spacy() -> Dict[str, Any]:
"""Generate info about the current spaCy intallation.
RETURNS (dict): The spaCy info.
diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
index 55622452b..d4cd939c2 100644
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@@ -27,9 +27,9 @@ class Optimizations(str, Enum):
@init_cli.command("config")
def init_config_cli(
# fmt: off
- output_file: Path = Arg(..., help="File to save config.cfg to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
- lang: Optional[str] = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
- pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
+ output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
+ lang: str = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
+ pipeline: str = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
gpu: bool = Opt(False, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
@@ -37,15 +37,13 @@ def init_config_cli(
# fmt: on
):
"""
- Generate a starter config.cfg for training. Based on your requirements
+ Generate a starter config file for training. Based on your requirements
specified via the CLI arguments, this command generates a config with the
optimal settings for your use case. This includes the choice of architecture,
pretrained weights and related hyperparameters.
DOCS: https://spacy.io/api/cli#init-config
"""
- if isinstance(optimize, Optimizations): # instance of enum from the CLI
- optimize = optimize.value
pipeline = string_to_list(pipeline)
is_stdout = str(output_file) == "-"
if not is_stdout and output_file.exists() and not force_overwrite:
@@ -57,7 +55,7 @@ def init_config_cli(
config = init_config(
lang=lang,
pipeline=pipeline,
- optimize=optimize,
+ optimize=optimize.value,
gpu=gpu,
pretraining=pretraining,
silent=is_stdout,
@@ -68,15 +66,15 @@ def init_config_cli(
@init_cli.command("fill-config")
def init_fill_config_cli(
# fmt: off
- base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False),
- output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
+ base_path: Path = Arg(..., help="Path to base config to fill", exists=True, dir_okay=False),
+ output_file: Path = Arg("-", help="Path to output .cfg file (or - for stdout)", allow_dash=True),
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes"),
code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
# fmt: on
):
"""
- Fill partial config.cfg with default values. Will add all missing settings
+ Fill partial config file with default values. Will add all missing settings
from the default config and will create all objects, check the registered
functions for their default values and update the base config. This command
can be used with a config generated via the training quickstart widget:
@@ -175,8 +173,8 @@ def init_config(
"Pipeline": ", ".join(pipeline),
"Optimize for": optimize,
"Hardware": variables["hardware"].upper(),
- "Transformer": template_vars.transformer.get("name")
- if template_vars.use_transformer
+ "Transformer": template_vars.transformer.get("name") # type: ignore[attr-defined]
+ if template_vars.use_transformer # type: ignore[attr-defined]
else None,
}
msg.info("Generated config template specific for your use case")
diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 2a920cdda..d53a61b8e 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -20,6 +20,7 @@ def init_vectors_cli(
output_dir: Path = Arg(..., help="Pipeline output directory"),
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
+ mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
@@ -34,7 +35,14 @@ def init_vectors_cli(
nlp = util.get_lang_class(lang)()
if jsonl_loc is not None:
update_lexemes(nlp, jsonl_loc)
- convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
+ convert_vectors(
+ nlp,
+ vectors_loc,
+ truncate=truncate,
+ prune=prune,
+ name=name,
+ mode=mode,
+ )
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
nlp.to_disk(output_dir)
msg.good(
diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 5dfe67296..f9d2a9af2 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -1,7 +1,10 @@
-from typing import Optional, Union, Any, Dict, List, Tuple
+from typing import Optional, Union, Any, Dict, List, Tuple, cast
import shutil
from pathlib import Path
from wasabi import Printer, MarkdownRenderer, get_raw_input
+from thinc.api import Config
+from collections import defaultdict
+from catalogue import RegistryError
import srsly
import sys
@@ -99,6 +102,12 @@ def package(
msg.fail("Can't load pipeline meta.json", meta_path, exits=1)
meta = srsly.read_json(meta_path)
meta = get_meta(input_dir, meta)
+ if meta["requirements"]:
+ msg.good(
+ f"Including {len(meta['requirements'])} package requirement(s) from "
+ f"meta and config",
+ ", ".join(meta["requirements"]),
+ )
if name is not None:
meta["name"] = name
if version is not None:
@@ -139,6 +148,9 @@ def package(
readme = generate_readme(meta)
create_file(readme_path, readme)
create_file(package_path / model_name_v / "README.md", readme)
+ msg.good("Generated README.md from meta.json")
+ else:
+ msg.info("Using existing README.md from pipeline directory")
imports = []
for code_path in code_paths:
imports.append(code_path.stem)
@@ -172,6 +184,64 @@ def has_wheel() -> bool:
return False
+def get_third_party_dependencies(
+ config: Config, exclude: List[str] = util.SimpleFrozenList()
+) -> List[str]:
+ """If the config includes references to registered functions that are
+ provided by third-party packages (spacy-transformers, other libraries), we
+ want to include them in meta["requirements"] so that the package specifies
+ them as dependencies and the user won't have to do it manually.
+
+ We do this by:
+ - traversing the config to check for registered function (@ keys)
+ - looking up the functions and getting their module
+ - looking up the module version and generating an appropriate version range
+
+ config (Config): The pipeline config.
+ exclude (list): List of packages to exclude (e.g. that already exist in meta).
+ RETURNS (list): The versioned requirements.
+ """
+ own_packages = ("spacy", "spacy-legacy", "spacy-nightly", "thinc", "srsly")
+ distributions = util.packages_distributions()
+ funcs = defaultdict(set)
+ # We only want to look at runtime-relevant sections, not [training] or [initialize]
+ for section in ("nlp", "components"):
+ for path, value in util.walk_dict(config[section]):
+ if path[-1].startswith("@"): # collect all function references by registry
+ funcs[path[-1][1:]].add(value)
+ for component in config.get("components", {}).values():
+ if "factory" in component:
+ funcs["factories"].add(component["factory"])
+ modules = set()
+ lang = config["nlp"]["lang"]
+ for reg_name, func_names in funcs.items():
+ for func_name in func_names:
+ # Try the lang-specific version and fall back
+ try:
+ func_info = util.registry.find(reg_name, lang + "." + func_name)
+ except RegistryError:
+ try:
+ func_info = util.registry.find(reg_name, func_name)
+ except RegistryError as regerr:
+ # lang-specific version being absent is not actually an issue
+ raise regerr from None
+ module_name = func_info.get("module") # type: ignore[attr-defined]
+ if module_name: # the code is part of a module, not a --code file
+ modules.add(func_info["module"].split(".")[0]) # type: ignore[index]
+ dependencies = []
+ for module_name in modules:
+ if module_name in distributions:
+ dist = distributions.get(module_name)
+ if dist:
+ pkg = dist[0]
+ if pkg in own_packages or pkg in exclude:
+ continue
+ version = util.get_package_version(pkg)
+ version_range = util.get_minor_version_range(version) # type: ignore[arg-type]
+ dependencies.append(f"{pkg}{version_range}")
+ return dependencies
+
+
def get_build_formats(formats: List[str]) -> Tuple[bool, bool]:
supported = ["sdist", "wheel", "none"]
for form in formats:
@@ -192,7 +262,7 @@ def create_file(file_path: Path, contents: str) -> None:
def get_meta(
model_path: Union[str, Path], existing_meta: Dict[str, Any]
) -> Dict[str, Any]:
- meta = {
+ meta: Dict[str, Any] = {
"lang": "en",
"name": "pipeline",
"version": "0.0.0",
@@ -202,9 +272,10 @@ def get_meta(
"url": "",
"license": "MIT",
}
- meta.update(existing_meta)
nlp = util.load_model_from_path(Path(model_path))
- meta["spacy_version"] = util.get_model_version_range(about.__version__)
+ meta.update(nlp.meta)
+ meta.update(existing_meta)
+ meta["spacy_version"] = util.get_minor_version_range(about.__version__)
meta["vectors"] = {
"width": nlp.vocab.vectors_length,
"vectors": len(nlp.vocab.vectors),
@@ -213,6 +284,11 @@ def get_meta(
}
if about.__title__ != "spacy":
meta["parent_package"] = about.__title__
+ meta.setdefault("requirements", [])
+ # Update the requirements with all third-party packages in the config
+ existing_reqs = [util.split_requirement(req)[0] for req in meta["requirements"]]
+ reqs = get_third_party_dependencies(nlp.config, exclude=existing_reqs)
+ meta["requirements"].extend(reqs)
return meta
@@ -258,8 +334,8 @@ def generate_readme(meta: Dict[str, Any]) -> str:
license_name = meta.get("license")
sources = _format_sources(meta.get("sources"))
description = meta.get("description")
- label_scheme = _format_label_scheme(meta.get("labels"))
- accuracy = _format_accuracy(meta.get("performance"))
+ label_scheme = _format_label_scheme(cast(Dict[str, Any], meta.get("labels")))
+ accuracy = _format_accuracy(cast(Dict[str, Any], meta.get("performance")))
table_data = [
(md.bold("Name"), md.code(name)),
(md.bold("Version"), md.code(version)),
@@ -331,7 +407,7 @@ def _format_label_scheme(data: Dict[str, Any]) -> str:
continue
col1 = md.bold(md.code(pipe))
col2 = ", ".join(
- [md.code(label.replace("|", "\\|")) for label in labels]
+ [md.code(str(label).replace("|", "\\|")) for label in labels]
) # noqa: W605
label_data.append((col1, col2))
n_labels += len(labels)
diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py
index f4f0d3caf..3c282c73d 100644
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@@ -32,7 +32,7 @@ def profile_cli(
DOCS: https://spacy.io/api/cli#debug-profile
"""
- if ctx.parent.command.name == NAME: # called as top-level command
+ if ctx.parent.command.name == NAME: # type: ignore[union-attr] # called as top-level command
msg.warn(
"The profile command is now available via the 'debug profile' "
"subcommand. You can run python -m spacy debug --help for an "
@@ -42,9 +42,9 @@ def profile_cli(
def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> None:
-
if inputs is not None:
- inputs = _read_inputs(inputs, msg)
+ texts = _read_inputs(inputs, msg)
+ texts = list(itertools.islice(texts, n_texts))
if inputs is None:
try:
import ml_datasets
@@ -56,16 +56,13 @@ def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) ->
exits=1,
)
- n_inputs = 25000
- with msg.loading("Loading IMDB dataset via Thinc..."):
- imdb_train, _ = ml_datasets.imdb()
- inputs, _ = zip(*imdb_train)
- msg.info(f"Loaded IMDB dataset and using {n_inputs} examples")
- inputs = inputs[:n_inputs]
+ with msg.loading("Loading IMDB dataset via ml_datasets..."):
+ imdb_train, _ = ml_datasets.imdb(train_limit=n_texts, dev_limit=0)
+ texts, _ = zip(*imdb_train)
+ msg.info(f"Loaded IMDB dataset and using {n_texts} examples")
with msg.loading(f"Loading pipeline '{model}'..."):
nlp = load_model(model)
msg.good(f"Loaded pipeline '{model}'")
- texts = list(itertools.islice(inputs, n_texts))
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
s = pstats.Stats("Profile.prof")
msg.divider("Profile stats")
@@ -87,7 +84,7 @@ def _read_inputs(loc: Union[Path, str], msg: Printer) -> Iterator[str]:
if not input_path.exists() or not input_path.is_file():
msg.fail("Not a valid input data file", loc, exits=1)
msg.info(f"Using data from {input_path.parts[-1]}")
- file_ = input_path.open()
+ file_ = input_path.open() # type: ignore[assignment]
for line in file_:
data = srsly.json_loads(line)
text = data["text"]
diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py
index b49e18608..5e0cdfdf2 100644
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@@ -1,18 +1,25 @@
-from typing import Optional
+from typing import Any, Dict, Optional
from pathlib import Path
from wasabi import msg
+import os
import re
import shutil
import requests
+import typer
from ...util import ensure_path, working_dir
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
from .._util import get_checksum, download_file, git_checkout, get_git_version
+from .._util import SimpleFrozenDict, parse_config_overrides
-@project_cli.command("assets")
+@project_cli.command(
+ "assets",
+ context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
def project_assets_cli(
# fmt: off
+ ctx: typer.Context, # This is only used to read additional arguments
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+.")
# fmt: on
@@ -24,16 +31,22 @@ def project_assets_cli(
DOCS: https://spacy.io/api/cli#project-assets
"""
- project_assets(project_dir, sparse_checkout=sparse_checkout)
+ overrides = parse_config_overrides(ctx.args)
+ project_assets(project_dir, overrides=overrides, sparse_checkout=sparse_checkout)
-def project_assets(project_dir: Path, *, sparse_checkout: bool = False) -> None:
+def project_assets(
+ project_dir: Path,
+ *,
+ overrides: Dict[str, Any] = SimpleFrozenDict(),
+ sparse_checkout: bool = False,
+) -> None:
"""Fetch assets for a project using DVC if possible.
project_dir (Path): Path to project directory.
"""
project_path = ensure_path(project_dir)
- config = load_project_config(project_path)
+ config = load_project_config(project_path, overrides=overrides)
assets = config.get("assets", {})
if not assets:
msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
@@ -59,6 +72,15 @@ def project_assets(project_dir: Path, *, sparse_checkout: bool = False) -> None:
shutil.rmtree(dest)
else:
dest.unlink()
+ if "repo" not in asset["git"] or asset["git"]["repo"] is None:
+ msg.fail(
+ "A git asset must include 'repo', the repository address.", exits=1
+ )
+ if "path" not in asset["git"] or asset["git"]["path"] is None:
+ msg.fail(
+ "A git asset must include 'path' - use \"\" to get the entire repository.",
+ exits=1,
+ )
git_checkout(
asset["git"]["repo"],
asset["git"]["path"],
@@ -108,11 +130,17 @@ def fetch_asset(
the asset failed.
"""
dest_path = (project_path / dest).resolve()
- if dest_path.exists() and checksum:
+ if dest_path.exists():
# If there's already a file, check for checksum
- if checksum == get_checksum(dest_path):
- msg.good(f"Skipping download with matching checksum: {dest}")
- return dest_path
+ if checksum:
+ if checksum == get_checksum(dest_path):
+ msg.good(f"Skipping download with matching checksum: {dest}")
+ return
+ else:
+ # If there's not a checksum, make sure the file is a possibly valid size
+ if os.path.getsize(dest_path) == 0:
+ msg.warn(f"Asset exists but with size of 0 bytes, deleting: {dest}")
+ os.remove(dest_path)
# We might as well support the user here and create parent directories in
# case the asset dir isn't listed as a dir to create in the project.yml
if not dest_path.parent.exists():
@@ -129,7 +157,6 @@ def fetch_asset(
msg.good(f"Copied local asset {dest}")
else:
msg.fail(f"Download failed: {dest}", e)
- return
if checksum and checksum != get_checksum(dest_path):
msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
diff --git a/spacy/cli/project/clone.py b/spacy/cli/project/clone.py
index 72d4004f8..360ee3428 100644
--- a/spacy/cli/project/clone.py
+++ b/spacy/cli/project/clone.py
@@ -80,9 +80,9 @@ def check_clone(name: str, dest: Path, repo: str) -> None:
repo (str): URL of the repo to clone from.
"""
git_err = (
- f"Cloning spaCy project templates requires Git and the 'git' command. ",
+ f"Cloning spaCy project templates requires Git and the 'git' command. "
f"To clone a project without Git, copy the files from the '{name}' "
- f"directory in the {repo} to {dest} manually.",
+ f"directory in the {repo} to {dest} manually."
)
get_git_version(error=git_err)
if not dest:
diff --git a/spacy/cli/project/dvc.py b/spacy/cli/project/dvc.py
index 7e37712c3..83dc5efbf 100644
--- a/spacy/cli/project/dvc.py
+++ b/spacy/cli/project/dvc.py
@@ -143,8 +143,8 @@ def run_dvc_commands(
easier to pass flags like --quiet that depend on a variable or
command-line setting while avoiding lots of nested conditionals.
"""
- for command in commands:
- command = split_command(command)
+ for c in commands:
+ command = split_command(c)
dvc_command = ["dvc", *command]
# Add the flags if they are set to True
for flag, is_active in flags.items():
diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py
index b88387a9f..6e3cde88c 100644
--- a/spacy/cli/project/pull.py
+++ b/spacy/cli/project/pull.py
@@ -2,7 +2,7 @@ from pathlib import Path
from wasabi import msg
from .remote_storage import RemoteStorage
from .remote_storage import get_command_hash
-from .._util import project_cli, Arg
+from .._util import project_cli, Arg, logger
from .._util import load_project_config
from .run import update_lockfile
@@ -39,11 +39,15 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
# in the list.
while commands:
for i, cmd in enumerate(list(commands)):
+ logger.debug(f"CMD: {cmd['name']}.")
deps = [project_dir / dep for dep in cmd.get("deps", [])]
if all(dep.exists() for dep in deps):
cmd_hash = get_command_hash("", "", deps, cmd["script"])
for output_path in cmd.get("outputs", []):
url = storage.pull(output_path, command_hash=cmd_hash)
+ logger.debug(
+ f"URL: {url} for {output_path} with command hash {cmd_hash}"
+ )
yield url, output_path
out_locs = [project_dir / out for out in cmd.get("outputs", [])]
@@ -53,6 +57,8 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
# we iterate over the loop again.
commands.pop(i)
break
+ else:
+ logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.")
else:
# If we didn't break the for loop, break the while loop.
break
diff --git a/spacy/cli/project/push.py b/spacy/cli/project/push.py
index 44050b716..bc779e9cd 100644
--- a/spacy/cli/project/push.py
+++ b/spacy/cli/project/push.py
@@ -3,7 +3,7 @@ from wasabi import msg
from .remote_storage import RemoteStorage
from .remote_storage import get_content_hash, get_command_hash
from .._util import load_project_config
-from .._util import project_cli, Arg
+from .._util import project_cli, Arg, logger
@project_cli.command("push")
@@ -37,12 +37,15 @@ def project_push(project_dir: Path, remote: str):
remote = config["remotes"][remote]
storage = RemoteStorage(project_dir, remote)
for cmd in config.get("commands", []):
+ logger.debug(f"CMD: cmd['name']")
deps = [project_dir / dep for dep in cmd.get("deps", [])]
if any(not dep.exists() for dep in deps):
+ logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs")
continue
cmd_hash = get_command_hash(
"", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
)
+ logger.debug(f"CMD_HASH: {cmd_hash}")
for output_path in cmd.get("outputs", []):
output_loc = project_dir / output_path
if output_loc.exists() and _is_not_empty_dir(output_loc):
@@ -51,6 +54,9 @@ def project_push(project_dir: Path, remote: str):
command_hash=cmd_hash,
content_hash=get_content_hash(output_loc),
)
+ logger.debug(
+ f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}"
+ )
yield output_path, url
diff --git a/spacy/cli/project/remote_storage.py b/spacy/cli/project/remote_storage.py
index 6056458e2..336a4bcb3 100644
--- a/spacy/cli/project/remote_storage.py
+++ b/spacy/cli/project/remote_storage.py
@@ -41,7 +41,7 @@ class RemoteStorage:
raise IOError(f"Cannot push {loc}: does not exist.")
url = self.make_url(path, command_hash, content_hash)
if url.exists():
- return None
+ return url
tmp: Path
with make_tempdir() as tmp:
tar_loc = tmp / self.encode_name(str(path))
@@ -131,8 +131,10 @@ def get_command_hash(
currently installed packages, whatever environment variables have been marked
as relevant, and the command.
"""
- check_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
- spacy_v = GIT_VERSION if check_commit else get_minor_version(about.__version__)
+ if check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION):
+ spacy_v = GIT_VERSION
+ else:
+ spacy_v = str(get_minor_version(about.__version__) or "")
dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
hashes = [spacy_v, site_hash, env_hash] + dep_checksums
hashes.extend(cmd)
diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py
index 5339d2a21..734803bc4 100644
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@@ -1,6 +1,7 @@
from typing import Optional, List, Dict, Sequence, Any, Iterable
from pathlib import Path
from wasabi import msg
+from wasabi.util import locale_escape
import sys
import srsly
import typer
@@ -57,6 +58,7 @@ def project_run(
project_dir (Path): Path to project directory.
subcommand (str): Name of command to run.
+ overrides (Dict[str, Any]): Optional config overrides.
force (bool): Force re-running, even if nothing changed.
dry (bool): Perform a dry run and don't execute commands.
capture (bool): Whether to capture the output and errors of individual commands.
@@ -68,11 +70,18 @@ def project_run(
config = load_project_config(project_dir, overrides=overrides)
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
workflows = config.get("workflows", {})
- validate_subcommand(commands.keys(), workflows.keys(), subcommand)
+ validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
if subcommand in workflows:
msg.info(f"Running workflow '{subcommand}'")
for cmd in workflows[subcommand]:
- project_run(project_dir, cmd, force=force, dry=dry, capture=capture)
+ project_run(
+ project_dir,
+ cmd,
+ overrides=overrides,
+ force=force,
+ dry=dry,
+ capture=capture,
+ )
else:
cmd = commands[subcommand]
for dep in cmd.get("deps", []):
@@ -107,7 +116,7 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
workflows = config.get("workflows", {})
project_loc = "" if is_cwd(project_dir) else project_dir
if subcommand:
- validate_subcommand(commands.keys(), workflows.keys(), subcommand)
+ validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
if subcommand in commands:
help_text = commands[subcommand].get("help")
@@ -127,7 +136,7 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
print("")
title = config.get("title")
if title:
- print(f"{title}\n")
+ print(f"{locale_escape(title)}\n")
if config_commands:
print(f"Available commands in {PROJECT_FILE}")
print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
@@ -155,8 +164,8 @@ def run_commands(
when you want to turn over execution to the command, and capture=True
when you want to run the command more like a function.
"""
- for command in commands:
- command = split_command(command)
+ for c in commands:
+ command = split_command(c)
# Not sure if this is needed or a good idea. Motivation: users may often
# use commands in their config that reference "python" and we want to
# make sure that it's always executing the same Python that spaCy is
@@ -212,6 +221,9 @@ def check_rerun(
strict_version (bool):
RETURNS (bool): Whether to re-run the command.
"""
+ # Always rerun if no-skip is set
+ if command.get("no_skip", False):
+ return True
lock_path = project_dir / PROJECT_LOCK
if not lock_path.exists(): # We don't have a lockfile, run command
return True
@@ -282,7 +294,7 @@ def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]
}
-def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, str]]:
+def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, Optional[str]]]:
"""Generate the file information for a list of paths (dependencies, outputs).
Includes the file path and the file's checksum.
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 339fb1e96..b78806fec 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -16,7 +16,10 @@ gpu_allocator = null
[nlp]
lang = "{{ lang }}"
-{%- if "tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "entity_linker" in components or (("textcat" in components or "textcat_multilabel" in components) and optimize == "accuracy") -%}
+{%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
+{%- set with_accuracy = optimize == "accuracy" -%}
+{%- set has_accurate_textcat = has_textcat and with_accuracy -%}
+{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "entity_linker" in components or has_accurate_textcat) -%}
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %}
{%- else -%}
{%- set full_pipeline = components %}
@@ -32,7 +35,7 @@ batch_size = {{ 128 if hardware == "gpu" else 1000 }}
factory = "transformer"
[components.transformer.model]
-@architectures = "spacy-transformers.TransformerModel.v1"
+@architectures = "spacy-transformers.TransformerModel.v3"
name = "{{ transformer["name"] }}"
tokenizer_config = {"use_fast": true}
@@ -198,7 +201,7 @@ no_output_layer = false
{# NON-TRANSFORMER PIPELINE #}
{% else -%}
-
+{% if "tok2vec" in full_pipeline -%}
[components.tok2vec]
factory = "tok2vec"
@@ -223,6 +226,7 @@ width = {{ 96 if optimize == "efficiency" else 256 }}
depth = {{ 4 if optimize == "efficiency" else 8 }}
window_size = 1
maxout_pieces = 3
+{% endif -%}
{% if "morphologizer" in components %}
[components.morphologizer]
diff --git a/spacy/cli/templates/quickstart_training_recommendations.yml b/spacy/cli/templates/quickstart_training_recommendations.yml
index dac3a26c1..a7bf9b74a 100644
--- a/spacy/cli/templates/quickstart_training_recommendations.yml
+++ b/spacy/cli/templates/quickstart_training_recommendations.yml
@@ -41,10 +41,10 @@ da:
word_vectors: da_core_news_lg
transformer:
efficiency:
- name: DJSammy/bert-base-danish-uncased_BotXO,ai
+ name: Maltehb/danish-bert-botxo
size_factor: 3
accuracy:
- name: DJSammy/bert-base-danish-uncased_BotXO,ai
+ name: Maltehb/danish-bert-botxo
size_factor: 3
de:
word_vectors: de_core_news_lg
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 2932edd3b..cc22cbba6 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Dict, Any, Union
from pathlib import Path
from wasabi import msg
import typer
@@ -7,7 +7,7 @@ import sys
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, setup_gpu
-from ..training.loop import train
+from ..training.loop import train as train_nlp
from ..training.initialize import init_nlp
from .. import util
@@ -40,14 +40,30 @@ def train_cli(
DOCS: https://spacy.io/api/cli#train
"""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+ overrides = parse_config_overrides(ctx.args)
+ import_code(code_path)
+ train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
+
+
+def train(
+ config_path: Union[str, Path],
+ output_path: Optional[Union[str, Path]] = None,
+ *,
+ use_gpu: int = -1,
+ overrides: Dict[str, Any] = util.SimpleFrozenDict(),
+):
+ config_path = util.ensure_path(config_path)
+ output_path = util.ensure_path(output_path)
# Make sure all files and paths exists if they are needed
if not config_path or (str(config_path) != "-" and not config_path.exists()):
msg.fail("Config file not found", config_path, exits=1)
- if output_path is not None and not output_path.exists():
- output_path.mkdir(parents=True)
- msg.good(f"Created output directory: {output_path}")
- overrides = parse_config_overrides(ctx.args)
- import_code(code_path)
+ if not output_path:
+ msg.info("No output directory provided")
+ else:
+ if not output_path.exists():
+ output_path.mkdir(parents=True)
+ msg.good(f"Created output directory: {output_path}")
+ msg.info(f"Saving to output directory: {output_path}")
setup_gpu(use_gpu)
with show_validation_error(config_path):
config = util.load_config(config_path, overrides=overrides, interpolate=False)
@@ -56,4 +72,4 @@ def train_cli(
nlp = init_nlp(config, use_gpu=use_gpu)
msg.good("Initialized pipeline")
msg.divider("Training pipeline")
- train(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr)
+ train_nlp(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr)
diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py
index a727e380e..a918e9a39 100644
--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@@ -99,7 +99,7 @@ def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]:
warnings.filterwarnings("ignore", message="\\[W09[45]")
model_meta = get_model_meta(model_path)
spacy_version = model_meta.get("spacy_version", "n/a")
- is_compat = is_compatible_version(about.__version__, spacy_version)
+ is_compat = is_compatible_version(about.__version__, spacy_version) # type: ignore[assignment]
pkgs[pkg_name] = {
"name": package,
"version": version,
diff --git a/spacy/compat.py b/spacy/compat.py
index 6eca18b80..89132735d 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -5,12 +5,12 @@ from thinc.util import copy_array
try:
import cPickle as pickle
except ImportError:
- import pickle
+ import pickle # type: ignore[no-redef]
try:
import copy_reg
except ImportError:
- import copyreg as copy_reg
+ import copyreg as copy_reg # type: ignore[no-redef]
try:
from cupy.cuda.stream import Stream as CudaStream
@@ -22,10 +22,18 @@ try:
except ImportError:
cupy = None
+if sys.version_info[:2] >= (3, 8): # Python 3.8+
+ from typing import Literal, Protocol, runtime_checkable
+else:
+ from typing_extensions import Literal, Protocol, runtime_checkable # noqa: F401
+
+# Important note: The importlib_metadata "backport" includes functionality
+# that's not part of the built-in importlib.metadata. We should treat this
+# import like the built-in and only use what's available there.
try: # Python 3.8+
- from typing import Literal
+ import importlib.metadata as importlib_metadata
except ImportError:
- from typing_extensions import Literal # noqa: F401
+ from catalogue import _importlib_metadata as importlib_metadata # type: ignore[no-redef] # noqa: F401
from thinc.api import Optimizer # noqa: F401
diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index ceb7357fc..86a72926e 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -68,12 +68,14 @@ seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
accumulate_gradient = 1
-# Controls early-stopping. 0 disables early stopping.
+# Controls early-stopping, i.e., the number of steps to continue without
+# improvement before stopping. 0 disables early stopping.
patience = 1600
# Number of epochs. 0 means unlimited. If >= 0, train corpus is loaded once in
# memory and shuffled within the training loop. -1 means stream train corpus
# rather than loading in memory with no shuffling within the training loop.
max_epochs = 0
+# Maximum number of update steps to train for. 0 means an unlimited number of steps.
max_steps = 20000
eval_frequency = 200
# Control how scores are printed and checkpoints are evaluated.
diff --git a/spacy/default_config_pretraining.cfg b/spacy/default_config_pretraining.cfg
index 16f767772..d70ecf04c 100644
--- a/spacy/default_config_pretraining.cfg
+++ b/spacy/default_config_pretraining.cfg
@@ -5,6 +5,7 @@ raw_text = null
max_epochs = 1000
dropout = 0.2
n_save_every = null
+n_save_epoch = null
component = "tok2vec"
layer = ""
corpus = "corpora.pretrain"
diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py
index 78b83f2e5..25d530c83 100644
--- a/spacy/displacy/__init__.py
+++ b/spacy/displacy/__init__.py
@@ -18,7 +18,7 @@ RENDER_WRAPPER = None
def render(
- docs: Union[Iterable[Union[Doc, Span]], Doc, Span],
+ docs: Union[Iterable[Union[Doc, Span, dict]], Doc, Span, dict],
style: str = "dep",
page: bool = False,
minify: bool = False,
@@ -28,7 +28,8 @@ def render(
) -> str:
"""Render displaCy visualisation.
- docs (Union[Iterable[Doc], Doc]): Document(s) to visualise.
+ docs (Union[Iterable[Union[Doc, Span, dict]], Doc, Span, dict]]): Document(s) to visualise.
+ a 'dict' is only allowed here when 'manual' is set to True
style (str): Visualisation style, 'dep' or 'ent'.
page (bool): Render markup as full HTML page.
minify (bool): Minify HTML markup.
@@ -53,8 +54,8 @@ def render(
raise ValueError(Errors.E096)
renderer_func, converter = factories[style]
renderer = renderer_func(options=options)
- parsed = [converter(doc, options) for doc in docs] if not manual else docs
- _html["parsed"] = renderer.render(parsed, page=page, minify=minify).strip()
+ parsed = [converter(doc, options) for doc in docs] if not manual else docs # type: ignore
+ _html["parsed"] = renderer.render(parsed, page=page, minify=minify).strip() # type: ignore
html = _html["parsed"]
if RENDER_WRAPPER is not None:
html = RENDER_WRAPPER(html)
@@ -133,7 +134,7 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
"lemma": np.root.lemma_,
"ent_type": np.root.ent_type_,
}
- retokenizer.merge(np, attrs=attrs)
+ retokenizer.merge(np, attrs=attrs) # type: ignore[arg-type]
if options.get("collapse_punct", True):
spans = []
for word in doc[:-1]:
@@ -148,7 +149,7 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
with doc.retokenize() as retokenizer:
for span, tag, lemma, ent_type in spans:
attrs = {"tag": tag, "lemma": lemma, "ent_type": ent_type}
- retokenizer.merge(span, attrs=attrs)
+ retokenizer.merge(span, attrs=attrs) # type: ignore[arg-type]
fine_grained = options.get("fine_grained")
add_lemma = options.get("add_lemma")
words = [
@@ -180,11 +181,19 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
def parse_ents(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
"""Generate named entities in [{start: i, end: i, label: 'label'}] format.
- doc (Doc): Document do parse.
+ doc (Doc): Document to parse.
+ options (Dict[str, Any]): NER-specific visualisation options.
RETURNS (dict): Generated entities keyed by text (original text) and ents.
"""
+ kb_url_template = options.get("kb_url_template", None)
ents = [
- {"start": ent.start_char, "end": ent.end_char, "label": ent.label_}
+ {
+ "start": ent.start_char,
+ "end": ent.end_char,
+ "label": ent.label_,
+ "kb_id": ent.kb_id_ if ent.kb_id_ else "",
+ "kb_url": kb_url_template.format(ent.kb_id_) if kb_url_template else "#",
+ }
for ent in doc.ents
]
if not ents:
diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py
index ba56beca3..a032d843b 100644
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@@ -3,7 +3,7 @@ import uuid
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_WORDS_LEMMA, TPL_DEP_ARCS
from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
-from .templates import TPL_ENTS
+from .templates import TPL_ENTS, TPL_KB_LINK
from ..util import minify_html, escape_html, registry
from ..errors import Errors
@@ -18,7 +18,7 @@ DEFAULT_LABEL_COLORS = {
"LOC": "#ff9561",
"PERSON": "#aa9cfc",
"NORP": "#c887fb",
- "FACILITY": "#9cc9cc",
+ "FAC": "#9cc9cc",
"EVENT": "#ffeb80",
"LAW": "#ff8197",
"LANGUAGE": "#ff8197",
@@ -305,7 +305,7 @@ class EntityRenderer:
"""Render entities in text.
text (str): Original text.
- spans (list): Individual entity spans and their start, end and label.
+ spans (list): Individual entity spans and their start, end, label, kb_id and kb_url.
title (str / None): Document title set in Doc.user_data['title'].
"""
markup = ""
@@ -314,6 +314,9 @@ class EntityRenderer:
label = span["label"]
start = span["start"]
end = span["end"]
+ kb_id = span.get("kb_id", "")
+ kb_url = span.get("kb_url", "#")
+ kb_link = TPL_KB_LINK.format(kb_id=kb_id, kb_url=kb_url) if kb_id else ""
additional_params = span.get("params", {})
entity = escape_html(text[start:end])
fragments = text[offset:start].split("\n")
@@ -323,7 +326,12 @@ class EntityRenderer:
markup += ""
if self.ents is None or label.upper() in self.ents:
color = self.colors.get(label.upper(), self.default_color)
- ent_settings = {"label": label, "text": entity, "bg": color}
+ ent_settings = {
+ "label": label,
+ "text": entity,
+ "bg": color,
+ "kb_link": kb_link,
+ }
ent_settings.update(additional_params)
markup += self.ent_template.format(**ent_settings)
else:
diff --git a/spacy/displacy/templates.py b/spacy/displacy/templates.py
index b9cbf717b..e7d3d4266 100644
--- a/spacy/displacy/templates.py
+++ b/spacy/displacy/templates.py
@@ -51,17 +51,22 @@ TPL_ENTS = """
TPL_ENT = """
{text}
- {label}
+ {label}{kb_link}
"""
TPL_ENT_RTL = """
{text}
- {label}
+ {label}{kb_link}
"""
+# Important: this needs to start with a space!
+TPL_KB_LINK = """
+ {kb_id}
+"""
+
TPL_PAGE = """
diff --git a/spacy/errors.py b/spacy/errors.py
index 2173dd58a..390612123 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -1,18 +1,13 @@
import warnings
-def add_codes(err_cls):
- """Add error codes to string messages via class attribute names."""
-
- class ErrorsWithCodes(err_cls):
- def __getattribute__(self, code):
- msg = super(ErrorsWithCodes, self).__getattribute__(code)
- if code.startswith("__"): # python system attributes like __class__
- return msg
- else:
- return "[{code}] {msg}".format(code=code, msg=msg)
-
- return ErrorsWithCodes()
+class ErrorsWithCodes(type):
+ def __getattribute__(self, code):
+ msg = super().__getattribute__(code)
+ if code.startswith("__"): # python system attributes like __class__
+ return msg
+ else:
+ return "[{code}] {msg}".format(code=code, msg=msg)
def setup_default_warnings():
@@ -25,7 +20,10 @@ def setup_default_warnings():
filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
# warn once about lemmatizer without required POS
- filter_warning("once", error_msg="[W108]")
+ filter_warning("once", error_msg=Warnings.W108)
+
+ # floret vector table cannot be modified
+ filter_warning("once", error_msg="[W114]")
def filter_warning(action: str, error_msg: str):
@@ -44,8 +42,7 @@ def _escape_warning_msg(msg):
# fmt: off
-@add_codes
-class Warnings:
+class Warnings(metaclass=ErrorsWithCodes):
W005 = ("Doc object not parsed. This means displaCy won't be able to "
"generate a dependency visualization for it. Make sure the Doc "
"was processed with a model that supports dependency parsing, and "
@@ -116,13 +113,11 @@ class Warnings:
# New warnings added in v3.x
W086 = ("Component '{listener}' will be (re)trained, but it needs the component "
- "'{name}' which is frozen. You can either freeze both, or neither "
- "of the two. If you're sourcing the component from "
- "an existing pipeline, you can use the `replace_listeners` setting in "
- "the config block to replace its token-to-vector listener with a copy "
- "and make it independent. For example, `replace_listeners = "
- "[\"model.tok2vec\"]` See the documentation for details: "
- "https://spacy.io/usage/training#config-components-listeners")
+ "'{name}' which is frozen. If you want to prevent retraining '{name}' "
+ "but want to train '{listener}' on top of it, you should add '{name}' to the "
+ "list of 'annotating_components' in the 'training' block in the config. "
+ "See the documentation for details: "
+ "https://spacy.io/usage/training#annotating-components")
W087 = ("Component '{name}' will be (re)trained, but the component '{listener}' "
"depends on it via a listener and is frozen. This means that the "
"performance of '{listener}' will be degraded. You can either freeze "
@@ -172,8 +167,8 @@ class Warnings:
"call the {matcher} on each Doc object.")
W107 = ("The property `Doc.{prop}` is deprecated. Use "
"`Doc.has_annotation(\"{attr}\")` instead.")
- W108 = ("The rule-based lemmatizer did not find POS annotation for the "
- "token '{text}'. Check that your pipeline includes components that "
+ W108 = ("The rule-based lemmatizer did not find POS annotation for one or "
+ "more tokens. Check that your pipeline includes components that "
"assign token.pos, typically 'tagger'+'attribute_ruler' or "
"'morphologizer'.")
W109 = ("Unable to save user hooks while serializing the doc. Re-add any "
@@ -192,10 +187,14 @@ class Warnings:
"vectors. This is almost certainly a mistake.")
W113 = ("Sourced component '{name}' may not work as expected: source "
"vectors are not identical to current pipeline vectors.")
+ W114 = ("Using multiprocessing with GPU models is not recommended and may "
+ "lead to errors.")
+ W115 = ("Skipping {method}: the floret vector table cannot be modified. "
+ "Vectors are calculated from character ngrams.")
+ W116 = ("Unable to clean attribute '{attr}'.")
-@add_codes
-class Errors:
+class Errors(metaclass=ErrorsWithCodes):
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
"This usually happens when spaCy calls `nlp.{method}` with a custom "
@@ -284,7 +283,7 @@ class Errors:
"you forget to call the `set_extension` method?")
E047 = ("Can't assign a value to unregistered extension attribute "
"'{name}'. Did you forget to call the `set_extension` method?")
- E048 = ("Can't import language {lang} from spacy.lang: {err}")
+ E048 = ("Can't import language {lang} or any matching language from spacy.lang: {err}")
E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
"package or a valid path to a data directory.")
E052 = ("Can't find model directory: {path}")
@@ -356,8 +355,8 @@ class Errors:
E098 = ("Invalid pattern: expected both RIGHT_ID and RIGHT_ATTRS.")
E099 = ("Invalid pattern: the first node of pattern should be an anchor "
"node. The node should only contain RIGHT_ID and RIGHT_ATTRS.")
- E100 = ("Nodes other than the anchor node should all contain LEFT_ID, "
- "REL_OP and RIGHT_ID.")
+ E100 = ("Nodes other than the anchor node should all contain {required}, "
+ "but these are missing: {missing}")
E101 = ("RIGHT_ID should be a new node and LEFT_ID should already have "
"have been declared in previous edges.")
E102 = ("Can't merge non-disjoint spans. '{token}' is already part of "
@@ -518,9 +517,24 @@ class Errors:
E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
E200 = ("Can't yet set {attr} from Span. Vote for this feature on the "
"issue tracker: http://github.com/explosion/spaCy/issues")
- E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
+ E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
# New errors added in v3.x
+ E858 = ("The {mode} vector table does not support this operation. "
+ "{alternative}")
+ E859 = ("The floret vector table cannot be modified.")
+ E860 = ("Can't truncate fasttext-bloom vectors.")
+ E861 = ("No 'keys' should be provided when initializing floret vectors "
+ "with 'minn' and 'maxn'.")
+ E862 = ("'hash_count' must be between 1-4 for floret vectors.")
+ E863 = ("'maxn' must be greater than or equal to 'minn'.")
+ E864 = ("The complete vector table 'data' is required to initialize floret "
+ "vectors.")
+ E865 = ("A SpanGroup is not functional after the corresponding Doc has "
+ "been garbage collected. To keep using the spans, make sure that "
+ "the corresponding Doc object is still available in the scope of "
+ "your function.")
+ E866 = ("Expected a string or 'Doc' as input, but got: {type}.")
E867 = ("The 'textcat' component requires at least two labels because it "
"uses mutually exclusive classes where exactly one label is True "
"for each doc. For binary classification tasks, you can use two "
@@ -628,7 +642,7 @@ class Errors:
E912 = ("Failed to initialize lemmatizer. Missing lemmatizer table(s) found "
"for mode '{mode}'. Required tables: {tables}. Found: {found}.")
E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
- "config.cfg or override it on the CLI?")
+ ".cfg file or override it on the CLI?")
E914 = ("Executing {name} callback failed. Expected the function to "
"return the nlp object but got: {value}. Maybe you forgot to return "
"the modified object in your function?")
@@ -655,7 +669,9 @@ class Errors:
"{nO} - cannot add any more labels.")
E923 = ("It looks like there is no proper sample data to initialize the "
"Model of component '{name}'. To check your input data paths and "
- "annotation, run: python -m spacy debug data config.cfg")
+ "annotation, run: python -m spacy debug data config.cfg "
+ "and include the same config override values you would specify "
+ "for the 'spacy train' command.")
E924 = ("The '{name}' component does not seem to be initialized properly. "
"This is likely a bug in spaCy, so feel free to open an issue: "
"https://github.com/explosion/spaCy/issues")
@@ -790,7 +806,7 @@ class Errors:
"to token boundaries.")
E982 = ("The `Token.ent_iob` attribute should be an integer indexing "
"into {values}, but found {value}.")
- E983 = ("Invalid key for '{dict}': {key}. Available keys: "
+ E983 = ("Invalid key(s) for '{dict}': {key}. Available keys: "
"{keys}")
E984 = ("Invalid component config for '{name}': component block needs either "
"a key `factory` specifying the registered function used to "
@@ -864,7 +880,21 @@ class Errors:
E1018 = ("Knowledge base for component '{name}' is not set. "
"Make sure either `nel.initialize` or `nel.set_kb` "
"is called with a `kb_loader` function.")
-
+ E1019 = ("`noun_chunks` requires the pos tagging, which requires a "
+ "statistical model to be installed and loaded. For more info, see "
+ "the documentation:\nhttps://spacy.io/usage/models")
+ E1020 = ("No `epoch_resume` value specified and could not infer one from "
+ "filename. Specify an epoch to resume from.")
+ E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
+ "Non-UD tags should use the `tag` property.")
+ E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
+ E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
+ "exist.")
+ E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler "
+ "patterns.")
+ E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
+ "supported values are: 'I', 'O', 'B' and ''")
+
# Deprecated model shortcuts, only used in errors and warnings
OLD_MODEL_SHORTCUTS = {
diff --git a/spacy/glossary.py b/spacy/glossary.py
index 0dc075ca7..e45704fc5 100644
--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@@ -95,6 +95,7 @@ GLOSSARY = {
"XX": "unknown",
"BES": 'auxiliary "be"',
"HVS": 'forms of "have"',
+ "_SP": "whitespace",
# POS Tags (German)
# TIGER Treebank
# http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index d8514b54c..9a765c8e4 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -1,5 +1,5 @@
# cython: infer_types=True, profile=True
-from typing import Iterator, Iterable
+from typing import Iterator, Iterable, Callable, Dict, Any
import srsly
from cymem.cymem cimport Pool
@@ -96,6 +96,8 @@ cdef class KnowledgeBase:
def initialize_entities(self, int64_t nr_entities):
self._entry_index = PreshMap(nr_entities + 1)
self._entries = entry_vec(nr_entities + 1)
+
+ def initialize_vectors(self, int64_t nr_entities):
self._vectors_table = float_matrix(nr_entities + 1)
def initialize_aliases(self, int64_t nr_aliases):
@@ -122,7 +124,7 @@ cdef class KnowledgeBase:
def get_alias_strings(self):
return [self.vocab.strings[x] for x in self._alias_index]
- def add_entity(self, unicode entity, float freq, vector[float] entity_vector):
+ def add_entity(self, str entity, float freq, vector[float] entity_vector):
"""
Add an entity to the KB, optionally specifying its log probability based on corpus frequency
Return the hash of the entity ID/name at the end.
@@ -154,6 +156,7 @@ cdef class KnowledgeBase:
nr_entities = len(set(entity_list))
self.initialize_entities(nr_entities)
+ self.initialize_vectors(nr_entities)
i = 0
cdef KBEntryC entry
@@ -172,8 +175,8 @@ cdef class KnowledgeBase:
entry.entity_hash = entity_hash
entry.freq = freq_list[i]
- vector_index = self.c_add_vector(entity_vector=vector_list[i])
- entry.vector_index = vector_index
+ self._vectors_table[i] = entity_vector
+ entry.vector_index = i
entry.feats_row = -1 # Features table currently not implemented
@@ -182,15 +185,15 @@ cdef class KnowledgeBase:
i += 1
- def contains_entity(self, unicode entity):
+ def contains_entity(self, str entity):
cdef hash_t entity_hash = self.vocab.strings.add(entity)
return entity_hash in self._entry_index
- def contains_alias(self, unicode alias):
+ def contains_alias(self, str alias):
cdef hash_t alias_hash = self.vocab.strings.add(alias)
return alias_hash in self._alias_index
- def add_alias(self, unicode alias, entities, probabilities):
+ def add_alias(self, str alias, entities, probabilities):
"""
For a given alias, add its potential entities and prior probabilies to the KB.
Return the alias_hash at the end
@@ -236,7 +239,7 @@ cdef class KnowledgeBase:
raise RuntimeError(Errors.E891.format(alias=alias))
return alias_hash
- def append_alias(self, unicode alias, unicode entity, float prior_prob, ignore_warnings=False):
+ def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False):
"""
For an alias already existing in the KB, extend its potential entities with one more.
Throw a warning if either the alias or the entity is unknown,
@@ -283,7 +286,7 @@ cdef class KnowledgeBase:
alias_entry.probs = probs
self._aliases_table[alias_index] = alias_entry
- def get_alias_candidates(self, unicode alias) -> Iterator[Candidate]:
+ def get_alias_candidates(self, str alias) -> Iterator[Candidate]:
"""
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
and the prior probability of that alias resolving to that entity.
@@ -304,7 +307,7 @@ cdef class KnowledgeBase:
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
if entry_index != 0]
- def get_vector(self, unicode entity):
+ def get_vector(self, str entity):
cdef hash_t entity_hash = self.vocab.strings[entity]
# Return an empty list if this entity is unknown in this KB
@@ -314,7 +317,7 @@ cdef class KnowledgeBase:
return self._vectors_table[self._entries[entry_index].vector_index]
- def get_prior_prob(self, unicode entity, unicode alias):
+ def get_prior_prob(self, str entity, str alias):
""" Return the prior probability of a given alias being linked to a given entity,
or return 0.0 when this combination is not known in the knowledge base"""
cdef hash_t alias_hash = self.vocab.strings[alias]
@@ -386,6 +389,7 @@ cdef class KnowledgeBase:
nr_aliases = header[1]
entity_vector_length = header[2]
self.initialize_entities(nr_entities)
+ self.initialize_vectors(nr_entities)
self.initialize_aliases(nr_aliases)
self.entity_vector_length = entity_vector_length
@@ -446,7 +450,7 @@ cdef class KnowledgeBase:
raise ValueError(Errors.E929.format(loc=path))
if not path.is_dir():
raise ValueError(Errors.E928.format(loc=path))
- deserialize = {}
+ deserialize: Dict[str, Callable[[Any], Any]] = {}
deserialize["contents"] = lambda p: self.read_contents(p)
deserialize["strings.json"] = lambda p: self.vocab.strings.from_disk(p)
util.from_disk(path, deserialize, exclude)
@@ -509,6 +513,7 @@ cdef class KnowledgeBase:
reader.read_header(&nr_entities, &entity_vector_length)
self.initialize_entities(nr_entities)
+ self.initialize_vectors(nr_entities)
self.entity_vector_length = entity_vector_length
# STEP 1: load entity vectors
@@ -582,7 +587,7 @@ cdef class Writer:
def __init__(self, path):
assert isinstance(path, Path)
content = bytes(path)
- cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
+ cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
self._fp = fopen(bytes_loc, 'wb')
if not self._fp:
raise IOError(Errors.E146.format(path=path))
@@ -624,7 +629,7 @@ cdef class Writer:
cdef class Reader:
def __init__(self, path):
content = bytes(path)
- cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
+ cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
self._fp = fopen(bytes_loc, 'rb')
if not self._fp:
PyErr_SetFromErrno(IOError)
diff --git a/spacy/lang/af/__init__.py b/spacy/lang/af/__init__.py
index 91917daee..553fcbf4c 100644
--- a/spacy/lang/af/__init__.py
+++ b/spacy/lang/af/__init__.py
@@ -1,8 +1,8 @@
from .stop_words import STOP_WORDS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class AfrikaansDefaults(Language.Defaults):
+class AfrikaansDefaults(BaseDefaults):
stop_words = STOP_WORDS
diff --git a/spacy/lang/am/__init__.py b/spacy/lang/am/__init__.py
index ed21b55ee..ddae556d6 100644
--- a/spacy/lang/am/__init__.py
+++ b/spacy/lang/am/__init__.py
@@ -4,12 +4,12 @@ from .punctuation import TOKENIZER_SUFFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...language import Language
+from ...language import Language, BaseDefaults
from ...attrs import LANG
from ...util import update_exc
-class AmharicDefaults(Language.Defaults):
+class AmharicDefaults(BaseDefaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "am"
diff --git a/spacy/lang/am/punctuation.py b/spacy/lang/am/punctuation.py
index 70af12039..555a179fa 100644
--- a/spacy/lang/am/punctuation.py
+++ b/spacy/lang/am/punctuation.py
@@ -1,7 +1,7 @@
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
from ..char_classes import UNITS, ALPHA_UPPER
-_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧".strip().split()
+_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
_suffixes = (
_list_punct
diff --git a/spacy/lang/ar/__init__.py b/spacy/lang/ar/__init__.py
index 6abb65efb..18c1f90ed 100644
--- a/spacy/lang/ar/__init__.py
+++ b/spacy/lang/ar/__init__.py
@@ -2,10 +2,10 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class ArabicDefaults(Language.Defaults):
+class ArabicDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS
diff --git a/spacy/lang/az/__init__.py b/spacy/lang/az/__init__.py
index 2937e2ecf..476898364 100644
--- a/spacy/lang/az/__init__.py
+++ b/spacy/lang/az/__init__.py
@@ -1,9 +1,9 @@
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class AzerbaijaniDefaults(Language.Defaults):
+class AzerbaijaniDefaults(BaseDefaults):
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
diff --git a/spacy/lang/bg/__init__.py b/spacy/lang/bg/__init__.py
index 6fa539a28..559cc34c4 100644
--- a/spacy/lang/bg/__init__.py
+++ b/spacy/lang/bg/__init__.py
@@ -3,12 +3,12 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...language import Language
+from ...language import Language, BaseDefaults
from ...attrs import LANG
from ...util import update_exc
-class BulgarianDefaults(Language.Defaults):
+class BulgarianDefaults(BaseDefaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "bg"
diff --git a/spacy/lang/bg/stop_words.py b/spacy/lang/bg/stop_words.py
index aae7692a2..df708b65e 100644
--- a/spacy/lang/bg/stop_words.py
+++ b/spacy/lang/bg/stop_words.py
@@ -1,265 +1,79 @@
-# Source: https://github.com/Alir3z4/stop-words
-
+"""
+References:
+ https://github.com/Alir3z4/stop-words - Original list, serves as a base.
+ https://postvai.com/books/stop-dumi.pdf - Additions to the original list in order to improve it.
+"""
STOP_WORDS = set(
"""
-а
-автентичен
-аз
-ако
-ала
-бе
-без
-беше
-би
-бивш
-бивша
-бившо
-бил
-била
-били
-било
-благодаря
-близо
-бъдат
-бъде
-бяха
-в
-вас
-ваш
-ваша
-вероятно
-вече
-взема
-ви
-вие
-винаги
-внимава
-време
-все
-всеки
-всички
-всичко
-всяка
-във
-въпреки
-върху
-г
-ги
-главен
-главна
-главно
-глас
-го
-година
-години
-годишен
-д
-да
-дали
-два
-двама
-двамата
-две
-двете
-ден
-днес
-дни
-до
-добра
-добре
-добро
-добър
-докато
-докога
-дори
-досега
-доста
-друг
-друга
-други
-е
-евтин
-едва
-един
-една
-еднаква
-еднакви
-еднакъв
-едно
-екип
-ето
-живот
-за
-забавям
-зад
-заедно
-заради
-засега
-заспал
-затова
-защо
-защото
-и
-из
-или
-им
-има
-имат
-иска
-й
-каза
-как
-каква
-какво
-както
-какъв
-като
-кога
-когато
-което
-които
-кой
-който
-колко
-която
-къде
-където
-към
-лесен
-лесно
-ли
-лош
-м
-май
-малко
-ме
-между
-мек
-мен
-месец
-ми
-много
-мнозина
-мога
-могат
-може
-мокър
-моля
-момента
-му
-н
-на
-над
-назад
-най
-направи
-напред
-например
-нас
-не
-него
-нещо
-нея
-ни
-ние
-никой
-нито
-нищо
-но
-нов
-нова
-нови
-новина
-някои
-някой
-няколко
-няма
-обаче
-около
-освен
-особено
-от
-отгоре
-отново
-още
-пак
-по
-повече
-повечето
-под
-поне
-поради
-после
-почти
-прави
-пред
-преди
-през
-при
-пък
-първата
-първи
-първо
-пъти
-равен
-равна
-с
-са
-сам
-само
-се
-сега
-си
-син
-скоро
-след
-следващ
-сме
-смях
-според
-сред
-срещу
-сте
-съм
-със
-също
-т
-тази
-така
-такива
-такъв
-там
-твой
-те
-тези
-ти
-т.н.
-то
-това
-тогава
-този
-той
-толкова
-точно
-три
-трябва
-тук
-тъй
-тя
-тях
-у
-утре
-харесва
-хиляди
-ч
-часа
-че
-често
-чрез
-ще
-щом
+а автентичен аз ако ала
+
+бе без беше би бивш бивша бившо бивши бил била били било благодаря близо бъдат
+бъде бъда бяха
+
+в вас ваш ваша вашата вашият вероятно вече взема ви вие винаги внимава време все
+всеки всички вместо всичко вследствие всъщност всяка втори във въпреки върху
+вътре веднъж
+
+г ги главен главна главно глас го годно година години годишен
+
+д да дали далеч далече два двама двамата две двете ден днес дни до добра добре
+добро добър достатъчно докато докога дори досега доста друг друга другаде други
+
+е евтин едва един една еднаква еднакви еднакъв едно екип ето
+
+живот жив
+
+за здравей здрасти знае зная забавям зад зададени заедно заради засега заспал
+затова запазва започвам защо защото завинаги
+
+и из или им има имат иска искам използвайки изглежда изглеждаше изглеждайки
+извън имайки
+
+й йо
+
+каза казва казвайки казвам как каква какво както какъв като кога кауза каузи
+когато когото което които кой който колко която къде където към край кратък
+кръгъл
+
+лесен лесно ли летя летиш летим лош
+
+м май малко макар малцина междувременно минус ме между мек мен месец ми мис
+мисля много мнозина мога могат може мой можем мокър моля момента му
+
+н на над назад най наш навсякъде навътре нагоре направи напред надолу наистина
+например наопаки наполовина напоследък нека независимо нас насам наскоро
+настрана необходимо него негов нещо нея ни ние никой нито нищо но нов някак нова
+нови новина някои някой някога някъде няколко няма
+
+о обаче около описан опитах опитва опитвайки опитвам определен определено освен
+обикновено осигурява обратно означава особен особено от ох отвъд отгоре отдолу
+отново отива отивам отидох отсега отделно отколкото откъдето очевидно оттам
+относно още
+
+п пак по повече повечето под поне просто пряко поради после последен последно
+посочен почти прави прав прави правя пред преди през при пък първата първи първо
+път пъти плюс
+
+равен равна различен различни разумен разумно
+
+с са сам само себе сериозно сигурен сигурно се сега си син скоро скорошен след
+следващ следващия следва следното следователно случва сме смях собствен
+сравнително смея според сред става срещу съвсем съдържа съдържащ съжалявам
+съответен съответно сте съм със също
+
+т така техен техни такива такъв твърде там трета твой те тези ти то това
+тогава този той търси толкова точно три трябва тук тъй тя тях
+
+у утре ужасно употреба успоредно уточнен уточняване
+
+харесва харесали хиляди
+
+ч часа ценя цяло цялостен че често чрез чудя
+
+ще щеше щом щяха
+
юмрук
-я
-як
+
+я як
""".split()
)
diff --git a/spacy/lang/bg/tokenizer_exceptions.py b/spacy/lang/bg/tokenizer_exceptions.py
index 0b7487c64..0f484b778 100644
--- a/spacy/lang/bg/tokenizer_exceptions.py
+++ b/spacy/lang/bg/tokenizer_exceptions.py
@@ -1,10 +1,16 @@
+"""
+References:
+ https://slovored.com/bg/abbr/grammar/ - Additional refs for abbreviations
+ (countries, occupations, fields of studies and more).
+"""
+
from ...symbols import ORTH, NORM
_exc = {}
-
-_abbr_exc = [
+# measurements
+for abbr in [
{ORTH: "м", NORM: "метър"},
{ORTH: "мм", NORM: "милиметър"},
{ORTH: "см", NORM: "сантиметър"},
@@ -17,51 +23,191 @@ _abbr_exc = [
{ORTH: "хл", NORM: "хектолиър"},
{ORTH: "дкл", NORM: "декалитър"},
{ORTH: "л", NORM: "литър"},
-]
-for abbr in _abbr_exc:
+]:
_exc[abbr[ORTH]] = [abbr]
-_abbr_line_exc = [
+# line abbreviations
+for abbr in [
{ORTH: "г-жа", NORM: "госпожа"},
{ORTH: "г-н", NORM: "господин"},
{ORTH: "г-ца", NORM: "госпожица"},
{ORTH: "д-р", NORM: "доктор"},
{ORTH: "о-в", NORM: "остров"},
{ORTH: "п-в", NORM: "полуостров"},
-]
-
-for abbr in _abbr_line_exc:
+ {ORTH: "с-у", NORM: "срещу"},
+ {ORTH: "в-у", NORM: "върху"},
+ {ORTH: "м-у", NORM: "между"},
+]:
_exc[abbr[ORTH]] = [abbr]
-_abbr_dot_exc = [
+# foreign language related abbreviations
+for abbr in [
+ {ORTH: "англ.", NORM: "английски"},
+ {ORTH: "ан.", NORM: "английски термин"},
+ {ORTH: "араб.", NORM: "арабски"},
+ {ORTH: "афр.", NORM: "африкански"},
+ {ORTH: "гр.", NORM: "гръцки"},
+ {ORTH: "лат.", NORM: "латински"},
+ {ORTH: "рим.", NORM: "римски"},
+ {ORTH: "старогр.", NORM: "старогръцки"},
+ {ORTH: "староевр.", NORM: "староеврейски"},
+ {ORTH: "фр.", NORM: "френски"},
+ {ORTH: "хол.", NORM: "холандски"},
+ {ORTH: "швед.", NORM: "шведски"},
+ {ORTH: "шотл.", NORM: "шотландски"},
+ {ORTH: "яп.", NORM: "японски"},
+]:
+ _exc[abbr[ORTH]] = [abbr]
+
+# profession and academic titles abbreviations
+for abbr in [
{ORTH: "акад.", NORM: "академик"},
- {ORTH: "ал.", NORM: "алинея"},
{ORTH: "арх.", NORM: "архитект"},
+ {ORTH: "инж.", NORM: "инженер"},
+ {ORTH: "канц.", NORM: "канцлер"},
+ {ORTH: "проф.", NORM: "професор"},
+ {ORTH: "св.", NORM: "свети"},
+]:
+ _exc[abbr[ORTH]] = [abbr]
+
+# fields of studies
+for abbr in [
+ {ORTH: "агр.", NORM: "агрономия"},
+ {ORTH: "ав.", NORM: "авиация"},
+ {ORTH: "агр.", NORM: "агрономия"},
+ {ORTH: "археол.", NORM: "археология"},
+ {ORTH: "астр.", NORM: "астрономия"},
+ {ORTH: "геод.", NORM: "геодезия"},
+ {ORTH: "геол.", NORM: "геология"},
+ {ORTH: "геом.", NORM: "геометрия"},
+ {ORTH: "гимн.", NORM: "гимнастика"},
+ {ORTH: "грам.", NORM: "граматика"},
+ {ORTH: "жур.", NORM: "журналистика"},
+ {ORTH: "журн.", NORM: "журналистика"},
+ {ORTH: "зем.", NORM: "земеделие"},
+ {ORTH: "икон.", NORM: "икономика"},
+ {ORTH: "лит.", NORM: "литература"},
+ {ORTH: "мат.", NORM: "математика"},
+ {ORTH: "мед.", NORM: "медицина"},
+ {ORTH: "муз.", NORM: "музика"},
+ {ORTH: "печ.", NORM: "печатарство"},
+ {ORTH: "пол.", NORM: "политика"},
+ {ORTH: "псих.", NORM: "психология"},
+ {ORTH: "соц.", NORM: "социология"},
+ {ORTH: "стат.", NORM: "статистика"},
+ {ORTH: "стил.", NORM: "стилистика"},
+ {ORTH: "топогр.", NORM: "топография"},
+ {ORTH: "търг.", NORM: "търговия"},
+ {ORTH: "фарм.", NORM: "фармацевтика"},
+ {ORTH: "фехт.", NORM: "фехтовка"},
+ {ORTH: "физиол.", NORM: "физиология"},
+ {ORTH: "физ.", NORM: "физика"},
+ {ORTH: "фил.", NORM: "философия"},
+ {ORTH: "фин.", NORM: "финанси"},
+ {ORTH: "фолкл.", NORM: "фолклор"},
+ {ORTH: "фон.", NORM: "фонетика"},
+ {ORTH: "фот.", NORM: "фотография"},
+ {ORTH: "футб.", NORM: "футбол"},
+ {ORTH: "хим.", NORM: "химия"},
+ {ORTH: "хир.", NORM: "хирургия"},
+ {ORTH: "ел.", NORM: "електротехника"},
+]:
+ _exc[abbr[ORTH]] = [abbr]
+
+for abbr in [
+ {ORTH: "ал.", NORM: "алинея"},
+ {ORTH: "авт.", NORM: "автоматично"},
+ {ORTH: "адм.", NORM: "администрация"},
+ {ORTH: "арт.", NORM: "артилерия"},
{ORTH: "бл.", NORM: "блок"},
{ORTH: "бр.", NORM: "брой"},
{ORTH: "бул.", NORM: "булевард"},
+ {ORTH: "букв.", NORM: "буквално"},
{ORTH: "в.", NORM: "век"},
+ {ORTH: "вр.", NORM: "време"},
+ {ORTH: "вм.", NORM: "вместо"},
+ {ORTH: "воен.", NORM: "военен термин"},
{ORTH: "г.", NORM: "година"},
{ORTH: "гр.", NORM: "град"},
+ {ORTH: "гл.", NORM: "глагол"},
+ {ORTH: "др.", NORM: "други"},
+ {ORTH: "ез.", NORM: "езеро"},
{ORTH: "ж.р.", NORM: "женски род"},
- {ORTH: "инж.", NORM: "инженер"},
+ {ORTH: "жп.", NORM: "железопът"},
+ {ORTH: "застр.", NORM: "застрахователно дело"},
+ {ORTH: "знач.", NORM: "значение"},
+ {ORTH: "и др.", NORM: "и други"},
+ {ORTH: "и под.", NORM: "и подобни"},
+ {ORTH: "и пр.", NORM: "и прочие"},
+ {ORTH: "изр.", NORM: "изречение"},
+ {ORTH: "изт.", NORM: "източен"},
+ {ORTH: "конкр.", NORM: "конкретно"},
{ORTH: "лв.", NORM: "лев"},
+ {ORTH: "л.", NORM: "лице"},
{ORTH: "м.р.", NORM: "мъжки род"},
- {ORTH: "мат.", NORM: "математика"},
- {ORTH: "мед.", NORM: "медицина"},
+ {ORTH: "мин.вр.", NORM: "минало време"},
+ {ORTH: "мн.ч.", NORM: "множествено число"},
+ {ORTH: "напр.", NORM: "например"},
+ {ORTH: "нар.", NORM: "наречие"},
+ {ORTH: "науч.", NORM: "научен термин"},
+ {ORTH: "непр.", NORM: "неправилно"},
+ {ORTH: "обик.", NORM: "обикновено"},
+ {ORTH: "опред.", NORM: "определение"},
+ {ORTH: "особ.", NORM: "особено"},
+ {ORTH: "ост.", NORM: "остаряло"},
+ {ORTH: "относ.", NORM: "относително"},
+ {ORTH: "отр.", NORM: "отрицателно"},
{ORTH: "пл.", NORM: "площад"},
- {ORTH: "проф.", NORM: "професор"},
+ {ORTH: "пад.", NORM: "падеж"},
+ {ORTH: "парл.", NORM: "парламентарен"},
+ {ORTH: "погов.", NORM: "поговорка"},
+ {ORTH: "пон.", NORM: "понякога"},
+ {ORTH: "правосл.", NORM: "православен"},
+ {ORTH: "прибл.", NORM: "приблизително"},
+ {ORTH: "прил.", NORM: "прилагателно име"},
+ {ORTH: "пр.", NORM: "прочие"},
{ORTH: "с.", NORM: "село"},
{ORTH: "с.р.", NORM: "среден род"},
- {ORTH: "св.", NORM: "свети"},
{ORTH: "сп.", NORM: "списание"},
{ORTH: "стр.", NORM: "страница"},
+ {ORTH: "сз.", NORM: "съюз"},
+ {ORTH: "сег.", NORM: "сегашно"},
+ {ORTH: "сп.", NORM: "спорт"},
+ {ORTH: "срв.", NORM: "сравни"},
+ {ORTH: "с.ст.", NORM: "селскостопанска техника"},
+ {ORTH: "счет.", NORM: "счетоводство"},
+ {ORTH: "съкр.", NORM: "съкратено"},
+ {ORTH: "съобщ.", NORM: "съобщение"},
+ {ORTH: "същ.", NORM: "съществително"},
+ {ORTH: "текст.", NORM: "текстилен"},
+ {ORTH: "телев.", NORM: "телевизия"},
+ {ORTH: "тел.", NORM: "телефон"},
+ {ORTH: "т.е.", NORM: "тоест"},
+ {ORTH: "т.н.", NORM: "така нататък"},
+ {ORTH: "т.нар.", NORM: "така наречен"},
+ {ORTH: "търж.", NORM: "тържествено"},
{ORTH: "ул.", NORM: "улица"},
+ {ORTH: "уч.", NORM: "училище"},
+ {ORTH: "унив.", NORM: "университет"},
+ {ORTH: "харт.", NORM: "хартия"},
+ {ORTH: "хидр.", NORM: "хидравлика"},
+ {ORTH: "хран.", NORM: "хранителна"},
+ {ORTH: "църк.", NORM: "църковен термин"},
+ {ORTH: "числ.", NORM: "числително"},
{ORTH: "чл.", NORM: "член"},
-]
-
-for abbr in _abbr_dot_exc:
+ {ORTH: "ч.", NORM: "число"},
+ {ORTH: "числ.", NORM: "числително"},
+ {ORTH: "шахм.", NORM: "шахмат"},
+ {ORTH: "шах.", NORM: "шахмат"},
+ {ORTH: "юр.", NORM: "юридически"},
+]:
_exc[abbr[ORTH]] = [abbr]
+# slash abbreviations
+for abbr in [
+ {ORTH: "м/у", NORM: "между"},
+ {ORTH: "с/у", NORM: "срещу"},
+]:
+ _exc[abbr[ORTH]] = [abbr]
TOKENIZER_EXCEPTIONS = _exc
diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py
index 23c3ff485..6d0331e00 100644
--- a/spacy/lang/bn/__init__.py
+++ b/spacy/lang/bn/__init__.py
@@ -1,13 +1,13 @@
-from typing import Optional
+from typing import Optional, Callable
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
-from ...language import Language
+from ...language import Language, BaseDefaults
from ...pipeline import Lemmatizer
-class BengaliDefaults(Language.Defaults):
+class BengaliDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
@@ -23,13 +23,25 @@ class Bengali(Language):
@Bengali.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={"model": None, "mode": "rule", "overwrite": False},
+ default_config={
+ "model": None,
+ "mode": "rule",
+ "overwrite": False,
+ "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+ },
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
- nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+ nlp: Language,
+ model: Optional[Model],
+ name: str,
+ mode: str,
+ overwrite: bool,
+ scorer: Optional[Callable],
):
- return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+ return Lemmatizer(
+ nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+ )
__all__ = ["Bengali"]
diff --git a/spacy/lang/ca/__init__.py b/spacy/lang/ca/__init__.py
old mode 100644
new mode 100755
index 81f39b13c..a3def660d
--- a/spacy/lang/ca/__init__.py
+++ b/spacy/lang/ca/__init__.py
@@ -1,20 +1,21 @@
-from typing import Optional
+from typing import Optional, Callable
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language
+from ...language import Language, BaseDefaults
from .lemmatizer import CatalanLemmatizer
-class CatalanDefaults(Language.Defaults):
+class CatalanDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
+ prefixes = TOKENIZER_PREFIXES
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS
@@ -28,13 +29,25 @@ class Catalan(Language):
@Catalan.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={"model": None, "mode": "rule", "overwrite": False},
+ default_config={
+ "model": None,
+ "mode": "rule",
+ "overwrite": False,
+ "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+ },
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
- nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+ nlp: Language,
+ model: Optional[Model],
+ name: str,
+ mode: str,
+ overwrite: bool,
+ scorer: Optional[Callable],
):
- return CatalanLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+ return CatalanLemmatizer(
+ nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+ )
__all__ = ["Catalan"]
diff --git a/spacy/lang/ca/lemmatizer.py b/spacy/lang/ca/lemmatizer.py
index 2518eb720..2fd012912 100644
--- a/spacy/lang/ca/lemmatizer.py
+++ b/spacy/lang/ca/lemmatizer.py
@@ -76,6 +76,6 @@ class CatalanLemmatizer(Lemmatizer):
forms.append(self.lookup_lemmatize(token)[0])
if not forms:
forms.append(string)
- forms = list(set(forms))
+ forms = list(dict.fromkeys(forms))
self.cache[cache_key] = forms
return forms
diff --git a/spacy/lang/ca/punctuation.py b/spacy/lang/ca/punctuation.py
old mode 100644
new mode 100755
index 39db08f17..8e2f09828
--- a/spacy/lang/ca/punctuation.py
+++ b/spacy/lang/ca/punctuation.py
@@ -1,4 +1,5 @@
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
+from ..char_classes import LIST_CURRENCY
from ..char_classes import CURRENCY
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
from ..char_classes import merge_chars, _units
@@ -6,6 +7,14 @@ from ..char_classes import merge_chars, _units
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
+_prefixes = (
+ ["§", "%", "=", "—", "–", "-", r"\+(?![0-9])"]
+ + LIST_PUNCT
+ + LIST_ELLIPSES
+ + LIST_QUOTES
+ + LIST_CURRENCY
+ + LIST_ICONS
+)
_infixes = (
LIST_ELLIPSES
@@ -18,6 +27,7 @@ _infixes = (
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION),
+ r"('ls|'l|'ns|'t|'m|'n|-les|-la|-lo|-li|-los|-me|-nos|-te|-vos|-se|-hi|-ne|-ho)(?![A-Za-z])|(-l'|-m'|-t'|-n')",
]
)
@@ -44,3 +54,4 @@ _suffixes = (
TOKENIZER_INFIXES = _infixes
TOKENIZER_SUFFIXES = _suffixes
+TOKENIZER_PREFIXES = _prefixes
diff --git a/spacy/lang/ca/syntax_iterators.py b/spacy/lang/ca/syntax_iterators.py
index c70d53e80..917e07c93 100644
--- a/spacy/lang/ca/syntax_iterators.py
+++ b/spacy/lang/ca/syntax_iterators.py
@@ -1,8 +1,10 @@
+from typing import Union, Iterator, Tuple
+from ...tokens import Doc, Span
from ...symbols import NOUN, PROPN
from ...errors import Errors
-def noun_chunks(doclike):
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
# fmt: off
labels = ["nsubj", "nsubj:pass", "obj", "obl", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
diff --git a/spacy/lang/ca/tokenizer_exceptions.py b/spacy/lang/ca/tokenizer_exceptions.py
old mode 100644
new mode 100755
index 5f9a50f5e..b261b3498
--- a/spacy/lang/ca/tokenizer_exceptions.py
+++ b/spacy/lang/ca/tokenizer_exceptions.py
@@ -18,12 +18,21 @@ for exc_data in [
{ORTH: "nov.", NORM: "novembre"},
{ORTH: "dec.", NORM: "desembre"},
{ORTH: "Dr.", NORM: "doctor"},
+ {ORTH: "Dra.", NORM: "doctora"},
{ORTH: "Sr.", NORM: "senyor"},
{ORTH: "Sra.", NORM: "senyora"},
{ORTH: "Srta.", NORM: "senyoreta"},
{ORTH: "núm", NORM: "número"},
{ORTH: "St.", NORM: "sant"},
{ORTH: "Sta.", NORM: "santa"},
+ {ORTH: "pl.", NORM: "plaça"},
+ {ORTH: "à."},
+ {ORTH: "è."},
+ {ORTH: "é."},
+ {ORTH: "í."},
+ {ORTH: "ò."},
+ {ORTH: "ó."},
+ {ORTH: "ú."},
{ORTH: "'l"},
{ORTH: "'ls"},
{ORTH: "'m"},
@@ -34,6 +43,18 @@ for exc_data in [
]:
_exc[exc_data[ORTH]] = [exc_data]
+_exc["del"] = [{ORTH: "d", NORM: "de"}, {ORTH: "el"}]
+_exc["dels"] = [{ORTH: "d", NORM: "de"}, {ORTH: "els"}]
+
+_exc["al"] = [{ORTH: "a"}, {ORTH: "l", NORM: "el"}]
+_exc["als"] = [{ORTH: "a"}, {ORTH: "ls", NORM: "els"}]
+
+_exc["pel"] = [{ORTH: "p", NORM: "per"}, {ORTH: "el"}]
+_exc["pels"] = [{ORTH: "p", NORM: "per"}, {ORTH: "els"}]
+
+_exc["holahola"] = [{ORTH: "holahola", NORM: "cocacola"}]
+
+
# Times
_exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", NORM: "p.m."}]
diff --git a/spacy/lang/cs/__init__.py b/spacy/lang/cs/__init__.py
index 26f5845cc..3e70e4078 100644
--- a/spacy/lang/cs/__init__.py
+++ b/spacy/lang/cs/__init__.py
@@ -1,9 +1,9 @@
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class CzechDefaults(Language.Defaults):
+class CzechDefaults(BaseDefaults):
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py
index c5260ccdd..e148a7b4f 100644
--- a/spacy/lang/da/__init__.py
+++ b/spacy/lang/da/__init__.py
@@ -3,10 +3,10 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class DanishDefaults(Language.Defaults):
+class DanishDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
diff --git a/spacy/lang/da/syntax_iterators.py b/spacy/lang/da/syntax_iterators.py
index 39181d753..a0b70f004 100644
--- a/spacy/lang/da/syntax_iterators.py
+++ b/spacy/lang/da/syntax_iterators.py
@@ -1,8 +1,10 @@
+from typing import Union, Iterator, Tuple
+from ...tokens import Doc, Span
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
from ...errors import Errors
-def noun_chunks(doclike):
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
def is_verb_token(tok):
return tok.pos in [VERB, AUX]
@@ -32,7 +34,7 @@ def noun_chunks(doclike):
def get_bounds(doc, root):
return get_left_bound(doc, root), get_right_bound(doc, root)
- doc = doclike.doc
+ doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py
index b645d3480..65863c098 100644
--- a/spacy/lang/de/__init__.py
+++ b/spacy/lang/de/__init__.py
@@ -2,10 +2,10 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class GermanDefaults(Language.Defaults):
+class GermanDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
diff --git a/spacy/lang/de/syntax_iterators.py b/spacy/lang/de/syntax_iterators.py
index aba0e8024..e80504998 100644
--- a/spacy/lang/de/syntax_iterators.py
+++ b/spacy/lang/de/syntax_iterators.py
@@ -1,11 +1,11 @@
-from typing import Union, Iterator
+from typing import Union, Iterator, Tuple
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
from ...tokens import Doc, Span
-def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
# this iterator extracts spans headed by NOUNs starting from the left-most
# syntactic dependent until the NOUN itself for close apposition and
diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py
index be59a3500..53dd9be8e 100644
--- a/spacy/lang/el/__init__.py
+++ b/spacy/lang/el/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@@ -7,10 +7,10 @@ from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .lemmatizer import GreekLemmatizer
-from ...language import Language
+from ...language import Language, BaseDefaults
-class GreekDefaults(Language.Defaults):
+class GreekDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
@@ -28,13 +28,25 @@ class Greek(Language):
@Greek.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={"model": None, "mode": "rule", "overwrite": False},
+ default_config={
+ "model": None,
+ "mode": "rule",
+ "overwrite": False,
+ "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+ },
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
- nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+ nlp: Language,
+ model: Optional[Model],
+ name: str,
+ mode: str,
+ overwrite: bool,
+ scorer: Optional[Callable],
):
- return GreekLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+ return GreekLemmatizer(
+ nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+ )
__all__ = ["Greek"]
diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py
index 89cfd8b72..18fa46695 100644
--- a/spacy/lang/el/syntax_iterators.py
+++ b/spacy/lang/el/syntax_iterators.py
@@ -1,11 +1,11 @@
-from typing import Union, Iterator
+from typing import Union, Iterator, Tuple
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
from ...tokens import Doc, Span
-def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
# It follows the logic of the noun chunks finder of English language,
# adjusted to some Greek language special characteristics.
diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py
index eea522908..876186979 100644
--- a/spacy/lang/en/__init__.py
+++ b/spacy/lang/en/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@@ -7,10 +7,10 @@ from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_INFIXES
from .lemmatizer import EnglishLemmatizer
-from ...language import Language
+from ...language import Language, BaseDefaults
-class EnglishDefaults(Language.Defaults):
+class EnglishDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS
@@ -26,13 +26,25 @@ class English(Language):
@English.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={"model": None, "mode": "rule", "overwrite": False},
+ default_config={
+ "model": None,
+ "mode": "rule",
+ "overwrite": False,
+ "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+ },
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
- nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+ nlp: Language,
+ model: Optional[Model],
+ name: str,
+ mode: str,
+ overwrite: bool,
+ scorer: Optional[Callable],
):
- return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+ return EnglishLemmatizer(
+ nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+ )
__all__ = ["English"]
diff --git a/spacy/lang/en/lemmatizer.py b/spacy/lang/en/lemmatizer.py
index 2cb0f9a53..c88b69bcc 100644
--- a/spacy/lang/en/lemmatizer.py
+++ b/spacy/lang/en/lemmatizer.py
@@ -10,7 +10,7 @@ class EnglishLemmatizer(Lemmatizer):
Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
- univ_pos (unicode / int): The token's universal part-of-speech tag.
+ univ_pos (str / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the
Universal Dependencies scheme.
"""
diff --git a/spacy/lang/en/lex_attrs.py b/spacy/lang/en/lex_attrs.py
index b630a317d..ab9353919 100644
--- a/spacy/lang/en/lex_attrs.py
+++ b/spacy/lang/en/lex_attrs.py
@@ -19,7 +19,7 @@ _ordinal_words = [
# fmt: on
-def like_num(text: str) -> bool:
+def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py
index 00a1bac42..7904e5621 100644
--- a/spacy/lang/en/syntax_iterators.py
+++ b/spacy/lang/en/syntax_iterators.py
@@ -1,11 +1,11 @@
-from typing import Union, Iterator
+from typing import Union, Iterator, Tuple
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
from ...tokens import Doc, Span
-def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py
index d69508470..55b544e42 100644
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@@ -1,9 +1,10 @@
+from typing import Dict, List
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM
from ...util import update_exc
-_exc = {}
+_exc: Dict[str, List[Dict]] = {}
_exclude = [
"Ill",
"ill",
@@ -294,9 +295,9 @@ for verb_data in [
{ORTH: "has", NORM: "has"},
{ORTH: "dare", NORM: "dare"},
]:
- verb_data_tc = dict(verb_data)
+ verb_data_tc = dict(verb_data) # type: ignore[call-overload]
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
- for data in [verb_data, verb_data_tc]:
+ for data in [verb_data, verb_data_tc]: # type: ignore[assignment]
_exc[data[ORTH] + "n't"] = [
dict(data),
{ORTH: "n't", NORM: "not"},
diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py
index 4b329b6f7..e75955202 100644
--- a/spacy/lang/es/__init__.py
+++ b/spacy/lang/es/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
@@ -6,10 +6,10 @@ from .lex_attrs import LEX_ATTRS
from .lemmatizer import SpanishLemmatizer
from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
-from ...language import Language
+from ...language import Language, BaseDefaults
-class SpanishDefaults(Language.Defaults):
+class SpanishDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
@@ -26,13 +26,25 @@ class Spanish(Language):
@Spanish.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={"model": None, "mode": "rule", "overwrite": False},
+ default_config={
+ "model": None,
+ "mode": "rule",
+ "overwrite": False,
+ "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+ },
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
- nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+ nlp: Language,
+ model: Optional[Model],
+ name: str,
+ mode: str,
+ overwrite: bool,
+ scorer: Optional[Callable],
):
- return SpanishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+ return SpanishLemmatizer(
+ nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+ )
__all__ = ["Spanish"]
diff --git a/spacy/lang/es/lemmatizer.py b/spacy/lang/es/lemmatizer.py
index 56f74068d..ca5fc08c8 100644
--- a/spacy/lang/es/lemmatizer.py
+++ b/spacy/lang/es/lemmatizer.py
@@ -52,7 +52,7 @@ class SpanishLemmatizer(Lemmatizer):
rule_pos = "verb"
else:
rule_pos = pos
- rule = self.select_rule(rule_pos, features)
+ rule = self.select_rule(rule_pos, list(features))
index = self.lookups.get_table("lemma_index").get(rule_pos, [])
lemmas = getattr(self, "lemmatize_" + rule_pos)(
string, features, rule, index
@@ -191,6 +191,8 @@ class SpanishLemmatizer(Lemmatizer):
return selected_lemmas
else:
return possible_lemmas
+ else:
+ return []
def lemmatize_noun(
self, word: str, features: List[str], rule: str, index: List[str]
@@ -268,7 +270,7 @@ class SpanishLemmatizer(Lemmatizer):
return [word]
def lemmatize_pron(
- self, word: str, features: List[str], rule: str, index: List[str]
+ self, word: str, features: List[str], rule: Optional[str], index: List[str]
) -> List[str]:
"""
Lemmatize a pronoun.
@@ -319,9 +321,11 @@ class SpanishLemmatizer(Lemmatizer):
return selected_lemmas
else:
return possible_lemmas
+ else:
+ return []
def lemmatize_verb(
- self, word: str, features: List[str], rule: str, index: List[str]
+ self, word: str, features: List[str], rule: Optional[str], index: List[str]
) -> List[str]:
"""
Lemmatize a verb.
@@ -342,6 +346,7 @@ class SpanishLemmatizer(Lemmatizer):
selected_lemmas = []
# Apply lemmatization rules
+ rule = str(rule or "")
for old, new in self.lookups.get_table("lemma_rules").get(rule, []):
possible_lemma = re.sub(old + "$", new, word)
if possible_lemma != word:
@@ -389,11 +394,11 @@ class SpanishLemmatizer(Lemmatizer):
return [word]
def lemmatize_verb_pron(
- self, word: str, features: List[str], rule: str, index: List[str]
+ self, word: str, features: List[str], rule: Optional[str], index: List[str]
) -> List[str]:
# Strip and collect pronouns
pron_patt = "^(.*?)([mts]e|l[aeo]s?|n?os)$"
- prons = []
+ prons: List[str] = []
verb = word
m = re.search(pron_patt, verb)
while m is not None and len(prons) <= 3:
@@ -410,7 +415,7 @@ class SpanishLemmatizer(Lemmatizer):
else:
rule = self.select_rule("verb", features)
verb_lemma = self.lemmatize_verb(
- verb, features - {"PronType=Prs"}, rule, index
+ verb, features - {"PronType=Prs"}, rule, index # type: ignore[operator]
)[0]
pron_lemmas = []
for pron in prons:
diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py
index e753a3f98..f2ca2a678 100644
--- a/spacy/lang/es/syntax_iterators.py
+++ b/spacy/lang/es/syntax_iterators.py
@@ -1,58 +1,76 @@
-from typing import Union, Iterator
+from typing import Union, Iterator, Tuple
-from ...symbols import NOUN, PROPN, PRON, VERB, AUX
+from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
-from ...tokens import Doc, Span, Token
+from ...tokens import Doc, Span
-def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
- """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
- doc = doclike.doc
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
+ """
+ Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+ """
+ labels = [
+ "nsubj",
+ "nsubj:pass",
+ "obj",
+ "obl",
+ "nmod",
+ "pcomp",
+ "appos",
+ "ROOT",
+ ]
+ post_modifiers = ["flat", "fixed", "compound"]
+ doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
- if not len(doc):
- return
+ np_deps = {doc.vocab.strings.add(label) for label in labels}
+ np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
np_label = doc.vocab.strings.add("NP")
- left_labels = ["det", "fixed", "neg"] # ['nunmod', 'det', 'appos', 'fixed']
- right_labels = ["flat", "fixed", "compound", "neg"]
- stop_labels = ["punct"]
- np_left_deps = [doc.vocab.strings.add(label) for label in left_labels]
- np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
- stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
+ adj_label = doc.vocab.strings.add("amod")
+ adp_label = doc.vocab.strings.add("ADP")
+ conj = doc.vocab.strings.add("conj")
+ conj_pos = doc.vocab.strings.add("CCONJ")
+ prev_end = -1
+ for i, word in enumerate(doclike):
+ if word.pos not in (NOUN, PROPN, PRON):
+ continue
+ # Prevent nested chunks from being produced
+ if word.left_edge.i <= prev_end:
+ continue
+ if word.dep in np_deps:
+ right_childs = list(word.rights)
+ right_child = right_childs[0] if right_childs else None
- prev_right = -1
- for token in doclike:
- if token.pos in [PROPN, NOUN, PRON]:
- left, right = noun_bounds(
- doc, token, np_left_deps, np_right_deps, stop_deps
- )
- if left.i <= prev_right:
- continue
- yield left.i, right.i + 1, np_label
- prev_right = right.i
-
-
-def is_verb_token(token: Token) -> bool:
- return token.pos in [VERB, AUX]
-
-
-def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps):
- left_bound = root
- for token in reversed(list(root.lefts)):
- if token.dep in np_left_deps:
- left_bound = token
- right_bound = root
- for token in root.rights:
- if token.dep in np_right_deps:
- left, right = noun_bounds(
- doc, token, np_left_deps, np_right_deps, stop_deps
- )
- filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps
- if list(filter(filter_func, doc[left_bound.i : right.i])):
- break
+ if right_child:
+ if right_child.dep == adj_label:
+ right_end = right_child.right_edge
+ elif right_child.dep in np_modifs: # Check if we can expand to right
+ right_end = word.right_edge
+ else:
+ right_end = word
else:
- right_bound = right
- return left_bound, right_bound
+ right_end = word
+ prev_end = right_end.i
+
+ left_index = word.left_edge.i
+ left_index = (
+ left_index + 1 if word.left_edge.pos == adp_label else left_index
+ ) # Eliminate left attached de, del
+
+ yield left_index, right_end.i + 1, np_label
+ elif word.dep == conj:
+ head = word.head
+ while head.dep == conj and head.head.i < head.i:
+ head = head.head
+ # If the head is an NP, and we're coordinated to it, we're an NP
+ if head.dep in np_deps:
+ prev_end = word.i
+
+ left_index = word.left_edge.i # eliminate left attached conjunction
+ left_index = (
+ left_index + 1 if word.left_edge.pos == conj_pos else left_index
+ )
+ yield left_index, word.i + 1, np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/lang/et/__init__.py b/spacy/lang/et/__init__.py
index 9f71882d2..274bc1309 100644
--- a/spacy/lang/et/__init__.py
+++ b/spacy/lang/et/__init__.py
@@ -1,8 +1,8 @@
from .stop_words import STOP_WORDS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class EstonianDefaults(Language.Defaults):
+class EstonianDefaults(BaseDefaults):
stop_words = STOP_WORDS
diff --git a/spacy/lang/eu/__init__.py b/spacy/lang/eu/__init__.py
index 89550be96..3346468bd 100644
--- a/spacy/lang/eu/__init__.py
+++ b/spacy/lang/eu/__init__.py
@@ -1,10 +1,10 @@
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES
-from ...language import Language
+from ...language import Language, BaseDefaults
-class BasqueDefaults(Language.Defaults):
+class BasqueDefaults(BaseDefaults):
suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py
index 77a0a28b9..914e4c27d 100644
--- a/spacy/lang/fa/__init__.py
+++ b/spacy/lang/fa/__init__.py
@@ -1,15 +1,15 @@
-from typing import Optional
+from typing import Optional, Callable
from thinc.api import Model
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_SUFFIXES
from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language
+from ...language import Language, BaseDefaults
from ...pipeline import Lemmatizer
-class PersianDefaults(Language.Defaults):
+class PersianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
@@ -26,13 +26,25 @@ class Persian(Language):
@Persian.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={"model": None, "mode": "rule", "overwrite": False},
+ default_config={
+ "model": None,
+ "mode": "rule",
+ "overwrite": False,
+ "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+ },
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
- nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+ nlp: Language,
+ model: Optional[Model],
+ name: str,
+ mode: str,
+ overwrite: bool,
+ scorer: Optional[Callable],
):
- return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+ return Lemmatizer(
+ nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+ )
__all__ = ["Persian"]
diff --git a/spacy/lang/fa/generate_verbs_exc.py b/spacy/lang/fa/generate_verbs_exc.py
index 62094c6de..a6d79a386 100644
--- a/spacy/lang/fa/generate_verbs_exc.py
+++ b/spacy/lang/fa/generate_verbs_exc.py
@@ -639,10 +639,12 @@ for verb_root in verb_roots:
)
if past.startswith("آ"):
- conjugations = set(
- map(
- lambda item: item.replace("بآ", "بیا").replace("نآ", "نیا"),
- conjugations,
+ conjugations = list(
+ set(
+ map(
+ lambda item: item.replace("بآ", "بیا").replace("نآ", "نیا"),
+ conjugations,
+ )
)
)
diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py
index 0be06e73c..8207884b0 100644
--- a/spacy/lang/fa/syntax_iterators.py
+++ b/spacy/lang/fa/syntax_iterators.py
@@ -1,8 +1,10 @@
+from typing import Union, Iterator, Tuple
+from ...tokens import Doc, Span
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
-def noun_chunks(doclike):
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
diff --git a/spacy/lang/fi/__init__.py b/spacy/lang/fi/__init__.py
index 9233c6547..86a834170 100644
--- a/spacy/lang/fi/__init__.py
+++ b/spacy/lang/fi/__init__.py
@@ -2,10 +2,10 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
-from ...language import Language
+from ...language import Language, BaseDefaults
-class FinnishDefaults(Language.Defaults):
+class FinnishDefaults(BaseDefaults):
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py
index d69a5a718..27d2a915e 100644
--- a/spacy/lang/fr/__init__.py
+++ b/spacy/lang/fr/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
from thinc.api import Model
@@ -9,10 +9,10 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from .lemmatizer import FrenchLemmatizer
-from ...language import Language
+from ...language import Language, BaseDefaults
-class FrenchDefaults(Language.Defaults):
+class FrenchDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
@@ -31,13 +31,25 @@ class French(Language):
@French.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={"model": None, "mode": "rule", "overwrite": False},
+ default_config={
+ "model": None,
+ "mode": "rule",
+ "overwrite": False,
+ "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+ },
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
- nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+ nlp: Language,
+ model: Optional[Model],
+ name: str,
+ mode: str,
+ overwrite: bool,
+ scorer: Optional[Callable],
):
- return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+ return FrenchLemmatizer(
+ nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+ )
__all__ = ["French"]
diff --git a/spacy/lang/fr/lemmatizer.py b/spacy/lang/fr/lemmatizer.py
index bb5a270ab..c6422cf96 100644
--- a/spacy/lang/fr/lemmatizer.py
+++ b/spacy/lang/fr/lemmatizer.py
@@ -75,6 +75,6 @@ class FrenchLemmatizer(Lemmatizer):
forms.append(self.lookup_lemmatize(token)[0])
if not forms:
forms.append(string)
- forms = list(set(forms))
+ forms = list(dict.fromkeys(forms))
self.cache[cache_key] = forms
return forms
diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py
index 68117a54d..d86662693 100644
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@@ -1,11 +1,11 @@
-from typing import Union, Iterator
+from typing import Union, Iterator, Tuple
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
from ...tokens import Doc, Span
-def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
# fmt: off
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py
index 6f429eecc..2e88b58cf 100644
--- a/spacy/lang/fr/tokenizer_exceptions.py
+++ b/spacy/lang/fr/tokenizer_exceptions.py
@@ -82,7 +82,8 @@ for orth in [
for verb in [
"a",
- "est" "semble",
+ "est",
+ "semble",
"indique",
"moque",
"passe",
@@ -114,7 +115,7 @@ for s, verb, pronoun in [("s", "est", "il"), ("S", "EST", "IL")]:
]
-_infixes_exc = []
+_infixes_exc = [] # type: ignore[var-annotated]
orig_elision = "'"
orig_hyphen = "-"
diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py
index 80131368b..3be53bc7a 100644
--- a/spacy/lang/ga/__init__.py
+++ b/spacy/lang/ga/__init__.py
@@ -1,9 +1,14 @@
+from typing import Optional
+
+from thinc.api import Model
+
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
-from ...language import Language
+from ...language import Language, BaseDefaults
+from .lemmatizer import IrishLemmatizer
-class IrishDefaults(Language.Defaults):
+class IrishDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
@@ -13,4 +18,16 @@ class Irish(Language):
Defaults = IrishDefaults
+@Irish.factory(
+ "lemmatizer",
+ assigns=["token.lemma"],
+ default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
+ default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+ nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+):
+ return IrishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+
+
__all__ = ["Irish"]
diff --git a/spacy/lang/ga/irish_morphology_helpers.py b/spacy/lang/ga/irish_morphology_helpers.py
deleted file mode 100644
index d606da975..000000000
--- a/spacy/lang/ga/irish_morphology_helpers.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# fmt: off
-consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z"]
-broad_vowels = ["a", "á", "o", "ó", "u", "ú"]
-slender_vowels = ["e", "é", "i", "í"]
-vowels = broad_vowels + slender_vowels
-# fmt: on
-
-
-def ends_dentals(word):
- if word != "" and word[-1] in ["d", "n", "t", "s"]:
- return True
- else:
- return False
-
-
-def devoice(word):
- if len(word) > 2 and word[-2] == "s" and word[-1] == "d":
- return word[:-1] + "t"
- else:
- return word
-
-
-def ends_with_vowel(word):
- return word != "" and word[-1] in vowels
-
-
-def starts_with_vowel(word):
- return word != "" and word[0] in vowels
-
-
-def deduplicate(word):
- if len(word) > 2 and word[-2] == word[-1] and word[-1] in consonants:
- return word[:-1]
- else:
- return word
diff --git a/spacy/lang/ga/lemmatizer.py b/spacy/lang/ga/lemmatizer.py
new file mode 100644
index 000000000..47aec8fd4
--- /dev/null
+++ b/spacy/lang/ga/lemmatizer.py
@@ -0,0 +1,162 @@
+from typing import List, Dict, Tuple
+
+from ...pipeline import Lemmatizer
+from ...tokens import Token
+
+
+class IrishLemmatizer(Lemmatizer):
+ # This is a lookup-based lemmatiser using data extracted from
+ # BuNaMo (https://github.com/michmech/BuNaMo)
+
+ @classmethod
+ def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
+ if mode == "pos_lookup":
+ # fmt: off
+ required = [
+ "lemma_lookup_adj", "lemma_lookup_adp",
+ "lemma_lookup_noun", "lemma_lookup_verb"
+ ]
+ # fmt: on
+ return (required, [])
+ else:
+ return super().get_lookups_config(mode)
+
+ def pos_lookup_lemmatize(self, token: Token) -> List[str]:
+ univ_pos = token.pos_
+ string = unponc(token.text)
+ if univ_pos not in ["PROPN", "ADP", "ADJ", "NOUN", "VERB"]:
+ return [string.lower()]
+ demutated = demutate(string)
+ secondary = ""
+ if string[0:1].lower() == "h" and string[1:2].lower() in "aáeéiíoóuú":
+ secondary = string[1:]
+ lookup_pos = univ_pos.lower()
+ if univ_pos == "PROPN":
+ lookup_pos = "noun"
+ if token.has_morph():
+ # TODO: lookup is actually required for the genitive forms, but
+ # this is not in BuNaMo, and would not be of use with IDT.
+ if univ_pos == "NOUN" and (
+ "VerbForm=Vnoun" in token.morph or "VerbForm=Inf" in token.morph
+ ):
+ hpref = "Form=HPref" in token.morph
+ return [demutate(string, hpref).lower()]
+ elif univ_pos == "ADJ" and "VerbForm=Part" in token.morph:
+ return [demutate(string).lower()]
+ lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})
+
+ def to_list(value):
+ if value is None:
+ value = []
+ elif not isinstance(value, list):
+ value = [value]
+ return value
+
+ if univ_pos == "ADP":
+ return to_list(lookup_table.get(string, string.lower()))
+ ret = []
+ if univ_pos == "PROPN":
+ ret.extend(to_list(lookup_table.get(demutated)))
+ ret.extend(to_list(lookup_table.get(secondary)))
+ else:
+ ret.extend(to_list(lookup_table.get(demutated.lower())))
+ ret.extend(to_list(lookup_table.get(secondary.lower())))
+ if len(ret) == 0:
+ ret = [string.lower()]
+ return ret
+
+
+def demutate(word: str, is_hpref: bool = False) -> str:
+ UVOWELS = "AÁEÉIÍOÓUÚ"
+ LVOWELS = "aáeéiíoóuú"
+ lc = word.lower()
+ # remove eclipsis
+ if lc.startswith("bhf"):
+ word = word[2:]
+ elif lc.startswith("mb"):
+ word = word[1:]
+ elif lc.startswith("gc"):
+ word = word[1:]
+ elif lc.startswith("nd"):
+ word = word[1:]
+ elif lc.startswith("ng"):
+ word = word[1:]
+ elif lc.startswith("bp"):
+ word = word[1:]
+ elif lc.startswith("dt"):
+ word = word[1:]
+ elif word[0:1] == "n" and word[1:2] in UVOWELS:
+ word = word[1:]
+ elif lc.startswith("n-") and word[2:3] in LVOWELS:
+ word = word[2:]
+ # non-standard eclipsis
+ elif lc.startswith("bh-f"):
+ word = word[3:]
+ elif lc.startswith("m-b"):
+ word = word[2:]
+ elif lc.startswith("g-c"):
+ word = word[2:]
+ elif lc.startswith("n-d"):
+ word = word[2:]
+ elif lc.startswith("n-g"):
+ word = word[2:]
+ elif lc.startswith("b-p"):
+ word = word[2:]
+ elif lc.startswith("d-t"):
+ word = word[2:]
+
+ # t-prothesis
+ elif lc.startswith("ts"):
+ word = word[1:]
+ elif lc.startswith("t-s"):
+ word = word[2:]
+
+ # h-prothesis, if known to be present
+ elif is_hpref and word[0:1] == "h":
+ word = word[1:]
+ # h-prothesis, simple case
+ # words can also begin with 'h', but unlike eclipsis,
+ # a hyphen is not used, so that needs to be handled
+ # elsewhere
+ elif word[0:1] == "h" and word[1:2] in UVOWELS:
+ word = word[1:]
+
+ # lenition
+ # this breaks the previous if, to handle super-non-standard
+ # text where both eclipsis and lenition were used.
+ if lc[0:1] in "bcdfgmpst" and lc[1:2] == "h":
+ word = word[0:1] + word[2:]
+
+ return word
+
+
+def unponc(word: str) -> str:
+ # fmt: off
+ PONC = {
+ "ḃ": "bh",
+ "ċ": "ch",
+ "ḋ": "dh",
+ "ḟ": "fh",
+ "ġ": "gh",
+ "ṁ": "mh",
+ "ṗ": "ph",
+ "ṡ": "sh",
+ "ṫ": "th",
+ "Ḃ": "BH",
+ "Ċ": "CH",
+ "Ḋ": "DH",
+ "Ḟ": "FH",
+ "Ġ": "GH",
+ "Ṁ": "MH",
+ "Ṗ": "PH",
+ "Ṡ": "SH",
+ "Ṫ": "TH"
+ }
+ # fmt: on
+ buf = []
+ for ch in word:
+ if ch in PONC:
+ buf.append(PONC[ch])
+ else:
+ buf.append(ch)
+ return "".join(buf)
diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py
index abf49c511..63af65fe9 100644
--- a/spacy/lang/ga/tokenizer_exceptions.py
+++ b/spacy/lang/ga/tokenizer_exceptions.py
@@ -9,6 +9,8 @@ _exc = {
"ded'": [{ORTH: "de", NORM: "de"}, {ORTH: "d'", NORM: "do"}],
"lem'": [{ORTH: "le", NORM: "le"}, {ORTH: "m'", NORM: "mo"}],
"led'": [{ORTH: "le", NORM: "le"}, {ORTH: "d'", NORM: "do"}],
+ "théis": [{ORTH: "th", NORM: "tar"}, {ORTH: "éis", NORM: "éis"}],
+ "tréis": [{ORTH: "tr", NORM: "tar"}, {ORTH: "éis", NORM: "éis"}],
}
for exc_data in [
diff --git a/spacy/lang/grc/__init__.py b/spacy/lang/grc/__init__.py
new file mode 100644
index 000000000..e83f0c5a5
--- /dev/null
+++ b/spacy/lang/grc/__init__.py
@@ -0,0 +1,18 @@
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .stop_words import STOP_WORDS
+from .lex_attrs import LEX_ATTRS
+from ...language import Language, BaseDefaults
+
+
+class AncientGreekDefaults(BaseDefaults):
+ tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+ lex_attr_getters = LEX_ATTRS
+ stop_words = STOP_WORDS
+
+
+class AncientGreek(Language):
+ lang = "grc"
+ Defaults = AncientGreekDefaults
+
+
+__all__ = ["AncientGreek"]
diff --git a/spacy/lang/grc/examples.py b/spacy/lang/grc/examples.py
new file mode 100644
index 000000000..9c0bcb265
--- /dev/null
+++ b/spacy/lang/grc/examples.py
@@ -0,0 +1,17 @@
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.grc.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+ "ἐρᾷ μὲν ἁγνὸς οὐρανὸς τρῶσαι χθόνα, ἔρως δὲ γαῖαν λαμβάνει γάμου τυχεῖν·",
+ "εὐδαίμων Χαρίτων καὶ Μελάνιππος ἔφυ, θείας ἁγητῆρες ἐφαμερίοις φιλότατος.",
+ "ὃ μὲν δὴ ἀπόστολος ἐς τὴν Μίλητον ἦν.",
+ "Θρασύβουλος δὲ σαφέως προπεπυσμένος πάντα λόγον καὶ εἰδὼς τὰ Ἀλυάττης μέλλοι ποιήσειν μηχανᾶται τοιάδε.",
+ "φιλόπαις δ' ἦν ἐκμανῶς καὶ Ἀλέξανδρος ὁ βασιλεύς.",
+ "Ἀντίγονος ὁ βασιλεὺς ἐπεκώμαζε τῷ Ζήνωνι",
+ "αὐτὰρ ὃ δεύτατος ἦλθεν ἄναξ ἀνδρῶν Ἀγαμέμνων ἕλκος ἔχων",
+]
diff --git a/spacy/lang/grc/lex_attrs.py b/spacy/lang/grc/lex_attrs.py
new file mode 100644
index 000000000..0ab15e6fd
--- /dev/null
+++ b/spacy/lang/grc/lex_attrs.py
@@ -0,0 +1,314 @@
+from ...attrs import LIKE_NUM
+
+
+_num_words = [
+ # CARDINALS
+ "εἷς",
+ "ἑνός",
+ "ἑνί",
+ "ἕνα",
+ "μία",
+ "μιᾶς",
+ "μιᾷ",
+ "μίαν",
+ "ἕν",
+ "δύο",
+ "δυοῖν",
+ "τρεῖς",
+ "τριῶν",
+ "τρισί",
+ "τρία",
+ "τέτταρες",
+ "τεττάρων",
+ "τέτταρσι",
+ "τέτταρα",
+ "τέτταρας",
+ "πέντε",
+ "ἕξ",
+ "ἑπτά",
+ "ὀκτώ",
+ "ἐννέα",
+ "δέκα",
+ "ἕνδεκα",
+ "δώδεκα",
+ "πεντεκαίδεκα",
+ "ἑκκαίδεκα",
+ "ἑπτακαίδεκα",
+ "ὀκτωκαίδεκα",
+ "ἐννεακαίδεκα",
+ "εἴκοσι",
+ "τριάκοντα",
+ "τετταράκοντα",
+ "πεντήκοντα",
+ "ἑξήκοντα",
+ "ἑβδομήκοντα",
+ "ὀγδοήκοντα",
+ "ἐνενήκοντα",
+ "ἑκατόν",
+ "διακόσιοι",
+ "διακοσίων",
+ "διακοσιᾶν",
+ "διακοσίους",
+ "διακοσίοις",
+ "διακόσια",
+ "διακόσιαι",
+ "διακοσίαις",
+ "διακοσίαισι",
+ "διηκόσιοι",
+ "διηκοσίων",
+ "διηκοσιέων",
+ "διακοσίας",
+ "διηκόσια",
+ "διηκόσιαι",
+ "διηκοσίας",
+ "τριακόσιοι",
+ "τριακοσίων",
+ "τριακοσιᾶν",
+ "τριακοσίους",
+ "τριακοσίοις",
+ "τριακόσια",
+ "τριακόσιαι",
+ "τριακοσίαις",
+ "τριακοσίαισι",
+ "τριακοσιέων",
+ "τριακοσίας",
+ "τριηκόσια",
+ "τριηκοσίας",
+ "τριηκόσιοι",
+ "τριηκοσίοισιν",
+ "τριηκοσίους",
+ "τριηκοσίων",
+ "τετρακόσιοι",
+ "τετρακοσίων",
+ "τετρακοσιᾶν",
+ "τετρακοσίους",
+ "τετρακοσίοις",
+ "τετρακόσια",
+ "τετρακόσιαι",
+ "τετρακοσίαις",
+ "τετρακοσίαισι",
+ "τετρακοσιέων",
+ "τετρακοσίας",
+ "πεντακόσιοι",
+ "πεντακοσίων",
+ "πεντακοσιᾶν",
+ "πεντακοσίους",
+ "πεντακοσίοις",
+ "πεντακόσια",
+ "πεντακόσιαι",
+ "πεντακοσίαις",
+ "πεντακοσίαισι",
+ "πεντακοσιέων",
+ "πεντακοσίας",
+ "ἑξακόσιοι",
+ "ἑξακοσίων",
+ "ἑξακοσιᾶν",
+ "ἑξακοσίους",
+ "ἑξακοσίοις",
+ "ἑξακόσια",
+ "ἑξακόσιαι",
+ "ἑξακοσίαις",
+ "ἑξακοσίαισι",
+ "ἑξακοσιέων",
+ "ἑξακοσίας",
+ "ἑπτακόσιοι",
+ "ἑπτακοσίων",
+ "ἑπτακοσιᾶν",
+ "ἑπτακοσίους",
+ "ἑπτακοσίοις",
+ "ἑπτακόσια",
+ "ἑπτακόσιαι",
+ "ἑπτακοσίαις",
+ "ἑπτακοσίαισι",
+ "ἑπτακοσιέων",
+ "ἑπτακοσίας",
+ "ὀκτακόσιοι",
+ "ὀκτακοσίων",
+ "ὀκτακοσιᾶν",
+ "ὀκτακοσίους",
+ "ὀκτακοσίοις",
+ "ὀκτακόσια",
+ "ὀκτακόσιαι",
+ "ὀκτακοσίαις",
+ "ὀκτακοσίαισι",
+ "ὀκτακοσιέων",
+ "ὀκτακοσίας",
+ "ἐνακόσιοι",
+ "ἐνακοσίων",
+ "ἐνακοσιᾶν",
+ "ἐνακοσίους",
+ "ἐνακοσίοις",
+ "ἐνακόσια",
+ "ἐνακόσιαι",
+ "ἐνακοσίαις",
+ "ἐνακοσίαισι",
+ "ἐνακοσιέων",
+ "ἐνακοσίας",
+ "χίλιοι",
+ "χιλίων",
+ "χιλιῶν",
+ "χιλίους",
+ "χιλίοις",
+ "χίλιαι",
+ "χιλίας",
+ "χιλίαις",
+ "χίλια",
+ "χίλι",
+ "δισχίλιοι",
+ "δισχιλίων",
+ "δισχιλιῶν",
+ "δισχιλίους",
+ "δισχιλίοις",
+ "δισχίλιαι",
+ "δισχιλίας",
+ "δισχιλίαις",
+ "δισχίλια",
+ "δισχίλι",
+ "τρισχίλιοι",
+ "τρισχιλίων",
+ "τρισχιλιῶν",
+ "τρισχιλίους",
+ "τρισχιλίοις",
+ "τρισχίλιαι",
+ "τρισχιλίας",
+ "τρισχιλίαις",
+ "τρισχίλια",
+ "τρισχίλι",
+ "μύριοι",
+ "μύριοί",
+ "μυρίων",
+ "μυρίοις",
+ "μυρίους",
+ "μύριαι",
+ "μυρίαις",
+ "μυρίας",
+ "μύρια",
+ "δισμύριοι",
+ "δισμύριοί",
+ "δισμυρίων",
+ "δισμυρίοις",
+ "δισμυρίους",
+ "δισμύριαι",
+ "δισμυρίαις",
+ "δισμυρίας",
+ "δισμύρια",
+ "δεκακισμύριοι",
+ "δεκακισμύριοί",
+ "δεκακισμυρίων",
+ "δεκακισμυρίοις",
+ "δεκακισμυρίους",
+ "δεκακισμύριαι",
+ "δεκακισμυρίαις",
+ "δεκακισμυρίας",
+ "δεκακισμύρια",
+ # ANCIENT GREEK NUMBERS (1-100)
+ "α",
+ "β",
+ "γ",
+ "δ",
+ "ε",
+ "ϛ",
+ "ζ",
+ "η",
+ "θ",
+ "ι",
+ "ια",
+ "ιβ",
+ "ιγ",
+ "ιδ",
+ "ιε",
+ "ιϛ",
+ "ιζ",
+ "ιη",
+ "ιθ",
+ "κ",
+ "κα",
+ "κβ",
+ "κγ",
+ "κδ",
+ "κε",
+ "κϛ",
+ "κζ",
+ "κη",
+ "κθ",
+ "λ",
+ "λα",
+ "λβ",
+ "λγ",
+ "λδ",
+ "λε",
+ "λϛ",
+ "λζ",
+ "λη",
+ "λθ",
+ "μ",
+ "μα",
+ "μβ",
+ "μγ",
+ "μδ",
+ "με",
+ "μϛ",
+ "μζ",
+ "μη",
+ "μθ",
+ "ν",
+ "να",
+ "νβ",
+ "νγ",
+ "νδ",
+ "νε",
+ "νϛ",
+ "νζ",
+ "νη",
+ "νθ",
+ "ξ",
+ "ξα",
+ "ξβ",
+ "ξγ",
+ "ξδ",
+ "ξε",
+ "ξϛ",
+ "ξζ",
+ "ξη",
+ "ξθ",
+ "ο",
+ "οα",
+ "οβ",
+ "ογ",
+ "οδ",
+ "οε",
+ "οϛ",
+ "οζ",
+ "οη",
+ "οθ",
+ "π",
+ "πα",
+ "πβ",
+ "πγ",
+ "πδ",
+ "πε",
+ "πϛ",
+ "πζ",
+ "πη",
+ "πθ",
+ "ϟ",
+ "ϟα",
+ "ϟβ",
+ "ϟγ",
+ "ϟδ",
+ "ϟε",
+ "ϟϛ",
+ "ϟζ",
+ "ϟη",
+ "ϟθ",
+ "ρ",
+]
+
+
+def like_num(text):
+ if text.lower() in _num_words:
+ return True
+ return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}
diff --git a/spacy/lang/grc/stop_words.py b/spacy/lang/grc/stop_words.py
new file mode 100644
index 000000000..cbb766a8c
--- /dev/null
+++ b/spacy/lang/grc/stop_words.py
@@ -0,0 +1,61 @@
+STOP_WORDS = set(
+ """
+αὐτῷ αὐτοῦ αὐτῆς αὐτόν αὐτὸν αὐτῶν αὐτὸς αὐτὸ αὐτό αὐτός αὐτὴν αὐτοῖς αὐτοὺς αὔτ' αὐτὰ αὐτῇ αὐτὴ
+αὐτὼ αὑταὶ καὐτὸς αὐτά αὑτός αὐτοῖσι αὐτοῖσιν αὑτὸς αὐτήν αὐτοῖσί αὐτοί αὐτοὶ αὐτοῖο αὐτάων αὐτὰς
+αὐτέων αὐτώ αὐτάς αὐτούς αὐτή αὐταί αὐταὶ αὐτῇσιν τὠυτῷ τὠυτὸ ταὐτὰ ταύτῃ αὐτῇσι αὐτῇς αὐταῖς αὐτᾶς αὐτὰν ταὐτὸν
+
+γε γ' γέ γὰρ γάρ δαῖτα δαιτὸς δαιτὶ δαὶ δαιτί δαῖτ' δαΐδας δαΐδων δἰ διὰ διά δὲ δ' δέ δὴ δή εἰ εἴ κεἰ κεἴ αἴ αἲ εἲ αἰ
+
+ἐστί ἐστιν ὢν ἦν ἐστὶν ὦσιν εἶναι ὄντι εἰσιν ἐστι ὄντα οὖσαν ἦσαν ἔστι ὄντας ἐστὲ εἰσὶ εἶ ὤν ἦ οὖσαι ἔσται ἐσμὲν ἐστ' ἐστίν ἔστ' ὦ ἔσει ἦμεν εἰμι εἰσὶν ἦσθ'
+ἐστὶ ᾖ οὖσ' ἔστιν εἰμὶ εἴμ' ἐσθ' ᾖς στί εἴην εἶναί οὖσα κἄστ' εἴη ἦσθα εἰμ' ἔστω ὄντ' ἔσθ' ἔμμεναι ἔω ἐὼν ἐσσι ἔσσεται ἐστὸν ἔσαν ἔστων ἐόντα ἦεν ἐοῦσαν ἔην
+ἔσσομαι εἰσί ἐστόν ἔσκεν ἐόντ' ἐών ἔσσεσθ' εἰσ' ἐόντες ἐόντε ἐσσεῖται εἰμεν ἔασιν ἔσκε ἔμεναι ἔσεσθαι ἔῃ εἰμὲν εἰσι ἐόντας ἔστε εἰς ἦτε εἰμί ἔσσεαι ἔμμεν
+ἐοῦσα ἔμεν ᾖσιν ἐστε ἐόντι εἶεν ἔσσονται ἔησθα ἔσεσθε ἐσσί ἐοῦσ' ἔασι ἔα ἦα ἐόν ἔσσεσθαι ἔσομαι ἔσκον εἴης ἔωσιν εἴησαν ἐὸν ἐουσέων ἔσσῃ ἐούσης ἔσονται
+ἐούσας ἐόντων ἐόντος ἐσομένην ἔστωσαν ἔωσι ἔας ἐοῦσαι ἣν εἰσίν ἤστην ὄντες ὄντων οὔσας οὔσαις ὄντος οὖσι οὔσης ἔσῃ ὂν ἐσμεν ἐσμέν οὖσιν ἐσομένους ἐσσόμεσθα
+
+ἒς ἐς ἔς ἐν κεἰς εἲς κἀν ἔν κατὰ κατ' καθ' κατά κάτα κὰπ κὰκ κὰδ κὰρ κάρ κὰγ κὰμ καὶ καί μετὰ μεθ' μετ' μέτα μετά μέθ' μέτ' μὲν μέν μὴ
+
+μή μη οὐκ οὒ οὐ οὐχ οὐχὶ κοὐ κοὐχ οὔ κοὐκ οὐχί οὐκὶ οὐδὲν οὐδεὶς οὐδέν κοὐδεὶς κοὐδὲν οὐδένα οὐδενὸς οὐδέν' οὐδενός οὐδενὶ
+οὐδεμία οὐδείς οὐδεμίαν οὐδὲ οὐδ' κοὐδ' οὐδέ οὔτε οὔθ' οὔτέ τε οὔτ' οὕτως οὕτω οὕτῶ χοὔτως οὖν ὦν ὧν τοῦτο τοῦθ' τοῦτον τούτῳ
+τούτοις ταύτας αὕτη ταῦτα οὗτος ταύτης ταύτην τούτων ταῦτ' τοῦτ' τούτου αὗται τούτους τοῦτό ταῦτά τούτοισι χαὔτη ταῦθ' χοὖτοι
+τούτοισιν οὗτός οὗτοι τούτω τουτέων τοῦτὸν οὗτοί τοῦτου οὗτοὶ ταύτῃσι ταύταις ταυτὶ παρὰ παρ' πάρα παρά πὰρ παραὶ πάρ' περὶ
+πέρι περί πρὸς πρός ποτ' ποτὶ προτὶ προτί πότι
+
+σὸς σήν σὴν σὸν σόν σὰ σῶν σοῖσιν σός σῆς σῷ σαῖς σῇ σοῖς σοῦ σ' σὰν σά σὴ σὰς
+σᾷ σοὺς σούς σοῖσι σῇς σῇσι σή σῇσιν σοὶ σου ὑμεῖς σὲ σύ σοι ὑμᾶς ὑμῶν ὑμῖν σε
+σέ σὺ σέθεν σοί ὑμὶν σφῷν ὑμίν τοι τοὶ σφὼ ὔμμ' σφῶϊ σεῖο τ' σφῶϊν ὔμμιν σέο σευ σεῦ
+ὔμμι ὑμέων τύνη ὑμείων τοί ὔμμες σεο τέ τεοῖο ὑμέας σὺν ξὺν σύν
+
+θ' τί τι τις τινες τινα τινος τινὸς τινὶ τινῶν τίς τίνες τινὰς τιν' τῳ του τίνα τοῦ τῷ τινί τινά τίνος τινι τινας τινὰ τινων
+τίν' τευ τέο τινές τεο τινὲς τεῷ τέῳ τινός τεῳ τισὶ
+
+τοιαῦτα τοιοῦτον τοιοῦθ' τοιοῦτος τοιαύτην τοιαῦτ' τοιούτου τοιαῦθ' τοιαύτῃ τοιούτοις τοιαῦται τοιαῦτά τοιαύτη τοιοῦτοι τοιούτων τοιούτοισι
+τοιοῦτο τοιούτους τοιούτῳ τοιαύτης τοιαύταις τοιαύτας τοιοῦτός τίνι τοῖσι τίνων τέων τέοισί τὰ τῇ τώ τὼ
+
+ἀλλὰ ἀλλ' ἀλλά ἀπ' ἀπὸ κἀπ' ἀφ' τἀπὸ κἀφ' ἄπο ἀπό τὠπὸ τἀπ' ἄλλων ἄλλῳ ἄλλη ἄλλης ἄλλους ἄλλοις ἄλλον ἄλλο ἄλλου τἄλλα ἄλλα
+ἄλλᾳ ἄλλοισιν τἄλλ' ἄλλ' ἄλλος ἄλλοισι κἄλλ' ἄλλοι ἄλλῃσι ἄλλόν ἄλλην ἄλλά ἄλλαι ἄλλοισίν ὧλλοι ἄλλῃ ἄλλας ἀλλέων τἆλλα ἄλλως
+ἀλλάων ἄλλαις τἆλλ'
+
+ἂν ἄν κἂν τἂν ἃν κεν κ' κέν κέ κε χ' ἄρα τἄρα ἄρ' τἄρ' ἄρ ῥα ῥά ῥ τὰρ ἄρά ἂρ
+
+ἡμᾶς με ἐγὼ ἐμὲ μοι κἀγὼ ἡμῶν ἡμεῖς ἐμοὶ ἔγωγ' ἁμοὶ ἡμῖν μ' ἔγωγέ ἐγώ ἐμοί ἐμοῦ κἀμοῦ ἔμ' κἀμὲ ἡμὶν μου ἐμέ ἔγωγε νῷν νὼ χἠμεῖς ἁμὲ κἀγώ κἀμοὶ χἠμᾶς
+ἁγὼ ἡμίν κἄμ' ἔμοιγ' μοί τοὐμὲ ἄμμε ἐγὼν ἐμεῦ ἐμεῖο μευ ἔμοιγε ἄμμι μέ ἡμέας νῶϊ ἄμμιν ἧμιν ἐγών νῶΐ ἐμέθεν ἥμιν ἄμμες νῶι ἡμείων ἄμμ' ἡμέων ἐμέο
+ἐκ ἔκ ἐξ κἀκ κ ἃκ κἀξ ἔξ εξ Ἐκ τἀμὰ ἐμοῖς τοὐμόν ἐμᾶς τοὐμὸν ἐμῶν ἐμὸς ἐμῆς ἐμῷ τὠμῷ ἐμὸν τἄμ' ἐμὴ ἐμὰς ἐμαῖς ἐμὴν ἐμόν ἐμὰ ἐμός ἐμοὺς ἐμῇ ἐμᾷ
+οὑμὸς ἐμοῖν οὑμός κἀμὸν ἐμαὶ ἐμή ἐμάς ἐμοῖσι ἐμοῖσιν ἐμῇσιν ἐμῇσι ἐμῇς ἐμήν
+
+ἔνι ἐνὶ εἰνὶ εἰν ἐμ ἐπὶ ἐπ' ἔπι ἐφ' κἀπὶ τἀπὶ ἐπί ἔφ' ἔπ' ἐὰν ἢν ἐάν ἤν ἄνπερ
+
+αὑτοῖς αὑτὸν αὑτῷ ἑαυτοῦ αὑτόν αὑτῆς αὑτῶν αὑτοῦ αὑτὴν αὑτοῖν χαὐτοῦ αὑταῖς ἑωυτοῦ ἑωυτῇ ἑωυτὸν ἐωυτῷ ἑωυτῆς ἑωυτόν ἑωυτῷ
+ἑωυτάς ἑωυτῶν ἑωυτοὺς ἑωυτοῖσι ἑαυτῇ ἑαυτούς αὑτοὺς ἑαυτῶν ἑαυτοὺς ἑαυτὸν ἑαυτῷ ἑαυτοῖς ἑαυτὴν ἑαυτῆς
+
+ἔτι ἔτ' ἔθ' κἄτι ἢ ἤ ἠέ ἠὲ ἦε ἦέ ἡ τοὺς τὴν τὸ τῶν τὸν ὁ ἁ οἱ τοῖς ταῖς τῆς τὰς αἱ τό τὰν τᾶς τοῖσιν αἳ χὠ τήν τά τοῖν τάς ὅ
+χοἰ ἣ ἥ χἠ τάν τᾶν ὃ οἳ οἵ τοῖο τόν τοῖιν τούς τάων ταὶ τῇς τῇσι τῇσιν αἵ τοῖό τοῖσίν ὅττί ταί Τὴν τῆ τῶ τάδε ὅδε τοῦδε τόδε τόνδ'
+τάδ' τῆσδε τῷδε ὅδ' τῶνδ' τῇδ' τοῦδέ τῶνδε τόνδε τόδ' τοῦδ' τάσδε τήνδε τάσδ' τήνδ' ταῖσδέ τῇδε τῆσδ' τάνδ' τῷδ' τάνδε ἅδε τοῖσδ' ἥδ'
+τᾷδέ τοῖσδε τούσδ' ἥδε τούσδε τώδ' ἅδ' οἵδ' τῶνδέ οἵδε τᾷδε τοῖσδεσσι τώδε τῇδέ τοῖσιδε αἵδε τοῦδὲ τῆδ' αἵδ' τοῖσδεσι ὃν ἃ ὃς ᾧ οὗ ἅπερ
+οὓς ἧς οἷς ἅσπερ ᾗ ἅ χὦνπερ ὣ αἷς ᾇ ὅς ἥπερ ἃς ὅσπερ ὅνπερ ὧνπερ ᾧπερ ὅν αἷν οἷσι ἇς ἅς ὥ οὕς ἥν οἷσιν ἕης ὅου ᾗς οἷσί οἷσίν τοῖσί ᾗσιν οἵπερ αἷσπερ
+ὅστις ἥτις ὅτου ὅτοισι ἥντιν' ὅτῳ ὅντιν' ὅττι ἅσσά ὅτεῳ ὅτις ὅτιν' ὅτευ ἥντινα αἵτινές ὅντινα ἅσσα ᾧτινι οἵτινες ὅτι ἅτις ὅτ' ὑμὴ
+ὑμήν ὑμὸν ὑπὲρ ὕπερ ὑπέρτερον ὑπεὶρ ὑπέρτατος ὑπὸ ὑπ' ὑφ' ὕπο ὑπαὶ ὑπό ὕπ' ὕφ'
+
+ ὣς ὡς ὥς ὧς ὥστ' ὥστε ὥσθ' ὤ ὢ
+
+ """.split()
+)
diff --git a/spacy/lang/grc/tokenizer_exceptions.py b/spacy/lang/grc/tokenizer_exceptions.py
new file mode 100644
index 000000000..bcee70f32
--- /dev/null
+++ b/spacy/lang/grc/tokenizer_exceptions.py
@@ -0,0 +1,111 @@
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ...symbols import ORTH, NORM
+from ...util import update_exc
+
+_exc = {}
+
+for token in ["᾽Απ'", "᾽ΑΠ'", "ἀφ'", "᾽Αφ", "ἀπὸ"]:
+ _exc[token] = [{ORTH: token, NORM: "από"}]
+
+for token in ["᾽Αλλ'", "ἀλλ'", "ἀλλὰ"]:
+ _exc[token] = [{ORTH: token, NORM: "ἀλλά"}]
+
+for token in ["παρ'", "Παρ'", "παρὰ", "παρ"]:
+ _exc[token] = [{ORTH: token, NORM: "παρά"}]
+
+for token in ["καθ'", "Καθ'", "κατ'", "Κατ'", "κατὰ"]:
+ _exc[token] = [{ORTH: token, NORM: "κατά"}]
+
+for token in ["Ἐπ'", "ἐπ'", "ἐπὶ", "Εφ'", "εφ'"]:
+ _exc[token] = [{ORTH: token, NORM: "επί"}]
+
+for token in ["Δι'", "δι'", "διὰ"]:
+ _exc[token] = [{ORTH: token, NORM: "διά"}]
+
+for token in ["Ὑπ'", "ὑπ'", "ὑφ'"]:
+ _exc[token] = [{ORTH: token, NORM: "ὑπό"}]
+
+for token in ["Μετ'", "μετ'", "μεθ'", "μετὰ"]:
+ _exc[token] = [{ORTH: token, NORM: "μετά"}]
+
+for token in ["Μ'", "μ'", "μέ", "μὲ"]:
+ _exc[token] = [{ORTH: token, NORM: "με"}]
+
+for token in ["Σ'", "σ'", "σέ", "σὲ"]:
+ _exc[token] = [{ORTH: token, NORM: "σε"}]
+
+for token in ["Τ'", "τ'", "τέ", "τὲ"]:
+ _exc[token] = [{ORTH: token, NORM: "τε"}]
+
+for token in ["Δ'", "δ'", "δὲ"]:
+ _exc[token] = [{ORTH: token, NORM: "δέ"}]
+
+
+_other_exc = {
+ "μὲν": [{ORTH: "μὲν", NORM: "μέν"}],
+ "μὴν": [{ORTH: "μὴν", NORM: "μήν"}],
+ "τὴν": [{ORTH: "τὴν", NORM: "τήν"}],
+ "τὸν": [{ORTH: "τὸν", NORM: "τόν"}],
+ "καὶ": [{ORTH: "καὶ", NORM: "καί"}],
+ "καὐτός": [{ORTH: "κ", NORM: "καί"}, {ORTH: "αὐτός"}],
+ "καὐτὸς": [{ORTH: "κ", NORM: "καί"}, {ORTH: "αὐτὸς", NORM: "αὐτός"}],
+ "κοὐ": [{ORTH: "κ", NORM: "καί"}, {ORTH: "οὐ"}],
+ "χἡ": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ἡ"}],
+ "χοἱ": [{ORTH: "χ", NORM: "καί"}, {ORTH: "οἱ"}],
+ "χἱκετεύετε": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ἱκετεύετε"}],
+ "κἀν": [{ORTH: "κ", NORM: "καί"}, {ORTH: "ἀν", NORM: "ἐν"}],
+ "κἀγὼ": [{ORTH: "κἀ", NORM: "καί"}, {ORTH: "γὼ", NORM: "ἐγώ"}],
+ "κἀγώ": [{ORTH: "κἀ", NORM: "καί"}, {ORTH: "γώ", NORM: "ἐγώ"}],
+ "ἁγώ": [{ORTH: "ἁ", NORM: "ἃ"}, {ORTH: "γώ", NORM: "ἐγώ"}],
+ "ἁγὼ": [{ORTH: "ἁ", NORM: "ἃ"}, {ORTH: "γὼ", NORM: "ἐγώ"}],
+ "ἐγᾦδα": [{ORTH: "ἐγ", NORM: "ἐγώ"}, {ORTH: "ᾦδα", NORM: "οἶδα"}],
+ "ἐγᾦμαι": [{ORTH: "ἐγ", NORM: "ἐγώ"}, {ORTH: "ᾦμαι", NORM: "οἶμαι"}],
+ "κἀς": [{ORTH: "κ", NORM: "καί"}, {ORTH: "ἀς", NORM: "ἐς"}],
+ "κᾆτα": [{ORTH: "κ", NORM: "καί"}, {ORTH: "ᾆτα", NORM: "εἶτα"}],
+ "κεἰ": [{ORTH: "κ", NORM: "καί"}, {ORTH: "εἰ"}],
+ "κεἰς": [{ORTH: "κ", NORM: "καί"}, {ORTH: "εἰς"}],
+ "χὤτε": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤτε", NORM: "ὅτε"}],
+ "χὤπως": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤπως", NORM: "ὅπως"}],
+ "χὤτι": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤτι", NORM: "ὅτι"}],
+ "χὤταν": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤταν", NORM: "ὅταν"}],
+ "οὑμός": [{ORTH: "οὑ", NORM: "ὁ"}, {ORTH: "μός", NORM: "ἐμός"}],
+ "οὑμὸς": [{ORTH: "οὑ", NORM: "ὁ"}, {ORTH: "μὸς", NORM: "ἐμός"}],
+ "οὑμοί": [{ORTH: "οὑ", NORM: "οἱ"}, {ORTH: "μοί", NORM: "ἐμoί"}],
+ "οὑμοὶ": [{ORTH: "οὑ", NORM: "οἱ"}, {ORTH: "μοὶ", NORM: "ἐμoί"}],
+ "σοὔστι": [{ORTH: "σοὔ", NORM: "σοί"}, {ORTH: "στι", NORM: "ἐστι"}],
+ "σοὐστί": [{ORTH: "σοὐ", NORM: "σοί"}, {ORTH: "στί", NORM: "ἐστί"}],
+ "σοὐστὶ": [{ORTH: "σοὐ", NORM: "σοί"}, {ORTH: "στὶ", NORM: "ἐστί"}],
+ "μοὖστι": [{ORTH: "μοὖ", NORM: "μοί"}, {ORTH: "στι", NORM: "ἐστι"}],
+ "μοὔστι": [{ORTH: "μοὔ", NORM: "μοί"}, {ORTH: "στι", NORM: "ἐστι"}],
+ "τοὔνομα": [{ORTH: "τοὔ", NORM: "τό"}, {ORTH: "νομα", NORM: "ὄνομα"}],
+ "οὑν": [{ORTH: "οὑ", NORM: "ὁ"}, {ORTH: "ν", NORM: "ἐν"}],
+ "ὦνερ": [{ORTH: "ὦ", NORM: "ὦ"}, {ORTH: "νερ", NORM: "ἄνερ"}],
+ "ὦνδρες": [{ORTH: "ὦ", NORM: "ὦ"}, {ORTH: "νδρες", NORM: "ἄνδρες"}],
+ "προὔχων": [{ORTH: "προὔ", NORM: "πρό"}, {ORTH: "χων", NORM: "ἔχων"}],
+ "προὔχοντα": [{ORTH: "προὔ", NORM: "πρό"}, {ORTH: "χοντα", NORM: "ἔχοντα"}],
+ "ὥνεκα": [{ORTH: "ὥ", NORM: "οὗ"}, {ORTH: "νεκα", NORM: "ἕνεκα"}],
+ "θοἰμάτιον": [{ORTH: "θο", NORM: "τό"}, {ORTH: "ἰμάτιον"}],
+ "ὥνεκα": [{ORTH: "ὥ", NORM: "οὗ"}, {ORTH: "νεκα", NORM: "ἕνεκα"}],
+ "τὠληθές": [{ORTH: "τὠ", NORM: "τὸ"}, {ORTH: "ληθές", NORM: "ἀληθές"}],
+ "θἡμέρᾳ": [{ORTH: "θ", NORM: "τῇ"}, {ORTH: "ἡμέρᾳ"}],
+ "ἅνθρωπος": [{ORTH: "ἅ", NORM: "ὁ"}, {ORTH: "νθρωπος", NORM: "ἄνθρωπος"}],
+ "τἄλλα": [{ORTH: "τ", NORM: "τὰ"}, {ORTH: "ἄλλα"}],
+ "τἆλλα": [{ORTH: "τἆ", NORM: "τὰ"}, {ORTH: "λλα", NORM: "ἄλλα"}],
+ "ἁνήρ": [{ORTH: "ἁ", NORM: "ὁ"}, {ORTH: "νήρ", NORM: "ἀνήρ"}],
+ "ἁνὴρ": [{ORTH: "ἁ", NORM: "ὁ"}, {ORTH: "νὴρ", NORM: "ἀνήρ"}],
+ "ἅνδρες": [{ORTH: "ἅ", NORM: "οἱ"}, {ORTH: "νδρες", NORM: "ἄνδρες"}],
+ "ἁγαθαί": [{ORTH: "ἁ", NORM: "αἱ"}, {ORTH: "γαθαί", NORM: "ἀγαθαί"}],
+ "ἁγαθαὶ": [{ORTH: "ἁ", NORM: "αἱ"}, {ORTH: "γαθαὶ", NORM: "ἀγαθαί"}],
+ "ἁλήθεια": [{ORTH: "ἁ", NORM: "ἡ"}, {ORTH: "λήθεια", NORM: "ἀλήθεια"}],
+ "τἀνδρός": [{ORTH: "τ", NORM: "τοῦ"}, {ORTH: "ἀνδρός"}],
+ "τἀνδρὸς": [{ORTH: "τ", NORM: "τοῦ"}, {ORTH: "ἀνδρὸς", NORM: "ἀνδρός"}],
+ "τἀνδρί": [{ORTH: "τ", NORM: "τῷ"}, {ORTH: "ἀνδρί"}],
+ "τἀνδρὶ": [{ORTH: "τ", NORM: "τῷ"}, {ORTH: "ἀνδρὶ", NORM: "ἀνδρί"}],
+ "αὑτός": [{ORTH: "αὑ", NORM: "ὁ"}, {ORTH: "τός", NORM: "αὐτός"}],
+ "αὑτὸς": [{ORTH: "αὑ", NORM: "ὁ"}, {ORTH: "τὸς", NORM: "αὐτός"}],
+ "ταὐτοῦ": [{ORTH: "τ", NORM: "τοῦ"}, {ORTH: "αὐτοῦ"}],
+}
+
+_exc.update(_other_exc)
+
+TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
diff --git a/spacy/lang/gu/__init__.py b/spacy/lang/gu/__init__.py
index 67228ac40..e6fbc9d18 100644
--- a/spacy/lang/gu/__init__.py
+++ b/spacy/lang/gu/__init__.py
@@ -1,8 +1,8 @@
from .stop_words import STOP_WORDS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class GujaratiDefaults(Language.Defaults):
+class GujaratiDefaults(BaseDefaults):
stop_words = STOP_WORDS
diff --git a/spacy/lang/he/__init__.py b/spacy/lang/he/__init__.py
index e0adc3293..dd2ee478d 100644
--- a/spacy/lang/he/__init__.py
+++ b/spacy/lang/he/__init__.py
@@ -1,9 +1,9 @@
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class HebrewDefaults(Language.Defaults):
+class HebrewDefaults(BaseDefaults):
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
diff --git a/spacy/lang/hi/__init__.py b/spacy/lang/hi/__init__.py
index 384f040c8..4c8ae446d 100644
--- a/spacy/lang/hi/__init__.py
+++ b/spacy/lang/hi/__init__.py
@@ -1,9 +1,9 @@
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class HindiDefaults(Language.Defaults):
+class HindiDefaults(BaseDefaults):
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
diff --git a/spacy/lang/hi/lex_attrs.py b/spacy/lang/hi/lex_attrs.py
index a18c2e513..ee845e8b1 100644
--- a/spacy/lang/hi/lex_attrs.py
+++ b/spacy/lang/hi/lex_attrs.py
@@ -90,7 +90,7 @@ _eleven_to_beyond = [
"अड़सठ",
"उनहत्तर",
"सत्तर",
- "इकहत्तर"
+ "इकहत्तर",
"बहत्तर",
"तिहत्तर",
"चौहत्तर",
diff --git a/spacy/lang/hr/__init__.py b/spacy/lang/hr/__init__.py
index 118e0946a..30870b522 100644
--- a/spacy/lang/hr/__init__.py
+++ b/spacy/lang/hr/__init__.py
@@ -1,8 +1,8 @@
from .stop_words import STOP_WORDS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class CroatianDefaults(Language.Defaults):
+class CroatianDefaults(BaseDefaults):
stop_words = STOP_WORDS
diff --git a/spacy/lang/hu/__init__.py b/spacy/lang/hu/__init__.py
index 8962603a6..9426bacea 100644
--- a/spacy/lang/hu/__init__.py
+++ b/spacy/lang/hu/__init__.py
@@ -1,10 +1,10 @@
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class HungarianDefaults(Language.Defaults):
+class HungarianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
diff --git a/spacy/lang/hu/tokenizer_exceptions.py b/spacy/lang/hu/tokenizer_exceptions.py
index 4a64a1d2c..ffaa74f50 100644
--- a/spacy/lang/hu/tokenizer_exceptions.py
+++ b/spacy/lang/hu/tokenizer_exceptions.py
@@ -646,5 +646,10 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format(
)
+for u in "cfkCFK":
+ _exc[f"°{u}"] = [{ORTH: f"°{u}"}]
+ _exc[f"°{u}."] = [{ORTH: f"°{u}"}, {ORTH: "."}]
+
+
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match
diff --git a/spacy/lang/hy/__init__.py b/spacy/lang/hy/__init__.py
index 4577ab641..481eaae0a 100644
--- a/spacy/lang/hy/__init__.py
+++ b/spacy/lang/hy/__init__.py
@@ -1,9 +1,9 @@
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class ArmenianDefaults(Language.Defaults):
+class ArmenianDefaults(BaseDefaults):
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py
index 87373551c..0d72cfa9d 100644
--- a/spacy/lang/id/__init__.py
+++ b/spacy/lang/id/__init__.py
@@ -3,10 +3,10 @@ from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIX
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class IndonesianDefaults(Language.Defaults):
+class IndonesianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py
index 0f29bfe16..fa984d411 100644
--- a/spacy/lang/id/syntax_iterators.py
+++ b/spacy/lang/id/syntax_iterators.py
@@ -1,11 +1,11 @@
-from typing import Union, Iterator
+from typing import Union, Iterator, Tuple
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
from ...tokens import Doc, Span
-def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
diff --git a/spacy/lang/is/__init__.py b/spacy/lang/is/__init__.py
index be5de5981..318363beb 100644
--- a/spacy/lang/is/__init__.py
+++ b/spacy/lang/is/__init__.py
@@ -1,8 +1,8 @@
from .stop_words import STOP_WORDS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class IcelandicDefaults(Language.Defaults):
+class IcelandicDefaults(BaseDefaults):
stop_words = STOP_WORDS
diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py
index 672a8698e..1edebc837 100644
--- a/spacy/lang/it/__init__.py
+++ b/spacy/lang/it/__init__.py
@@ -1,14 +1,14 @@
-from typing import Optional
+from typing import Optional, Callable
from thinc.api import Model
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
-from ...language import Language
+from ...language import Language, BaseDefaults
from .lemmatizer import ItalianLemmatizer
-class ItalianDefaults(Language.Defaults):
+class ItalianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES
@@ -23,13 +23,25 @@ class Italian(Language):
@Italian.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
+ default_config={
+ "model": None,
+ "mode": "pos_lookup",
+ "overwrite": False,
+ "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+ },
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
- nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+ nlp: Language,
+ model: Optional[Model],
+ name: str,
+ mode: str,
+ overwrite: bool,
+ scorer: Optional[Callable],
):
- return ItalianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+ return ItalianLemmatizer(
+ nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+ )
__all__ = ["Italian"]
diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
index 4e6bf9d3c..bf86305fb 100644
--- a/spacy/lang/ja/__init__.py
+++ b/spacy/lang/ja/__init__.py
@@ -1,21 +1,25 @@
-from typing import Optional, Union, Dict, Any
+from typing import Optional, Union, Dict, Any, Callable
from pathlib import Path
import srsly
from collections import namedtuple
+from thinc.api import Model
+import re
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .tag_map import TAG_MAP
from .tag_orth_map import TAG_ORTH_MAP
from .tag_bigram_map import TAG_BIGRAM_MAP
-from ...compat import copy_reg
from ...errors import Errors
-from ...language import Language
+from ...language import Language, BaseDefaults
+from ...pipeline import Morphologizer
+from ...pipeline.morphologizer import DEFAULT_MORPH_MODEL
from ...scorer import Scorer
from ...symbols import POS
-from ...tokens import Doc
+from ...tokens import Doc, MorphAnalysis
from ...training import validate_examples
from ...util import DummyTokenizer, registry, load_config_from_str
+from ...vocab import Vocab
from ... import util
@@ -31,16 +35,21 @@ split_mode = null
@registry.tokenizers("spacy.ja.JapaneseTokenizer")
def create_tokenizer(split_mode: Optional[str] = None):
def japanese_tokenizer_factory(nlp):
- return JapaneseTokenizer(nlp, split_mode=split_mode)
+ return JapaneseTokenizer(nlp.vocab, split_mode=split_mode)
return japanese_tokenizer_factory
class JapaneseTokenizer(DummyTokenizer):
- def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
- self.vocab = nlp.vocab
+ def __init__(self, vocab: Vocab, split_mode: Optional[str] = None) -> None:
+ self.vocab = vocab
self.split_mode = split_mode
self.tokenizer = try_sudachi_import(self.split_mode)
+ # if we're using split mode A we don't need subtokens
+ self.need_subtokens = not (split_mode is None or split_mode == "A")
+
+ def __reduce__(self):
+ return JapaneseTokenizer, (self.vocab, self.split_mode)
def __call__(self, text: str) -> Doc:
# convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
@@ -49,8 +58,8 @@ class JapaneseTokenizer(DummyTokenizer):
dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
# create Doc with tag bi-gram based part-of-speech identification rules
- words, tags, inflections, lemmas, readings, sub_tokens_list = (
- zip(*dtokens) if dtokens else [[]] * 6
+ words, tags, inflections, lemmas, norms, readings, sub_tokens_list = (
+ zip(*dtokens) if dtokens else [[]] * 7
)
sub_tokens_list = list(sub_tokens_list)
doc = Doc(self.vocab, words=words, spaces=spaces)
@@ -68,9 +77,18 @@ class JapaneseTokenizer(DummyTokenizer):
)
# if there's no lemma info (it's an unk) just use the surface
token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
- doc.user_data["inflections"] = inflections
- doc.user_data["reading_forms"] = readings
- doc.user_data["sub_tokens"] = sub_tokens_list
+ morph = {}
+ if dtoken.inf:
+ # it's normal for this to be empty for non-inflecting types
+ morph["Inflection"] = dtoken.inf
+ token.norm_ = dtoken.norm
+ if dtoken.reading:
+ # punctuation is its own reading, but we don't want values like
+ # "=" here
+ morph["Reading"] = re.sub("[=|]", "_", dtoken.reading)
+ token.morph = MorphAnalysis(self.vocab, morph)
+ if self.need_subtokens:
+ doc.user_data["sub_tokens"] = sub_tokens_list
return doc
def _get_dtokens(self, sudachipy_tokens, need_sub_tokens: bool = True):
@@ -81,9 +99,10 @@ class JapaneseTokenizer(DummyTokenizer):
DetailedToken(
token.surface(), # orth
"-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]), # tag
- ",".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]), # inf
+ ";".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]), # inf
token.dictionary_form(), # lemma
- token.reading_form(), # user_data['reading_forms']
+ token.normalized_form(),
+ token.reading_form(),
sub_tokens_list[idx]
if sub_tokens_list
else None, # user_data['sub_tokens']
@@ -105,9 +124,8 @@ class JapaneseTokenizer(DummyTokenizer):
]
def _get_sub_tokens(self, sudachipy_tokens):
- if (
- self.split_mode is None or self.split_mode == "A"
- ): # do nothing for default split mode
+ # do nothing for default split mode
+ if not self.need_subtokens:
return None
sub_tokens_list = [] # list of (list of list of DetailedToken | None)
@@ -154,7 +172,7 @@ class JapaneseTokenizer(DummyTokenizer):
def to_disk(self, path: Union[str, Path], **kwargs) -> None:
path = util.ensure_path(path)
serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())}
- return util.to_disk(path, serializers, [])
+ util.to_disk(path, serializers, [])
def from_disk(self, path: Union[str, Path], **kwargs) -> "JapaneseTokenizer":
path = util.ensure_path(path)
@@ -164,7 +182,7 @@ class JapaneseTokenizer(DummyTokenizer):
return self
-class JapaneseDefaults(Language.Defaults):
+class JapaneseDefaults(BaseDefaults):
config = load_config_from_str(DEFAULT_CONFIG)
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS
@@ -176,9 +194,37 @@ class Japanese(Language):
Defaults = JapaneseDefaults
+@Japanese.factory(
+ "morphologizer",
+ assigns=["token.morph", "token.pos"],
+ default_config={
+ "model": DEFAULT_MORPH_MODEL,
+ "overwrite": True,
+ "extend": True,
+ "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
+ },
+ default_score_weights={
+ "pos_acc": 0.5,
+ "morph_micro_f": 0.5,
+ "morph_per_feat": None,
+ },
+)
+def make_morphologizer(
+ nlp: Language,
+ model: Model,
+ name: str,
+ overwrite: bool,
+ extend: bool,
+ scorer: Optional[Callable],
+):
+ return Morphologizer(
+ nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer
+ )
+
+
# Hold the attributes we need with convenient names
DetailedToken = namedtuple(
- "DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"]
+ "DetailedToken", ["surface", "tag", "inf", "lemma", "norm", "reading", "sub_tokens"]
)
@@ -254,7 +300,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
return text_dtokens, text_spaces
elif len([word for word in words if not word.isspace()]) == 0:
assert text.isspace()
- text_dtokens = [DetailedToken(text, gap_tag, "", text, None, None)]
+ text_dtokens = [DetailedToken(text, gap_tag, "", text, text, None, None)]
text_spaces = [False]
return text_dtokens, text_spaces
@@ -271,7 +317,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
# space token
if word_start > 0:
w = text[text_pos : text_pos + word_start]
- text_dtokens.append(DetailedToken(w, gap_tag, "", w, None, None))
+ text_dtokens.append(DetailedToken(w, gap_tag, "", w, w, None, None))
text_spaces.append(False)
text_pos += word_start
@@ -287,16 +333,10 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
# trailing space token
if text_pos < len(text):
w = text[text_pos:]
- text_dtokens.append(DetailedToken(w, gap_tag, "", w, None, None))
+ text_dtokens.append(DetailedToken(w, gap_tag, "", w, w, None, None))
text_spaces.append(False)
return text_dtokens, text_spaces
-def pickle_japanese(instance):
- return Japanese, tuple()
-
-
-copy_reg.pickle(Japanese, pickle_japanese)
-
__all__ = ["Japanese"]
diff --git a/spacy/lang/ja/syntax_iterators.py b/spacy/lang/ja/syntax_iterators.py
index cca4902ab..588a9ba03 100644
--- a/spacy/lang/ja/syntax_iterators.py
+++ b/spacy/lang/ja/syntax_iterators.py
@@ -1,4 +1,4 @@
-from typing import Union, Iterator
+from typing import Union, Iterator, Tuple, Set
from ...symbols import NOUN, PROPN, PRON, VERB
from ...tokens import Doc, Span
@@ -10,13 +10,13 @@ labels = ["nsubj", "nmod", "ddoclike", "nsubjpass", "pcomp", "pdoclike", "doclik
# fmt: on
-def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
doc = doclike.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings.add(label) for label in labels]
doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
- seen = set()
+ seen: Set[int] = set()
for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
diff --git a/spacy/lang/kn/__init__.py b/spacy/lang/kn/__init__.py
index 8e53989e6..ccd46a394 100644
--- a/spacy/lang/kn/__init__.py
+++ b/spacy/lang/kn/__init__.py
@@ -1,8 +1,8 @@
from .stop_words import STOP_WORDS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class KannadaDefaults(Language.Defaults):
+class KannadaDefaults(BaseDefaults):
stop_words = STOP_WORDS
diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py
index 83c9f4962..05fc67e79 100644
--- a/spacy/lang/ko/__init__.py
+++ b/spacy/lang/ko/__init__.py
@@ -1,15 +1,15 @@
-from typing import Optional, Any, Dict
+from typing import Iterator, Any, Dict
from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP
from .lex_attrs import LEX_ATTRS
-from ...language import Language
+from ...language import Language, BaseDefaults
from ...tokens import Doc
-from ...compat import copy_reg
from ...scorer import Scorer
from ...symbols import POS
from ...training import validate_examples
from ...util import DummyTokenizer, registry, load_config_from_str
+from ...vocab import Vocab
DEFAULT_CONFIG = """
@@ -23,17 +23,20 @@ DEFAULT_CONFIG = """
@registry.tokenizers("spacy.ko.KoreanTokenizer")
def create_tokenizer():
def korean_tokenizer_factory(nlp):
- return KoreanTokenizer(nlp)
+ return KoreanTokenizer(nlp.vocab)
return korean_tokenizer_factory
class KoreanTokenizer(DummyTokenizer):
- def __init__(self, nlp: Optional[Language] = None):
- self.vocab = nlp.vocab
- MeCab = try_mecab_import()
+ def __init__(self, vocab: Vocab):
+ self.vocab = vocab
+ MeCab = try_mecab_import() # type: ignore[func-returns-value]
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
+ def __reduce__(self):
+ return KoreanTokenizer, (self.vocab,)
+
def __del__(self):
self.mecab_tokenizer.__del__()
@@ -49,7 +52,7 @@ class KoreanTokenizer(DummyTokenizer):
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
return doc
- def detailed_tokens(self, text: str) -> Dict[str, Any]:
+ def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
for node in self.mecab_tokenizer.parse(text, as_nodes=True):
@@ -68,7 +71,7 @@ class KoreanTokenizer(DummyTokenizer):
return Scorer.score_tokenization(examples)
-class KoreanDefaults(Language.Defaults):
+class KoreanDefaults(BaseDefaults):
config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
@@ -106,10 +109,4 @@ def check_spaces(text, tokens):
yield False
-def pickle_korean(instance):
- return Korean, tuple()
-
-
-copy_reg.pickle(Korean, pickle_korean)
-
__all__ = ["Korean"]
diff --git a/spacy/lang/ky/__init__.py b/spacy/lang/ky/__init__.py
index a333db035..ccca384bd 100644
--- a/spacy/lang/ky/__init__.py
+++ b/spacy/lang/ky/__init__.py
@@ -2,10 +2,10 @@ from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class KyrgyzDefaults(Language.Defaults):
+class KyrgyzDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS
diff --git a/spacy/lang/lb/__init__.py b/spacy/lang/lb/__init__.py
index da6fe55d7..7827e7762 100644
--- a/spacy/lang/lb/__init__.py
+++ b/spacy/lang/lb/__init__.py
@@ -2,10 +2,10 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class LuxembourgishDefaults(Language.Defaults):
+class LuxembourgishDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS
diff --git a/spacy/lang/lex_attrs.py b/spacy/lang/lex_attrs.py
index 12016c273..6ed981a06 100644
--- a/spacy/lang/lex_attrs.py
+++ b/spacy/lang/lex_attrs.py
@@ -3,6 +3,7 @@ import unicodedata
import re
from .. import attrs
+from .tokenizer_exceptions import URL_MATCH
_like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match
@@ -109,6 +110,8 @@ def like_url(text: str) -> bool:
return True
if tld.isalpha() and tld in _tlds:
return True
+ if URL_MATCH(text):
+ return True
return False
diff --git a/spacy/lang/lij/__init__.py b/spacy/lang/lij/__init__.py
index 5ae280324..b7e11f77e 100644
--- a/spacy/lang/lij/__init__.py
+++ b/spacy/lang/lij/__init__.py
@@ -1,10 +1,10 @@
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES
-from ...language import Language
+from ...language import Language, BaseDefaults
-class LigurianDefaults(Language.Defaults):
+class LigurianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
stop_words = STOP_WORDS
diff --git a/spacy/lang/lt/__init__.py b/spacy/lang/lt/__init__.py
index e395a8f62..3ae000e5f 100644
--- a/spacy/lang/lt/__init__.py
+++ b/spacy/lang/lt/__init__.py
@@ -2,10 +2,10 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class LithuanianDefaults(Language.Defaults):
+class LithuanianDefaults(BaseDefaults):
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
diff --git a/spacy/lang/lv/__init__.py b/spacy/lang/lv/__init__.py
index 142bc706e..a05e5b939 100644
--- a/spacy/lang/lv/__init__.py
+++ b/spacy/lang/lv/__init__.py
@@ -1,8 +1,8 @@
from .stop_words import STOP_WORDS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class LatvianDefaults(Language.Defaults):
+class LatvianDefaults(BaseDefaults):
stop_words = STOP_WORDS
diff --git a/spacy/lang/mk/__init__.py b/spacy/lang/mk/__init__.py
index 2f6097f16..fa07cfef9 100644
--- a/spacy/lang/mk/__init__.py
+++ b/spacy/lang/mk/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
from thinc.api import Model
from .lemmatizer import MacedonianLemmatizer
from .stop_words import STOP_WORDS
@@ -6,13 +6,13 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...language import Language
+from ...language import Language, BaseDefaults
from ...attrs import LANG
from ...util import update_exc
from ...lookups import Lookups
-class MacedonianDefaults(Language.Defaults):
+class MacedonianDefaults(BaseDefaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "mk"
@@ -38,13 +38,25 @@ class Macedonian(Language):
@Macedonian.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={"model": None, "mode": "rule", "overwrite": False},
+ default_config={
+ "model": None,
+ "mode": "rule",
+ "overwrite": False,
+ "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+ },
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
- nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+ nlp: Language,
+ model: Optional[Model],
+ name: str,
+ mode: str,
+ overwrite: bool,
+ scorer: Optional[Callable],
):
- return MacedonianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+ return MacedonianLemmatizer(
+ nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+ )
__all__ = ["Macedonian"]
diff --git a/spacy/lang/ml/__init__.py b/spacy/lang/ml/__init__.py
index cfad52261..9f90605f0 100644
--- a/spacy/lang/ml/__init__.py
+++ b/spacy/lang/ml/__init__.py
@@ -1,9 +1,9 @@
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class MalayalamDefaults(Language.Defaults):
+class MalayalamDefaults(BaseDefaults):
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
diff --git a/spacy/lang/mr/__init__.py b/spacy/lang/mr/__init__.py
index af0c49878..3e172fa60 100644
--- a/spacy/lang/mr/__init__.py
+++ b/spacy/lang/mr/__init__.py
@@ -1,8 +1,8 @@
from .stop_words import STOP_WORDS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class MarathiDefaults(Language.Defaults):
+class MarathiDefaults(BaseDefaults):
stop_words = STOP_WORDS
diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py
index 0bfde7d28..e079236fd 100644
--- a/spacy/lang/nb/__init__.py
+++ b/spacy/lang/nb/__init__.py
@@ -1,15 +1,15 @@
-from typing import Optional
+from typing import Optional, Callable
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language
+from ...language import Language, BaseDefaults
from ...pipeline import Lemmatizer
-class NorwegianDefaults(Language.Defaults):
+class NorwegianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
@@ -26,13 +26,25 @@ class Norwegian(Language):
@Norwegian.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={"model": None, "mode": "rule", "overwrite": False},
+ default_config={
+ "model": None,
+ "mode": "rule",
+ "overwrite": False,
+ "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+ },
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
- nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+ nlp: Language,
+ model: Optional[Model],
+ name: str,
+ mode: str,
+ overwrite: bool,
+ scorer: Optional[Callable],
):
- return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+ return Lemmatizer(
+ nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+ )
__all__ = ["Norwegian"]
diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py
index 68117a54d..d86662693 100644
--- a/spacy/lang/nb/syntax_iterators.py
+++ b/spacy/lang/nb/syntax_iterators.py
@@ -1,11 +1,11 @@
-from typing import Union, Iterator
+from typing import Union, Iterator, Tuple
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
from ...tokens import Doc, Span
-def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
# fmt: off
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
diff --git a/spacy/lang/ne/__init__.py b/spacy/lang/ne/__init__.py
index 68632e9ad..0028d1b0b 100644
--- a/spacy/lang/ne/__init__.py
+++ b/spacy/lang/ne/__init__.py
@@ -1,9 +1,9 @@
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class NepaliDefaults(Language.Defaults):
+class NepaliDefaults(BaseDefaults):
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py
index 7fff3c3d2..ad2205a0b 100644
--- a/spacy/lang/nl/__init__.py
+++ b/spacy/lang/nl/__init__.py
@@ -1,21 +1,24 @@
-from typing import Optional
+from typing import Optional, Callable
+
from thinc.api import Model
-from .stop_words import STOP_WORDS
+from .lemmatizer import DutchLemmatizer
from .lex_attrs import LEX_ATTRS
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES
-from .lemmatizer import DutchLemmatizer
-from ...language import Language
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from ...language import Language, BaseDefaults
-class DutchDefaults(Language.Defaults):
+class DutchDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
+ syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS
@@ -27,13 +30,25 @@ class Dutch(Language):
@Dutch.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={"model": None, "mode": "rule", "overwrite": False},
+ default_config={
+ "model": None,
+ "mode": "rule",
+ "overwrite": False,
+ "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+ },
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
- nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+ nlp: Language,
+ model: Optional[Model],
+ name: str,
+ mode: str,
+ overwrite: bool,
+ scorer: Optional[Callable],
):
- return DutchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+ return DutchLemmatizer(
+ nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+ )
__all__ = ["Dutch"]
diff --git a/spacy/lang/nl/lemmatizer.py b/spacy/lang/nl/lemmatizer.py
index 6c025dcf6..4f6b2ef30 100644
--- a/spacy/lang/nl/lemmatizer.py
+++ b/spacy/lang/nl/lemmatizer.py
@@ -97,7 +97,7 @@ class DutchLemmatizer(Lemmatizer):
return forms
else:
oov_forms.append(form)
- forms = list(set(oov_forms))
+ forms = list(dict.fromkeys(oov_forms))
# Back-off through remaining return value candidates.
if forms:
for form in forms:
diff --git a/spacy/lang/nl/syntax_iterators.py b/spacy/lang/nl/syntax_iterators.py
new file mode 100644
index 000000000..1ab5e7cff
--- /dev/null
+++ b/spacy/lang/nl/syntax_iterators.py
@@ -0,0 +1,72 @@
+from typing import Union, Iterator, Tuple
+
+from ...symbols import NOUN, PRON
+from ...errors import Errors
+from ...tokens import Doc, Span
+
+
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
+ """
+ Detect base noun phrases from a dependency parse. Works on Doc and Span.
+ The definition is inspired by https://www.nltk.org/book/ch07.html
+ Consider : [Noun + determinant / adjective] and also [Pronoun]
+ """
+ # fmt: off
+ # labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
+ # fmt: on
+ doc = doclike.doc # Ensure works on both Doc and Span.
+
+ # Check for dependencies: POS, DEP
+ if not doc.has_annotation("POS"):
+ raise ValueError(Errors.E1019)
+ if not doc.has_annotation("DEP"):
+ raise ValueError(Errors.E029)
+
+ # See UD tags: https://universaldependencies.org/u/dep/index.html
+ # amod = adjectival modifier
+ # nmod:poss = possessive nominal modifier
+ # nummod = numeric modifier
+ # det = determiner
+ # det:poss = possessive determiner
+ noun_deps = [
+ doc.vocab.strings[label] for label in ["amod", "nmod:poss", "det", "det:poss"]
+ ]
+
+ # nsubj = nominal subject
+ # nsubj:pass = passive nominal subject
+ pronoun_deps = [doc.vocab.strings[label] for label in ["nsubj", "nsubj:pass"]]
+
+ # Label NP for the Span to identify it as Noun-Phrase
+ span_label = doc.vocab.strings.add("NP")
+
+ # Only NOUNS and PRONOUNS matter
+ for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)):
+ # For NOUNS
+ # Pick children from syntactic parse (only those with certain dependencies)
+ if word.pos == NOUN:
+ # Some debugging. It happens that VERBS are POS-TAGGED as NOUNS
+ # We check if the word has a "nsubj", if it's the case, we eliminate it
+ nsubjs = filter(
+ lambda x: x.dep == doc.vocab.strings["nsubj"], word.children
+ )
+ next_word = next(nsubjs, None)
+ if next_word is not None:
+ # We found some nsubj, so we skip this word. Otherwise, consider it a normal NOUN
+ continue
+
+ children = filter(lambda x: x.dep in noun_deps, word.children)
+ children_i = [c.i for c in children] + [word.i]
+
+ start_span = min(children_i)
+ end_span = max(children_i) + 1
+ yield start_span, end_span, span_label
+
+ # PRONOUNS only if it is the subject of a verb
+ elif word.pos == PRON:
+ if word.dep in pronoun_deps:
+ start_span = word.i
+ end_span = word.i + 1
+ yield start_span, end_span, span_label
+
+
+SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py
index 585e08c60..02c96799b 100644
--- a/spacy/lang/pl/__init__.py
+++ b/spacy/lang/pl/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
from thinc.api import Model
@@ -8,7 +8,7 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import PolishLemmatizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...language import Language
+from ...language import Language, BaseDefaults
TOKENIZER_EXCEPTIONS = {
@@ -16,7 +16,7 @@ TOKENIZER_EXCEPTIONS = {
}
-class PolishDefaults(Language.Defaults):
+class PolishDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
@@ -33,13 +33,25 @@ class Polish(Language):
@Polish.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
+ default_config={
+ "model": None,
+ "mode": "pos_lookup",
+ "overwrite": False,
+ "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+ },
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
- nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+ nlp: Language,
+ model: Optional[Model],
+ name: str,
+ mode: str,
+ overwrite: bool,
+ scorer: Optional[Callable],
):
- return PolishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+ return PolishLemmatizer(
+ nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+ )
__all__ = ["Polish"]
diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py
index 0447099f0..454002491 100644
--- a/spacy/lang/pt/__init__.py
+++ b/spacy/lang/pt/__init__.py
@@ -1,15 +1,17 @@
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
+from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
-from ...language import Language
+from ...language import Language, BaseDefaults
-class PortugueseDefaults(Language.Defaults):
+class PortugueseDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
prefixes = TOKENIZER_PREFIXES
lex_attr_getters = LEX_ATTRS
+ syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS
diff --git a/spacy/lang/pt/syntax_iterators.py b/spacy/lang/pt/syntax_iterators.py
new file mode 100644
index 000000000..62661f5e4
--- /dev/null
+++ b/spacy/lang/pt/syntax_iterators.py
@@ -0,0 +1,85 @@
+from typing import Union, Iterator, Tuple
+
+from ...symbols import NOUN, PROPN, PRON
+from ...errors import Errors
+from ...tokens import Doc, Span
+
+
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
+ """
+ Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+ """
+ labels = [
+ "nsubj",
+ "nsubj:pass",
+ "obj",
+ "obl",
+ "obl:agent",
+ "nmod",
+ "pcomp",
+ "appos",
+ "ROOT",
+ ]
+ post_modifiers = ["flat", "flat:name", "fixed", "compound"]
+ doc = doclike.doc # Ensure works on both Doc and Span.
+ if not doc.has_annotation("DEP"):
+ raise ValueError(Errors.E029)
+ np_deps = {doc.vocab.strings.add(label) for label in labels}
+ np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
+ np_label = doc.vocab.strings.add("NP")
+ adj_label = doc.vocab.strings.add("amod")
+ det_label = doc.vocab.strings.add("det")
+ det_pos = doc.vocab.strings.add("DET")
+ adp_label = doc.vocab.strings.add("ADP")
+ conj = doc.vocab.strings.add("conj")
+ conj_pos = doc.vocab.strings.add("CCONJ")
+ prev_end = -1
+ for i, word in enumerate(doclike):
+ if word.pos not in (NOUN, PROPN, PRON):
+ continue
+ # Prevent nested chunks from being produced
+ if word.left_edge.i <= prev_end:
+ continue
+ if word.dep in np_deps:
+ right_childs = list(word.rights)
+ right_child = right_childs[0] if right_childs else None
+
+ if right_child:
+ if (
+ right_child.dep == adj_label
+ ): # allow chain of adjectives by expanding to right
+ right_end = right_child.right_edge
+ elif (
+ right_child.dep == det_label and right_child.pos == det_pos
+ ): # cut relative pronouns here
+ right_end = right_child
+ elif right_child.dep in np_modifs: # Check if we can expand to right
+ right_end = word.right_edge
+ else:
+ right_end = word
+ else:
+ right_end = word
+ prev_end = right_end.i
+
+ left_index = word.left_edge.i
+ left_index = (
+ left_index + 1 if word.left_edge.pos == adp_label else left_index
+ )
+
+ yield left_index, right_end.i + 1, np_label
+ elif word.dep == conj:
+ head = word.head
+ while head.dep == conj and head.head.i < head.i:
+ head = head.head
+ # If the head is an NP, and we're coordinated to it, we're an NP
+ if head.dep in np_deps:
+ prev_end = word.i
+
+ left_index = word.left_edge.i # eliminate left attached conjunction
+ left_index = (
+ left_index + 1 if word.left_edge.pos == conj_pos else left_index
+ )
+ yield left_index, word.i + 1, np_label
+
+
+SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/lang/ro/__init__.py b/spacy/lang/ro/__init__.py
index f0d8d8d31..50027ffd2 100644
--- a/spacy/lang/ro/__init__.py
+++ b/spacy/lang/ro/__init__.py
@@ -3,14 +3,14 @@ from .stop_words import STOP_WORDS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES
from .lex_attrs import LEX_ATTRS
-from ...language import Language
+from ...language import Language, BaseDefaults
# Lemma data note:
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
# Replaced characters using cedillas with the correct ones (ș and ț)
-class RomanianDefaults(Language.Defaults):
+class RomanianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py
index 4287cc288..5d31d8ea2 100644
--- a/spacy/lang/ru/__init__.py
+++ b/spacy/lang/ru/__init__.py
@@ -1,14 +1,14 @@
-from typing import Optional
+from typing import Optional, Callable
from thinc.api import Model
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import RussianLemmatizer
-from ...language import Language
+from ...language import Language, BaseDefaults
-class RussianDefaults(Language.Defaults):
+class RussianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
@@ -22,7 +22,12 @@ class Russian(Language):
@Russian.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={"model": None, "mode": "pymorphy2", "overwrite": False},
+ default_config={
+ "model": None,
+ "mode": "pymorphy2",
+ "overwrite": False,
+ "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+ },
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
@@ -31,8 +36,11 @@ def make_lemmatizer(
name: str,
mode: str,
overwrite: bool,
+ scorer: Optional[Callable],
):
- return RussianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+ return RussianLemmatizer(
+ nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+ )
__all__ = ["Russian"]
diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py
index 5a49a4e00..85180b1e4 100644
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@@ -1,8 +1,9 @@
-from typing import Optional, List, Dict, Tuple
+from typing import Optional, List, Dict, Tuple, Callable
from thinc.api import Model
from ...pipeline import Lemmatizer
+from ...pipeline.lemmatizer import lemmatizer_score
from ...symbols import POS
from ...tokens import Token
from ...vocab import Vocab
@@ -12,7 +13,6 @@ PUNCT_RULES = {"«": '"', "»": '"'}
class RussianLemmatizer(Lemmatizer):
-
def __init__(
self,
vocab: Vocab,
@@ -21,6 +21,7 @@ class RussianLemmatizer(Lemmatizer):
*,
mode: str = "pymorphy2",
overwrite: bool = False,
+ scorer: Optional[Callable] = lemmatizer_score,
) -> None:
if mode == "pymorphy2":
try:
@@ -32,7 +33,9 @@ class RussianLemmatizer(Lemmatizer):
) from None
if getattr(self, "_morph", None) is None:
self._morph = MorphAnalyzer()
- super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
+ super().__init__(
+ vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+ )
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
string = token.text
@@ -57,7 +60,9 @@ class RussianLemmatizer(Lemmatizer):
if not len(filtered_analyses):
return [string.lower()]
if morphology is None or (len(morphology) == 1 and POS in morphology):
- return list(set([analysis.normal_form for analysis in filtered_analyses]))
+ return list(
+ dict.fromkeys([analysis.normal_form for analysis in filtered_analyses])
+ )
if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
features_to_compare = ["Case", "Number", "Gender"]
elif univ_pos == "NUM":
@@ -88,7 +93,9 @@ class RussianLemmatizer(Lemmatizer):
filtered_analyses.append(analysis)
if not len(filtered_analyses):
return [string.lower()]
- return list(set([analysis.normal_form for analysis in filtered_analyses]))
+ return list(
+ dict.fromkeys([analysis.normal_form for analysis in filtered_analyses])
+ )
def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
string = token.text
diff --git a/spacy/lang/sa/__init__.py b/spacy/lang/sa/__init__.py
index 345137817..61398af6c 100644
--- a/spacy/lang/sa/__init__.py
+++ b/spacy/lang/sa/__init__.py
@@ -1,9 +1,9 @@
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class SanskritDefaults(Language.Defaults):
+class SanskritDefaults(BaseDefaults):
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
diff --git a/spacy/lang/si/__init__.py b/spacy/lang/si/__init__.py
index d77e3bb8b..971cee3c6 100644
--- a/spacy/lang/si/__init__.py
+++ b/spacy/lang/si/__init__.py
@@ -1,9 +1,9 @@
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class SinhalaDefaults(Language.Defaults):
+class SinhalaDefaults(BaseDefaults):
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
diff --git a/spacy/lang/si/stop_words.py b/spacy/lang/si/stop_words.py
index bde662bf7..7d29bc1b4 100644
--- a/spacy/lang/si/stop_words.py
+++ b/spacy/lang/si/stop_words.py
@@ -1,47 +1,195 @@
STOP_WORDS = set(
"""
-අතර
-එච්චර
-එපමණ
-එලෙස
-එවිට
-ඒ
-කට
-කදී
-කින්
-ක්
-ට
-තුර
-ත්
-ද
-නමුත්
-නොහොත්
-පමණ
-පමණි
-ම
-මෙච්චර
-මෙපමණ
-මෙලෙස
-මෙවිට
-මේ
-ය
-යි
-ලදී
+සහ
+සමග
+සමඟ
+අහා
+ආහ්
+ආ
+ඕහෝ
+අනේ
+අඳෝ
+අපොයි
+අපෝ
+අයියෝ
+ආයි
+ඌයි
+චී
+චිහ්
+චික්
+හෝ
+දෝ
+දෝහෝ
+මෙන්
+සේ
+වැනි
+බඳු
+වන්
+අයුරු
+අයුරින්
ලෙස
-වගේ
+වැඩි
+ශ්රී
+හා
+ය
+නිසා
+නිසාවෙන්
+බවට
+බව
+බවෙන්
+නම්
+වැඩි
+සිට
+දී
+මහා
+මහ
+පමණ
+පමණින්
+පමන
වන
විට
-විටෙක
-විතර
-විය
-වුව
-වුවත්
-වුවද
-වූ
-සමඟ
+විටින්
+මේ
+මෙලෙස
+මෙයින්
+ඇති
+ලෙස
+සිදු
+වශයෙන්
+යන
+සඳහා
+මගින්
+හෝ
+ඉතා
+ඒ
+එම
+ද
+අතර
+විසින්
+සමග
+පිළිබඳව
+පිළිබඳ
+තුළ
+බව
+වැනි
+මහ
+මෙම
+මෙහි
+මේ
+වෙත
+වෙතින්
+වෙතට
+වෙනුවෙන්
+වෙනුවට
+වෙන
+ගැන
+නෑ
+අනුව
+නව
+පිළිබඳ
+විශේෂ
+දැනට
+එහෙන්
+මෙහෙන්
+එහේ
+මෙහේ
+ම
+තවත්
+තව
සහ
-හා
+දක්වා
+ට
+ගේ
+එ
+ක
+ක්
+බවත්
+බවද
+මත
+ඇතුලු
+ඇතුළු
+මෙසේ
+වඩා
+වඩාත්ම
+නිති
+නිතිත්
+නිතොර
+නිතර
+ඉක්බිති
+දැන්
+යලි
+පුන
+ඉතින්
+සිට
+සිටන්
+පටන්
+තෙක්
+දක්වා
+සා
+තාක්
+තුවක්
+පවා
+ද
+හෝ
+වත්
+විනා
+හැර
+මිස
+මුත්
+කිම
+කිම්
+ඇයි
+මන්ද
හෙවත්
-හෝ
+නොහොත්
+පතා
+පාසා
+ගානෙ
+තව
+ඉතා
+බොහෝ
+වහා
+සෙද
+සැනින්
+හනික
+එම්බා
+එම්බල
+බොල
+නම්
+වනාහි
+කලී
+ඉඳුරා
+අන්න
+ඔන්න
+මෙන්න
+උදෙසා
+පිණිස
+සඳහා
+අරබයා
+නිසා
+එනිසා
+එබැවින්
+බැවින්
+හෙයින්
+සේක්
+සේක
+ගැන
+අනුව
+පරිදි
+විට
+තෙක්
+මෙතෙක්
+මේතාක්
+තුරු
+තුරා
+තුරාවට
+තුලින්
+නමුත්
+එනමුත්
+වස්
+මෙන්
+ලෙස
+පරිදි
+එහෙත්
""".split()
)
diff --git a/spacy/lang/sk/__init__.py b/spacy/lang/sk/__init__.py
index 4003c7340..da6e3048e 100644
--- a/spacy/lang/sk/__init__.py
+++ b/spacy/lang/sk/__init__.py
@@ -1,9 +1,9 @@
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class SlovakDefaults(Language.Defaults):
+class SlovakDefaults(BaseDefaults):
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
diff --git a/spacy/lang/sl/__init__.py b/spacy/lang/sl/__init__.py
index 0330cc4d0..9ddd676bf 100644
--- a/spacy/lang/sl/__init__.py
+++ b/spacy/lang/sl/__init__.py
@@ -1,8 +1,8 @@
from .stop_words import STOP_WORDS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class SlovenianDefaults(Language.Defaults):
+class SlovenianDefaults(BaseDefaults):
stop_words = STOP_WORDS
diff --git a/spacy/lang/sq/__init__.py b/spacy/lang/sq/__init__.py
index a4bacfa49..5e32a0cbe 100644
--- a/spacy/lang/sq/__init__.py
+++ b/spacy/lang/sq/__init__.py
@@ -1,8 +1,8 @@
from .stop_words import STOP_WORDS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class AlbanianDefaults(Language.Defaults):
+class AlbanianDefaults(BaseDefaults):
stop_words = STOP_WORDS
diff --git a/spacy/lang/sr/__init__.py b/spacy/lang/sr/__init__.py
index 165e54975..fd0c8c832 100644
--- a/spacy/lang/sr/__init__.py
+++ b/spacy/lang/sr/__init__.py
@@ -1,10 +1,10 @@
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class SerbianDefaults(Language.Defaults):
+class SerbianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py
index 1b1b69fac..6963e8b79 100644
--- a/spacy/lang/sv/__init__.py
+++ b/spacy/lang/sv/__init__.py
@@ -1,10 +1,10 @@
-from typing import Optional
+from typing import Optional, Callable
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language
+from ...language import Language, BaseDefaults
from ...pipeline import Lemmatizer
@@ -12,7 +12,7 @@ from ...pipeline import Lemmatizer
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
-class SwedishDefaults(Language.Defaults):
+class SwedishDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
@@ -29,13 +29,25 @@ class Swedish(Language):
@Swedish.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={"model": None, "mode": "rule", "overwrite": False},
+ default_config={
+ "model": None,
+ "mode": "rule",
+ "overwrite": False,
+ "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+ },
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
- nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+ nlp: Language,
+ model: Optional[Model],
+ name: str,
+ mode: str,
+ overwrite: bool,
+ scorer: Optional[Callable],
):
- return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+ return Lemmatizer(
+ nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+ )
__all__ = ["Swedish"]
diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py
index d5ae47853..06ad016ac 100644
--- a/spacy/lang/sv/syntax_iterators.py
+++ b/spacy/lang/sv/syntax_iterators.py
@@ -1,11 +1,11 @@
-from typing import Union, Iterator
+from typing import Union, Iterator, Tuple
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
from ...tokens import Doc, Span
-def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
# fmt: off
labels = ["nsubj", "nsubj:pass", "dobj", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
diff --git a/spacy/lang/ta/__init__.py b/spacy/lang/ta/__init__.py
index ac5fc7124..4929a4b97 100644
--- a/spacy/lang/ta/__init__.py
+++ b/spacy/lang/ta/__init__.py
@@ -1,9 +1,9 @@
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class TamilDefaults(Language.Defaults):
+class TamilDefaults(BaseDefaults):
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
diff --git a/spacy/lang/te/__init__.py b/spacy/lang/te/__init__.py
index e6dc80e28..77cc2fe9b 100644
--- a/spacy/lang/te/__init__.py
+++ b/spacy/lang/te/__init__.py
@@ -1,9 +1,9 @@
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class TeluguDefaults(Language.Defaults):
+class TeluguDefaults(BaseDefaults):
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py
index 219c50c1a..12b1527e0 100644
--- a/spacy/lang/th/__init__.py
+++ b/spacy/lang/th/__init__.py
@@ -1,8 +1,9 @@
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
-from ...language import Language
+from ...language import Language, BaseDefaults
from ...tokens import Doc
from ...util import DummyTokenizer, registry, load_config_from_str
+from ...vocab import Vocab
DEFAULT_CONFIG = """
@@ -16,13 +17,13 @@ DEFAULT_CONFIG = """
@registry.tokenizers("spacy.th.ThaiTokenizer")
def create_thai_tokenizer():
def thai_tokenizer_factory(nlp):
- return ThaiTokenizer(nlp)
+ return ThaiTokenizer(nlp.vocab)
return thai_tokenizer_factory
class ThaiTokenizer(DummyTokenizer):
- def __init__(self, nlp: Language) -> None:
+ def __init__(self, vocab: Vocab) -> None:
try:
from pythainlp.tokenize import word_tokenize
except ImportError:
@@ -31,7 +32,7 @@ class ThaiTokenizer(DummyTokenizer):
"https://github.com/PyThaiNLP/pythainlp"
) from None
self.word_tokenize = word_tokenize
- self.vocab = nlp.vocab
+ self.vocab = vocab
def __call__(self, text: str) -> Doc:
words = list(self.word_tokenize(text))
@@ -39,7 +40,7 @@ class ThaiTokenizer(DummyTokenizer):
return Doc(self.vocab, words=words, spaces=spaces)
-class ThaiDefaults(Language.Defaults):
+class ThaiDefaults(BaseDefaults):
config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
diff --git a/spacy/lang/ti/__init__.py b/spacy/lang/ti/__init__.py
index 709fb21cb..c74c081b5 100644
--- a/spacy/lang/ti/__init__.py
+++ b/spacy/lang/ti/__init__.py
@@ -4,12 +4,12 @@ from .punctuation import TOKENIZER_SUFFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...language import Language
+from ...language import Language, BaseDefaults
from ...attrs import LANG
from ...util import update_exc
-class TigrinyaDefaults(Language.Defaults):
+class TigrinyaDefaults(BaseDefaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "ti"
diff --git a/spacy/lang/ti/lex_attrs.py b/spacy/lang/ti/lex_attrs.py
index ed094de3b..da56af6c0 100644
--- a/spacy/lang/ti/lex_attrs.py
+++ b/spacy/lang/ti/lex_attrs.py
@@ -2,7 +2,7 @@ from ...attrs import LIKE_NUM
_num_words = [
"ዜሮ",
- "ሐደ",
+ "ሓደ",
"ክልተ",
"ሰለስተ",
"ኣርባዕተ",
@@ -11,66 +11,37 @@ _num_words = [
"ሸውዓተ",
"ሽሞንተ",
"ትሽዓተ",
- "ኣሰርተ",
- "ኣሰርተ ሐደ",
- "ኣሰርተ ክልተ",
- "ኣሰርተ ሰለስተ",
- "ኣሰርተ ኣርባዕተ",
- "ኣሰርተ ሓሙሽተ",
- "ኣሰርተ ሽድሽተ",
- "ኣሰርተ ሸውዓተ",
- "ኣሰርተ ሽሞንተ",
- "ኣሰርተ ትሽዓተ",
+ "ዓሰርተ",
"ዕስራ",
"ሰላሳ",
"ኣርብዓ",
- "ሃምሳ",
- "ስልሳ",
+ "ሓምሳ",
+ "ሱሳ",
"ሰብዓ",
"ሰማንያ",
- "ተስዓ",
+ "ቴስዓ",
"ሚእቲ",
"ሺሕ",
"ሚልዮን",
"ቢልዮን",
"ትሪልዮን",
"ኳድሪልዮን",
- "ገጅልዮን",
- "ባዝልዮን",
+ "ጋዚልዮን",
+ "ባዚልዮን",
]
+# Tigrinya ordinals above 10 are the same as _num_words but start with "መበል "
_ordinal_words = [
"ቀዳማይ",
"ካልኣይ",
"ሳልሳይ",
- "ራብኣይ",
+ "ራብዓይ",
"ሓምሻይ",
"ሻድሻይ",
"ሻውዓይ",
"ሻምናይ",
- "ዘጠነኛ",
- "አስረኛ",
- "ኣሰርተ አንደኛ",
- "ኣሰርተ ሁለተኛ",
- "ኣሰርተ ሶስተኛ",
- "ኣሰርተ አራተኛ",
- "ኣሰርተ አምስተኛ",
- "ኣሰርተ ስድስተኛ",
- "ኣሰርተ ሰባተኛ",
- "ኣሰርተ ስምንተኛ",
- "ኣሰርተ ዘጠነኛ",
- "ሃያኛ",
- "ሰላሳኛ" "አርባኛ",
- "አምሳኛ",
- "ስድሳኛ",
- "ሰባኛ",
- "ሰማንያኛ",
- "ዘጠናኛ",
- "መቶኛ",
- "ሺኛ",
- "ሚሊዮንኛ",
- "ቢሊዮንኛ",
- "ትሪሊዮንኛ",
+ "ታሽዓይ",
+ "ዓስራይ",
]
@@ -92,7 +63,7 @@ def like_num(text):
# Check ordinal number
if text_lower in _ordinal_words:
return True
- if text_lower.endswith("ኛ"):
+ if text_lower.endswith("ይ"):
if text_lower[:-2].isdigit():
return True
diff --git a/spacy/lang/ti/punctuation.py b/spacy/lang/ti/punctuation.py
index 772b009bf..aa884c2ba 100644
--- a/spacy/lang/ti/punctuation.py
+++ b/spacy/lang/ti/punctuation.py
@@ -1,7 +1,7 @@
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
from ..char_classes import UNITS, ALPHA_UPPER
-_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧".strip().split()
+_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
_suffixes = (
_list_punct
diff --git a/spacy/lang/ti/stop_words.py b/spacy/lang/ti/stop_words.py
index c4f8f20fa..9bd712200 100644
--- a/spacy/lang/ti/stop_words.py
+++ b/spacy/lang/ti/stop_words.py
@@ -1,6 +1,27 @@
+# Stop words from Tigrinya Wordcount: https://github.com/fgaim/Tigrinya-WordCount/blob/main/ti_stop_words.txt
+
# Stop words
STOP_WORDS = set(
"""
-ግን ግና ንስኻ ንስኺ ንስኻትክን ንስኻትኩም ናትካ ናትኪ ናትክን ናትኩም
+'ምበር 'ሞ 'ቲ 'ታ 'ኳ 'ውን 'ዚ 'የ 'ዩ 'ያ 'ዮም 'ዮን
+ልዕሊ ሒዙ ሒዛ ሕጂ መበል መን መንጎ መጠን ማለት ምስ ምባል
+ምእንቲ ምኽንያቱ ምኽንያት ምዃኑ ምዃንና ምዃኖም
+ስለ ስለዚ ስለዝበላ ሽዑ ቅድሚ በለ በቲ በዚ ብምባል ብተወሳኺ ብኸመይ
+ብዘይ ብዘይካ ብዙሕ ብዛዕባ ብፍላይ ተባሂሉ ነበረ ነቲ ነታ ነቶም
+ነዚ ነይሩ ነገራት ነገር ናብ ናብቲ ናትኩም ናትኪ ናትካ ናትክን
+ናይ ናይቲ ንሕና ንሱ ንሳ ንሳቶም ንስኺ ንስኻ ንስኻትኩም ንስኻትክን ንዓይ
+ኢለ ኢሉ ኢላ ኢልካ ኢሎም ኢና ኢኻ ኢዩ ኣለኹ
+ኣለዉ ኣለዎ ኣሎ ኣብ ኣብቲ ኣብታ ኣብኡ ኣብዚ ኣነ ኣዝዩ ኣይኮነን ኣይኰነን
+እምበር እሞ እተን እቲ እታ እቶም እንተ እንተሎ
+ኣላ እንተኾነ እንታይ እንከሎ እኳ እዋን እውን እዚ እዛ እዞም
+እየ እየን እዩ እያ እዮም
+ከሎ ከመይ ከም ከምቲ ከምኡ ከምዘሎ
+ከምዚ ከኣ ኩሉ ካልእ ካልኦት ካብ ካብቲ ካብቶም ክሳብ ክሳዕ ክብል
+ክንደይ ክንዲ ክኸውን ኮይኑ ኰይኑ ኵሉ ኸም ኸኣ ወይ
+ዋላ ዘለና ዘለዉ ዘለዋ ዘለዎ ዘለዎም ዘላ ዘሎ ዘይብሉ
+ዝርከብ ዝበሃል ዝበለ ዝብል ዝተባህለ ዝተኻየደ ዝተፈላለየ ዝተፈላለዩ
+ዝነበረ ዝነበረት ዝነበሩ ዝካየድ ዝኸውን ዝኽእል ዝኾነ ዝዀነ
+የለን ይቕረብ ይብል ይኸውን ይኹን ይኽእል ደኣ ድሕሪ ድማ
+ገለ ገሊጹ ገና ገይሩ ግና ግን ጥራይ
""".split()
)
diff --git a/spacy/lang/tl/__init__.py b/spacy/lang/tl/__init__.py
index 61530dc30..30838890a 100644
--- a/spacy/lang/tl/__init__.py
+++ b/spacy/lang/tl/__init__.py
@@ -1,10 +1,10 @@
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class TagalogDefaults(Language.Defaults):
+class TagalogDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
diff --git a/spacy/lang/tn/__init__.py b/spacy/lang/tn/__init__.py
index 99907c28a..28e887eea 100644
--- a/spacy/lang/tn/__init__.py
+++ b/spacy/lang/tn/__init__.py
@@ -1,10 +1,10 @@
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES
-from ...language import Language
+from ...language import Language, BaseDefaults
-class SetswanaDefaults(Language.Defaults):
+class SetswanaDefaults(BaseDefaults):
infixes = TOKENIZER_INFIXES
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py
index e41db911f..d76fe4262 100644
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@@ -250,3 +250,9 @@ o.0
for orth in emoticons:
BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
+
+
+# Moved from a suffix setting due to #9155 removing prefixes from consideration
+# for lookbehinds
+for u in "cfkCFK":
+ BASE_EXCEPTIONS[f"°{u}."] = [{ORTH: "°"}, {ORTH: f"{u}"}, {ORTH: "."}]
diff --git a/spacy/lang/tr/__init__.py b/spacy/lang/tr/__init__.py
index 679411acf..02b5c7bf4 100644
--- a/spacy/lang/tr/__init__.py
+++ b/spacy/lang/tr/__init__.py
@@ -2,10 +2,10 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .lex_attrs import LEX_ATTRS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class TurkishDefaults(Language.Defaults):
+class TurkishDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
diff --git a/spacy/lang/tr/syntax_iterators.py b/spacy/lang/tr/syntax_iterators.py
index 3fd726fb5..769af1223 100644
--- a/spacy/lang/tr/syntax_iterators.py
+++ b/spacy/lang/tr/syntax_iterators.py
@@ -1,8 +1,10 @@
+from typing import Union, Iterator, Tuple
+from ...tokens import Doc, Span
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
-def noun_chunks(doclike):
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
diff --git a/spacy/lang/tt/__init__.py b/spacy/lang/tt/__init__.py
index c8e293f29..d5e1e87ef 100644
--- a/spacy/lang/tt/__init__.py
+++ b/spacy/lang/tt/__init__.py
@@ -2,10 +2,10 @@ from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class TatarDefaults(Language.Defaults):
+class TatarDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS
diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py
index 677281ec6..21f9649f2 100644
--- a/spacy/lang/uk/__init__.py
+++ b/spacy/lang/uk/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
from thinc.api import Model
@@ -6,10 +6,10 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import UkrainianLemmatizer
-from ...language import Language
+from ...language import Language, BaseDefaults
-class UkrainianDefaults(Language.Defaults):
+class UkrainianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
@@ -23,13 +23,25 @@ class Ukrainian(Language):
@Ukrainian.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={"model": None, "mode": "pymorphy2", "overwrite": False},
+ default_config={
+ "model": None,
+ "mode": "pymorphy2",
+ "overwrite": False,
+ "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+ },
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
- nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+ nlp: Language,
+ model: Optional[Model],
+ name: str,
+ mode: str,
+ overwrite: bool,
+ scorer: Optional[Callable],
):
- return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+ return UkrainianLemmatizer(
+ nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+ )
__all__ = ["Ukrainian"]
diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py
index 1fb030e06..a8bc56057 100644
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@@ -1,8 +1,9 @@
-from typing import Optional
+from typing import Optional, Callable
from thinc.api import Model
from ..ru.lemmatizer import RussianLemmatizer
+from ...pipeline.lemmatizer import lemmatizer_score
from ...vocab import Vocab
@@ -15,6 +16,7 @@ class UkrainianLemmatizer(RussianLemmatizer):
*,
mode: str = "pymorphy2",
overwrite: bool = False,
+ scorer: Optional[Callable] = lemmatizer_score,
) -> None:
if mode == "pymorphy2":
try:
@@ -27,4 +29,6 @@ class UkrainianLemmatizer(RussianLemmatizer):
) from None
if getattr(self, "_morph", None) is None:
self._morph = MorphAnalyzer(lang="uk")
- super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
+ super().__init__(
+ vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+ )
diff --git a/spacy/lang/ur/__init__.py b/spacy/lang/ur/__init__.py
index e3dee5805..266c5a73d 100644
--- a/spacy/lang/ur/__init__.py
+++ b/spacy/lang/ur/__init__.py
@@ -1,10 +1,10 @@
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES
-from ...language import Language
+from ...language import Language, BaseDefaults
-class UrduDefaults(Language.Defaults):
+class UrduDefaults(BaseDefaults):
suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
diff --git a/spacy/lang/vi/__init__.py b/spacy/lang/vi/__init__.py
index b6d873a13..822dc348c 100644
--- a/spacy/lang/vi/__init__.py
+++ b/spacy/lang/vi/__init__.py
@@ -6,9 +6,10 @@ import string
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
-from ...language import Language
+from ...language import Language, BaseDefaults
from ...tokens import Doc
from ...util import DummyTokenizer, registry, load_config_from_str
+from ...vocab import Vocab
from ... import util
@@ -24,14 +25,14 @@ use_pyvi = true
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
def create_vietnamese_tokenizer(use_pyvi: bool = True):
def vietnamese_tokenizer_factory(nlp):
- return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
+ return VietnameseTokenizer(nlp.vocab, use_pyvi=use_pyvi)
return vietnamese_tokenizer_factory
class VietnameseTokenizer(DummyTokenizer):
- def __init__(self, nlp: Language, use_pyvi: bool = False):
- self.vocab = nlp.vocab
+ def __init__(self, vocab: Vocab, use_pyvi: bool = False):
+ self.vocab = vocab
self.use_pyvi = use_pyvi
if self.use_pyvi:
try:
@@ -45,6 +46,9 @@ class VietnameseTokenizer(DummyTokenizer):
)
raise ImportError(msg) from None
+ def __reduce__(self):
+ return VietnameseTokenizer, (self.vocab, self.use_pyvi)
+
def __call__(self, text: str) -> Doc:
if self.use_pyvi:
words = self.pyvi_tokenize(text)
@@ -141,7 +145,7 @@ class VietnameseTokenizer(DummyTokenizer):
def to_disk(self, path: Union[str, Path], **kwargs) -> None:
path = util.ensure_path(path)
serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())}
- return util.to_disk(path, serializers, [])
+ util.to_disk(path, serializers, [])
def from_disk(self, path: Union[str, Path], **kwargs) -> "VietnameseTokenizer":
path = util.ensure_path(path)
@@ -150,7 +154,7 @@ class VietnameseTokenizer(DummyTokenizer):
return self
-class VietnameseDefaults(Language.Defaults):
+class VietnameseDefaults(BaseDefaults):
config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
diff --git a/spacy/lang/vi/examples.py b/spacy/lang/vi/examples.py
new file mode 100644
index 000000000..36575f67c
--- /dev/null
+++ b/spacy/lang/vi/examples.py
@@ -0,0 +1,17 @@
+"""
+Example sentences to test spaCy and its language models.
+>>> from spacy.lang.vi.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+ "Đây là đâu, tôi là ai?",
+ "Căn phòng có nhiều cửa sổ nên nó khá sáng",
+ "Đại dịch COVID vừa qua đã gây ảnh hưởng rất lớn tới nhiều doanh nghiệp lớn nhỏ.",
+ "Thành phố Hồ Chí Minh đã bị ảnh hưởng nặng nề trong thời gian vừa qua.",
+ "Ông bạn đang ở đâu thế?",
+ "Ai là người giải phóng đất nước Việt Nam khỏi ách đô hộ?",
+ "Vị tướng nào là người đã làm nên chiến thắng lịch sử Điện Biên Phủ?",
+ "Làm việc nhiều chán quá, đi chơi đâu đi?",
+]
diff --git a/spacy/lang/vi/lex_attrs.py b/spacy/lang/vi/lex_attrs.py
index b3dbf2192..33a3745cc 100644
--- a/spacy/lang/vi/lex_attrs.py
+++ b/spacy/lang/vi/lex_attrs.py
@@ -9,11 +9,14 @@ _num_words = [
"bốn",
"năm",
"sáu",
+ "bảy",
"bẩy",
"tám",
"chín",
"mười",
+ "chục",
"trăm",
+ "nghìn",
"tỷ",
]
diff --git a/spacy/lang/xx/examples.py b/spacy/lang/xx/examples.py
index 8d63c3c20..34570d747 100644
--- a/spacy/lang/xx/examples.py
+++ b/spacy/lang/xx/examples.py
@@ -59,7 +59,7 @@ sentences = [
"Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
"Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”.",
"Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares.",
- "Carros autônomos empurram a responsabilidade do seguro para os fabricantes.."
+ "Carros autônomos empurram a responsabilidade do seguro para os fabricantes..",
"São Francisco considera banir os robôs de entrega que andam pelas calçadas.",
"Londres é a maior cidade do Reino Unido.",
# Translations from English:
diff --git a/spacy/lang/yo/__init__.py b/spacy/lang/yo/__init__.py
index df6bb7d4a..6c38ec8af 100644
--- a/spacy/lang/yo/__init__.py
+++ b/spacy/lang/yo/__init__.py
@@ -1,9 +1,9 @@
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
-from ...language import Language
+from ...language import Language, BaseDefaults
-class YorubaDefaults(Language.Defaults):
+class YorubaDefaults(BaseDefaults):
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index 9a8a21a63..fdf6776e2 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -6,11 +6,12 @@ import warnings
from pathlib import Path
from ...errors import Warnings, Errors
-from ...language import Language
+from ...language import Language, BaseDefaults
from ...scorer import Scorer
from ...tokens import Doc
from ...training import validate_examples, Example
from ...util import DummyTokenizer, registry, load_config_from_str
+from ...vocab import Vocab
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from ... import util
@@ -48,29 +49,29 @@ class Segmenter(str, Enum):
@registry.tokenizers("spacy.zh.ChineseTokenizer")
def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
def chinese_tokenizer_factory(nlp):
- return ChineseTokenizer(nlp, segmenter=segmenter)
+ return ChineseTokenizer(nlp.vocab, segmenter=segmenter)
return chinese_tokenizer_factory
class ChineseTokenizer(DummyTokenizer):
- def __init__(self, nlp: Language, segmenter: Segmenter = Segmenter.char):
- self.vocab = nlp.vocab
- if isinstance(segmenter, Segmenter):
- segmenter = segmenter.value
- self.segmenter = segmenter
+ def __init__(self, vocab: Vocab, segmenter: Segmenter = Segmenter.char):
+ self.vocab = vocab
+ self.segmenter = (
+ segmenter.value if isinstance(segmenter, Segmenter) else segmenter
+ )
self.pkuseg_seg = None
self.jieba_seg = None
- if segmenter not in Segmenter.values():
+ if self.segmenter not in Segmenter.values():
warn_msg = Warnings.W103.format(
lang="Chinese",
- segmenter=segmenter,
+ segmenter=self.segmenter,
supported=", ".join(Segmenter.values()),
default="'char' (character segmentation)",
)
warnings.warn(warn_msg)
self.segmenter = Segmenter.char
- if segmenter == Segmenter.jieba:
+ if self.segmenter == Segmenter.jieba:
self.jieba_seg = try_jieba_import()
def initialize(
@@ -90,7 +91,7 @@ class ChineseTokenizer(DummyTokenizer):
def __call__(self, text: str) -> Doc:
if self.segmenter == Segmenter.jieba:
- words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x])
+ words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x]) # type: ignore[union-attr]
(words, spaces) = util.get_words_and_spaces(words, text)
return Doc(self.vocab, words=words, spaces=spaces)
elif self.segmenter == Segmenter.pkuseg:
@@ -121,7 +122,7 @@ class ChineseTokenizer(DummyTokenizer):
try:
import spacy_pkuseg
- self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(None)
+ self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(None) # type: ignore[attr-defined]
except ImportError:
msg = (
"spacy_pkuseg not installed: unable to reset pkuseg "
@@ -129,7 +130,7 @@ class ChineseTokenizer(DummyTokenizer):
)
raise ImportError(msg) from None
for word in words:
- self.pkuseg_seg.preprocesser.insert(word.strip(), "")
+ self.pkuseg_seg.preprocesser.insert(word.strip(), "") # type: ignore[attr-defined]
else:
warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
warnings.warn(warn_msg)
@@ -282,7 +283,7 @@ class ChineseTokenizer(DummyTokenizer):
util.from_disk(path, serializers, [])
-class ChineseDefaults(Language.Defaults):
+class ChineseDefaults(BaseDefaults):
config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
@@ -294,7 +295,7 @@ class Chinese(Language):
Defaults = ChineseDefaults
-def try_jieba_import() -> None:
+def try_jieba_import():
try:
import jieba
@@ -310,7 +311,7 @@ def try_jieba_import() -> None:
raise ImportError(msg) from None
-def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None:
+def try_pkuseg_import(pkuseg_model: Optional[str], pkuseg_user_dict: Optional[str]):
try:
import spacy_pkuseg
@@ -318,9 +319,9 @@ def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None:
msg = "spacy-pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
raise ImportError(msg) from None
try:
- return spacy_pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
+ return spacy_pkuseg.pkuseg(pkuseg_model, user_dict=pkuseg_user_dict)
except FileNotFoundError:
- msg = "Unable to load pkuseg model from: " + pkuseg_model
+ msg = "Unable to load pkuseg model from: " + str(pkuseg_model or "")
raise FileNotFoundError(msg) from None
diff --git a/spacy/language.py b/spacy/language.py
index b60c92158..217356b4c 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1,6 +1,7 @@
-from typing import Iterator, Optional, Any, Dict, Callable, Iterable, TypeVar
-from typing import Union, List, Pattern, overload
-from typing import Tuple
+from typing import Iterator, Optional, Any, Dict, Callable, Iterable
+from typing import Union, Tuple, List, Set, Pattern, Sequence
+from typing import NoReturn, TYPE_CHECKING, TypeVar, cast, overload
+
from dataclasses import dataclass
import random
import itertools
@@ -9,13 +10,14 @@ from contextlib import contextmanager
from copy import deepcopy
from pathlib import Path
import warnings
-from thinc.api import get_current_ops, Config, Optimizer
+from thinc.api import get_current_ops, Config, CupyOps, Optimizer
import srsly
import multiprocessing as mp
from itertools import chain, cycle
from timeit import default_timer as timer
import traceback
+from . import ty
from .tokens.underscore import Underscore
from .vocab import Vocab, create_vocab
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
@@ -37,6 +39,11 @@ from .git_info import GIT_VERSION
from . import util
from . import about
from .lookups import load_lookups
+from .compat import Literal
+
+
+if TYPE_CHECKING:
+ from .pipeline import Pipe # noqa: F401
# This is the base config will all settings (training etc.)
@@ -46,6 +53,9 @@ DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH)
# in the main config and only added via the 'init fill-config' command
DEFAULT_CONFIG_PRETRAIN_PATH = Path(__file__).parent / "default_config_pretraining.cfg"
+# Type variable for contexts piped with documents
+_AnyContext = TypeVar("_AnyContext")
+
class BaseDefaults:
"""Language data defaults, available via Language.Defaults. Can be
@@ -55,14 +65,14 @@ class BaseDefaults:
config: Config = Config(section_order=CONFIG_SECTION_ORDER)
tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS
- prefixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_PREFIXES
- suffixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_SUFFIXES
- infixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_INFIXES
- token_match: Optional[Pattern] = None
- url_match: Optional[Pattern] = URL_MATCH
+ prefixes: Optional[Sequence[Union[str, Pattern]]] = TOKENIZER_PREFIXES
+ suffixes: Optional[Sequence[Union[str, Pattern]]] = TOKENIZER_SUFFIXES
+ infixes: Optional[Sequence[Union[str, Pattern]]] = TOKENIZER_INFIXES
+ token_match: Optional[Callable] = None
+ url_match: Optional[Callable] = URL_MATCH
syntax_iterators: Dict[str, Callable] = {}
lex_attr_getters: Dict[int, Callable[[str], Any]] = {}
- stop_words = set()
+ stop_words: Set[str] = set()
writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
@@ -105,13 +115,13 @@ class Language:
Defaults (class): Settings, data and factory methods for creating the `nlp`
object and processing pipeline.
- lang (str): Two-letter language ID, i.e. ISO code.
+ lang (str): IETF language code, such as 'en'.
DOCS: https://spacy.io/api/language
"""
Defaults = BaseDefaults
- lang: str = None
+ lang: Optional[str] = None
default_config = DEFAULT_CONFIG
factories = SimpleFrozenDict(error=Errors.E957)
@@ -154,7 +164,7 @@ class Language:
self._config = DEFAULT_CONFIG.merge(self.default_config)
self._meta = dict(meta)
self._path = None
- self._optimizer = None
+ self._optimizer: Optional[Optimizer] = None
# Component meta and configs are only needed on the instance
self._pipe_meta: Dict[str, "FactoryMeta"] = {} # meta by component
self._pipe_configs: Dict[str, Config] = {} # config by component
@@ -170,8 +180,8 @@ class Language:
self.vocab: Vocab = vocab
if self.lang is None:
self.lang = self.vocab.lang
- self._components = []
- self._disabled = set()
+ self._components: List[Tuple[str, "Pipe"]] = []
+ self._disabled: Set[str] = set()
self.max_length = max_length
# Create the default tokenizer from the default config
if not create_tokenizer:
@@ -199,7 +209,7 @@ class Language:
DOCS: https://spacy.io/api/language#meta
"""
- spacy_version = util.get_model_version_range(about.__version__)
+ spacy_version = util.get_minor_version_range(about.__version__)
if self.vocab.lang:
self._meta.setdefault("lang", self.vocab.lang)
else:
@@ -218,6 +228,7 @@ class Language:
"vectors": len(self.vocab.vectors),
"keys": self.vocab.vectors.n_keys,
"name": self.vocab.vectors.name,
+ "mode": self.vocab.vectors.mode,
}
self._meta["labels"] = dict(self.pipe_labels)
# TODO: Adding this back to prevent breaking people's code etc., but
@@ -291,7 +302,7 @@ class Language:
return SimpleFrozenList(names)
@property
- def components(self) -> List[Tuple[str, Callable[[Doc], Doc]]]:
+ def components(self) -> List[Tuple[str, "Pipe"]]:
"""Get all (name, component) tuples in the pipeline, including the
currently disabled components.
"""
@@ -310,12 +321,12 @@ class Language:
return SimpleFrozenList(names, error=Errors.E926.format(attr="component_names"))
@property
- def pipeline(self) -> List[Tuple[str, Callable[[Doc], Doc]]]:
+ def pipeline(self) -> List[Tuple[str, "Pipe"]]:
"""The processing pipeline consisting of (name, component) tuples. The
components are called on the Doc in order as it passes through the
pipeline.
- RETURNS (List[Tuple[str, Callable[[Doc], Doc]]]): The pipeline.
+ RETURNS (List[Tuple[str, Pipe]]): The pipeline.
"""
pipes = [(n, p) for n, p in self._components if n not in self._disabled]
return SimpleFrozenList(pipes, error=Errors.E926.format(attr="pipeline"))
@@ -423,7 +434,7 @@ class Language:
assigns: Iterable[str] = SimpleFrozenList(),
requires: Iterable[str] = SimpleFrozenList(),
retokenizes: bool = False,
- default_score_weights: Dict[str, float] = SimpleFrozenDict(),
+ default_score_weights: Dict[str, Optional[float]] = SimpleFrozenDict(),
func: Optional[Callable] = None,
) -> Callable:
"""Register a new pipeline component factory. Can be used as a decorator
@@ -440,7 +451,7 @@ class Language:
e.g. "token.ent_id". Used for pipeline analysis.
retokenizes (bool): Whether the component changes the tokenization.
Used for pipeline analysis.
- default_score_weights (Dict[str, float]): The scores to report during
+ default_score_weights (Dict[str, Optional[float]]): The scores to report during
training, and their default weight towards the final score used to
select the best model. Weights should sum to 1.0 per component and
will be combined and normalized for the whole pipeline. If None,
@@ -505,13 +516,13 @@ class Language:
@classmethod
def component(
cls,
- name: Optional[str] = None,
+ name: str,
*,
assigns: Iterable[str] = SimpleFrozenList(),
requires: Iterable[str] = SimpleFrozenList(),
retokenizes: bool = False,
- func: Optional[Callable[[Doc], Doc]] = None,
- ) -> Callable:
+ func: Optional["Pipe"] = None,
+ ) -> Callable[..., Any]:
"""Register a new pipeline component. Can be used for stateless function
components that don't require a separate factory. Can be used as a
decorator on a function or classmethod, or called as a function with the
@@ -533,11 +544,11 @@ class Language:
raise ValueError(Errors.E963.format(decorator="component"))
component_name = name if name is not None else util.get_object_name(func)
- def add_component(component_func: Callable[[Doc], Doc]) -> Callable:
+ def add_component(component_func: "Pipe") -> Callable:
if isinstance(func, type): # function is a class
raise ValueError(Errors.E965.format(name=component_name))
- def factory_func(nlp: cls, name: str) -> Callable[[Doc], Doc]:
+ def factory_func(nlp, name: str) -> "Pipe":
return component_func
internal_name = cls.get_factory_name(name)
@@ -587,7 +598,7 @@ class Language:
print_pipe_analysis(analysis, keys=keys)
return analysis
- def get_pipe(self, name: str) -> Callable[[Doc], Doc]:
+ def get_pipe(self, name: str) -> "Pipe":
"""Get a pipeline component for a given component name.
name (str): Name of pipeline component to get.
@@ -605,22 +616,22 @@ class Language:
factory_name: str,
name: Optional[str] = None,
*,
- config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
+ config: Dict[str, Any] = SimpleFrozenDict(),
raw_config: Optional[Config] = None,
validate: bool = True,
- ) -> Callable[[Doc], Doc]:
+ ) -> "Pipe":
"""Create a pipeline component. Mostly used internally. To create and
add a component to the pipeline, you can use nlp.add_pipe.
factory_name (str): Name of component factory.
name (Optional[str]): Optional name to assign to component instance.
Defaults to factory name if not set.
- config (Optional[Dict[str, Any]]): Config parameters to use for this
- component. Will be merged with default config, if available.
+ config (Dict[str, Any]): Config parameters to use for this component.
+ Will be merged with default config, if available.
raw_config (Optional[Config]): Internals: the non-interpolated config.
validate (bool): Whether to validate the component config against the
arguments and types expected by the factory.
- RETURNS (Callable[[Doc], Doc]): The pipeline component.
+ RETURNS (Pipe): The pipeline component.
DOCS: https://spacy.io/api/language#create_pipe
"""
@@ -640,7 +651,6 @@ class Language:
)
raise ValueError(err)
pipe_meta = self.get_factory_meta(factory_name)
- config = config or {}
# This is unideal, but the alternative would mean you always need to
# specify the full config settings, which is not really viable.
if pipe_meta.default_config:
@@ -676,7 +686,7 @@ class Language:
def create_pipe_from_source(
self, source_name: str, source: "Language", *, name: str
- ) -> Tuple[Callable[[Doc], Doc], str]:
+ ) -> Tuple["Pipe", str]:
"""Create a pipeline component by copying it from an existing model.
source_name (str): Name of the component in the source pipeline.
@@ -691,7 +701,8 @@ class Language:
if (
self.vocab.vectors.shape != source.vocab.vectors.shape
or self.vocab.vectors.key2row != source.vocab.vectors.key2row
- or self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes()
+ or self.vocab.vectors.to_bytes(exclude=["strings"])
+ != source.vocab.vectors.to_bytes(exclude=["strings"])
):
warnings.warn(Warnings.W113.format(name=source_name))
if source_name not in source.component_names:
@@ -708,8 +719,9 @@ class Language:
source_config = source.config.interpolate()
pipe_config = util.copy_config(source_config["components"][source_name])
self._pipe_configs[name] = pipe_config
- for s in source.vocab.strings:
- self.vocab.strings.add(s)
+ if self.vocab.strings != source.vocab.strings:
+ for s in source.vocab.strings:
+ self.vocab.strings.add(s)
return pipe, pipe_config["factory"]
def add_pipe(
@@ -722,10 +734,10 @@ class Language:
first: Optional[bool] = None,
last: Optional[bool] = None,
source: Optional["Language"] = None,
- config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
+ config: Dict[str, Any] = SimpleFrozenDict(),
raw_config: Optional[Config] = None,
validate: bool = True,
- ) -> Callable[[Doc], Doc]:
+ ) -> "Pipe":
"""Add a component to the processing pipeline. Valid components are
callables that take a `Doc` object, modify it and return it. Only one
of before/after/first/last can be set. Default behaviour is "last".
@@ -743,12 +755,12 @@ class Language:
last (bool): If True, insert component last in the pipeline.
source (Language): Optional loaded nlp object to copy the pipeline
component from.
- config (Optional[Dict[str, Any]]): Config parameters to use for this
- component. Will be merged with default config, if available.
+ config (Dict[str, Any]): Config parameters to use for this component.
+ Will be merged with default config, if available.
raw_config (Optional[Config]): Internals: the non-interpolated config.
validate (bool): Whether to validate the component config against the
arguments and types expected by the factory.
- RETURNS (Callable[[Doc], Doc]): The pipeline component.
+ RETURNS (Pipe): The pipeline component.
DOCS: https://spacy.io/api/language#add_pipe
"""
@@ -859,7 +871,7 @@ class Language:
*,
config: Dict[str, Any] = SimpleFrozenDict(),
validate: bool = True,
- ) -> Callable[[Doc], Doc]:
+ ) -> "Pipe":
"""Replace a component in the pipeline.
name (str): Name of the component to replace.
@@ -868,18 +880,18 @@ class Language:
component. Will be merged with default config, if available.
validate (bool): Whether to validate the component config against the
arguments and types expected by the factory.
- RETURNS (Callable[[Doc], Doc]): The new pipeline component.
+ RETURNS (Pipe): The new pipeline component.
DOCS: https://spacy.io/api/language#replace_pipe
"""
- if name not in self.pipe_names:
+ if name not in self.component_names:
raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
if hasattr(factory_name, "__call__"):
err = Errors.E968.format(component=repr(factory_name), name=name)
raise ValueError(err)
# We need to delegate to Language.add_pipe here instead of just writing
# to Language.pipeline to make sure the configs are handled correctly
- pipe_index = self.pipe_names.index(name)
+ pipe_index = self.component_names.index(name)
self.remove_pipe(name)
if not len(self._components) or pipe_index == len(self._components):
# we have no components to insert before/after, or we're replacing the last component
@@ -920,7 +932,7 @@ class Language:
init_cfg = self._config["initialize"]["components"].pop(old_name)
self._config["initialize"]["components"][new_name] = init_cfg
- def remove_pipe(self, name: str) -> Tuple[str, Callable[[Doc], Doc]]:
+ def remove_pipe(self, name: str) -> Tuple[str, "Pipe"]:
"""Remove a component from the pipeline.
name (str): Name of the component to remove.
@@ -968,7 +980,7 @@ class Language:
def __call__(
self,
- text: str,
+ text: Union[str, Doc],
*,
disable: Iterable[str] = SimpleFrozenList(),
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
@@ -977,15 +989,17 @@ class Language:
and can contain arbitrary whitespace. Alignment into the original string
is preserved.
- text (str): The text to be processed.
- disable (list): Names of the pipeline components to disable.
+ text (Union[str, Doc]): If `str`, the text to be processed. If `Doc`,
+ the doc will be passed directly to the pipeline, skipping
+ `Language.make_doc`.
+ disable (List[str]): Names of the pipeline components to disable.
component_cfg (Dict[str, dict]): An optional dictionary with extra
keyword arguments for specific components.
RETURNS (Doc): A container for accessing the annotations.
DOCS: https://spacy.io/api/language#call
"""
- doc = self.make_doc(text)
+ doc = self._ensure_doc(text)
if component_cfg is None:
component_cfg = {}
for name, proc in self.pipeline:
@@ -997,7 +1011,7 @@ class Language:
if hasattr(proc, "get_error_handler"):
error_handler = proc.get_error_handler()
try:
- doc = proc(doc, **component_cfg.get(name, {}))
+ doc = proc(doc, **component_cfg.get(name, {})) # type: ignore[call-arg]
except KeyError as e:
# This typically happens if a component is not initialized
raise ValueError(Errors.E109.format(name=name)) from e
@@ -1017,7 +1031,7 @@ class Language:
"""
warnings.warn(Warnings.W096, DeprecationWarning)
if len(names) == 1 and isinstance(names[0], (list, tuple)):
- names = names[0] # support list of names instead of spread
+ names = names[0] # type: ignore[assignment] # support list of names instead of spread
return self.select_pipes(disable=names)
def select_pipes(
@@ -1052,6 +1066,7 @@ class Language:
)
)
disable = to_disable
+ assert disable is not None
# DisabledPipes will restore the pipes in 'disable' when it's done, so we need to exclude
# those pipes that were already disabled.
disable = [d for d in disable if d not in self._disabled]
@@ -1069,6 +1084,20 @@ class Language:
)
return self.tokenizer(text)
+ def _ensure_doc(self, doc_like: Union[str, Doc]) -> Doc:
+ """Create a Doc if need be, or raise an error if the input is not a Doc or a string."""
+ if isinstance(doc_like, Doc):
+ return doc_like
+ if isinstance(doc_like, str):
+ return self.make_doc(doc_like)
+ raise ValueError(Errors.E866.format(type=type(doc_like)))
+
+ def _ensure_doc_with_context(self, doc_like: Union[str, Doc], context: Any) -> Doc:
+ """Create a Doc if need be and add as_tuples context, or raise an error if the input is not a Doc or a string."""
+ doc = self._ensure_doc(doc_like)
+ doc._context = context
+ return doc
+
def update(
self,
examples: Iterable[Example],
@@ -1102,7 +1131,7 @@ class Language:
raise ValueError(Errors.E989)
if losses is None:
losses = {}
- if len(examples) == 0:
+ if isinstance(examples, list) and len(examples) == 0:
return losses
validate_examples(examples, "Language.update")
examples = _copy_examples(examples)
@@ -1119,12 +1148,13 @@ class Language:
component_cfg[name].setdefault("drop", drop)
pipe_kwargs[name].setdefault("batch_size", self.batch_size)
for name, proc in self.pipeline:
+ # ignore statements are used here because mypy ignores hasattr
if name not in exclude and hasattr(proc, "update"):
- proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
+ proc.update(examples, sgd=None, losses=losses, **component_cfg[name]) # type: ignore
if sgd not in (None, False):
if (
name not in exclude
- and hasattr(proc, "is_trainable")
+ and isinstance(proc, ty.TrainableComponent)
and proc.is_trainable
and proc.model not in (True, False, None)
):
@@ -1174,8 +1204,10 @@ class Language:
DOCS: https://spacy.io/api/language#rehearse
"""
- if len(examples) == 0:
- return
+ if losses is None:
+ losses = {}
+ if isinstance(examples, list) and len(examples) == 0:
+ return losses
validate_examples(examples, "Language.rehearse")
if sgd is None:
if self._optimizer is None:
@@ -1190,18 +1222,18 @@ class Language:
def get_grads(W, dW, key=None):
grads[key] = (W, dW)
- get_grads.learn_rate = sgd.learn_rate
- get_grads.b1 = sgd.b1
- get_grads.b2 = sgd.b2
+ get_grads.learn_rate = sgd.learn_rate # type: ignore[attr-defined, union-attr]
+ get_grads.b1 = sgd.b1 # type: ignore[attr-defined, union-attr]
+ get_grads.b2 = sgd.b2 # type: ignore[attr-defined, union-attr]
for name, proc in pipes:
if name in exclude or not hasattr(proc, "rehearse"):
continue
grads = {}
- proc.rehearse(
+ proc.rehearse( # type: ignore[attr-defined]
examples, sgd=get_grads, losses=losses, **component_cfg.get(name, {})
)
for key, (W, dW) in grads.items():
- sgd(W, dW, key=key)
+ sgd(W, dW, key=key) # type: ignore[call-arg, misc]
return losses
def begin_training(
@@ -1253,19 +1285,19 @@ class Language:
)
except IOError:
raise IOError(Errors.E884.format(vectors=I["vectors"]))
- if self.vocab.vectors.data.shape[1] >= 1:
+ if self.vocab.vectors.shape[1] >= 1:
ops = get_current_ops()
- self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
+ self.vocab.vectors.to_ops(ops)
if hasattr(self.tokenizer, "initialize"):
tok_settings = validate_init_settings(
- self.tokenizer.initialize,
+ self.tokenizer.initialize, # type: ignore[union-attr]
I["tokenizer"],
section="tokenizer",
name="tokenizer",
)
- self.tokenizer.initialize(get_examples, nlp=self, **tok_settings)
+ self.tokenizer.initialize(get_examples, nlp=self, **tok_settings) # type: ignore[union-attr]
for name, proc in self.pipeline:
- if hasattr(proc, "initialize"):
+ if isinstance(proc, ty.InitializableComponent):
p_settings = I["components"].get(name, {})
p_settings = validate_init_settings(
proc.initialize, p_settings, section="components", name=name
@@ -1300,11 +1332,11 @@ class Language:
DOCS: https://spacy.io/api/language#resume_training
"""
ops = get_current_ops()
- if self.vocab.vectors.data.shape[1] >= 1:
- self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
+ if self.vocab.vectors.shape[1] >= 1:
+ self.vocab.vectors.to_ops(ops)
for name, proc in self.pipeline:
if hasattr(proc, "_rehearsal_model"):
- proc._rehearsal_model = deepcopy(proc.model)
+ proc._rehearsal_model = deepcopy(proc.model) # type: ignore[attr-defined]
if sgd is not None:
self._optimizer = sgd
elif self._optimizer is None:
@@ -1313,14 +1345,12 @@ class Language:
def set_error_handler(
self,
- error_handler: Callable[
- [str, Callable[[Doc], Doc], List[Doc], Exception], None
- ],
+ error_handler: Callable[[str, "Pipe", List[Doc], Exception], NoReturn],
):
"""Set an error handler object for all the components in the pipeline that implement
a set_error_handler function.
- error_handler (Callable[[str, Callable[[Doc], Doc], List[Doc], Exception], None]):
+ error_handler (Callable[[str, Pipe, List[Doc], Exception], NoReturn]):
Function that deals with a failing batch of documents. This callable function should take in
the component's name, the component itself, the offending batch of documents, and the exception
that was thrown.
@@ -1339,7 +1369,7 @@ class Language:
scorer: Optional[Scorer] = None,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
scorer_cfg: Optional[Dict[str, Any]] = None,
- ) -> Dict[str, Union[float, dict]]:
+ ) -> Dict[str, Any]:
"""Evaluate a model's pipeline components.
examples (Iterable[Example]): `Example` objects.
@@ -1370,21 +1400,17 @@ class Language:
scorer = Scorer(**kwargs)
# reset annotation in predicted docs and time tokenization
start_time = timer()
+ # this is purely for timing
+ for eg in examples:
+ self.make_doc(eg.reference.text)
# apply all pipeline components
- for name, pipe in self.pipeline:
- kwargs = component_cfg.get(name, {})
- kwargs.setdefault("batch_size", batch_size)
- for doc, eg in zip(
- _pipe(
- (eg.predicted for eg in examples),
- proc=pipe,
- name=name,
- default_error_handler=self.default_error_handler,
- kwargs=kwargs,
- ),
- examples,
- ):
- eg.predicted = doc
+ docs = self.pipe(
+ (eg.predicted for eg in examples),
+ batch_size=batch_size,
+ component_cfg=component_cfg,
+ )
+ for eg, doc in zip(examples, docs):
+ eg.predicted = doc
end_time = timer()
results = scorer.score(examples)
n_words = sum(len(eg.predicted) for eg in examples)
@@ -1414,7 +1440,7 @@ class Language:
yield
else:
contexts = [
- pipe.use_params(params)
+ pipe.use_params(params) # type: ignore[attr-defined]
for name, pipe in self.pipeline
if hasattr(pipe, "use_params") and hasattr(pipe, "model")
]
@@ -1432,14 +1458,25 @@ class Language:
except StopIteration:
pass
- _AnyContext = TypeVar("_AnyContext")
-
@overload
def pipe(
self,
- texts: Iterable[Tuple[str, _AnyContext]],
+ texts: Iterable[Union[str, Doc]],
*,
- as_tuples: bool = ...,
+ as_tuples: Literal[False] = ...,
+ batch_size: Optional[int] = ...,
+ disable: Iterable[str] = ...,
+ component_cfg: Optional[Dict[str, Dict[str, Any]]] = ...,
+ n_process: int = ...,
+ ) -> Iterator[Doc]:
+ ...
+
+ @overload
+ def pipe( # noqa: F811
+ self,
+ texts: Iterable[Tuple[Union[str, Doc], _AnyContext]],
+ *,
+ as_tuples: Literal[True] = ...,
batch_size: Optional[int] = ...,
disable: Iterable[str] = ...,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = ...,
@@ -1447,19 +1484,22 @@ class Language:
) -> Iterator[Tuple[Doc, _AnyContext]]:
...
- def pipe(
+ def pipe( # noqa: F811
self,
- texts: Iterable[str],
+ texts: Union[
+ Iterable[Union[str, Doc]], Iterable[Tuple[Union[str, Doc], _AnyContext]]
+ ],
*,
as_tuples: bool = False,
batch_size: Optional[int] = None,
disable: Iterable[str] = SimpleFrozenList(),
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
n_process: int = 1,
- ) -> Iterator[Doc]:
+ ) -> Union[Iterator[Doc], Iterator[Tuple[Doc, _AnyContext]]]:
"""Process texts as a stream, and yield `Doc` objects in order.
- texts (Iterable[str]): A sequence of texts to process.
+ texts (Iterable[Union[str, Doc]]): A sequence of texts or docs to
+ process.
as_tuples (bool): If set to True, inputs should be a sequence of
(text, context) tuples. Output will then be a sequence of
(doc, context) tuples. Defaults to False.
@@ -1472,22 +1512,30 @@ class Language:
DOCS: https://spacy.io/api/language#pipe
"""
- if n_process == -1:
- n_process = mp.cpu_count()
+ # Handle texts with context as tuples
if as_tuples:
- text_context1, text_context2 = itertools.tee(texts)
- texts = (tc[0] for tc in text_context1)
- contexts = (tc[1] for tc in text_context2)
+ texts = cast(Iterable[Tuple[Union[str, Doc], _AnyContext]], texts)
+ docs_with_contexts = (
+ self._ensure_doc_with_context(text, context) for text, context in texts
+ )
docs = self.pipe(
- texts,
+ docs_with_contexts,
batch_size=batch_size,
disable=disable,
n_process=n_process,
component_cfg=component_cfg,
)
- for doc, context in zip(docs, contexts):
+ for doc in docs:
+ context = doc._context
+ doc._context = None
yield (doc, context)
return
+
+ texts = cast(Iterable[Union[str, Doc]], texts)
+
+ # Set argument defaults
+ if n_process == -1:
+ n_process = mp.cpu_count()
if component_cfg is None:
component_cfg = {}
if batch_size is None:
@@ -1512,26 +1560,40 @@ class Language:
pipes.append(f)
if n_process != 1:
+ if self._has_gpu_model(disable):
+ warnings.warn(Warnings.W114)
+
docs = self._multiprocessing_pipe(texts, pipes, n_process, batch_size)
else:
# if n_process == 1, no processes are forked.
- docs = (self.make_doc(text) for text in texts)
+ docs = (self._ensure_doc(text) for text in texts)
for pipe in pipes:
docs = pipe(docs)
for doc in docs:
yield doc
+ def _has_gpu_model(self, disable: Iterable[str]):
+ for name, proc in self.pipeline:
+ is_trainable = hasattr(proc, "is_trainable") and proc.is_trainable # type: ignore
+ if name in disable or not is_trainable:
+ continue
+
+ if hasattr(proc, "model") and hasattr(proc.model, "ops") and isinstance(proc.model.ops, CupyOps): # type: ignore
+ return True
+
+ return False
+
def _multiprocessing_pipe(
self,
- texts: Iterable[str],
- pipes: Iterable[Callable[[Doc], Doc]],
+ texts: Iterable[Union[str, Doc]],
+ pipes: Iterable[Callable[..., Iterator[Doc]]],
n_process: int,
batch_size: int,
- ) -> None:
+ ) -> Iterator[Doc]:
# raw_texts is used later to stop iteration.
texts, raw_texts = itertools.tee(texts)
# for sending texts to worker
- texts_q = [mp.Queue() for _ in range(n_process)]
+ texts_q: List[mp.Queue] = [mp.Queue() for _ in range(n_process)]
# for receiving byte-encoded docs from worker
bytedocs_recv_ch, bytedocs_send_ch = zip(
*[mp.Pipe(False) for _ in range(n_process)]
@@ -1549,7 +1611,7 @@ class Language:
procs = [
mp.Process(
target=_apply_pipes,
- args=(self.make_doc, pipes, rch, sch, Underscore.get_state()),
+ args=(self._ensure_doc, pipes, rch, sch, Underscore.get_state()),
)
for rch, sch in zip(texts_q, bytedocs_send_ch)
]
@@ -1562,11 +1624,12 @@ class Language:
recv.recv() for recv in cycle(bytedocs_recv_ch)
)
try:
- for i, (_, (byte_doc, byte_error)) in enumerate(
+ for i, (_, (byte_doc, byte_context, byte_error)) in enumerate(
zip(raw_texts, byte_tuples), 1
):
if byte_doc is not None:
doc = Doc(self.vocab).from_bytes(byte_doc)
+ doc._context = byte_context
yield doc
elif byte_error is not None:
error = srsly.msgpack_loads(byte_error)
@@ -1590,7 +1653,7 @@ class Language:
# components don't receive the pipeline then. So this does have to be
# here :(
for i, (name1, proc1) in enumerate(self.pipeline):
- if hasattr(proc1, "find_listeners"):
+ if isinstance(proc1, ty.ListenedToComponent):
for name2, proc2 in self.pipeline[i + 1 :]:
proc1.find_listeners(proc2)
@@ -1698,7 +1761,7 @@ class Language:
# them here so they're only loaded once
source_nlps = {}
source_nlp_vectors_hashes = {}
- nlp.meta["_sourced_vectors_hashes"] = {}
+ vocab_b = None
for pipe_name in config["nlp"]["pipeline"]:
if pipe_name not in pipeline:
opts = ", ".join(pipeline.keys())
@@ -1721,14 +1784,22 @@ class Language:
raw_config=raw_config,
)
else:
+ # We need the sourced components to reference the same
+ # vocab without modifying the current vocab state **AND**
+ # we still want to load the source model vectors to perform
+ # the vectors check. Since the source vectors clobber the
+ # current ones, we save the original vocab state and
+ # restore after this loop. Existing strings are preserved
+ # during deserialization, so they do not need any
+ # additional handling.
+ if vocab_b is None:
+ vocab_b = nlp.vocab.to_bytes(exclude=["lookups", "strings"])
model = pipe_cfg["source"]
if model not in source_nlps:
- # We only need the components here and we intentionally
- # do not load the model with the same vocab because
- # this would cause the vectors to be copied into the
- # current nlp object (all the strings will be added in
- # create_pipe_from_source)
- source_nlps[model] = util.load_model(model)
+ # Load with the same vocab, adding any strings
+ source_nlps[model] = util.load_model(
+ model, vocab=nlp.vocab, exclude=["lookups"]
+ )
source_name = pipe_cfg.get("component", pipe_name)
listeners_replaced = False
if "replace_listeners" in pipe_cfg:
@@ -1745,14 +1816,21 @@ class Language:
)
if model not in source_nlp_vectors_hashes:
source_nlp_vectors_hashes[model] = hash(
- source_nlps[model].vocab.vectors.to_bytes()
+ source_nlps[model].vocab.vectors.to_bytes(
+ exclude=["strings"]
+ )
)
+ if "_sourced_vectors_hashes" not in nlp.meta:
+ nlp.meta["_sourced_vectors_hashes"] = {}
nlp.meta["_sourced_vectors_hashes"][
pipe_name
] = source_nlp_vectors_hashes[model]
# Delete from cache if listeners were replaced
if listeners_replaced:
del source_nlps[model]
+ # Restore the original vocab after sourcing if necessary
+ if vocab_b is not None:
+ nlp.vocab.from_bytes(vocab_b)
disabled_pipes = [*config["nlp"]["disabled"], *disable]
nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
nlp.batch_size = config["nlp"]["batch_size"]
@@ -1765,25 +1843,25 @@ class Language:
)
# Detect components with listeners that are not frozen consistently
for name, proc in nlp.pipeline:
- # Remove listeners not in the pipeline
- listener_names = getattr(proc, "listening_components", [])
- unused_listener_names = [
- ll for ll in listener_names if ll not in nlp.pipe_names
- ]
- for listener_name in unused_listener_names:
- for listener in proc.listener_map.get(listener_name, []):
- proc.remove_listener(listener, listener_name)
+ if isinstance(proc, ty.ListenedToComponent):
+ # Remove listeners not in the pipeline
+ listener_names = proc.listening_components
+ unused_listener_names = [
+ ll for ll in listener_names if ll not in nlp.pipe_names
+ ]
+ for listener_name in unused_listener_names:
+ for listener in proc.listener_map.get(listener_name, []):
+ proc.remove_listener(listener, listener_name)
- for listener in getattr(
- proc, "listening_components", []
- ): # e.g. tok2vec/transformer
- # If it's a component sourced from another pipeline, we check if
- # the tok2vec listeners should be replaced with standalone tok2vec
- # models (e.g. so component can be frozen without its performance
- # degrading when other components/tok2vec are updated)
- paths = sourced.get(listener, {}).get("replace_listeners", [])
- if paths:
- nlp.replace_listeners(name, listener, paths)
+ for listener_name in proc.listening_components:
+ # e.g. tok2vec/transformer
+ # If it's a component sourced from another pipeline, we check if
+ # the tok2vec listeners should be replaced with standalone tok2vec
+ # models (e.g. so component can be frozen without its performance
+ # degrading when other components/tok2vec are updated)
+ paths = sourced.get(listener_name, {}).get("replace_listeners", [])
+ if paths:
+ nlp.replace_listeners(name, listener_name, paths)
return nlp
def replace_listeners(
@@ -1833,20 +1911,15 @@ class Language:
raise ValueError(err)
tok2vec = self.get_pipe(tok2vec_name)
tok2vec_cfg = self.get_pipe_config(tok2vec_name)
- tok2vec_model = tok2vec.model
- if (
- not hasattr(tok2vec, "model")
- or not hasattr(tok2vec, "listener_map")
- or not hasattr(tok2vec, "remove_listener")
- or "model" not in tok2vec_cfg
- ):
+ if not isinstance(tok2vec, ty.ListenedToComponent):
raise ValueError(Errors.E888.format(name=tok2vec_name, pipe=type(tok2vec)))
+ tok2vec_model = tok2vec.model
pipe_listeners = tok2vec.listener_map.get(pipe_name, [])
pipe = self.get_pipe(pipe_name)
pipe_cfg = self._pipe_configs[pipe_name]
if listeners:
util.logger.debug(f"Replacing listeners of component '{pipe_name}'")
- if len(listeners) != len(pipe_listeners):
+ if len(list(listeners)) != len(pipe_listeners):
# The number of listeners defined in the component model doesn't
# match the listeners to replace, so we won't be able to update
# the nodes and generate a matching config
@@ -1880,7 +1953,7 @@ class Language:
new_model = tok2vec_model.copy()
if "replace_listener" in tok2vec_model.attrs:
new_model = tok2vec_model.attrs["replace_listener"](new_model)
- util.replace_model_node(pipe.model, listener, new_model)
+ util.replace_model_node(pipe.model, listener, new_model) # type: ignore[attr-defined]
tok2vec.remove_listener(listener, pipe_name)
def to_disk(
@@ -1891,13 +1964,13 @@ class Language:
path (str / Path): Path to a directory, which will be created if
it doesn't exist.
- exclude (list): Names of components or serialization fields to exclude.
+ exclude (Iterable[str]): Names of components or serialization fields to exclude.
DOCS: https://spacy.io/api/language#to_disk
"""
path = util.ensure_path(path)
serializers = {}
- serializers["tokenizer"] = lambda p: self.tokenizer.to_disk(
+ serializers["tokenizer"] = lambda p: self.tokenizer.to_disk( # type: ignore[union-attr]
p, exclude=["vocab"]
)
serializers["meta.json"] = lambda p: srsly.write_json(p, self.meta)
@@ -1907,8 +1980,8 @@ class Language:
continue
if not hasattr(proc, "to_disk"):
continue
- serializers[name] = lambda p, proc=proc: proc.to_disk(p, exclude=["vocab"])
- serializers["vocab"] = lambda p: self.vocab.to_disk(p)
+ serializers[name] = lambda p, proc=proc: proc.to_disk(p, exclude=["vocab"]) # type: ignore[misc]
+ serializers["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude)
util.to_disk(path, serializers, exclude)
def from_disk(
@@ -1923,7 +1996,7 @@ class Language:
model will be loaded.
path (str / Path): A path to a directory.
- exclude (list): Names of components or serialization fields to exclude.
+ exclude (Iterable[str]): Names of components or serialization fields to exclude.
RETURNS (Language): The modified `Language` object.
DOCS: https://spacy.io/api/language#from_disk
@@ -1939,17 +2012,17 @@ class Language:
def deserialize_vocab(path: Path) -> None:
if path.exists():
- self.vocab.from_disk(path)
+ self.vocab.from_disk(path, exclude=exclude)
path = util.ensure_path(path)
deserializers = {}
- if Path(path / "config.cfg").exists():
+ if Path(path / "config.cfg").exists(): # type: ignore[operator]
deserializers["config.cfg"] = lambda p: self.config.from_disk(
p, interpolate=False, overrides=overrides
)
- deserializers["meta.json"] = deserialize_meta
- deserializers["vocab"] = deserialize_vocab
- deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(
+ deserializers["meta.json"] = deserialize_meta # type: ignore[assignment]
+ deserializers["vocab"] = deserialize_vocab # type: ignore[assignment]
+ deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk( # type: ignore[union-attr]
p, exclude=["vocab"]
)
for name, proc in self._components:
@@ -1957,28 +2030,28 @@ class Language:
continue
if not hasattr(proc, "from_disk"):
continue
- deserializers[name] = lambda p, proc=proc: proc.from_disk(
+ deserializers[name] = lambda p, proc=proc: proc.from_disk( # type: ignore[misc]
p, exclude=["vocab"]
)
- if not (path / "vocab").exists() and "vocab" not in exclude:
+ if not (path / "vocab").exists() and "vocab" not in exclude: # type: ignore[operator]
# Convert to list here in case exclude is (default) tuple
exclude = list(exclude) + ["vocab"]
- util.from_disk(path, deserializers, exclude)
- self._path = path
+ util.from_disk(path, deserializers, exclude) # type: ignore[arg-type]
+ self._path = path # type: ignore[assignment]
self._link_components()
return self
def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
"""Serialize the current state to a binary string.
- exclude (list): Names of components or serialization fields to exclude.
+ exclude (Iterable[str]): Names of components or serialization fields to exclude.
RETURNS (bytes): The serialized form of the `Language` object.
DOCS: https://spacy.io/api/language#to_bytes
"""
- serializers = {}
- serializers["vocab"] = lambda: self.vocab.to_bytes()
- serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
+ serializers: Dict[str, Callable[[], bytes]] = {}
+ serializers["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
+ serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"]) # type: ignore[union-attr]
serializers["meta.json"] = lambda: srsly.json_dumps(self.meta)
serializers["config.cfg"] = lambda: self.config.to_bytes()
for name, proc in self._components:
@@ -1986,7 +2059,7 @@ class Language:
continue
if not hasattr(proc, "to_bytes"):
continue
- serializers[name] = lambda proc=proc: proc.to_bytes(exclude=["vocab"])
+ serializers[name] = lambda proc=proc: proc.to_bytes(exclude=["vocab"]) # type: ignore[misc]
return util.to_bytes(serializers, exclude)
def from_bytes(
@@ -1995,7 +2068,7 @@ class Language:
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
- exclude (list): Names of components or serialization fields to exclude.
+ exclude (Iterable[str]): Names of components or serialization fields to exclude.
RETURNS (Language): The `Language` object.
DOCS: https://spacy.io/api/language#from_bytes
@@ -2008,13 +2081,13 @@ class Language:
# from self.vocab.vectors, so set the name directly
self.vocab.vectors.name = data.get("vectors", {}).get("name")
- deserializers = {}
+ deserializers: Dict[str, Callable[[bytes], Any]] = {}
deserializers["config.cfg"] = lambda b: self.config.from_bytes(
b, interpolate=False
)
deserializers["meta.json"] = deserialize_meta
- deserializers["vocab"] = self.vocab.from_bytes
- deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(
+ deserializers["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude)
+ deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes( # type: ignore[union-attr]
b, exclude=["vocab"]
)
for name, proc in self._components:
@@ -2022,7 +2095,7 @@ class Language:
continue
if not hasattr(proc, "from_bytes"):
continue
- deserializers[name] = lambda b, proc=proc: proc.from_bytes(
+ deserializers[name] = lambda b, proc=proc: proc.from_bytes( # type: ignore[misc]
b, exclude=["vocab"]
)
util.from_bytes(bytes_data, deserializers, exclude)
@@ -2044,7 +2117,7 @@ class FactoryMeta:
requires: Iterable[str] = tuple()
retokenizes: bool = False
scores: Iterable[str] = tuple()
- default_score_weights: Optional[Dict[str, float]] = None # noqa: E704
+ default_score_weights: Optional[Dict[str, Optional[float]]] = None # noqa: E704
class DisabledPipes(list):
@@ -2083,16 +2156,17 @@ def _copy_examples(examples: Iterable[Example]) -> List[Example]:
def _apply_pipes(
- make_doc: Callable[[str], Doc],
- pipes: Iterable[Callable[[Doc], Doc]],
+ ensure_doc: Callable[[Union[str, Doc]], Doc],
+ pipes: Iterable[Callable[..., Iterator[Doc]]],
receiver,
sender,
underscore_state: Tuple[dict, dict, dict],
) -> None:
"""Worker for Language.pipe
- make_doc (Callable[[str,] Doc]): Function to create Doc from text.
- pipes (Iterable[Callable[[Doc], Doc]]): The components to apply.
+ ensure_doc (Callable[[Union[str, Doc]], Doc]): Function to create Doc from text
+ or raise an error if the input is neither a Doc nor a string.
+ pipes (Iterable[Pipe]): The components to apply.
receiver (multiprocessing.Connection): Pipe to receive text. Usually
created by `multiprocessing.Pipe()`
sender (multiprocessing.Connection): Pipe to send doc. Usually created by
@@ -2104,16 +2178,16 @@ def _apply_pipes(
while True:
try:
texts = receiver.get()
- docs = (make_doc(text) for text in texts)
+ docs = (ensure_doc(text) for text in texts)
for pipe in pipes:
- docs = pipe(docs)
+ docs = pipe(docs) # type: ignore[arg-type, assignment]
# Connection does not accept unpickable objects, so send list.
- byte_docs = [(doc.to_bytes(), None) for doc in docs]
- padding = [(None, None)] * (len(texts) - len(byte_docs))
- sender.send(byte_docs + padding)
+ byte_docs = [(doc.to_bytes(), doc._context, None) for doc in docs]
+ padding = [(None, None, None)] * (len(texts) - len(byte_docs))
+ sender.send(byte_docs + padding) # type: ignore[operator]
except Exception:
- error_msg = [(None, srsly.msgpack_dumps(traceback.format_exc()))]
- padding = [(None, None)] * (len(texts) - 1)
+ error_msg = [(None, None, srsly.msgpack_dumps(traceback.format_exc()))]
+ padding = [(None, None, None)] * (len(texts) - 1)
sender.send(error_msg + padding)
diff --git a/spacy/lexeme.pyi b/spacy/lexeme.pyi
new file mode 100644
index 000000000..4fcaa82cf
--- /dev/null
+++ b/spacy/lexeme.pyi
@@ -0,0 +1,61 @@
+from typing import (
+ Union,
+ Any,
+)
+from thinc.types import Floats1d
+from .tokens import Doc, Span, Token
+from .vocab import Vocab
+
+class Lexeme:
+ def __init__(self, vocab: Vocab, orth: int) -> None: ...
+ def __richcmp__(self, other: Lexeme, op: int) -> bool: ...
+ def __hash__(self) -> int: ...
+ def set_attrs(self, **attrs: Any) -> None: ...
+ def set_flag(self, flag_id: int, value: bool) -> None: ...
+ def check_flag(self, flag_id: int) -> bool: ...
+ def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ...
+ @property
+ def has_vector(self) -> bool: ...
+ @property
+ def vector_norm(self) -> float: ...
+ vector: Floats1d
+ rank: int
+ sentiment: float
+ @property
+ def orth_(self) -> str: ...
+ @property
+ def text(self) -> str: ...
+ lower: str
+ norm: int
+ shape: int
+ prefix: int
+ suffix: int
+ cluster: int
+ lang: int
+ prob: float
+ lower_: str
+ norm_: str
+ shape_: str
+ prefix_: str
+ suffix_: str
+ lang_: str
+ flags: int
+ @property
+ def is_oov(self) -> bool: ...
+ is_stop: bool
+ is_alpha: bool
+ is_ascii: bool
+ is_digit: bool
+ is_lower: bool
+ is_upper: bool
+ is_title: bool
+ is_punct: bool
+ is_space: bool
+ is_bracket: bool
+ is_quote: bool
+ is_left_punct: bool
+ is_right_punct: bool
+ is_currency: bool
+ like_url: bool
+ like_num: bool
+ like_email: bool
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 3564b6e42..6c66effde 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -130,8 +130,10 @@ cdef class Lexeme:
return 0.0
vector = self.vector
xp = get_array_module(vector)
- return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
-
+ result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
+ # ensure we get a scalar back (numpy does this automatically but cupy doesn't)
+ return result.item()
+
@property
def has_vector(self):
"""RETURNS (bool): Whether a word vector is associated with the object.
@@ -284,7 +286,7 @@ cdef class Lexeme:
def __get__(self):
return self.vocab.strings[self.c.lower]
- def __set__(self, unicode x):
+ def __set__(self, str x):
self.c.lower = self.vocab.strings.add(x)
property norm_:
@@ -294,7 +296,7 @@ cdef class Lexeme:
def __get__(self):
return self.vocab.strings[self.c.norm]
- def __set__(self, unicode x):
+ def __set__(self, str x):
self.norm = self.vocab.strings.add(x)
property shape_:
@@ -304,7 +306,7 @@ cdef class Lexeme:
def __get__(self):
return self.vocab.strings[self.c.shape]
- def __set__(self, unicode x):
+ def __set__(self, str x):
self.c.shape = self.vocab.strings.add(x)
property prefix_:
@@ -314,7 +316,7 @@ cdef class Lexeme:
def __get__(self):
return self.vocab.strings[self.c.prefix]
- def __set__(self, unicode x):
+ def __set__(self, str x):
self.c.prefix = self.vocab.strings.add(x)
property suffix_:
@@ -324,7 +326,7 @@ cdef class Lexeme:
def __get__(self):
return self.vocab.strings[self.c.suffix]
- def __set__(self, unicode x):
+ def __set__(self, str x):
self.c.suffix = self.vocab.strings.add(x)
property lang_:
@@ -332,7 +334,7 @@ cdef class Lexeme:
def __get__(self):
return self.vocab.strings[self.c.lang]
- def __set__(self, unicode x):
+ def __set__(self, str x):
self.c.lang = self.vocab.strings.add(x)
property flags:
diff --git a/spacy/lookups.py b/spacy/lookups.py
index 025afa04b..b2f3dc15e 100644
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@@ -1,4 +1,4 @@
-from typing import Any, List, Union, Optional
+from typing import Any, List, Union, Optional, Dict
from pathlib import Path
import srsly
from preshed.bloom import BloomFilter
@@ -34,9 +34,9 @@ def load_lookups(lang: str, tables: List[str], strict: bool = True) -> "Lookups"
if table not in data:
if strict:
raise ValueError(Errors.E955.format(table=table, lang=lang))
- language_data = {}
+ language_data = {} # type: ignore[var-annotated]
else:
- language_data = load_language_data(data[table])
+ language_data = load_language_data(data[table]) # type: ignore[assignment]
lookups.add_table(table, language_data)
return lookups
@@ -116,7 +116,7 @@ class Table(OrderedDict):
key = get_string_id(key)
return OrderedDict.get(self, key, default)
- def __contains__(self, key: Union[str, int]) -> bool:
+ def __contains__(self, key: Union[str, int]) -> bool: # type: ignore[override]
"""Check whether a key is in the table. String keys will be hashed.
key (str / int): The key to check.
@@ -172,7 +172,7 @@ class Lookups:
DOCS: https://spacy.io/api/lookups#init
"""
- self._tables = {}
+ self._tables: Dict[str, Table] = {}
def __contains__(self, name: str) -> bool:
"""Check if the lookups contain a table of a given name. Delegates to
diff --git a/spacy/matcher/dependencymatcher.pyi b/spacy/matcher/dependencymatcher.pyi
new file mode 100644
index 000000000..c19d3a71c
--- /dev/null
+++ b/spacy/matcher/dependencymatcher.pyi
@@ -0,0 +1,66 @@
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from .matcher import Matcher
+from ..vocab import Vocab
+from ..tokens.doc import Doc
+from ..tokens.span import Span
+
+class DependencyMatcher:
+ """Match dependency parse tree based on pattern rules."""
+
+ _patterns: Dict[str, List[Any]]
+ _raw_patterns: Dict[str, List[Any]]
+ _tokens_to_key: Dict[str, List[Any]]
+ _root: Dict[str, List[Any]]
+ _tree: Dict[str, List[Any]]
+ _callbacks: Dict[
+ Any, Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
+ ]
+ _ops: Dict[str, Any]
+ vocab: Vocab
+ _matcher: Matcher
+ def __init__(self, vocab: Vocab, *, validate: bool = ...) -> None: ...
+ def __reduce__(
+ self,
+ ) -> Tuple[
+ Callable[
+ [Vocab, Dict[str, Any], Dict[str, Callable[..., Any]]], DependencyMatcher
+ ],
+ Tuple[
+ Vocab,
+ Dict[str, List[Any]],
+ Dict[
+ str,
+ Callable[
+ [DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any
+ ],
+ ],
+ ],
+ None,
+ None,
+ ]: ...
+ def __len__(self) -> int: ...
+ def __contains__(self, key: Union[str, int]) -> bool: ...
+ def add(
+ self,
+ key: Union[str, int],
+ patterns: List[List[Dict[str, Any]]],
+ *,
+ on_match: Optional[
+ Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
+ ] = ...
+ ) -> None: ...
+ def has_key(self, key: Union[str, int]) -> bool: ...
+ def get(
+ self, key: Union[str, int], default: Optional[Any] = ...
+ ) -> Tuple[
+ Optional[
+ Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
+ ],
+ List[List[Dict[str, Any]]],
+ ]: ...
+ def remove(self, key: Union[str, int]) -> None: ...
+ def __call__(self, doclike: Union[Doc, Span]) -> List[Tuple[int, List[int]]]: ...
+
+def unpickle_matcher(
+ vocab: Vocab, patterns: Dict[str, Any], callbacks: Dict[str, Callable[..., Any]]
+) -> DependencyMatcher: ...
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index b6e84a5da..a602ba737 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -3,7 +3,6 @@ from typing import List
from collections import defaultdict
from itertools import product
-import numpy
import warnings
from .matcher cimport Matcher
@@ -122,13 +121,15 @@ cdef class DependencyMatcher:
raise ValueError(Errors.E099.format(key=key))
visited_nodes[relation["RIGHT_ID"]] = True
else:
- if not(
- "RIGHT_ID" in relation
- and "RIGHT_ATTRS" in relation
- and "REL_OP" in relation
- and "LEFT_ID" in relation
- ):
- raise ValueError(Errors.E100.format(key=key))
+ required_keys = {"RIGHT_ID", "RIGHT_ATTRS", "REL_OP", "LEFT_ID"}
+ relation_keys = set(relation.keys())
+ missing = required_keys - relation_keys
+ if missing:
+ missing_txt = ", ".join(list(missing))
+ raise ValueError(Errors.E100.format(
+ required=required_keys,
+ missing=missing_txt
+ ))
if (
relation["RIGHT_ID"] in visited_nodes
or relation["LEFT_ID"] not in visited_nodes
@@ -147,9 +148,9 @@ cdef class DependencyMatcher:
Creates a token key to be used by the matcher
"""
return self._normalize_key(
- unicode(key) + DELIMITER +
- unicode(pattern_idx) + DELIMITER +
- unicode(token_idx)
+ str(key) + DELIMITER +
+ str(pattern_idx) + DELIMITER +
+ str(token_idx)
)
def add(self, key, patterns, *, on_match=None):
@@ -175,28 +176,23 @@ cdef class DependencyMatcher:
self._callbacks[key] = on_match
# Add 'RIGHT_ATTRS' to self._patterns[key]
- _patterns = []
- for pattern in patterns:
- token_patterns = []
- for i in range(len(pattern)):
- token_pattern = [pattern[i]["RIGHT_ATTRS"]]
- token_patterns.append(token_pattern)
- _patterns.append(token_patterns)
+ _patterns = [[[pat["RIGHT_ATTRS"]] for pat in pattern] for pattern in patterns]
+ pattern_offset = len(self._patterns[key])
self._patterns[key].extend(_patterns)
# Add each node pattern of all the input patterns individually to the
# matcher. This enables only a single instance of Matcher to be used.
# Multiple adds are required to track each node pattern.
tokens_to_key_list = []
- for i in range(len(_patterns)):
+ for i, current_patterns in enumerate(_patterns, start=pattern_offset):
# Preallocate list space
- tokens_to_key = [None]*len(_patterns[i])
+ tokens_to_key = [None] * len(current_patterns)
# TODO: Better ways to hash edges in pattern?
- for j in range(len(_patterns[i])):
+ for j, _pattern in enumerate(current_patterns):
k = self._get_matcher_key(key, i, j)
- self._matcher.add(k, [_patterns[i][j]])
+ self._matcher.add(k, [_pattern])
tokens_to_key[j] = k
tokens_to_key_list.append(tokens_to_key)
@@ -268,7 +264,9 @@ cdef class DependencyMatcher:
self._raw_patterns.pop(key)
self._tree.pop(key)
self._root.pop(key)
- self._tokens_to_key.pop(key)
+ for mklist in self._tokens_to_key.pop(key):
+ for mkey in mklist:
+ self._matcher.remove(mkey)
def _get_keys_to_position_maps(self, doc):
"""
@@ -333,7 +331,7 @@ cdef class DependencyMatcher:
# position of the matched tokens
for candidate_match in product(*all_positions):
- # A potential match is a valid match if all relationhips between the
+ # A potential match is a valid match if all relationships between the
# matched tokens are satisfied.
is_valid = True
for left_idx in range(len(candidate_match)):
@@ -420,21 +418,13 @@ cdef class DependencyMatcher:
return []
def _right_sib(self, doc, node):
- candidate_children = []
- for child in list(doc[node].head.children):
- if child.i > node:
- candidate_children.append(doc[child.i])
- return candidate_children
+ return [doc[child.i] for child in doc[node].head.children if child.i > node]
def _left_sib(self, doc, node):
- candidate_children = []
- for child in list(doc[node].head.children):
- if child.i < node:
- candidate_children.append(doc[child.i])
- return candidate_children
+ return [doc[child.i] for child in doc[node].head.children if child.i < node]
def _normalize_key(self, key):
- if isinstance(key, basestring):
+ if isinstance(key, str):
return self.vocab.strings.add(key)
else:
return key
diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi
new file mode 100644
index 000000000..390629ff8
--- /dev/null
+++ b/spacy/matcher/matcher.pyi
@@ -0,0 +1,54 @@
+from typing import Any, List, Dict, Tuple, Optional, Callable, Union
+from typing import Iterator, Iterable, overload
+from ..compat import Literal
+from ..vocab import Vocab
+from ..tokens import Doc, Span
+
+class Matcher:
+ def __init__(self, vocab: Vocab, validate: bool = ...) -> None: ...
+ def __reduce__(self) -> Any: ...
+ def __len__(self) -> int: ...
+ def __contains__(self, key: str) -> bool: ...
+ def add(
+ self,
+ key: Union[str, int],
+ patterns: List[List[Dict[str, Any]]],
+ *,
+ on_match: Optional[
+ Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
+ ] = ...,
+ greedy: Optional[str] = ...
+ ) -> None: ...
+ def remove(self, key: str) -> None: ...
+ def has_key(self, key: Union[str, int]) -> bool: ...
+ def get(
+ self, key: Union[str, int], default: Optional[Any] = ...
+ ) -> Tuple[Optional[Callable[[Any], Any]], List[List[Dict[Any, Any]]]]: ...
+ def pipe(
+ self,
+ docs: Iterable[Tuple[Doc, Any]],
+ batch_size: int = ...,
+ return_matches: bool = ...,
+ as_tuples: bool = ...,
+ ) -> Union[
+ Iterator[Tuple[Tuple[Doc, Any], Any]], Iterator[Tuple[Doc, Any]], Iterator[Doc]
+ ]: ...
+ @overload
+ def __call__(
+ self,
+ doclike: Union[Doc, Span],
+ *,
+ as_spans: Literal[False] = ...,
+ allow_missing: bool = ...,
+ with_alignments: bool = ...
+ ) -> List[Tuple[int, int, int]]: ...
+ @overload
+ def __call__(
+ self,
+ doclike: Union[Doc, Span],
+ *,
+ as_spans: Literal[True],
+ allow_missing: bool = ...,
+ with_alignments: bool = ...
+ ) -> List[Span]: ...
+ def _normalize_key(self, key: Any) -> Any: ...
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 7b1cfb633..6aa58f0e3 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -18,7 +18,7 @@ from ..tokens.doc cimport Doc, get_token_attr_for_matcher
from ..tokens.span cimport Span
from ..tokens.token cimport Token
from ..tokens.morphanalysis cimport MorphAnalysis
-from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
+from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB
from ..schemas import validate_token_pattern
from ..errors import Errors, MatchPatternError, Warnings
@@ -96,12 +96,10 @@ cdef class Matcher:
by returning a non-overlapping set per key, either taking preference to
the first greedy match ("FIRST"), or the longest ("LONGEST").
- As of spaCy v2.2.2, Matcher.add supports the future API, which makes
- the patterns the second argument and a list (instead of a variable
- number of arguments). The on_match callback becomes an optional keyword
- argument.
+ Since spaCy v2.2.2, Matcher.add takes a list of patterns as the second
+ argument, and the on_match callback is an optional keyword argument.
- key (str): The match ID.
+ key (Union[str, int]): The match ID.
patterns (list): The patterns to add for the given key.
on_match (callable): Optional callback executed on match.
greedy (str): Optional filter: "FIRST" or "LONGEST".
@@ -281,28 +279,19 @@ cdef class Matcher:
final_matches.append((key, *match))
# Mark tokens that have matched
memset(&matched[start], 1, span_len * sizeof(matched[0]))
- if with_alignments:
- final_matches_with_alignments = final_matches
- final_matches = [(key, start, end) for key, start, end, alignments in final_matches]
- # perform the callbacks on the filtered set of results
- for i, (key, start, end) in enumerate(final_matches):
- on_match = self._callbacks.get(key, None)
- if on_match is not None:
- on_match(self, doc, i, final_matches)
if as_spans:
- spans = []
- for key, start, end in final_matches:
+ final_results = []
+ for key, start, end, *_ in final_matches:
if isinstance(doclike, Span):
start += doclike.start
end += doclike.start
- spans.append(Span(doc, start, end, label=key))
- return spans
+ final_results.append(Span(doc, start, end, label=key))
elif with_alignments:
# convert alignments List[Dict[str, int]] --> List[int]
- final_matches = []
# when multiple alignment (belongs to the same length) is found,
# keeps the alignment that has largest token_idx
- for key, start, end, alignments in final_matches_with_alignments:
+ final_results = []
+ for key, start, end, alignments in final_matches:
sorted_alignments = sorted(alignments, key=lambda x: (x['length'], x['token_idx']), reverse=False)
alignments = [0] * (end-start)
for align in sorted_alignments:
@@ -311,13 +300,19 @@ cdef class Matcher:
# Since alignments are sorted in order of (length, token_idx)
# this overwrites smaller token_idx when they have same length.
alignments[align['length']] = align['token_idx']
- final_matches.append((key, start, end, alignments))
- return final_matches
+ final_results.append((key, start, end, alignments))
+ final_matches = final_results # for callbacks
else:
- return final_matches
+ final_results = final_matches
+ # perform the callbacks on the filtered set of results
+ for i, (key, *_) in enumerate(final_matches):
+ on_match = self._callbacks.get(key, None)
+ if on_match is not None:
+ on_match(self, doc, i, final_matches)
+ return final_results
def _normalize_key(self, key):
- if isinstance(key, basestring):
+ if isinstance(key, str):
return self.vocab.strings.add(key)
else:
return key
@@ -340,7 +335,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
The "predicates" list contains functions that take a Python list and return a
boolean value. It's mostly used for regular expressions.
- The "extra_getters" list contains functions that take a Python list and return
+ The "extensions" list contains functions that take a Python list and return
an attr ID. It's mostly used for extension attributes.
"""
cdef vector[PatternStateC] states
@@ -365,7 +360,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
for i, token in enumerate(doclike):
for name, index in extensions.items():
value = token._.get(name)
- if isinstance(value, basestring):
+ if isinstance(value, str):
value = token.vocab.strings[value]
extra_attr_values[i * nr_extra_attr + index] = value
# Main loop
@@ -791,7 +786,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
def _get_attr_values(spec, string_store):
attr_values = []
for attr, value in spec.items():
- if isinstance(attr, basestring):
+ if isinstance(attr, str):
attr = attr.upper()
if attr == '_':
continue
@@ -802,8 +797,11 @@ def _get_attr_values(spec, string_store):
if attr == "IS_SENT_START":
attr = "SENT_START"
attr = IDS.get(attr)
- if isinstance(value, basestring):
- value = string_store.add(value)
+ if isinstance(value, str):
+ if attr == ENT_IOB and value in Token.iob_strings():
+ value = Token.iob_strings().index(value)
+ else:
+ value = string_store.add(value)
elif isinstance(value, bool):
value = int(value)
elif isinstance(value, int):
@@ -845,7 +843,7 @@ class _RegexPredicate:
class _SetPredicate:
- operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET")
+ operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS")
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
self.i = i
@@ -868,14 +866,16 @@ class _SetPredicate:
else:
value = get_token_attr_for_matcher(token.c, self.attr)
- if self.predicate in ("IS_SUBSET", "IS_SUPERSET"):
+ if self.predicate in ("IS_SUBSET", "IS_SUPERSET", "INTERSECTS"):
if self.attr == MORPH:
# break up MORPH into individual Feat=Val values
value = set(get_string_id(v) for v in MorphAnalysis.from_id(self.vocab, value))
else:
- # IS_SUBSET for other attrs will be equivalent to "IN"
- # IS_SUPERSET will only match for other attrs with 0 or 1 values
- value = set([value])
+ # treat a single value as a list
+ if isinstance(value, (str, int)):
+ value = set([get_string_id(value)])
+ else:
+ value = set(get_string_id(v) for v in value)
if self.predicate == "IN":
return value in self.value
elif self.predicate == "NOT_IN":
@@ -884,6 +884,8 @@ class _SetPredicate:
return value <= self.value
elif self.predicate == "IS_SUPERSET":
return value >= self.value
+ elif self.predicate == "INTERSECTS":
+ return bool(value & self.value)
def __repr__(self):
return repr(("SetPredicate", self.i, self.attr, self.value, self.predicate))
@@ -928,6 +930,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
"NOT_IN": _SetPredicate,
"IS_SUBSET": _SetPredicate,
"IS_SUPERSET": _SetPredicate,
+ "INTERSECTS": _SetPredicate,
"==": _ComparisonPredicate,
"!=": _ComparisonPredicate,
">=": _ComparisonPredicate,
@@ -938,7 +941,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
seen_predicates = {pred.key: pred.i for pred in extra_predicates}
output = []
for attr, value in spec.items():
- if isinstance(attr, basestring):
+ if isinstance(attr, str):
if attr == "_":
output.extend(
_get_extension_extra_predicates(
@@ -995,7 +998,7 @@ def _get_operators(spec):
"?": (ZERO_ONE,), "1": (ONE,), "!": (ZERO,)}
# Fix casing
spec = {key.upper(): values for key, values in spec.items()
- if isinstance(key, basestring)}
+ if isinstance(key, str)}
if "OP" not in spec:
return (ONE,)
elif spec["OP"] in lookup:
@@ -1013,7 +1016,7 @@ def _get_extensions(spec, string_store, name2index):
if isinstance(value, dict):
# Handle predicates (e.g. "IN", in the extra_predicates, not here.
continue
- if isinstance(value, basestring):
+ if isinstance(value, str):
value = string_store.add(value)
if name not in name2index:
name2index[name] = len(name2index)
diff --git a/spacy/matcher/phrasematcher.pyi b/spacy/matcher/phrasematcher.pyi
new file mode 100644
index 000000000..82a194835
--- /dev/null
+++ b/spacy/matcher/phrasematcher.pyi
@@ -0,0 +1,37 @@
+from typing import List, Tuple, Union, Optional, Callable, Any, Dict, overload
+from ..compat import Literal
+from .matcher import Matcher
+from ..vocab import Vocab
+from ..tokens import Doc, Span
+
+class PhraseMatcher:
+ def __init__(
+ self, vocab: Vocab, attr: Optional[Union[int, str]], validate: bool = ...
+ ) -> None: ...
+ def __reduce__(self) -> Any: ...
+ def __len__(self) -> int: ...
+ def __contains__(self, key: str) -> bool: ...
+ def add(
+ self,
+ key: str,
+ docs: List[List[Dict[str, Any]]],
+ *,
+ on_match: Optional[
+ Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
+ ] = ...,
+ ) -> None: ...
+ def remove(self, key: str) -> None: ...
+ @overload
+ def __call__(
+ self,
+ doclike: Union[Doc, Span],
+ *,
+ as_spans: Literal[False] = ...,
+ ) -> List[Tuple[int, int, int]]: ...
+ @overload
+ def __call__(
+ self,
+ doclike: Union[Doc, Span],
+ *,
+ as_spans: Literal[True],
+ ) -> List[Span]: ...
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index d8486b84b..2ff5105ad 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -157,9 +157,8 @@ cdef class PhraseMatcher:
"""Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
key, an on_match callback, and one or more patterns.
- As of spaCy v2.2.2, PhraseMatcher.add supports the future API, which
- makes the patterns the second argument and a list (instead of a variable
- number of arguments). The on_match callback becomes an optional keyword
+ Since spaCy v2.2.2, PhraseMatcher.add takes a list of patterns as the
+ second argument, with the on_match callback as an optional keyword
argument.
key (str): The match ID.
diff --git a/spacy/ml/__init__.py b/spacy/ml/__init__.py
index c382d915b..fce8ae5af 100644
--- a/spacy/ml/__init__.py
+++ b/spacy/ml/__init__.py
@@ -1 +1,2 @@
+from .callbacks import create_models_with_nvtx_range # noqa: F401
from .models import * # noqa: F401, F403
diff --git a/spacy/ml/_character_embed.py b/spacy/ml/_character_embed.py
index 0ed28b859..e46735102 100644
--- a/spacy/ml/_character_embed.py
+++ b/spacy/ml/_character_embed.py
@@ -44,7 +44,7 @@ def forward(model: Model, docs: List[Doc], is_train: bool):
# Let's say I have a 2d array of indices, and a 3d table of data. What numpy
# incantation do I chant to get
# output[i, j, k] == data[j, ids[i, j], k]?
- doc_vectors[:, nCv] = E[nCv, doc_ids[:, nCv]]
+ doc_vectors[:, nCv] = E[nCv, doc_ids[:, nCv]] # type: ignore[call-overload, index]
output.append(doc_vectors.reshape((len(doc), nO)))
ids.append(doc_ids)
diff --git a/spacy/ml/callbacks.py b/spacy/ml/callbacks.py
new file mode 100644
index 000000000..b0d088182
--- /dev/null
+++ b/spacy/ml/callbacks.py
@@ -0,0 +1,39 @@
+from functools import partial
+from typing import Type, Callable, TYPE_CHECKING
+
+from thinc.layers import with_nvtx_range
+from thinc.model import Model, wrap_model_recursive
+
+from ..util import registry
+
+if TYPE_CHECKING:
+ # This lets us add type hints for mypy etc. without causing circular imports
+ from ..language import Language # noqa: F401
+
+
+@registry.callbacks("spacy.models_with_nvtx_range.v1")
+def create_models_with_nvtx_range(
+ forward_color: int = -1, backprop_color: int = -1
+) -> Callable[["Language"], "Language"]:
+ def models_with_nvtx_range(nlp):
+ pipes = [
+ pipe
+ for _, pipe in nlp.components
+ if hasattr(pipe, "is_trainable") and pipe.is_trainable
+ ]
+
+ # We need process all models jointly to avoid wrapping callbacks twice.
+ models = Model(
+ "wrap_with_nvtx_range",
+ forward=lambda model, X, is_train: ...,
+ layers=[pipe.model for pipe in pipes],
+ )
+
+ for node in models.walk():
+ with_nvtx_range(
+ node, forward_color=forward_color, backprop_color=backprop_color
+ )
+
+ return nlp
+
+ return models_with_nvtx_range
diff --git a/spacy/ml/extract_ngrams.py b/spacy/ml/extract_ngrams.py
index c1c2929fd..c9c82f369 100644
--- a/spacy/ml/extract_ngrams.py
+++ b/spacy/ml/extract_ngrams.py
@@ -6,7 +6,7 @@ from ..attrs import LOWER
@registry.layers("spacy.extract_ngrams.v1")
def extract_ngrams(ngram_size: int, attr: int = LOWER) -> Model:
- model = Model("extract_ngrams", forward)
+ model: Model = Model("extract_ngrams", forward)
model.attrs["ngram_size"] = ngram_size
model.attrs["attr"] = attr
return model
@@ -19,7 +19,7 @@ def forward(model: Model, docs, is_train: bool):
unigrams = model.ops.asarray(doc.to_array([model.attrs["attr"]]))
ngrams = [unigrams]
for n in range(2, model.attrs["ngram_size"] + 1):
- ngrams.append(model.ops.ngrams(n, unigrams))
+ ngrams.append(model.ops.ngrams(n, unigrams)) # type: ignore[arg-type]
keys = model.ops.xp.concatenate(ngrams)
keys, vals = model.ops.xp.unique(keys, return_counts=True)
batch_keys.append(keys)
diff --git a/spacy/ml/extract_spans.py b/spacy/ml/extract_spans.py
index 8afd1a3cc..edc86ff9c 100644
--- a/spacy/ml/extract_spans.py
+++ b/spacy/ml/extract_spans.py
@@ -28,13 +28,19 @@ def forward(
X, spans = source_spans
assert spans.dataXd.ndim == 2
indices = _get_span_indices(ops, spans, X.lengths)
- Y = Ragged(X.dataXd[indices], spans.dataXd[:, 1] - spans.dataXd[:, 0])
+ if len(indices) > 0:
+ Y = Ragged(X.dataXd[indices], spans.dataXd[:, 1] - spans.dataXd[:, 0]) # type: ignore[arg-type, index]
+ else:
+ Y = Ragged(
+ ops.xp.zeros(X.dataXd.shape, dtype=X.dataXd.dtype),
+ ops.xp.zeros((len(X.lengths),), dtype="i"),
+ )
x_shape = X.dataXd.shape
x_lengths = X.lengths
def backprop_windows(dY: Ragged) -> Tuple[Ragged, Ragged]:
dX = Ragged(ops.alloc2f(*x_shape), x_lengths)
- ops.scatter_add(dX.dataXd, indices, dY.dataXd)
+ ops.scatter_add(dX.dataXd, indices, dY.dataXd) # type: ignore[arg-type]
return (dX, spans)
return Y, backprop_windows
@@ -51,9 +57,9 @@ def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d:
for i, length in enumerate(lengths):
spans_i = spans[i].dataXd + offset
for j in range(spans_i.shape[0]):
- indices.append(ops.xp.arange(spans_i[j, 0], spans_i[j, 1]))
+ indices.append(ops.xp.arange(spans_i[j, 0], spans_i[j, 1])) # type: ignore[call-overload, index]
offset += length
- return ops.flatten(indices)
+ return ops.flatten(indices, dtype="i", ndim_if_empty=1)
def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:
diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index 645b67c62..831fee90f 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -1,16 +1,19 @@
from pathlib import Path
-from typing import Optional, Callable, Iterable
+from typing import Optional, Callable, Iterable, List
+from thinc.types import Floats2d
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
from thinc.api import Model, Maxout, Linear
from ...util import registry
from ...kb import KnowledgeBase, Candidate, get_candidates
from ...vocab import Vocab
-from ...tokens import Span
+from ...tokens import Span, Doc
@registry.architectures("spacy.EntityLinker.v1")
-def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
+def build_nel_encoder(
+ tok2vec: Model, nO: Optional[int] = None
+) -> Model[List[Doc], Floats2d]:
with Model.define_operators({">>": chain, "**": clone}):
token_width = tok2vec.maybe_get_dim("nO")
output_layer = Linear(nO=nO, nI=token_width)
@@ -18,7 +21,7 @@ def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
tok2vec
>> list2ragged()
>> reduce_mean()
- >> residual(Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0))
+ >> residual(Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0)) # type: ignore[arg-type]
>> output_layer
)
model.set_ref("output_layer", output_layer)
diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py
index d4d2d638b..9e1face63 100644
--- a/spacy/ml/models/multi_task.py
+++ b/spacy/ml/models/multi_task.py
@@ -1,9 +1,11 @@
-from typing import Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING
+from typing import Any, Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING, cast
+from thinc.types import Floats2d
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
from thinc.api import MultiSoftmax, list2array
from thinc.api import to_categorical, CosineDistance, L2Distance
+from thinc.loss import Loss
-from ...util import registry
+from ...util import registry, OOV_RANK
from ...errors import Errors
from ...attrs import ID
@@ -21,7 +23,7 @@ def create_pretrain_vectors(
maxout_pieces: int, hidden_size: int, loss: str
) -> Callable[["Vocab", Model], Model]:
def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
- if vocab.vectors.data.shape[1] == 0:
+ if vocab.vectors.shape[1] == 0:
raise ValueError(Errors.E875)
model = build_cloze_multi_task_model(
vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
@@ -30,6 +32,7 @@ def create_pretrain_vectors(
return model
def create_vectors_loss() -> Callable:
+ distance: Loss
if loss == "cosine":
distance = CosineDistance(normalize=True, ignore_zeros=True)
return partial(get_vectors_loss, distance=distance)
@@ -70,6 +73,7 @@ def get_vectors_loss(ops, docs, prediction, distance):
# and look them up all at once. This prevents data copying.
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
target = docs[0].vocab.vectors.data[ids]
+ target[ids == OOV_RANK] = 0
d_target, loss = distance(prediction, target)
return loss, d_target
@@ -112,9 +116,9 @@ def build_multi_task_model(
def build_cloze_multi_task_model(
vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int
) -> Model:
- nO = vocab.vectors.data.shape[1]
+ nO = vocab.vectors.shape[1]
output_layer = chain(
- list2array(),
+ cast(Model[List["Floats2d"], Floats2d], list2array()),
Maxout(
nO=hidden_size,
nI=tok2vec.get_dim("nO"),
@@ -135,10 +139,10 @@ def build_cloze_characters_multi_task_model(
vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int, nr_char: int
) -> Model:
output_layer = chain(
- list2array(),
+ cast(Model[List["Floats2d"], Floats2d], list2array()),
Maxout(nO=hidden_size, nP=maxout_pieces),
LayerNorm(nI=hidden_size),
- MultiSoftmax([256] * nr_char, nI=hidden_size),
+ MultiSoftmax([256] * nr_char, nI=hidden_size), # type: ignore[arg-type]
)
model = build_masked_language_model(vocab, chain(tok2vec, output_layer))
model.set_ref("tok2vec", tok2vec)
@@ -170,7 +174,7 @@ def build_masked_language_model(
if wrapped.has_dim(dim):
model.set_dim(dim, wrapped.get_dim(dim))
- mlm_model = Model(
+ mlm_model: Model = Model(
"masked-language-model",
mlm_forward,
layers=[wrapped_model],
@@ -184,13 +188,19 @@ def build_masked_language_model(
class _RandomWords:
def __init__(self, vocab: "Vocab") -> None:
+ # Extract lexeme representations
self.words = [lex.text for lex in vocab if lex.prob != 0.0]
- self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
self.words = self.words[:10000]
- self.probs = self.probs[:10000]
- self.probs = numpy.exp(numpy.array(self.probs, dtype="f"))
- self.probs /= self.probs.sum()
- self._cache = []
+
+ # Compute normalized lexeme probabilities
+ probs = [lex.prob for lex in vocab if lex.prob != 0.0]
+ probs = probs[:10000]
+ probs: numpy.ndarray = numpy.exp(numpy.array(probs, dtype="f"))
+ probs /= probs.sum()
+ self.probs = probs
+
+ # Initialize cache
+ self._cache: List[int] = []
def next(self) -> str:
if not self._cache:
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index 80751a695..63284e766 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,4 +1,4 @@
-from typing import Optional, List
+from typing import Optional, List, cast
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
from thinc.types import Floats2d
@@ -56,7 +56,7 @@ def build_tb_parser_model(
non-linearity if use_upper=False.
use_upper (bool): Whether to use an additional hidden layer after the state
vector in order to predict the action scores. It is recommended to set
- this to False for large pretrained models such as transformers, and False
+ this to False for large pretrained models such as transformers, and True
for smaller networks. The upper layer is computed on CPU, which becomes
a bottleneck on larger GPU-based models, where it's also less necessary.
nO (int or None): The number of actions the model will predict between.
@@ -70,7 +70,11 @@ def build_tb_parser_model(
else:
raise ValueError(Errors.E917.format(value=state_type))
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
- tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width))
+ tok2vec = chain(
+ tok2vec,
+ cast(Model[List["Floats2d"], Floats2d], list2array()),
+ Linear(hidden_width, t2v_width),
+ )
tok2vec.set_dim("nO", hidden_width)
lower = _define_lower(
nO=hidden_width if use_upper else nO,
@@ -80,7 +84,7 @@ def build_tb_parser_model(
)
upper = None
if use_upper:
- with use_ops("numpy"):
+ with use_ops("cpu"):
# Initialize weights at zero, as it's a classification layer.
upper = _define_upper(nO=nO, nI=None)
return TransitionModel(tok2vec, lower, upper, resize_output)
@@ -110,7 +114,7 @@ def _resize_upper(model, new_nO):
smaller = upper
nI = smaller.maybe_get_dim("nI")
- with use_ops("numpy"):
+ with use_ops("cpu"):
larger = _define_upper(nO=new_nO, nI=nI)
# it could be that the model is not initialized yet, then skip this bit
if smaller.has_param("W"):
diff --git a/spacy/ml/models/spancat.py b/spacy/ml/models/spancat.py
index b3fd7bd98..29926c4fd 100644
--- a/spacy/ml/models/spancat.py
+++ b/spacy/ml/models/spancat.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple
+from typing import List, Tuple, cast
from thinc.api import Model, with_getitem, chain, list2ragged, Logistic
from thinc.api import Maxout, Linear, concatenate, glorot_uniform_init
from thinc.api import reduce_mean, reduce_max, reduce_first, reduce_last
@@ -9,7 +9,7 @@ from ...tokens import Doc
from ..extract_spans import extract_spans
-@registry.layers.register("spacy.LinearLogistic.v1")
+@registry.layers("spacy.LinearLogistic.v1")
def build_linear_logistic(nO=None, nI=None) -> Model[Floats2d, Floats2d]:
"""An output layer for multi-label classification. It uses a linear layer
followed by a logistic activation.
@@ -17,18 +17,23 @@ def build_linear_logistic(nO=None, nI=None) -> Model[Floats2d, Floats2d]:
return chain(Linear(nO=nO, nI=nI, init_W=glorot_uniform_init), Logistic())
-@registry.layers.register("spacy.mean_max_reducer.v1")
+@registry.layers("spacy.mean_max_reducer.v1")
def build_mean_max_reducer(hidden_size: int) -> Model[Ragged, Floats2d]:
"""Reduce sequences by concatenating their mean and max pooled vectors,
and then combine the concatenated vectors with a hidden layer.
"""
return chain(
- concatenate(reduce_last(), reduce_first(), reduce_mean(), reduce_max()),
+ concatenate(
+ cast(Model[Ragged, Floats2d], reduce_last()),
+ cast(Model[Ragged, Floats2d], reduce_first()),
+ reduce_mean(),
+ reduce_max(),
+ ),
Maxout(nO=hidden_size, normalize=True, dropout=0.0),
)
-@registry.architectures.register("spacy.SpanCategorizer.v1")
+@registry.architectures("spacy.SpanCategorizer.v1")
def build_spancat_model(
tok2vec: Model[List[Doc], List[Floats2d]],
reducer: Model[Ragged, Floats2d],
@@ -43,7 +48,12 @@ def build_spancat_model(
scorer (Model[Floats2d, Floats2d]): The scorer model.
"""
model = chain(
- with_getitem(0, chain(tok2vec, list2ragged())),
+ cast(
+ Model[Tuple[List[Doc], Ragged], Tuple[Ragged, Ragged]],
+ with_getitem(
+ 0, chain(tok2vec, cast(Model[List[Floats2d], Ragged], list2ragged()))
+ ),
+ ),
extract_spans(),
reducer,
scorer,
diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py
index 87944e305..9c7fe042d 100644
--- a/spacy/ml/models/tagger.py
+++ b/spacy/ml/models/tagger.py
@@ -20,7 +20,7 @@ def build_tagger_model(
# TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
output_layer = Softmax(nO, t2v_width, init_W=zero_init)
- softmax = with_array(output_layer)
+ softmax = with_array(output_layer) # type: ignore
model = chain(tok2vec, softmax)
model.set_ref("tok2vec", tok2vec)
model.set_ref("softmax", output_layer)
diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
index e3f6e944a..c8c146f02 100644
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@@ -37,7 +37,7 @@ def build_simple_cnn_text_classifier(
if exclusive_classes:
output_layer = Softmax(nO=nO, nI=nI)
fill_defaults["b"] = NEG_VALUE
- resizable_layer = resizable(
+ resizable_layer: Model = resizable(
output_layer,
resize_layer=partial(
resize_linear_weighted, fill_defaults=fill_defaults
@@ -59,7 +59,7 @@ def build_simple_cnn_text_classifier(
resizable_layer=resizable_layer,
)
model.set_ref("tok2vec", tok2vec)
- model.set_dim("nO", nO)
+ model.set_dim("nO", nO) # type: ignore # TODO: remove type ignore once Thinc has been updated
model.attrs["multi_label"] = not exclusive_classes
return model
@@ -85,7 +85,7 @@ def build_bow_text_classifier(
if not no_output_layer:
fill_defaults["b"] = NEG_VALUE
output_layer = softmax_activation() if exclusive_classes else Logistic()
- resizable_layer = resizable(
+ resizable_layer = resizable( # type: ignore[var-annotated]
sparse_linear,
resize_layer=partial(resize_linear_weighted, fill_defaults=fill_defaults),
)
@@ -93,7 +93,7 @@ def build_bow_text_classifier(
model = with_cpu(model, model.ops)
if output_layer:
model = model >> with_cpu(output_layer, output_layer.ops)
- model.set_dim("nO", nO)
+ model.set_dim("nO", nO) # type: ignore[arg-type]
model.set_ref("output_layer", sparse_linear)
model.attrs["multi_label"] = not exclusive_classes
model.attrs["resize_output"] = partial(
@@ -130,14 +130,14 @@ def build_text_classifier_v2(
model = (linear_model | cnn_model) >> output_layer
model.set_ref("tok2vec", tok2vec)
if model.has_dim("nO") is not False:
- model.set_dim("nO", nO)
+ model.set_dim("nO", nO) # type: ignore[arg-type]
model.set_ref("output_layer", linear_model.get_ref("output_layer"))
model.set_ref("attention_layer", attention_layer)
model.set_ref("maxout_layer", maxout_layer)
model.set_ref("norm_layer", norm_layer)
model.attrs["multi_label"] = not exclusive_classes
- model.init = init_ensemble_textcat
+ model.init = init_ensemble_textcat # type: ignore[assignment]
return model
@@ -164,7 +164,7 @@ def build_text_classifier_lowdata(
>> list2ragged()
>> ParametricAttention(width)
>> reduce_sum()
- >> residual(Relu(width, width)) ** 2
+ >> residual(Relu(width, width)) ** 2 # type: ignore[arg-type]
>> Linear(nO, width)
)
if dropout:
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 76ec87054..ecdf6be27 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -1,5 +1,5 @@
-from typing import Optional, List, Union
-from thinc.types import Floats2d
+from typing import Optional, List, Union, cast
+from thinc.types import Floats2d, Ints2d, Ragged
from thinc.api import chain, clone, concatenate, with_array, with_padded
from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
@@ -53,7 +53,7 @@ def build_hash_embed_cnn_tok2vec(
window_size (int): The number of tokens on either side to concatenate during
the convolutions. The receptive field of the CNN will be
depth * (window_size * 2 + 1), so a 4-layer network with window_size of
- 2 will be sensitive to 17 words at a time. Recommended value is 1.
+ 2 will be sensitive to 20 words at a time. Recommended value is 1.
embed_size (int): The number of rows in the hash embedding tables. This can
be surprisingly small, due to the use of the hash embeddings. Recommended
values are between 2000 and 10000.
@@ -123,7 +123,7 @@ def MultiHashEmbed(
attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
account some subword information, without constructing a fully character-based
representation. If pretrained vectors are available, they can be included in
- the representation as well, with the vectors table will be kept static
+ the representation as well, with the vectors table kept static
(i.e. it's not updated).
The `width` parameter specifies the output width of the layer and the widths
@@ -158,26 +158,30 @@ def MultiHashEmbed(
embeddings = [make_hash_embed(i) for i in range(len(attrs))]
concat_size = width * (len(embeddings) + include_static_vectors)
+ max_out: Model[Ragged, Ragged] = with_array(
+ Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True) # type: ignore
+ )
if include_static_vectors:
+ feature_extractor: Model[List[Doc], Ragged] = chain(
+ FeatureExtractor(attrs),
+ cast(Model[List[Ints2d], Ragged], list2ragged()),
+ with_array(concatenate(*embeddings)),
+ )
model = chain(
concatenate(
- chain(
- FeatureExtractor(attrs),
- list2ragged(),
- with_array(concatenate(*embeddings)),
- ),
+ feature_extractor,
StaticVectors(width, dropout=0.0),
),
- with_array(Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)),
- ragged2list(),
+ max_out,
+ cast(Model[Ragged, List[Floats2d]], ragged2list()),
)
else:
model = chain(
FeatureExtractor(list(attrs)),
- list2ragged(),
+ cast(Model[List[Ints2d], Ragged], list2ragged()),
with_array(concatenate(*embeddings)),
- with_array(Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)),
- ragged2list(),
+ max_out,
+ cast(Model[Ragged, List[Floats2d]], ragged2list()),
)
return model
@@ -220,37 +224,41 @@ def CharacterEmbed(
"""
feature = intify_attr(feature)
if feature is None:
- raise ValueError(Errors.E911(feat=feature))
+ raise ValueError(Errors.E911.format(feat=feature))
+ char_embed = chain(
+ _character_embed.CharacterEmbed(nM=nM, nC=nC),
+ cast(Model[List[Floats2d], Ragged], list2ragged()),
+ )
+ feature_extractor: Model[List[Doc], Ragged] = chain(
+ FeatureExtractor([feature]),
+ cast(Model[List[Ints2d], Ragged], list2ragged()),
+ with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)), # type: ignore
+ )
+ max_out: Model[Ragged, Ragged]
if include_static_vectors:
+ max_out = with_array(
+ Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0) # type: ignore
+ )
model = chain(
concatenate(
- chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
- chain(
- FeatureExtractor([feature]),
- list2ragged(),
- with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
- ),
+ char_embed,
+ feature_extractor,
StaticVectors(width, dropout=0.0),
),
- with_array(
- Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)
- ),
- ragged2list(),
+ max_out,
+ cast(Model[Ragged, List[Floats2d]], ragged2list()),
)
else:
+ max_out = with_array(
+ Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0) # type: ignore
+ )
model = chain(
concatenate(
- chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
- chain(
- FeatureExtractor([feature]),
- list2ragged(),
- with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
- ),
+ char_embed,
+ feature_extractor,
),
- with_array(
- Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)
- ),
- ragged2list(),
+ max_out,
+ cast(Model[Ragged, List[Floats2d]], ragged2list()),
)
return model
@@ -281,10 +289,10 @@ def MaxoutWindowEncoder(
normalize=True,
),
)
- model = clone(residual(cnn), depth)
+ model = clone(residual(cnn), depth) # type: ignore[arg-type]
model.set_dim("nO", width)
receptive_field = window_size * depth
- return with_array(model, pad=receptive_field)
+ return with_array(model, pad=receptive_field) # type: ignore[arg-type]
@registry.architectures("spacy.MishWindowEncoder.v2")
@@ -305,9 +313,9 @@ def MishWindowEncoder(
expand_window(window_size=window_size),
Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True),
)
- model = clone(residual(cnn), depth)
+ model = clone(residual(cnn), depth) # type: ignore[arg-type]
model.set_dim("nO", width)
- return with_array(model)
+ return with_array(model) # type: ignore[arg-type]
@registry.architectures("spacy.TorchBiLSTMEncoder.v1")
diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py
index 4e7262e7d..8d9b1af9b 100644
--- a/spacy/ml/staticvectors.py
+++ b/spacy/ml/staticvectors.py
@@ -1,11 +1,13 @@
-from typing import List, Tuple, Callable, Optional, cast
+from typing import List, Tuple, Callable, Optional, Sequence, cast
from thinc.initializers import glorot_uniform_init
from thinc.util import partial
-from thinc.types import Ragged, Floats2d, Floats1d
+from thinc.types import Ragged, Floats2d, Floats1d, Ints1d
from thinc.api import Model, Ops, registry
from ..tokens import Doc
from ..errors import Errors
+from ..vectors import Mode
+from ..vocab import Vocab
@registry.layers("spacy.StaticVectors.v2")
@@ -34,22 +36,34 @@ def StaticVectors(
def forward(
model: Model[List[Doc], Ragged], docs: List[Doc], is_train: bool
) -> Tuple[Ragged, Callable]:
- if not sum(len(doc) for doc in docs):
+ token_count = sum(len(doc) for doc in docs)
+ if not token_count:
return _handle_empty(model.ops, model.get_dim("nO"))
- key_attr = model.attrs["key_attr"]
- W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
- V = cast(Floats2d, model.ops.asarray(docs[0].vocab.vectors.data))
- rows = model.ops.flatten(
- [doc.vocab.vectors.find(keys=doc.to_array(key_attr)) for doc in docs]
+ key_attr: int = model.attrs["key_attr"]
+ keys: Ints1d = model.ops.flatten(
+ cast(Sequence, [doc.to_array(key_attr) for doc in docs])
)
+ vocab: Vocab = docs[0].vocab
+ W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
+ if vocab.vectors.mode == Mode.default:
+ V = cast(Floats2d, model.ops.asarray(vocab.vectors.data))
+ rows = vocab.vectors.find(keys=keys)
+ V = model.ops.as_contig(V[rows])
+ elif vocab.vectors.mode == Mode.floret:
+ V = cast(Floats2d, vocab.vectors.get_batch(keys))
+ V = model.ops.as_contig(V)
+ else:
+ raise RuntimeError(Errors.E896)
try:
- vectors_data = model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True)
+ vectors_data = model.ops.gemm(V, W, trans2=True)
except ValueError:
raise RuntimeError(Errors.E896)
- # Convert negative indices to 0-vectors (TODO: more options for UNK tokens)
- vectors_data[rows < 0] = 0
+ if vocab.vectors.mode == Mode.default:
+ # Convert negative indices to 0-vectors
+ # TODO: more options for UNK tokens
+ vectors_data[rows < 0] = 0
output = Ragged(
- vectors_data, model.ops.asarray([len(doc) for doc in docs], dtype="i")
+ vectors_data, model.ops.asarray([len(doc) for doc in docs], dtype="i") # type: ignore
)
mask = None
if is_train:
@@ -62,7 +76,9 @@ def forward(
d_output.data *= mask
model.inc_grad(
"W",
- model.ops.gemm(d_output.data, model.ops.as_contig(V[rows]), trans1=True),
+ model.ops.gemm(
+ cast(Floats2d, d_output.data), model.ops.as_contig(V), trans1=True
+ ),
)
return []
@@ -78,7 +94,7 @@ def init(
nM = model.get_dim("nM") if model.has_dim("nM") else None
nO = model.get_dim("nO") if model.has_dim("nO") else None
if X is not None and len(X):
- nM = X[0].vocab.vectors.data.shape[1]
+ nM = X[0].vocab.vectors.shape[1]
if Y is not None:
nO = Y.data.shape[1]
@@ -97,4 +113,7 @@ def _handle_empty(ops: Ops, nO: int):
def _get_drop_mask(ops: Ops, nO: int, rate: Optional[float]) -> Optional[Floats1d]:
- return ops.get_dropout_mask((nO,), rate) if rate is not None else None
+ if rate is not None:
+ mask = ops.get_dropout_mask((nO,), rate)
+ return mask # type: ignore
+ return None
diff --git a/spacy/pipe_analysis.py b/spacy/pipe_analysis.py
index d0362e7e1..245747061 100644
--- a/spacy/pipe_analysis.py
+++ b/spacy/pipe_analysis.py
@@ -1,4 +1,4 @@
-from typing import List, Dict, Iterable, Optional, Union, TYPE_CHECKING
+from typing import List, Set, Dict, Iterable, ItemsView, Union, TYPE_CHECKING
from wasabi import msg
from .tokens import Doc, Token, Span
@@ -67,7 +67,7 @@ def get_attr_info(nlp: "Language", attr: str) -> Dict[str, List[str]]:
RETURNS (Dict[str, List[str]]): A dict keyed by "assigns" and "requires",
mapped to a list of component names.
"""
- result = {"assigns": [], "requires": []}
+ result: Dict[str, List[str]] = {"assigns": [], "requires": []}
for pipe_name in nlp.pipe_names:
meta = nlp.get_pipe_meta(pipe_name)
if attr in meta.assigns:
@@ -79,7 +79,7 @@ def get_attr_info(nlp: "Language", attr: str) -> Dict[str, List[str]]:
def analyze_pipes(
nlp: "Language", *, keys: List[str] = DEFAULT_KEYS
-) -> Dict[str, Union[List[str], Dict[str, List[str]]]]:
+) -> Dict[str, Dict[str, Union[List[str], Dict]]]:
"""Print a formatted summary for the current nlp object's pipeline. Shows
a table with the pipeline components and why they assign and require, as
well as any problems if available.
@@ -88,8 +88,11 @@ def analyze_pipes(
keys (List[str]): The meta keys to show in the table.
RETURNS (dict): A dict with "summary" and "problems".
"""
- result = {"summary": {}, "problems": {}}
- all_attrs = set()
+ result: Dict[str, Dict[str, Union[List[str], Dict]]] = {
+ "summary": {},
+ "problems": {},
+ }
+ all_attrs: Set[str] = set()
for i, name in enumerate(nlp.pipe_names):
meta = nlp.get_pipe_meta(name)
all_attrs.update(meta.assigns)
@@ -102,19 +105,18 @@ def analyze_pipes(
prev_meta = nlp.get_pipe_meta(prev_name)
for annot in prev_meta.assigns:
requires[annot] = True
- result["problems"][name] = []
- for annot, fulfilled in requires.items():
- if not fulfilled:
- result["problems"][name].append(annot)
+ result["problems"][name] = [
+ annot for annot, fulfilled in requires.items() if not fulfilled
+ ]
result["attrs"] = {attr: get_attr_info(nlp, attr) for attr in all_attrs}
return result
def print_pipe_analysis(
- analysis: Dict[str, Union[List[str], Dict[str, List[str]]]],
+ analysis: Dict[str, Dict[str, Union[List[str], Dict]]],
*,
keys: List[str] = DEFAULT_KEYS,
-) -> Optional[Dict[str, Union[List[str], Dict[str, List[str]]]]]:
+) -> None:
"""Print a formatted version of the pipe analysis produced by analyze_pipes.
analysis (Dict[str, Union[List[str], Dict[str, List[str]]]]): The analysis.
@@ -122,7 +124,7 @@ def print_pipe_analysis(
"""
msg.divider("Pipeline Overview")
header = ["#", "Component", *[key.capitalize() for key in keys]]
- summary = analysis["summary"].items()
+ summary: ItemsView = analysis["summary"].items()
body = [[i, n, *[v for v in m.values()]] for i, (n, m) in enumerate(summary)]
msg.table(body, header=header, divider=True, multiline=True)
n_problems = sum(len(p) for p in analysis["problems"].values())
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index 161f3ca48..27623e7c6 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -1,3 +1,4 @@
+from cython.operator cimport dereference as deref, preincrement as incr
from libc.string cimport memcpy, memset
from libc.stdlib cimport calloc, free
from libc.stdint cimport uint32_t, uint64_t
@@ -184,16 +185,20 @@ cdef cppclass StateC:
int L(int head, int idx) nogil const:
if idx < 1 or this._left_arcs.size() == 0:
return -1
- cdef vector[int] lefts
- for i in range(this._left_arcs.size()):
- arc = this._left_arcs.at(i)
+
+ # Work backwards through left-arcs to find the arc at the
+ # requested index more quickly.
+ cdef size_t child_index = 0
+ it = this._left_arcs.const_rbegin()
+ while it != this._left_arcs.rend():
+ arc = deref(it)
if arc.head == head and arc.child != -1 and arc.child < head:
- lefts.push_back(arc.child)
- idx = (lefts.size()) - idx
- if idx < 0:
- return -1
- else:
- return lefts.at(idx)
+ child_index += 1
+ if child_index == idx:
+ return arc.child
+ incr(it)
+
+ return -1
int R(int head, int idx) nogil const:
if idx < 1 or this._right_arcs.size() == 0:
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index 9ca702f9b..029e2e29e 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -17,7 +17,7 @@ from ...errors import Errors
from thinc.extra.search cimport Beam
cdef weight_t MIN_SCORE = -90000
-cdef attr_t SUBTOK_LABEL = hash_string(u'subtok')
+cdef attr_t SUBTOK_LABEL = hash_string('subtok')
DEF NON_MONOTONIC = True
@@ -585,7 +585,10 @@ cdef class ArcEager(TransitionSystem):
actions[RIGHT][label] = 1
actions[REDUCE][label] = 1
for example in kwargs.get('examples', []):
- heads, labels = example.get_aligned_parse(projectivize=True)
+ # use heads and labels from the reference parse (without regard to
+ # misalignments between the predicted and reference)
+ example_gold_preproc = Example(example.reference, example.reference)
+ heads, labels = example_gold_preproc.get_aligned_parse(projectivize=True)
for child, (head, label) in enumerate(zip(heads, labels)):
if head is None or label is None:
continue
@@ -601,7 +604,7 @@ cdef class ArcEager(TransitionSystem):
actions[SHIFT][''] += 1
if min_freq is not None:
for action, label_freqs in actions.items():
- for label, freq in list(label_freqs.items()):
+ for label, freq in label_freqs.copy().items():
if freq < min_freq:
label_freqs.pop(label)
# Ensure these actions are present
diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py
index a6efd5906..0d9494865 100644
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@@ -5,15 +5,15 @@ from pathlib import Path
from .pipe import Pipe
from ..errors import Errors
-from ..training import validate_examples, Example
+from ..training import Example
from ..language import Language
from ..matcher import Matcher
from ..scorer import Scorer
-from ..symbols import IDS, TAG, POS, MORPH, LEMMA
+from ..symbols import IDS
from ..tokens import Doc, Span
from ..tokens._retokenize import normalize_token_attrs, set_token_attrs
from ..vocab import Vocab
-from ..util import SimpleFrozenList
+from ..util import SimpleFrozenList, registry
from .. import util
@@ -23,9 +23,41 @@ TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]]
MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
-@Language.factory("attribute_ruler", default_config={"validate": False})
-def make_attribute_ruler(nlp: Language, name: str, validate: bool):
- return AttributeRuler(nlp.vocab, name, validate=validate)
+@Language.factory(
+ "attribute_ruler",
+ default_config={
+ "validate": False,
+ "scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"},
+ },
+)
+def make_attribute_ruler(
+ nlp: Language, name: str, validate: bool, scorer: Optional[Callable]
+):
+ return AttributeRuler(nlp.vocab, name, validate=validate, scorer=scorer)
+
+
+def attribute_ruler_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
+ def morph_key_getter(token, attr):
+ return getattr(token, attr).key
+
+ results = {}
+ results.update(Scorer.score_token_attr(examples, "tag", **kwargs))
+ results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
+ results.update(
+ Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs)
+ )
+ results.update(
+ Scorer.score_token_attr_per_feat(
+ examples, "morph", getter=morph_key_getter, **kwargs
+ )
+ )
+ results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
+ return results
+
+
+@registry.scorers("spacy.attribute_ruler_scorer.v1")
+def make_attribute_ruler_scorer():
+ return attribute_ruler_score
class AttributeRuler(Pipe):
@@ -36,7 +68,12 @@ class AttributeRuler(Pipe):
"""
def __init__(
- self, vocab: Vocab, name: str = "attribute_ruler", *, validate: bool = False
+ self,
+ vocab: Vocab,
+ name: str = "attribute_ruler",
+ *,
+ validate: bool = False,
+ scorer: Optional[Callable] = attribute_ruler_score,
) -> None:
"""Create the AttributeRuler. After creation, you can add patterns
with the `.initialize()` or `.add_patterns()` methods, or load patterns
@@ -45,6 +82,10 @@ class AttributeRuler(Pipe):
vocab (Vocab): The vocab.
name (str): The pipe name. Defaults to "attribute_ruler".
+ scorer (Optional[Callable]): The scoring method. Defaults to
+ Scorer.score_token_attr for the attributes "tag", "pos", "morph" and
+ "lemma" and Scorer.score_token_attr_per_feat for the attribute
+ "morph".
RETURNS (AttributeRuler): The AttributeRuler component.
@@ -54,9 +95,10 @@ class AttributeRuler(Pipe):
self.vocab = vocab
self.matcher = Matcher(self.vocab, validate=validate)
self.validate = validate
- self.attrs = []
- self._attrs_unnormed = [] # store for reference
- self.indices = []
+ self.attrs: List[Dict] = []
+ self._attrs_unnormed: List[Dict] = [] # store for reference
+ self.indices: List[int] = []
+ self.scorer = scorer
def clear(self) -> None:
"""Reset all patterns."""
@@ -102,13 +144,13 @@ class AttributeRuler(Pipe):
self.set_annotations(doc, matches)
return doc
except Exception as e:
- error_handler(self.name, self, [doc], e)
+ return error_handler(self.name, self, [doc], e)
def match(self, doc: Doc):
- matches = self.matcher(doc, allow_missing=True)
+ matches = self.matcher(doc, allow_missing=True, as_spans=False)
# Sort by the attribute ID, so that later rules have precedence
matches = [
- (int(self.vocab.strings[m_id]), m_id, s, e) for m_id, s, e in matches
+ (int(self.vocab.strings[m_id]), m_id, s, e) for m_id, s, e in matches # type: ignore
]
matches.sort()
return matches
@@ -154,7 +196,7 @@ class AttributeRuler(Pipe):
else:
morph = self.vocab.morphology.add(attrs["MORPH"])
attrs["MORPH"] = self.vocab.strings[morph]
- self.add([pattern], attrs)
+ self.add([pattern], attrs) # type: ignore[list-item]
def load_from_morph_rules(
self, morph_rules: Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
@@ -178,7 +220,7 @@ class AttributeRuler(Pipe):
elif morph_attrs:
morph = self.vocab.morphology.add(morph_attrs)
attrs["MORPH"] = self.vocab.strings[morph]
- self.add([pattern], attrs)
+ self.add([pattern], attrs) # type: ignore[list-item]
def add(
self, patterns: Iterable[MatcherPatternType], attrs: Dict, index: int = 0
@@ -198,7 +240,7 @@ class AttributeRuler(Pipe):
# We need to make a string here, because otherwise the ID we pass back
# will be interpreted as the hash of a string, rather than an ordinal.
key = str(len(self.attrs))
- self.matcher.add(self.vocab.strings.add(key), patterns)
+ self.matcher.add(self.vocab.strings.add(key), patterns) # type: ignore[arg-type]
self._attrs_unnormed.append(attrs)
attrs = normalize_token_attrs(self.vocab, attrs)
self.attrs.append(attrs)
@@ -214,7 +256,7 @@ class AttributeRuler(Pipe):
DOCS: https://spacy.io/api/attributeruler#add_patterns
"""
for p in patterns:
- self.add(**p)
+ self.add(**p) # type: ignore[arg-type]
@property
def patterns(self) -> List[AttributeRulerPatternType]:
@@ -223,49 +265,10 @@ class AttributeRuler(Pipe):
for i in range(len(self.attrs)):
p = {}
p["patterns"] = self.matcher.get(str(i))[1]
- p["attrs"] = self._attrs_unnormed[i]
- p["index"] = self.indices[i]
+ p["attrs"] = self._attrs_unnormed[i] # type: ignore
+ p["index"] = self.indices[i] # type: ignore
all_patterns.append(p)
- return all_patterns
-
- def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
- """Score a batch of examples.
-
- examples (Iterable[Example]): The examples to score.
- RETURNS (Dict[str, Any]): The scores, produced by
- Scorer.score_token_attr for the attributes "tag", "pos", "morph"
- and "lemma" for the target token attributes.
-
- DOCS: https://spacy.io/api/tagger#score
- """
-
- def morph_key_getter(token, attr):
- return getattr(token, attr).key
-
- validate_examples(examples, "AttributeRuler.score")
- results = {}
- attrs = set()
- for token_attrs in self.attrs:
- attrs.update(token_attrs)
- for attr in attrs:
- if attr == TAG:
- results.update(Scorer.score_token_attr(examples, "tag", **kwargs))
- elif attr == POS:
- results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
- elif attr == MORPH:
- results.update(
- Scorer.score_token_attr(
- examples, "morph", getter=morph_key_getter, **kwargs
- )
- )
- results.update(
- Scorer.score_token_attr_per_feat(
- examples, "morph", getter=morph_key_getter, **kwargs
- )
- )
- elif attr == LEMMA:
- results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
- return results
+ return all_patterns # type: ignore[return-value]
def to_bytes(self, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
"""Serialize the AttributeRuler to a bytestring.
@@ -276,7 +279,7 @@ class AttributeRuler(Pipe):
DOCS: https://spacy.io/api/attributeruler#to_bytes
"""
serialize = {}
- serialize["vocab"] = self.vocab.to_bytes
+ serialize["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
serialize["patterns"] = lambda: srsly.msgpack_dumps(self.patterns)
return util.to_bytes(serialize, exclude)
@@ -296,7 +299,7 @@ class AttributeRuler(Pipe):
self.add_patterns(srsly.msgpack_loads(b))
deserialize = {
- "vocab": lambda b: self.vocab.from_bytes(b),
+ "vocab": lambda b: self.vocab.from_bytes(b, exclude=exclude),
"patterns": load_patterns,
}
util.from_bytes(bytes_data, deserialize, exclude)
@@ -313,7 +316,7 @@ class AttributeRuler(Pipe):
DOCS: https://spacy.io/api/attributeruler#to_disk
"""
serialize = {
- "vocab": lambda p: self.vocab.to_disk(p),
+ "vocab": lambda p: self.vocab.to_disk(p, exclude=exclude),
"patterns": lambda p: srsly.write_msgpack(p, self.patterns),
}
util.to_disk(path, serialize, exclude)
@@ -334,7 +337,7 @@ class AttributeRuler(Pipe):
self.add_patterns(srsly.read_msgpack(p))
deserialize = {
- "vocab": lambda p: self.vocab.from_disk(p),
+ "vocab": lambda p: self.vocab.from_disk(p, exclude=exclude),
"patterns": load_patterns,
}
util.from_disk(path, deserialize, exclude)
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index be23ab0dd..50c57ee5b 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -1,6 +1,6 @@
# cython: infer_types=True, profile=True, binding=True
from collections import defaultdict
-from typing import Optional, Iterable
+from typing import Optional, Iterable, Callable
from thinc.api import Model, Config
from ._parser_internals.transition_system import TransitionSystem
@@ -12,7 +12,7 @@ from ..language import Language
from ._parser_internals import nonproj
from ._parser_internals.nonproj import DELIMITER
from ..scorer import Scorer
-from ..training import validate_examples
+from ..util import registry
default_model_config = """
@@ -46,6 +46,7 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
"learn_tokens": False,
"min_action_freq": 30,
"model": DEFAULT_PARSER_MODEL,
+ "scorer": {"@scorers": "spacy.parser_scorer.v1"},
},
default_score_weights={
"dep_uas": 0.5,
@@ -63,7 +64,8 @@ def make_parser(
moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int,
learn_tokens: bool,
- min_action_freq: int
+ min_action_freq: int,
+ scorer: Optional[Callable],
):
"""Create a transition-based DependencyParser component. The dependency parser
jointly learns sentence segmentation and labelled dependency parsing, and can
@@ -100,6 +102,7 @@ def make_parser(
primarily affects the label accuracy, it can also affect the attachment
structure, as the labels are used to represent the pseudo-projectivity
transformation.
+ scorer (Optional[Callable]): The scoring method.
"""
return DependencyParser(
nlp.vocab,
@@ -115,7 +118,8 @@ def make_parser(
beam_update_prob=0.0,
# At some point in the future we can try to implement support for
# partial annotations, perhaps only in the beam objective.
- incorrect_spans_key=None
+ incorrect_spans_key=None,
+ scorer=scorer,
)
@Language.factory(
@@ -130,6 +134,7 @@ def make_parser(
"learn_tokens": False,
"min_action_freq": 30,
"model": DEFAULT_PARSER_MODEL,
+ "scorer": {"@scorers": "spacy.parser_scorer.v1"},
},
default_score_weights={
"dep_uas": 0.5,
@@ -151,6 +156,7 @@ def make_beam_parser(
beam_width: int,
beam_density: float,
beam_update_prob: float,
+ scorer: Optional[Callable],
):
"""Create a transition-based DependencyParser component that uses beam-search.
The dependency parser jointly learns sentence segmentation and labelled
@@ -207,10 +213,41 @@ def make_beam_parser(
min_action_freq=min_action_freq,
# At some point in the future we can try to implement support for
# partial annotations, perhaps only in the beam objective.
- incorrect_spans_key=None
+ incorrect_spans_key=None,
+ scorer=scorer,
)
+def parser_score(examples, **kwargs):
+ """Score a batch of examples.
+
+ examples (Iterable[Example]): The examples to score.
+ RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
+ and Scorer.score_deps.
+
+ DOCS: https://spacy.io/api/dependencyparser#score
+ """
+ def has_sents(doc):
+ return doc.has_annotation("SENT_START")
+
+ def dep_getter(token, attr):
+ dep = getattr(token, attr)
+ dep = token.vocab.strings.as_string(dep).lower()
+ return dep
+ results = {}
+ results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
+ kwargs.setdefault("getter", dep_getter)
+ kwargs.setdefault("ignore_labels", ("p", "punct"))
+ results.update(Scorer.score_deps(examples, "dep", **kwargs))
+ del results["sents_per_type"]
+ return results
+
+
+@registry.scorers("spacy.parser_scorer.v1")
+def make_parser_scorer():
+ return parser_score
+
+
cdef class DependencyParser(Parser):
"""Pipeline component for dependency parsing.
@@ -233,6 +270,7 @@ cdef class DependencyParser(Parser):
beam_update_prob=0.0,
multitasks=tuple(),
incorrect_spans_key=None,
+ scorer=parser_score,
):
"""Create a DependencyParser.
"""
@@ -249,6 +287,7 @@ cdef class DependencyParser(Parser):
beam_update_prob=beam_update_prob,
multitasks=multitasks,
incorrect_spans_key=incorrect_spans_key,
+ scorer=scorer,
)
@property
@@ -281,31 +320,6 @@ cdef class DependencyParser(Parser):
labels.add(label)
return tuple(sorted(labels))
- def score(self, examples, **kwargs):
- """Score a batch of examples.
-
- examples (Iterable[Example]): The examples to score.
- RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
- and Scorer.score_deps.
-
- DOCS: https://spacy.io/api/dependencyparser#score
- """
- def has_sents(doc):
- return doc.has_annotation("SENT_START")
-
- validate_examples(examples, "DependencyParser.score")
- def dep_getter(token, attr):
- dep = getattr(token, attr)
- dep = token.vocab.strings.as_string(dep).lower()
- return dep
- results = {}
- results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
- kwargs.setdefault("getter", dep_getter)
- kwargs.setdefault("ignore_labels", ("p", "punct"))
- results.update(Scorer.score_deps(examples, "dep", **kwargs))
- del results["sents_per_type"]
- return results
-
def scored_parses(self, beams):
"""Return two dictionaries with scores for each beam/doc that was processed:
one containing (i, head) keys, and another containing (i, label) keys.
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index ba7e71f15..1169e898d 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -1,4 +1,5 @@
-from typing import Optional, Iterable, Callable, Dict, Union, List
+from typing import Optional, Iterable, Callable, Dict, Union, List, Any
+from thinc.types import Floats2d
from pathlib import Path
from itertools import islice
import srsly
@@ -16,10 +17,12 @@ from ..language import Language
from ..vocab import Vocab
from ..training import Example, validate_examples, validate_get_examples
from ..errors import Errors, Warnings
-from ..util import SimpleFrozenList
+from ..util import SimpleFrozenList, registry
from .. import util
from ..scorer import Scorer
+# See #9050
+BACKWARD_OVERWRITE = True
default_model_config = """
[model]
@@ -50,6 +53,8 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
"incl_context": True,
"entity_vector_length": 64,
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
+ "overwrite": True,
+ "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
},
default_score_weights={
"nel_micro_f": 1.0,
@@ -68,6 +73,8 @@ def make_entity_linker(
incl_context: bool,
entity_vector_length: int,
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
+ overwrite: bool,
+ scorer: Optional[Callable],
):
"""Construct an EntityLinker component.
@@ -81,6 +88,7 @@ def make_entity_linker(
entity_vector_length (int): Size of encoding vectors in the KB.
get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
produces a list of candidates, given a certain knowledge base and a textual mention.
+ scorer (Optional[Callable]): The scoring method.
"""
return EntityLinker(
nlp.vocab,
@@ -92,9 +100,20 @@ def make_entity_linker(
incl_context=incl_context,
entity_vector_length=entity_vector_length,
get_candidates=get_candidates,
+ overwrite=overwrite,
+ scorer=scorer,
)
+def entity_linker_score(examples, **kwargs):
+ return Scorer.score_links(examples, negative_labels=[EntityLinker.NIL], **kwargs)
+
+
+@registry.scorers("spacy.entity_linker_scorer.v1")
+def make_entity_linker_scorer():
+ return entity_linker_score
+
+
class EntityLinker(TrainablePipe):
"""Pipeline component for named entity linking.
@@ -115,6 +134,8 @@ class EntityLinker(TrainablePipe):
incl_context: bool,
entity_vector_length: int,
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
+ overwrite: bool = BACKWARD_OVERWRITE,
+ scorer: Optional[Callable] = entity_linker_score,
) -> None:
"""Initialize an entity linker.
@@ -129,6 +150,8 @@ class EntityLinker(TrainablePipe):
entity_vector_length (int): Size of encoding vectors in the KB.
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
produces a list of candidates, given a certain knowledge base and a textual mention.
+ scorer (Optional[Callable]): The scoring method. Defaults to
+ Scorer.score_links.
DOCS: https://spacy.io/api/entitylinker#init
"""
@@ -140,11 +163,12 @@ class EntityLinker(TrainablePipe):
self.incl_prior = incl_prior
self.incl_context = incl_context
self.get_candidates = get_candidates
- self.cfg = {}
+ self.cfg: Dict[str, Any] = {"overwrite": overwrite}
self.distance = CosineDistance(normalize=False)
# how many neighbour sentences to take into account
# create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
self.kb = empty_kb(entity_vector_length)(self.vocab)
+ self.scorer = scorer
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
"""Define the KB of this pipe by providing a function that will
@@ -166,7 +190,7 @@ class EntityLinker(TrainablePipe):
get_examples: Callable[[], Iterable[Example]],
*,
nlp: Optional[Language] = None,
- kb_loader: Callable[[Vocab], KnowledgeBase] = None,
+ kb_loader: Optional[Callable[[Vocab], KnowledgeBase]] = None,
):
"""Initialize the pipe for training, using a representative set
of data examples.
@@ -261,7 +285,7 @@ class EntityLinker(TrainablePipe):
losses[self.name] += loss
return losses
- def get_loss(self, examples: Iterable[Example], sentence_encodings):
+ def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
validate_examples(examples, "EntityLinker.get_loss")
entity_encodings = []
for eg in examples:
@@ -277,8 +301,9 @@ class EntityLinker(TrainablePipe):
method="get_loss", msg="gold entities do not match up"
)
raise RuntimeError(err)
- gradients = self.distance.get_grad(sentence_encodings, entity_encodings)
- loss = self.distance.get_loss(sentence_encodings, entity_encodings)
+ # TODO: fix typing issue here
+ gradients = self.distance.get_grad(sentence_encodings, entity_encodings) # type: ignore
+ loss = self.distance.get_loss(sentence_encodings, entity_encodings) # type: ignore
loss = loss / len(entity_encodings)
return float(loss), gradients
@@ -288,13 +313,13 @@ class EntityLinker(TrainablePipe):
no prediction.
docs (Iterable[Doc]): The documents to predict.
- RETURNS (List[int]): The models prediction for each document.
+ RETURNS (List[str]): The models prediction for each document.
DOCS: https://spacy.io/api/entitylinker#predict
"""
self.validate_kb()
entity_count = 0
- final_kb_ids = []
+ final_kb_ids: List[str] = []
if not docs:
return final_kb_ids
if isinstance(docs, Doc):
@@ -324,7 +349,7 @@ class EntityLinker(TrainablePipe):
# ignoring this entity - setting to NIL
final_kb_ids.append(self.NIL)
else:
- candidates = self.get_candidates(self.kb, ent)
+ candidates = list(self.get_candidates(self.kb, ent))
if not candidates:
# no prediction possible for this entity - setting to NIL
final_kb_ids.append(self.NIL)
@@ -382,23 +407,14 @@ class EntityLinker(TrainablePipe):
if count_ents != len(kb_ids):
raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
i = 0
+ overwrite = self.cfg["overwrite"]
for doc in docs:
for ent in doc.ents:
kb_id = kb_ids[i]
i += 1
for token in ent:
- token.ent_kb_id_ = kb_id
-
- def score(self, examples, **kwargs):
- """Score a batch of examples.
-
- examples (Iterable[Example]): The examples to score.
- RETURNS (Dict[str, Any]): The scores.
-
- DOCS TODO: https://spacy.io/api/entity_linker#score
- """
- validate_examples(examples, "EntityLinker.score")
- return Scorer.score_links(examples, negative_labels=[self.NIL])
+ if token.ent_kb_id == 0 or overwrite:
+ token.ent_kb_id_ = kb_id
def to_bytes(self, *, exclude=tuple()):
"""Serialize the pipe to a bytestring.
@@ -412,7 +428,7 @@ class EntityLinker(TrainablePipe):
serialize = {}
if hasattr(self, "cfg") and self.cfg is not None:
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
- serialize["vocab"] = self.vocab.to_bytes
+ serialize["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
serialize["kb"] = self.kb.to_bytes
serialize["model"] = self.model.to_bytes
return util.to_bytes(serialize, exclude)
@@ -436,7 +452,7 @@ class EntityLinker(TrainablePipe):
deserialize = {}
if hasattr(self, "cfg") and self.cfg is not None:
deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
- deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
+ deserialize["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude)
deserialize["kb"] = lambda b: self.kb.from_bytes(b)
deserialize["model"] = load_model
util.from_bytes(bytes_data, deserialize, exclude)
@@ -453,7 +469,7 @@ class EntityLinker(TrainablePipe):
DOCS: https://spacy.io/api/entitylinker#to_disk
"""
serialize = {}
- serialize["vocab"] = lambda p: self.vocab.to_disk(p)
+ serialize["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude)
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
serialize["kb"] = lambda p: self.kb.to_disk(p)
serialize["model"] = lambda p: self.model.to_disk(p)
@@ -478,8 +494,9 @@ class EntityLinker(TrainablePipe):
except AttributeError:
raise ValueError(Errors.E149) from None
- deserialize = {}
+ deserialize: Dict[str, Callable[[Any], Any]] = {}
deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
+ deserialize["vocab"] = lambda p: self.vocab.from_disk(p, exclude=exclude)
deserialize["kb"] = lambda p: self.kb.from_disk(p)
deserialize["model"] = load_model
util.from_disk(path, deserialize, exclude)
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index bd1ebcb04..8f7be6994 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -1,5 +1,6 @@
import warnings
from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence
+from typing import cast
from collections import defaultdict
from pathlib import Path
import srsly
@@ -8,11 +9,10 @@ from .pipe import Pipe
from ..training import Example
from ..language import Language
from ..errors import Errors, Warnings
-from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
+from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry
from ..tokens import Doc, Span
from ..matcher import Matcher, PhraseMatcher
from ..scorer import get_ner_prf
-from ..training import validate_examples
DEFAULT_ENT_ID_SEP = "||"
@@ -27,6 +27,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
"validate": False,
"overwrite_ents": False,
"ent_id_sep": DEFAULT_ENT_ID_SEP,
+ "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
},
default_score_weights={
"ents_f": 1.0,
@@ -42,6 +43,7 @@ def make_entity_ruler(
validate: bool,
overwrite_ents: bool,
ent_id_sep: str,
+ scorer: Optional[Callable],
):
return EntityRuler(
nlp,
@@ -50,9 +52,19 @@ def make_entity_ruler(
validate=validate,
overwrite_ents=overwrite_ents,
ent_id_sep=ent_id_sep,
+ scorer=scorer,
)
+def entity_ruler_score(examples, **kwargs):
+ return get_ner_prf(examples)
+
+
+@registry.scorers("spacy.entity_ruler_scorer.v1")
+def make_entity_ruler_scorer():
+ return entity_ruler_score
+
+
class EntityRuler(Pipe):
"""The EntityRuler lets you add spans to the `Doc.ents` using token-based
rules or exact phrase matches. It can be combined with the statistical
@@ -73,6 +85,7 @@ class EntityRuler(Pipe):
overwrite_ents: bool = False,
ent_id_sep: str = DEFAULT_ENT_ID_SEP,
patterns: Optional[List[PatternType]] = None,
+ scorer: Optional[Callable] = entity_ruler_score,
) -> None:
"""Initialize the entity ruler. If patterns are supplied here, they
need to be a list of dictionaries with a `"label"` and `"pattern"`
@@ -93,14 +106,16 @@ class EntityRuler(Pipe):
overwrite_ents (bool): If existing entities are present, e.g. entities
added by the model, overwrite them by matches if necessary.
ent_id_sep (str): Separator used internally for entity IDs.
+ scorer (Optional[Callable]): The scoring method. Defaults to
+ spacy.scorer.get_ner_prf.
DOCS: https://spacy.io/api/entityruler#init
"""
self.nlp = nlp
self.name = name
self.overwrite = overwrite_ents
- self.token_patterns = defaultdict(list)
- self.phrase_patterns = defaultdict(list)
+ self.token_patterns = defaultdict(list) # type: ignore
+ self.phrase_patterns = defaultdict(list) # type: ignore
self._validate = validate
self.matcher = Matcher(nlp.vocab, validate=validate)
self.phrase_matcher_attr = phrase_matcher_attr
@@ -108,9 +123,10 @@ class EntityRuler(Pipe):
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
)
self.ent_id_sep = ent_id_sep
- self._ent_ids = defaultdict(dict)
+ self._ent_ids = defaultdict(tuple) # type: ignore
if patterns is not None:
self.add_patterns(patterns)
+ self.scorer = scorer
def __len__(self) -> int:
"""The number of all patterns added to the entity ruler."""
@@ -136,19 +152,22 @@ class EntityRuler(Pipe):
self.set_annotations(doc, matches)
return doc
except Exception as e:
- error_handler(self.name, self, [doc], e)
+ return error_handler(self.name, self, [doc], e)
def match(self, doc: Doc):
self._require_patterns()
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="\\[W036")
- matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
- matches = set(
+ matches = cast(
+ List[Tuple[int, int, int]],
+ list(self.matcher(doc)) + list(self.phrase_matcher(doc)),
+ )
+ final_matches = set(
[(m_id, start, end) for m_id, start, end in matches if start != end]
)
get_sort_key = lambda m: (m[2] - m[1], -m[1])
- matches = sorted(matches, key=get_sort_key, reverse=True)
- return matches
+ final_matches = sorted(final_matches, key=get_sort_key, reverse=True)
+ return final_matches
def set_annotations(self, doc, matches):
"""Modify the document in place"""
@@ -213,10 +232,10 @@ class EntityRuler(Pipe):
"""
self.clear()
if patterns:
- self.add_patterns(patterns)
+ self.add_patterns(patterns) # type: ignore[arg-type]
@property
- def ent_ids(self) -> Tuple[str, ...]:
+ def ent_ids(self) -> Tuple[Optional[str], ...]:
"""All entity ids present in the match patterns `id` properties
RETURNS (set): The string entity ids.
@@ -301,17 +320,17 @@ class EntityRuler(Pipe):
if ent_id:
phrase_pattern["id"] = ent_id
phrase_patterns.append(phrase_pattern)
- for entry in token_patterns + phrase_patterns:
+ for entry in token_patterns + phrase_patterns: # type: ignore[operator]
label = entry["label"]
if "id" in entry:
ent_label = label
label = self._create_label(label, entry["id"])
key = self.matcher._normalize_key(label)
self._ent_ids[key] = (ent_label, entry["id"])
- pattern = entry["pattern"]
+ pattern = entry["pattern"] # type: ignore
if isinstance(pattern, Doc):
self.phrase_patterns[label].append(pattern)
- self.phrase_matcher.add(label, [pattern])
+ self.phrase_matcher.add(label, [pattern]) # type: ignore
elif isinstance(pattern, list):
self.token_patterns[label].append(pattern)
self.matcher.add(label, [pattern])
@@ -322,18 +341,58 @@ class EntityRuler(Pipe):
"""Reset all patterns."""
self.token_patterns = defaultdict(list)
self.phrase_patterns = defaultdict(list)
- self._ent_ids = defaultdict(dict)
+ self._ent_ids = defaultdict(tuple)
self.matcher = Matcher(self.nlp.vocab, validate=self._validate)
self.phrase_matcher = PhraseMatcher(
self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
)
+ def remove(self, ent_id: str) -> None:
+ """Remove a pattern by its ent_id if a pattern with this ent_id was added before
+
+ ent_id (str): id of the pattern to be removed
+ RETURNS: None
+ DOCS: https://spacy.io/api/entityruler#remove
+ """
+ label_id_pairs = [
+ (label, eid) for (label, eid) in self._ent_ids.values() if eid == ent_id
+ ]
+ if not label_id_pairs:
+ raise ValueError(Errors.E1024.format(ent_id=ent_id))
+ created_labels = [
+ self._create_label(label, eid) for (label, eid) in label_id_pairs
+ ]
+ # remove the patterns from self.phrase_patterns
+ self.phrase_patterns = defaultdict(
+ list,
+ {
+ label: val
+ for (label, val) in self.phrase_patterns.items()
+ if label not in created_labels
+ },
+ )
+ # remove the patterns from self.token_pattern
+ self.token_patterns = defaultdict(
+ list,
+ {
+ label: val
+ for (label, val) in self.token_patterns.items()
+ if label not in created_labels
+ },
+ )
+ # remove the patterns from self.token_pattern
+ for label in created_labels:
+ if label in self.phrase_matcher:
+ self.phrase_matcher.remove(label)
+ else:
+ self.matcher.remove(label)
+
def _require_patterns(self) -> None:
"""Raise a warning if this component has no patterns defined."""
if len(self) == 0:
warnings.warn(Warnings.W036.format(name=self.name))
- def _split_label(self, label: str) -> Tuple[str, str]:
+ def _split_label(self, label: str) -> Tuple[str, Optional[str]]:
"""Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
label (str): The value of label in a pattern entry
@@ -343,11 +402,12 @@ class EntityRuler(Pipe):
ent_label, ent_id = label.rsplit(self.ent_id_sep, 1)
else:
ent_label = label
- ent_id = None
+ ent_id = None # type: ignore
return ent_label, ent_id
- def _create_label(self, label: str, ent_id: str) -> str:
+ def _create_label(self, label: Any, ent_id: Any) -> str:
"""Join Entity label with ent_id if the pattern has an `id` attribute
+ If ent_id is not a string, the label is returned as is.
label (str): The label to set for ent.label_
ent_id (str): The label
@@ -357,10 +417,6 @@ class EntityRuler(Pipe):
label = f"{label}{self.ent_id_sep}{ent_id}"
return label
- def score(self, examples, **kwargs):
- validate_examples(examples, "EntityRuler.score")
- return get_ner_prf(examples)
-
def from_bytes(
self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
) -> "EntityRuler":
@@ -414,10 +470,16 @@ class EntityRuler(Pipe):
path = ensure_path(path)
self.clear()
depr_patterns_path = path.with_suffix(".jsonl")
- if depr_patterns_path.is_file():
+ if path.suffix == ".jsonl": # user provides a jsonl
+ if path.is_file:
+ patterns = srsly.read_jsonl(path)
+ self.add_patterns(patterns)
+ else:
+ raise ValueError(Errors.E1023.format(path=path))
+ elif depr_patterns_path.is_file():
patterns = srsly.read_jsonl(depr_patterns_path)
self.add_patterns(patterns)
- else:
+ elif path.is_dir(): # path is a valid directory
cfg = {}
deserializers_patterns = {
"patterns": lambda p: self.add_patterns(
@@ -434,6 +496,8 @@ class EntityRuler(Pipe):
self.nlp.vocab, attr=self.phrase_matcher_attr
)
from_disk(path, deserializers_patterns, {})
+ else: # path is not a valid directory or file
+ raise ValueError(Errors.E146.format(path=path))
return self
def to_disk(
diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py
index 03c7db422..c005395bf 100644
--- a/spacy/pipeline/functions.py
+++ b/spacy/pipeline/functions.py
@@ -1,6 +1,8 @@
from typing import Dict, Any
import srsly
+import warnings
+from ..errors import Warnings
from ..language import Language
from ..matcher import Matcher
from ..tokens import Doc
@@ -25,7 +27,7 @@ def merge_noun_chunks(doc: Doc) -> Doc:
with doc.retokenize() as retokenizer:
for np in doc.noun_chunks:
attrs = {"tag": np.root.tag, "dep": np.root.dep}
- retokenizer.merge(np, attrs=attrs)
+ retokenizer.merge(np, attrs=attrs) # type: ignore[arg-type]
return doc
@@ -45,7 +47,7 @@ def merge_entities(doc: Doc):
with doc.retokenize() as retokenizer:
for ent in doc.ents:
attrs = {"tag": ent.root.tag, "dep": ent.root.dep, "ent_type": ent.label}
- retokenizer.merge(ent, attrs=attrs)
+ retokenizer.merge(ent, attrs=attrs) # type: ignore[arg-type]
return doc
@@ -63,7 +65,7 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc:
merger = Matcher(doc.vocab)
merger.add("SUBTOK", [[{"DEP": label, "op": "+"}]])
matches = merger(doc)
- spans = util.filter_spans([doc[start : end + 1] for _, start, end in matches])
+ spans = util.filter_spans([doc[start : end + 1] for _, start, end in matches]) # type: ignore[misc, operator]
with doc.retokenize() as retokenizer:
for span in spans:
retokenizer.merge(span)
@@ -93,11 +95,11 @@ class TokenSplitter:
if len(t.text) >= self.min_length:
orths = []
heads = []
- attrs = {}
+ attrs = {} # type: ignore[var-annotated]
for i in range(0, len(t.text), self.split_length):
orths.append(t.text[i : i + self.split_length])
heads.append((t, i / self.split_length))
- retokenizer.split(t, orths, heads, attrs)
+ retokenizer.split(t, orths, heads, attrs) # type: ignore[arg-type]
return doc
def _get_config(self) -> Dict[str, Any]:
@@ -136,3 +138,65 @@ class TokenSplitter:
"cfg": lambda p: self._set_config(srsly.read_json(p)),
}
util.from_disk(path, serializers, [])
+
+
+@Language.factory(
+ "doc_cleaner",
+ default_config={"attrs": {"tensor": None, "_.trf_data": None}, "silent": True},
+)
+def make_doc_cleaner(nlp: Language, name: str, *, attrs: Dict[str, Any], silent: bool):
+ return DocCleaner(attrs, silent=silent)
+
+
+class DocCleaner:
+ def __init__(self, attrs: Dict[str, Any], *, silent: bool = True):
+ self.cfg: Dict[str, Any] = {"attrs": dict(attrs), "silent": silent}
+
+ def __call__(self, doc: Doc) -> Doc:
+ attrs: dict = self.cfg["attrs"]
+ silent: bool = self.cfg["silent"]
+ for attr, value in attrs.items():
+ obj = doc
+ parts = attr.split(".")
+ skip = False
+ for part in parts[:-1]:
+ if hasattr(obj, part):
+ obj = getattr(obj, part)
+ else:
+ skip = True
+ if not silent:
+ warnings.warn(Warnings.W116.format(attr=attr))
+ if not skip:
+ if hasattr(obj, parts[-1]):
+ setattr(obj, parts[-1], value)
+ else:
+ if not silent:
+ warnings.warn(Warnings.W116.format(attr=attr))
+ return doc
+
+ def to_bytes(self, **kwargs):
+ serializers = {
+ "cfg": lambda: srsly.json_dumps(self.cfg),
+ }
+ return util.to_bytes(serializers, [])
+
+ def from_bytes(self, data, **kwargs):
+ deserializers = {
+ "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
+ }
+ util.from_bytes(data, deserializers, [])
+ return self
+
+ def to_disk(self, path, **kwargs):
+ path = util.ensure_path(path)
+ serializers = {
+ "cfg": lambda p: srsly.write_json(p, self.cfg),
+ }
+ return util.to_disk(path, serializers, [])
+
+ def from_disk(self, path, **kwargs):
+ path = util.ensure_path(path)
+ serializers = {
+ "cfg": lambda p: self.cfg.update(srsly.read_json(p)),
+ }
+ util.from_disk(path, serializers, [])
diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py
index 87504fade..9c2fc2f09 100644
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@@ -12,21 +12,41 @@ from ..lookups import Lookups, load_lookups
from ..scorer import Scorer
from ..tokens import Doc, Token
from ..vocab import Vocab
-from ..training import validate_examples
-from ..util import logger, SimpleFrozenList
+from ..util import logger, SimpleFrozenList, registry
from .. import util
@Language.factory(
"lemmatizer",
assigns=["token.lemma"],
- default_config={"model": None, "mode": "lookup", "overwrite": False},
+ default_config={
+ "model": None,
+ "mode": "lookup",
+ "overwrite": False,
+ "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+ },
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
- nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False
+ nlp: Language,
+ model: Optional[Model],
+ name: str,
+ mode: str,
+ overwrite: bool,
+ scorer: Optional[Callable],
):
- return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+ return Lemmatizer(
+ nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+ )
+
+
+def lemmatizer_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
+ return Scorer.score_token_attr(examples, "lemma", **kwargs)
+
+
+@registry.scorers("spacy.lemmatizer_scorer.v1")
+def make_lemmatizer_scorer():
+ return lemmatizer_score
class Lemmatizer(Pipe):
@@ -60,6 +80,7 @@ class Lemmatizer(Pipe):
*,
mode: str = "lookup",
overwrite: bool = False,
+ scorer: Optional[Callable] = lemmatizer_score,
) -> None:
"""Initialize a Lemmatizer.
@@ -69,6 +90,8 @@ class Lemmatizer(Pipe):
mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
overwrite (bool): Whether to overwrite existing lemmas. Defaults to
`False`.
+ scorer (Optional[Callable]): The scoring method. Defaults to
+ Scorer.score_token_attr for the attribute "lemma".
DOCS: https://spacy.io/api/lemmatizer#init
"""
@@ -88,7 +111,8 @@ class Lemmatizer(Pipe):
if not hasattr(self, mode_attr):
raise ValueError(Errors.E1003.format(mode=mode))
self.lemmatize = getattr(self, mode_attr)
- self.cache = {}
+ self.cache = {} # type: ignore[var-annotated]
+ self.scorer = scorer
@property
def mode(self):
@@ -177,14 +201,14 @@ class Lemmatizer(Pipe):
DOCS: https://spacy.io/api/lemmatizer#rule_lemmatize
"""
- cache_key = (token.orth, token.pos, token.morph.key)
+ cache_key = (token.orth, token.pos, token.morph.key) # type: ignore[attr-defined]
if cache_key in self.cache:
return self.cache[cache_key]
string = token.text
univ_pos = token.pos_.lower()
if univ_pos in ("", "eol", "space"):
if univ_pos == "":
- warnings.warn(Warnings.W108.format(text=string))
+ warnings.warn(Warnings.W108)
return [string.lower()]
# See Issue #435 for example of where this logic is requied.
if self.is_base_form(token):
@@ -247,17 +271,6 @@ class Lemmatizer(Pipe):
"""
return False
- def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
- """Score a batch of examples.
-
- examples (Iterable[Example]): The examples to score.
- RETURNS (Dict[str, Any]): The scores.
-
- DOCS: https://spacy.io/api/lemmatizer#score
- """
- validate_examples(examples, "Lemmatizer.score")
- return Scorer.score_token_attr(examples, "lemma", **kwargs)
-
def to_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
):
@@ -269,7 +282,7 @@ class Lemmatizer(Pipe):
DOCS: https://spacy.io/api/lemmatizer#to_disk
"""
serialize = {}
- serialize["vocab"] = lambda p: self.vocab.to_disk(p)
+ serialize["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude)
serialize["lookups"] = lambda p: self.lookups.to_disk(p)
util.to_disk(path, serialize, exclude)
@@ -284,8 +297,8 @@ class Lemmatizer(Pipe):
DOCS: https://spacy.io/api/lemmatizer#from_disk
"""
- deserialize = {}
- deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
+ deserialize: Dict[str, Callable[[Any], Any]] = {}
+ deserialize["vocab"] = lambda p: self.vocab.from_disk(p, exclude=exclude)
deserialize["lookups"] = lambda p: self.lookups.from_disk(p)
util.from_disk(path, deserialize, exclude)
self._validate_tables()
@@ -300,7 +313,7 @@ class Lemmatizer(Pipe):
DOCS: https://spacy.io/api/lemmatizer#to_bytes
"""
serialize = {}
- serialize["vocab"] = self.vocab.to_bytes
+ serialize["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
serialize["lookups"] = self.lookups.to_bytes
return util.to_bytes(serialize, exclude)
@@ -315,8 +328,8 @@ class Lemmatizer(Pipe):
DOCS: https://spacy.io/api/lemmatizer#from_bytes
"""
- deserialize = {}
- deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
+ deserialize: Dict[str, Callable[[Any], Any]] = {}
+ deserialize["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude)
deserialize["lookups"] = lambda b: self.lookups.from_bytes(b)
util.from_bytes(bytes_data, deserialize, exclude)
self._validate_tables()
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 3ba05e616..73d3799b1 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,5 +1,5 @@
# cython: infer_types=True, profile=True, binding=True
-from typing import Optional, Union, Dict
+from typing import Optional, Union, Dict, Callable
import srsly
from thinc.api import SequenceCategoricalCrossentropy, Model, Config
from itertools import islice
@@ -17,7 +17,11 @@ from .tagger import Tagger
from .. import util
from ..scorer import Scorer
from ..training import validate_examples, validate_get_examples
+from ..util import registry
+# See #9050
+BACKWARD_OVERWRITE = True
+BACKWARD_EXTEND = False
default_model_config = """
[model]
@@ -48,15 +52,35 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
"morphologizer",
assigns=["token.morph", "token.pos"],
- default_config={"model": DEFAULT_MORPH_MODEL},
+ default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}},
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
)
def make_morphologizer(
nlp: Language,
model: Model,
name: str,
+ overwrite: bool,
+ extend: bool,
+ scorer: Optional[Callable],
):
- return Morphologizer(nlp.vocab, model, name)
+ return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer)
+
+
+def morphologizer_score(examples, **kwargs):
+ def morph_key_getter(token, attr):
+ return getattr(token, attr).key
+
+ results = {}
+ results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
+ results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
+ results.update(Scorer.score_token_attr_per_feat(examples,
+ "morph", getter=morph_key_getter, **kwargs))
+ return results
+
+
+@registry.scorers("spacy.morphologizer_scorer.v1")
+def make_morphologizer_scorer():
+ return morphologizer_score
class Morphologizer(Tagger):
@@ -67,6 +91,10 @@ class Morphologizer(Tagger):
vocab: Vocab,
model: Model,
name: str = "morphologizer",
+ *,
+ overwrite: bool = BACKWARD_OVERWRITE,
+ extend: bool = BACKWARD_EXTEND,
+ scorer: Optional[Callable] = morphologizer_score,
):
"""Initialize a morphologizer.
@@ -74,6 +102,9 @@ class Morphologizer(Tagger):
model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the
losses during training.
+ scorer (Optional[Callable]): The scoring method. Defaults to
+ Scorer.score_token_attr for the attributes "pos" and "morph" and
+ Scorer.score_token_attr_per_feat for the attribute "morph".
DOCS: https://spacy.io/api/morphologizer#init
"""
@@ -85,8 +116,14 @@ class Morphologizer(Tagger):
# store mappings from morph+POS labels to token-level annotations:
# 1) labels_morph stores a mapping from morph+POS->morph
# 2) labels_pos stores a mapping from morph+POS->POS
- cfg = {"labels_morph": {}, "labels_pos": {}}
+ cfg = {
+ "labels_morph": {},
+ "labels_pos": {},
+ "overwrite": overwrite,
+ "extend": extend,
+ }
self.cfg = dict(sorted(cfg.items()))
+ self.scorer = scorer
@property
def labels(self):
@@ -192,14 +229,35 @@ class Morphologizer(Tagger):
docs = [docs]
cdef Doc doc
cdef Vocab vocab = self.vocab
+ cdef bint overwrite = self.cfg["overwrite"]
+ cdef bint extend = self.cfg["extend"]
+ labels = self.labels
for i, doc in enumerate(docs):
doc_tag_ids = batch_tag_ids[i]
if hasattr(doc_tag_ids, "get"):
doc_tag_ids = doc_tag_ids.get()
for j, tag_id in enumerate(doc_tag_ids):
- morph = self.labels[tag_id]
- doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0))
- doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0)
+ morph = labels[tag_id]
+ # set morph
+ if doc.c[j].morph == 0 or overwrite or extend:
+ if overwrite and extend:
+ # morphologizer morph overwrites any existing features
+ # while extending
+ extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph])
+ extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0)))
+ doc.c[j].morph = self.vocab.morphology.add(extended_morph)
+ elif extend:
+ # existing features are preserved and any new features
+ # are added
+ extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0))
+ extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph]))
+ doc.c[j].morph = self.vocab.morphology.add(extended_morph)
+ else:
+ # clobber
+ doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0))
+ # set POS
+ if doc.c[j].pos == 0 or overwrite:
+ doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0)
def get_loss(self, examples, scores):
"""Find the loss and gradient of loss for the batch of documents and
@@ -246,24 +304,3 @@ class Morphologizer(Tagger):
if self.model.ops.xp.isnan(loss):
raise ValueError(Errors.E910.format(name=self.name))
return float(loss), d_scores
-
- def score(self, examples, **kwargs):
- """Score a batch of examples.
-
- examples (Iterable[Example]): The examples to score.
- RETURNS (Dict[str, Any]): The scores, produced by
- Scorer.score_token_attr for the attributes "pos" and "morph" and
- Scorer.score_token_attr_per_feat for the attribute "morph".
-
- DOCS: https://spacy.io/api/morphologizer#score
- """
- def morph_key_getter(token, attr):
- return getattr(token, attr).key
-
- validate_examples(examples, "Morphologizer.score")
- results = {}
- results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
- results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
- results.update(Scorer.score_token_attr_per_feat(examples,
- "morph", getter=morph_key_getter, **kwargs))
- return results
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index f4ae4b787..4835a8c4b 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -1,6 +1,6 @@
# cython: infer_types=True, profile=True, binding=True
from collections import defaultdict
-from typing import Optional, Iterable
+from typing import Optional, Iterable, Callable
from thinc.api import Model, Config
from ._parser_internals.transition_system import TransitionSystem
@@ -9,7 +9,7 @@ from ._parser_internals.ner cimport BiluoPushDown
from ..language import Language
from ..scorer import get_ner_prf, PRFScore
-from ..training import validate_examples
+from ..util import registry
default_model_config = """
@@ -41,7 +41,8 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
"moves": None,
"update_with_oracle_cut_size": 100,
"model": DEFAULT_NER_MODEL,
- "incorrect_spans_key": None
+ "incorrect_spans_key": None,
+ "scorer": {"@scorers": "spacy.ner_scorer.v1"},
},
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
@@ -52,7 +53,8 @@ def make_ner(
model: Model,
moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int,
- incorrect_spans_key: Optional[str]=None
+ incorrect_spans_key: Optional[str],
+ scorer: Optional[Callable],
):
"""Create a transition-based EntityRecognizer component. The entity recognizer
identifies non-overlapping labelled spans of tokens.
@@ -80,6 +82,7 @@ def make_ner(
incorrect_spans_key (Optional[str]): Identifies spans that are known
to be incorrect entity annotations. The incorrect entity annotations
can be stored in the span group, under this key.
+ scorer (Optional[Callable]): The scoring method.
"""
return EntityRecognizer(
nlp.vocab,
@@ -92,6 +95,7 @@ def make_ner(
beam_width=1,
beam_density=0.0,
beam_update_prob=0.0,
+ scorer=scorer,
)
@Language.factory(
@@ -104,7 +108,8 @@ def make_ner(
"beam_density": 0.01,
"beam_update_prob": 0.5,
"beam_width": 32,
- "incorrect_spans_key": None
+ "incorrect_spans_key": None,
+ "scorer": None,
},
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
)
@@ -117,7 +122,8 @@ def make_beam_ner(
beam_width: int,
beam_density: float,
beam_update_prob: float,
- incorrect_spans_key: Optional[str]=None
+ incorrect_spans_key: Optional[str],
+ scorer: Optional[Callable],
):
"""Create a transition-based EntityRecognizer component that uses beam-search.
The entity recognizer identifies non-overlapping labelled spans of tokens.
@@ -153,6 +159,7 @@ def make_beam_ner(
and are faster to compute.
incorrect_spans_key (Optional[str]): Optional key into span groups of
entities known to be non-entities.
+ scorer (Optional[Callable]): The scoring method.
"""
return EntityRecognizer(
nlp.vocab,
@@ -164,10 +171,20 @@ def make_beam_ner(
beam_width=beam_width,
beam_density=beam_density,
beam_update_prob=beam_update_prob,
- incorrect_spans_key=incorrect_spans_key
+ incorrect_spans_key=incorrect_spans_key,
+ scorer=scorer,
)
+def ner_score(examples, **kwargs):
+ return get_ner_prf(examples, **kwargs)
+
+
+@registry.scorers("spacy.ner_scorer.v1")
+def make_ner_scorer():
+ return ner_score
+
+
cdef class EntityRecognizer(Parser):
"""Pipeline component for named entity recognition.
@@ -188,6 +205,7 @@ cdef class EntityRecognizer(Parser):
beam_update_prob=0.0,
multitasks=tuple(),
incorrect_spans_key=None,
+ scorer=ner_score,
):
"""Create an EntityRecognizer.
"""
@@ -204,6 +222,7 @@ cdef class EntityRecognizer(Parser):
beam_update_prob=beam_update_prob,
multitasks=multitasks,
incorrect_spans_key=incorrect_spans_key,
+ scorer=scorer,
)
def add_multitask_objective(self, mt_component):
@@ -227,17 +246,6 @@ cdef class EntityRecognizer(Parser):
if move[0] in ("B", "I", "L", "U"))
return tuple(sorted(labels))
- def score(self, examples, **kwargs):
- """Score a batch of examples.
-
- examples (Iterable[Example]): The examples to score.
- RETURNS (Dict[str, Any]): The NER precision, recall and f-scores.
-
- DOCS: https://spacy.io/api/entityrecognizer#score
- """
- validate_examples(examples, "EntityRecognizer.score")
- return get_ner_prf(examples)
-
def scored_ents(self, beams):
"""Return a dictionary of (start, end, label) tuples with corresponding scores
for each beam/doc that was processed.
diff --git a/spacy/pipeline/pipe.pyi b/spacy/pipeline/pipe.pyi
new file mode 100644
index 000000000..c7c0568f9
--- /dev/null
+++ b/spacy/pipeline/pipe.pyi
@@ -0,0 +1,38 @@
+from pathlib import Path
+from typing import Any, Callable, Dict, Iterable, Iterator, List
+from typing import NoReturn, Optional, Tuple, Union
+
+from ..tokens.doc import Doc
+
+from ..training import Example
+from ..language import Language
+
+class Pipe:
+ def __call__(self, doc: Doc) -> Doc: ...
+ def pipe(
+ self, stream: Iterable[Doc], *, batch_size: int = ...
+ ) -> Iterator[Doc]: ...
+ def initialize(
+ self,
+ get_examples: Callable[[], Iterable[Example]],
+ *,
+ nlp: Language = ...,
+ ) -> None: ...
+ def score(
+ self, examples: Iterable[Example], **kwargs: Any
+ ) -> Dict[str, Union[float, Dict[str, float]]]: ...
+ @property
+ def is_trainable(self) -> bool: ...
+ @property
+ def labels(self) -> Tuple[str, ...]: ...
+ @property
+ def label_data(self) -> Any: ...
+ def _require_labels(self) -> None: ...
+ def set_error_handler(
+ self, error_handler: Callable[[str, "Pipe", List[Doc], Exception], NoReturn]
+ ) -> None: ...
+ def get_error_handler(
+ self,
+ ) -> Callable[[str, "Pipe", List[Doc], Exception], NoReturn]: ...
+
+def deserialize_config(path: Path) -> Any: ...
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index 0d298ce4f..9eddc1e3f 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -81,6 +81,17 @@ cdef class Pipe:
DOCS: https://spacy.io/api/pipe#score
"""
+ if hasattr(self, "scorer") and self.scorer is not None:
+ scorer_kwargs = {}
+ # use default settings from cfg (e.g., threshold)
+ if hasattr(self, "cfg") and isinstance(self.cfg, dict):
+ scorer_kwargs.update(self.cfg)
+ # override self.cfg["labels"] with self.labels
+ if hasattr(self, "labels"):
+ scorer_kwargs["labels"] = self.labels
+ # override with kwargs settings
+ scorer_kwargs.update(kwargs)
+ return self.scorer(examples, **scorer_kwargs)
return {}
@property
@@ -88,7 +99,7 @@ cdef class Pipe:
return False
@property
- def labels(self) -> Optional[Tuple[str]]:
+ def labels(self) -> Tuple[str, ...]:
return tuple()
@property
@@ -115,7 +126,7 @@ cdef class Pipe:
"""
self.error_handler = error_handler
- def get_error_handler(self) -> Optional[Callable]:
+ def get_error_handler(self) -> Callable:
"""Retrieve the error handler function.
RETURNS (Callable): The error handler, or if it's not set a default function that just reraises.
diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx
index 60102efcb..77f4e8adb 100644
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@@ -1,26 +1,32 @@
# cython: infer_types=True, profile=True, binding=True
-from typing import Optional, List
+from typing import Optional, List, Callable
import srsly
from ..tokens.doc cimport Doc
+
from .pipe import Pipe
+from .senter import senter_score
from ..language import Language
from ..scorer import Scorer
-from ..training import validate_examples
from .. import util
+# see #9050
+BACKWARD_OVERWRITE = False
+
@Language.factory(
"sentencizer",
assigns=["token.is_sent_start", "doc.sents"],
- default_config={"punct_chars": None},
+ default_config={"punct_chars": None, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
)
def make_sentencizer(
nlp: Language,
name: str,
- punct_chars: Optional[List[str]]
+ punct_chars: Optional[List[str]],
+ overwrite: bool,
+ scorer: Optional[Callable],
):
- return Sentencizer(name, punct_chars=punct_chars)
+ return Sentencizer(name, punct_chars=punct_chars, overwrite=overwrite, scorer=scorer)
class Sentencizer(Pipe):
@@ -41,12 +47,20 @@ class Sentencizer(Pipe):
'𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
'。', '。']
- def __init__(self, name="sentencizer", *, punct_chars=None):
+ def __init__(
+ self,
+ name="sentencizer",
+ *,
+ punct_chars=None,
+ overwrite=BACKWARD_OVERWRITE,
+ scorer=senter_score,
+ ):
"""Initialize the sentencizer.
punct_chars (list): Punctuation characters to split on. Will be
serialized with the nlp object.
- RETURNS (Sentencizer): The sentencizer component.
+ scorer (Optional[Callable]): The scoring method. Defaults to
+ Scorer.score_spans for the attribute "sents".
DOCS: https://spacy.io/api/sentencizer#init
"""
@@ -55,6 +69,8 @@ class Sentencizer(Pipe):
self.punct_chars = set(punct_chars)
else:
self.punct_chars = set(self.default_punct_chars)
+ self.overwrite = overwrite
+ self.scorer = scorer
def __call__(self, doc):
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
@@ -115,29 +131,12 @@ class Sentencizer(Pipe):
for i, doc in enumerate(docs):
doc_tag_ids = batch_tag_ids[i]
for j, tag_id in enumerate(doc_tag_ids):
- # Don't clobber existing sentence boundaries
- if doc.c[j].sent_start == 0:
+ if doc.c[j].sent_start == 0 or self.overwrite:
if tag_id:
doc.c[j].sent_start = 1
else:
doc.c[j].sent_start = -1
- def score(self, examples, **kwargs):
- """Score a batch of examples.
-
- examples (Iterable[Example]): The examples to score.
- RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
-
- DOCS: https://spacy.io/api/sentencizer#score
- """
- def has_sents(doc):
- return doc.has_annotation("SENT_START")
-
- validate_examples(examples, "Sentencizer.score")
- results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
- del results["sents_per_type"]
- return results
-
def to_bytes(self, *, exclude=tuple()):
"""Serialize the sentencizer to a bytestring.
@@ -145,7 +144,7 @@ class Sentencizer(Pipe):
DOCS: https://spacy.io/api/sentencizer#to_bytes
"""
- return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)})
+ return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars), "overwrite": self.overwrite})
def from_bytes(self, bytes_data, *, exclude=tuple()):
"""Load the sentencizer from a bytestring.
@@ -157,6 +156,7 @@ class Sentencizer(Pipe):
"""
cfg = srsly.msgpack_loads(bytes_data)
self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
+ self.overwrite = cfg.get("overwrite", self.overwrite)
return self
def to_disk(self, path, *, exclude=tuple()):
@@ -166,7 +166,7 @@ class Sentencizer(Pipe):
"""
path = util.ensure_path(path)
path = path.with_suffix(".json")
- srsly.write_json(path, {"punct_chars": list(self.punct_chars)})
+ srsly.write_json(path, {"punct_chars": list(self.punct_chars), "overwrite": self.overwrite})
def from_disk(self, path, *, exclude=tuple()):
@@ -178,4 +178,5 @@ class Sentencizer(Pipe):
path = path.with_suffix(".json")
cfg = srsly.read_json(path)
self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
+ self.overwrite = cfg.get("overwrite", self.overwrite)
return self
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index f9472abf5..54ce021af 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -1,5 +1,6 @@
# cython: infer_types=True, profile=True, binding=True
from itertools import islice
+from typing import Optional, Callable
import srsly
from thinc.api import Model, SequenceCategoricalCrossentropy, Config
@@ -11,8 +12,11 @@ from ..language import Language
from ..errors import Errors
from ..scorer import Scorer
from ..training import validate_examples, validate_get_examples
+from ..util import registry
from .. import util
+# See #9050
+BACKWARD_OVERWRITE = False
default_model_config = """
[model]
@@ -34,11 +38,25 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
"senter",
assigns=["token.is_sent_start"],
- default_config={"model": DEFAULT_SENTER_MODEL},
+ default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
)
-def make_senter(nlp: Language, name: str, model: Model):
- return SentenceRecognizer(nlp.vocab, model, name)
+def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]):
+ return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer)
+
+
+def senter_score(examples, **kwargs):
+ def has_sents(doc):
+ return doc.has_annotation("SENT_START")
+
+ results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
+ del results["sents_per_type"]
+ return results
+
+
+@registry.scorers("spacy.senter_scorer.v1")
+def make_senter_scorer():
+ return senter_score
class SentenceRecognizer(Tagger):
@@ -46,13 +64,23 @@ class SentenceRecognizer(Tagger):
DOCS: https://spacy.io/api/sentencerecognizer
"""
- def __init__(self, vocab, model, name="senter"):
+ def __init__(
+ self,
+ vocab,
+ model,
+ name="senter",
+ *,
+ overwrite=BACKWARD_OVERWRITE,
+ scorer=senter_score,
+ ):
"""Initialize a sentence recognizer.
vocab (Vocab): The shared vocabulary.
model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the
losses during training.
+ scorer (Optional[Callable]): The scoring method. Defaults to
+ Scorer.score_spans for the attribute "sents".
DOCS: https://spacy.io/api/sentencerecognizer#init
"""
@@ -60,7 +88,8 @@ class SentenceRecognizer(Tagger):
self.model = model
self.name = name
self._rehearsal_model = None
- self.cfg = {}
+ self.cfg = {"overwrite": overwrite}
+ self.scorer = scorer
@property
def labels(self):
@@ -85,13 +114,13 @@ class SentenceRecognizer(Tagger):
if isinstance(docs, Doc):
docs = [docs]
cdef Doc doc
+ cdef bint overwrite = self.cfg["overwrite"]
for i, doc in enumerate(docs):
doc_tag_ids = batch_tag_ids[i]
if hasattr(doc_tag_ids, "get"):
doc_tag_ids = doc_tag_ids.get()
for j, tag_id in enumerate(doc_tag_ids):
- # Don't clobber existing sentence boundaries
- if doc.c[j].sent_start == 0:
+ if doc.c[j].sent_start == 0 or overwrite:
if tag_id == 1:
doc.c[j].sent_start = 1
else:
@@ -153,18 +182,3 @@ class SentenceRecognizer(Tagger):
def add_label(self, label, values=None):
raise NotImplementedError
-
- def score(self, examples, **kwargs):
- """Score a batch of examples.
-
- examples (Iterable[Example]): The examples to score.
- RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
- DOCS: https://spacy.io/api/sentencerecognizer#score
- """
- def has_sents(doc):
- return doc.has_annotation("SENT_START")
-
- validate_examples(examples, "SentenceRecognizer.score")
- results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
- del results["sents_per_type"]
- return results
diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index 524e3a659..32c1275a6 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -1,9 +1,10 @@
import numpy
-from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any
+from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
from thinc.api import Optimizer
-from thinc.types import Ragged, Ints2d, Floats2d
+from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
+from ..compat import Protocol, runtime_checkable
from ..scorer import Scorer
from ..language import Language
from .trainable_pipe import TrainablePipe
@@ -44,13 +45,19 @@ depth = 4
DEFAULT_SPANCAT_MODEL = Config().from_str(spancat_default_config)["model"]
+@runtime_checkable
+class Suggester(Protocol):
+ def __call__(self, docs: Iterable[Doc], *, ops: Optional[Ops] = None) -> Ragged:
+ ...
+
+
@registry.misc("spacy.ngram_suggester.v1")
-def build_ngram_suggester(sizes: List[int]) -> Callable[[List[Doc]], Ragged]:
+def build_ngram_suggester(sizes: List[int]) -> Suggester:
"""Suggest all spans of the given lengths. Spans are returned as a ragged
array of integers. The array has two columns, indicating the start and end
position."""
- def ngram_suggester(docs: List[Doc], *, ops: Optional[Ops] = None) -> Ragged:
+ def ngram_suggester(docs: Iterable[Doc], *, ops: Optional[Ops] = None) -> Ragged:
if ops is None:
ops = get_current_ops()
spans = []
@@ -67,10 +74,11 @@ def build_ngram_suggester(sizes: List[int]) -> Callable[[List[Doc]], Ragged]:
if spans:
assert spans[-1].ndim == 2, spans[-1].shape
lengths.append(length)
+ lengths_array = cast(Ints1d, ops.asarray(lengths, dtype="i"))
if len(spans) > 0:
- output = Ragged(ops.xp.vstack(spans), ops.asarray(lengths, dtype="i"))
+ output = Ragged(ops.xp.vstack(spans), lengths_array)
else:
- output = Ragged(ops.xp.zeros((0, 0)), ops.asarray(lengths, dtype="i"))
+ output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
assert output.dataXd.ndim == 2
return output
@@ -78,6 +86,15 @@ def build_ngram_suggester(sizes: List[int]) -> Callable[[List[Doc]], Ragged]:
return ngram_suggester
+@registry.misc("spacy.ngram_range_suggester.v1")
+def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester:
+ """Suggest all spans of the given lengths between a given min and max value - both inclusive.
+ Spans are returned as a ragged array of integers. The array has two columns,
+ indicating the start and end position."""
+ sizes = list(range(min_size, max_size + 1))
+ return build_ngram_suggester(sizes)
+
+
@Language.factory(
"spancat",
assigns=["doc.spans"],
@@ -87,23 +104,25 @@ def build_ngram_suggester(sizes: List[int]) -> Callable[[List[Doc]], Ragged]:
"max_positive": None,
"model": DEFAULT_SPANCAT_MODEL,
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
+ "scorer": {"@scorers": "spacy.spancat_scorer.v1"},
},
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
)
def make_spancat(
nlp: Language,
name: str,
- suggester: Callable[[List[Doc]], Ragged],
+ suggester: Suggester,
model: Model[Tuple[List[Doc], Ragged], Floats2d],
spans_key: str,
- threshold: float = 0.5,
- max_positive: Optional[int] = None,
+ scorer: Optional[Callable],
+ threshold: float,
+ max_positive: Optional[int],
) -> "SpanCategorizer":
"""Create a SpanCategorizer component. The span categorizer consists of two
parts: a suggester function that proposes candidate spans, and a labeller
model that predicts one or more labels for each span.
- suggester (Callable[List[Doc], Ragged]): A function that suggests spans.
+ suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
Spans are returned as a ragged array with two integer columns, for the
start and end positions.
model (Model[Tuple[List[Doc], Ragged], Floats2d]): A model instance that
@@ -127,9 +146,28 @@ def make_spancat(
threshold=threshold,
max_positive=max_positive,
name=name,
+ scorer=scorer,
)
+def spancat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
+ kwargs = dict(kwargs)
+ attr_prefix = "spans_"
+ key = kwargs["spans_key"]
+ kwargs.setdefault("attr", f"{attr_prefix}{key}")
+ kwargs.setdefault("allow_overlap", True)
+ kwargs.setdefault(
+ "getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], [])
+ )
+ kwargs.setdefault("has_annotation", lambda doc: key in doc.spans)
+ return Scorer.score_spans(examples, **kwargs)
+
+
+@registry.scorers("spacy.spancat_scorer.v1")
+def make_spancat_scorer():
+ return spancat_score
+
+
class SpanCategorizer(TrainablePipe):
"""Pipeline component to label spans of text.
@@ -140,14 +178,31 @@ class SpanCategorizer(TrainablePipe):
self,
vocab: Vocab,
model: Model[Tuple[List[Doc], Ragged], Floats2d],
- suggester: Callable[[List[Doc]], Ragged],
+ suggester: Suggester,
name: str = "spancat",
*,
spans_key: str = "spans",
threshold: float = 0.5,
max_positive: Optional[int] = None,
+ scorer: Optional[Callable] = spancat_score,
) -> None:
"""Initialize the span categorizer.
+ vocab (Vocab): The shared vocabulary.
+ model (thinc.api.Model): The Thinc Model powering the pipeline component.
+ name (str): The component instance name, used to add entries to the
+ losses during training.
+ spans_key (str): Key of the Doc.spans dict to save the spans under.
+ During initialization and training, the component will look for
+ spans on the reference document under the same key. Defaults to
+ `"spans"`.
+ threshold (float): Minimum probability to consider a prediction
+ positive. Spans with a positive prediction will be saved on the Doc.
+ Defaults to 0.5.
+ max_positive (Optional[int]): Maximum number of labels to consider
+ positive per span. Defaults to None, indicating no limit.
+ scorer (Optional[Callable]): The scoring method. Defaults to
+ Scorer.score_spans for the Doc.spans[spans_key] with overlapping
+ spans allowed.
DOCS: https://spacy.io/api/spancategorizer#init
"""
@@ -161,6 +216,7 @@ class SpanCategorizer(TrainablePipe):
self.suggester = suggester
self.model = model
self.name = name
+ self.scorer = scorer
@property
def key(self) -> str:
@@ -168,7 +224,7 @@ class SpanCategorizer(TrainablePipe):
initialization and training, the component will look for spans on the
reference document under the same key.
"""
- return self.cfg["spans_key"]
+ return str(self.cfg["spans_key"])
def add_label(self, label: str) -> int:
"""Add a new label to the pipe.
@@ -183,7 +239,7 @@ class SpanCategorizer(TrainablePipe):
if label in self.labels:
return 0
self._allow_extra_label()
- self.cfg["labels"].append(label)
+ self.cfg["labels"].append(label) # type: ignore
self.vocab.strings.add(label)
return 1
@@ -193,7 +249,7 @@ class SpanCategorizer(TrainablePipe):
DOCS: https://spacy.io/api/spancategorizer#labels
"""
- return tuple(self.cfg["labels"])
+ return tuple(self.cfg["labels"]) # type: ignore
@property
def label_data(self) -> List[str]:
@@ -212,8 +268,8 @@ class SpanCategorizer(TrainablePipe):
DOCS: https://spacy.io/api/spancategorizer#predict
"""
indices = self.suggester(docs, ops=self.model.ops)
- scores = self.model.predict((docs, indices))
- return (indices, scores)
+ scores = self.model.predict((docs, indices)) # type: ignore
+ return indices, scores
def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None:
"""Modify a batch of Doc objects, using pre-computed scores.
@@ -229,7 +285,7 @@ class SpanCategorizer(TrainablePipe):
for i, doc in enumerate(docs):
indices_i = indices[i].dataXd
doc.spans[self.key] = self._make_span_group(
- doc, indices_i, scores[offset : offset + indices.lengths[i]], labels
+ doc, indices_i, scores[offset : offset + indices.lengths[i]], labels # type: ignore[arg-type]
)
offset += indices.lengths[i]
@@ -268,14 +324,14 @@ class SpanCategorizer(TrainablePipe):
set_dropout_rate(self.model, drop)
scores, backprop_scores = self.model.begin_update((docs, spans))
loss, d_scores = self.get_loss(examples, (spans, scores))
- backprop_scores(d_scores)
+ backprop_scores(d_scores) # type: ignore
if sgd is not None:
self.finish_update(sgd)
losses[self.name] += loss
return losses
def get_loss(
- self, examples: Iterable[Example], spans_scores: Tuple[Ragged, Ragged]
+ self, examples: Iterable[Example], spans_scores: Tuple[Ragged, Floats2d]
) -> Tuple[float, float]:
"""Find the loss and gradient of loss for the batch of documents and
their predicted scores.
@@ -300,8 +356,8 @@ class SpanCategorizer(TrainablePipe):
spans_index = {}
spans_i = spans[i].dataXd
for j in range(spans.lengths[i]):
- start = int(spans_i[j, 0])
- end = int(spans_i[j, 1])
+ start = int(spans_i[j, 0]) # type: ignore
+ end = int(spans_i[j, 1]) # type: ignore
spans_index[(start, end)] = offset + j
for gold_span in self._get_aligned_spans(eg):
key = (gold_span.start, gold_span.end)
@@ -312,7 +368,7 @@ class SpanCategorizer(TrainablePipe):
# The target is a flat array for all docs. Track the position
# we're at within the flat array.
offset += spans.lengths[i]
- target = self.model.ops.asarray(target, dtype="f")
+ target = self.model.ops.asarray(target, dtype="f") # type: ignore
# The target will have the values 0 (for untrue predictions) or 1
# (for true predictions).
# The scores should be in the range [0, 1].
@@ -328,7 +384,7 @@ class SpanCategorizer(TrainablePipe):
self,
get_examples: Callable[[], Iterable[Example]],
*,
- nlp: Language = None,
+ nlp: Optional[Language] = None,
labels: Optional[List[str]] = None,
) -> None:
"""Initialize the pipe for training, using a representative set
@@ -336,14 +392,14 @@ class SpanCategorizer(TrainablePipe):
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.
- nlp (Language): The current nlp object the component is part of.
- labels: The labels to add to the component, typically generated by the
+ nlp (Optional[Language]): The current nlp object the component is part of.
+ labels (Optional[List[str]]): The labels to add to the component, typically generated by the
`init labels` command. If no labels are provided, the get_examples
callback is used to extract the labels from the data.
DOCS: https://spacy.io/api/spancategorizer#initialize
"""
- subbatch = []
+ subbatch: List[Example] = []
if labels is not None:
for label in labels:
self.add_label(label)
@@ -356,40 +412,20 @@ class SpanCategorizer(TrainablePipe):
self._require_labels()
if subbatch:
docs = [eg.x for eg in subbatch]
- spans = self.suggester(docs)
+ spans = build_ngram_suggester(sizes=[1])(docs)
Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels))
self.model.initialize(X=(docs, spans), Y=Y)
else:
self.model.initialize()
- def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
- """Score a batch of examples.
-
- examples (Iterable[Example]): The examples to score.
- RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
-
- DOCS: https://spacy.io/api/spancategorizer#score
- """
- validate_examples(examples, "SpanCategorizer.score")
- self._validate_categories(examples)
- kwargs = dict(kwargs)
- attr_prefix = "spans_"
- kwargs.setdefault("attr", f"{attr_prefix}{self.key}")
- kwargs.setdefault("labels", self.labels)
- kwargs.setdefault("multi_label", True)
- kwargs.setdefault("threshold", self.cfg["threshold"])
- kwargs.setdefault(
- "getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], [])
- )
- kwargs.setdefault("has_annotation", lambda doc: self.key in doc.spans)
- return Scorer.score_spans(examples, **kwargs)
-
- def _validate_categories(self, examples):
+ def _validate_categories(self, examples: Iterable[Example]):
# TODO
pass
def _get_aligned_spans(self, eg: Example):
- return eg.get_aligned_spans_y2x(eg.reference.spans.get(self.key, []))
+ return eg.get_aligned_spans_y2x(
+ eg.reference.spans.get(self.key, []), allow_overlap=True
+ )
def _make_span_group(
self, doc: Doc, indices: Ints2d, scores: Floats2d, labels: List[str]
@@ -397,16 +433,25 @@ class SpanCategorizer(TrainablePipe):
spans = SpanGroup(doc, name=self.key)
max_positive = self.cfg["max_positive"]
threshold = self.cfg["threshold"]
+
+ keeps = scores >= threshold
+ ranked = (scores * -1).argsort() # type: ignore
+ if max_positive is not None:
+ assert isinstance(max_positive, int)
+ span_filter = ranked[:, max_positive:]
+ for i, row in enumerate(span_filter):
+ keeps[i, row] = False
+ spans.attrs["scores"] = scores[keeps].flatten()
+
+ indices = self.model.ops.to_numpy(indices)
+ keeps = self.model.ops.to_numpy(keeps)
+
for i in range(indices.shape[0]):
- start = int(indices[i, 0])
- end = int(indices[i, 1])
- positives = []
- for j, score in enumerate(scores[i]):
- if score >= threshold:
- positives.append((score, start, end, labels[j]))
- positives.sort(reverse=True)
- if max_positive:
- positives = positives[:max_positive]
- for score, start, end, label in positives:
- spans.append(Span(doc, start, end, label=label))
+ start = indices[i, 0]
+ end = indices[i, 1]
+
+ for j, keep in enumerate(keeps[i]):
+ if keep:
+ spans.append(Span(doc, start, end, label=labels[j]))
+
return spans
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 938131f6f..a2bec888e 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -1,4 +1,5 @@
# cython: infer_types=True, profile=True, binding=True
+from typing import Callable, Optional
import numpy
import srsly
from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
@@ -18,8 +19,11 @@ from ..parts_of_speech import X
from ..errors import Errors, Warnings
from ..scorer import Scorer
from ..training import validate_examples, validate_get_examples
+from ..util import registry
from .. import util
+# See #9050
+BACKWARD_OVERWRITE = False
default_model_config = """
[model]
@@ -41,10 +45,17 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
"tagger",
assigns=["token.tag"],
- default_config={"model": DEFAULT_TAGGER_MODEL},
+ default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!"},
default_score_weights={"tag_acc": 1.0},
)
-def make_tagger(nlp: Language, name: str, model: Model):
+def make_tagger(
+ nlp: Language,
+ name: str,
+ model: Model,
+ overwrite: bool,
+ scorer: Optional[Callable],
+ neg_prefix: str,
+):
"""Construct a part-of-speech tagger component.
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
@@ -52,7 +63,16 @@ def make_tagger(nlp: Language, name: str, model: Model):
in size, and be normalized as probabilities (all scores between 0 and 1,
with the rows summing to 1).
"""
- return Tagger(nlp.vocab, model, name)
+ return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix)
+
+
+def tagger_score(examples, **kwargs):
+ return Scorer.score_token_attr(examples, "tag", **kwargs)
+
+
+@registry.scorers("spacy.tagger_scorer.v1")
+def make_tagger_scorer():
+ return tagger_score
class Tagger(TrainablePipe):
@@ -60,13 +80,24 @@ class Tagger(TrainablePipe):
DOCS: https://spacy.io/api/tagger
"""
- def __init__(self, vocab, model, name="tagger"):
+ def __init__(
+ self,
+ vocab,
+ model,
+ name="tagger",
+ *,
+ overwrite=BACKWARD_OVERWRITE,
+ scorer=tagger_score,
+ neg_prefix="!",
+ ):
"""Initialize a part-of-speech tagger.
vocab (Vocab): The shared vocabulary.
model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the
losses during training.
+ scorer (Optional[Callable]): The scoring method. Defaults to
+ Scorer.score_token_attr for the attribute "tag".
DOCS: https://spacy.io/api/tagger#init
"""
@@ -74,8 +105,9 @@ class Tagger(TrainablePipe):
self.model = model
self.name = name
self._rehearsal_model = None
- cfg = {"labels": []}
+ cfg = {"labels": [], "overwrite": overwrite, "neg_prefix": neg_prefix}
self.cfg = dict(sorted(cfg.items()))
+ self.scorer = scorer
@property
def labels(self):
@@ -135,14 +167,15 @@ class Tagger(TrainablePipe):
docs = [docs]
cdef Doc doc
cdef Vocab vocab = self.vocab
+ cdef bint overwrite = self.cfg["overwrite"]
+ labels = self.labels
for i, doc in enumerate(docs):
doc_tag_ids = batch_tag_ids[i]
if hasattr(doc_tag_ids, "get"):
doc_tag_ids = doc_tag_ids.get()
for j, tag_id in enumerate(doc_tag_ids):
- # Don't clobber preset POS tags
- if doc.c[j].tag == 0:
- doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
+ if doc.c[j].tag == 0 or overwrite:
+ doc.c[j].tag = self.vocab.strings[labels[tag_id]]
def update(self, examples, *, drop=0., sgd=None, losses=None):
"""Learn from a batch of documents and gold-standard information,
@@ -222,7 +255,7 @@ class Tagger(TrainablePipe):
DOCS: https://spacy.io/api/tagger#get_loss
"""
validate_examples(examples, "Tagger.get_loss")
- loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
+ loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"])
# Convert empty tag "" to missing value None so that both misaligned
# tokens and tokens with missing annotation have the default missing
# value None.
@@ -289,15 +322,3 @@ class Tagger(TrainablePipe):
self.cfg["labels"].append(label)
self.vocab.strings.add(label)
return 1
-
- def score(self, examples, **kwargs):
- """Score a batch of examples.
-
- examples (Iterable[Example]): The examples to score.
- RETURNS (Dict[str, Any]): The scores, produced by
- Scorer.score_token_attr for the attributes "tag".
-
- DOCS: https://spacy.io/api/tagger#score
- """
- validate_examples(examples, "Tagger.score")
- return Scorer.score_token_attr(examples, "tag", **kwargs)
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 89308c2ed..0e0c3cf45 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -10,6 +10,7 @@ from ..training import Example, validate_examples, validate_get_examples
from ..errors import Errors
from ..scorer import Scorer
from ..tokens import Doc
+from ..util import registry
from ..vocab import Vocab
@@ -70,7 +71,11 @@ subword_features = true
@Language.factory(
"textcat",
assigns=["doc.cats"],
- default_config={"threshold": 0.5, "model": DEFAULT_SINGLE_TEXTCAT_MODEL},
+ default_config={
+ "threshold": 0.5,
+ "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
+ "scorer": {"@scorers": "spacy.textcat_scorer.v1"},
+ },
default_score_weights={
"cats_score": 1.0,
"cats_score_desc": None,
@@ -86,7 +91,11 @@ subword_features = true
},
)
def make_textcat(
- nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
+ nlp: Language,
+ name: str,
+ model: Model[List[Doc], List[Floats2d]],
+ threshold: float,
+ scorer: Optional[Callable],
) -> "TextCategorizer":
"""Create a TextCategorizer component. The text categorizer predicts categories
over a whole document. It can learn one or more labels, and the labels are considered
@@ -95,8 +104,23 @@ def make_textcat(
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
scores for each category.
threshold (float): Cutoff to consider a prediction "positive".
+ scorer (Optional[Callable]): The scoring method.
"""
- return TextCategorizer(nlp.vocab, model, name, threshold=threshold)
+ return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer)
+
+
+def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
+ return Scorer.score_cats(
+ examples,
+ "cats",
+ multi_label=False,
+ **kwargs,
+ )
+
+
+@registry.scorers("spacy.textcat_scorer.v1")
+def make_textcat_scorer():
+ return textcat_score
class TextCategorizer(TrainablePipe):
@@ -106,7 +130,13 @@ class TextCategorizer(TrainablePipe):
"""
def __init__(
- self, vocab: Vocab, model: Model, name: str = "textcat", *, threshold: float
+ self,
+ vocab: Vocab,
+ model: Model,
+ name: str = "textcat",
+ *,
+ threshold: float,
+ scorer: Optional[Callable] = textcat_score,
) -> None:
"""Initialize a text categorizer for single-label classification.
@@ -115,6 +145,8 @@ class TextCategorizer(TrainablePipe):
name (str): The component instance name, used to add entries to the
losses during training.
threshold (float): Cutoff to consider a prediction "positive".
+ scorer (Optional[Callable]): The scoring method. Defaults to
+ Scorer.score_cats for the attribute "cats".
DOCS: https://spacy.io/api/textcategorizer#init
"""
@@ -124,6 +156,7 @@ class TextCategorizer(TrainablePipe):
self._rehearsal_model = None
cfg = {"labels": [], "threshold": threshold, "positive_label": None}
self.cfg = dict(cfg)
+ self.scorer = scorer
@property
def labels(self) -> Tuple[str]:
@@ -131,7 +164,7 @@ class TextCategorizer(TrainablePipe):
DOCS: https://spacy.io/api/textcategorizer#labels
"""
- return tuple(self.cfg["labels"])
+ return tuple(self.cfg["labels"]) # type: ignore[arg-type, return-value]
@property
def label_data(self) -> List[str]:
@@ -139,7 +172,7 @@ class TextCategorizer(TrainablePipe):
DOCS: https://spacy.io/api/textcategorizer#label_data
"""
- return self.labels
+ return self.labels # type: ignore[return-value]
def predict(self, docs: Iterable[Doc]):
"""Apply the pipeline's model to a batch of docs, without modifying them.
@@ -153,7 +186,7 @@ class TextCategorizer(TrainablePipe):
# Handle cases where there are no tokens in any docs.
tensors = [doc.tensor for doc in docs]
xp = get_array_module(tensors)
- scores = xp.zeros((len(docs), len(self.labels)))
+ scores = xp.zeros((len(list(docs)), len(self.labels)))
return scores
scores = self.model.predict(docs)
scores = self.model.ops.asarray(scores)
@@ -230,8 +263,9 @@ class TextCategorizer(TrainablePipe):
DOCS: https://spacy.io/api/textcategorizer#rehearse
"""
- if losses is not None:
- losses.setdefault(self.name, 0.0)
+ if losses is None:
+ losses = {}
+ losses.setdefault(self.name, 0.0)
if self._rehearsal_model is None:
return losses
validate_examples(examples, "TextCategorizer.rehearse")
@@ -247,23 +281,23 @@ class TextCategorizer(TrainablePipe):
bp_scores(gradient)
if sgd is not None:
self.finish_update(sgd)
- if losses is not None:
- losses[self.name] += (gradient ** 2).sum()
+ losses[self.name] += (gradient ** 2).sum()
return losses
def _examples_to_truth(
- self, examples: List[Example]
+ self, examples: Iterable[Example]
) -> Tuple[numpy.ndarray, numpy.ndarray]:
- truths = numpy.zeros((len(examples), len(self.labels)), dtype="f")
- not_missing = numpy.ones((len(examples), len(self.labels)), dtype="f")
+ nr_examples = len(list(examples))
+ truths = numpy.zeros((nr_examples, len(self.labels)), dtype="f")
+ not_missing = numpy.ones((nr_examples, len(self.labels)), dtype="f")
for i, eg in enumerate(examples):
for j, label in enumerate(self.labels):
if label in eg.reference.cats:
truths[i, j] = eg.reference.cats[label]
else:
not_missing[i, j] = 0.0
- truths = self.model.ops.asarray(truths)
- return truths, not_missing
+ truths = self.model.ops.asarray(truths) # type: ignore
+ return truths, not_missing # type: ignore
def get_loss(self, examples: Iterable[Example], scores) -> Tuple[float, float]:
"""Find the loss and gradient of loss for the batch of documents and
@@ -278,7 +312,7 @@ class TextCategorizer(TrainablePipe):
validate_examples(examples, "TextCategorizer.get_loss")
self._validate_categories(examples)
truths, not_missing = self._examples_to_truth(examples)
- not_missing = self.model.ops.asarray(not_missing)
+ not_missing = self.model.ops.asarray(not_missing) # type: ignore
d_scores = (scores - truths) / scores.shape[0]
d_scores *= not_missing
mean_square_error = (d_scores ** 2).sum(axis=1).mean()
@@ -297,11 +331,9 @@ class TextCategorizer(TrainablePipe):
if label in self.labels:
return 0
self._allow_extra_label()
- self.cfg["labels"].append(label)
+ self.cfg["labels"].append(label) # type: ignore[attr-defined]
if self.model and "resize_output" in self.model.attrs:
- self.model = self.model.attrs["resize_output"](
- self.model, len(self.cfg["labels"])
- )
+ self.model = self.model.attrs["resize_output"](self.model, len(self.labels))
self.vocab.strings.add(label)
return 1
@@ -354,27 +386,7 @@ class TextCategorizer(TrainablePipe):
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(X=doc_sample, Y=label_sample)
- def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
- """Score a batch of examples.
-
- examples (Iterable[Example]): The examples to score.
- RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
-
- DOCS: https://spacy.io/api/textcategorizer#score
- """
- validate_examples(examples, "TextCategorizer.score")
- self._validate_categories(examples)
- kwargs.setdefault("threshold", self.cfg["threshold"])
- kwargs.setdefault("positive_label", self.cfg["positive_label"])
- return Scorer.score_cats(
- examples,
- "cats",
- labels=self.labels,
- multi_label=False,
- **kwargs,
- )
-
- def _validate_categories(self, examples: List[Example]):
+ def _validate_categories(self, examples: Iterable[Example]):
"""Check whether the provided examples all have single-label cats annotations."""
for ex in examples:
if list(ex.reference.cats.values()).count(1.0) > 1:
diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index ba36881af..a7bfacca7 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -5,10 +5,11 @@ from thinc.api import Model, Config
from thinc.types import Floats2d
from ..language import Language
-from ..training import Example, validate_examples, validate_get_examples
+from ..training import Example, validate_get_examples
from ..errors import Errors
from ..scorer import Scorer
from ..tokens import Doc
+from ..util import registry
from ..vocab import Vocab
from .textcat import TextCategorizer
@@ -70,7 +71,11 @@ subword_features = true
@Language.factory(
"textcat_multilabel",
assigns=["doc.cats"],
- default_config={"threshold": 0.5, "model": DEFAULT_MULTI_TEXTCAT_MODEL},
+ default_config={
+ "threshold": 0.5,
+ "model": DEFAULT_MULTI_TEXTCAT_MODEL,
+ "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
+ },
default_score_weights={
"cats_score": 1.0,
"cats_score_desc": None,
@@ -86,7 +91,11 @@ subword_features = true
},
)
def make_multilabel_textcat(
- nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
+ nlp: Language,
+ name: str,
+ model: Model[List[Doc], List[Floats2d]],
+ threshold: float,
+ scorer: Optional[Callable],
) -> "TextCategorizer":
"""Create a TextCategorizer component. The text categorizer predicts categories
over a whole document. It can learn one or more labels, and the labels are considered
@@ -97,7 +106,23 @@ def make_multilabel_textcat(
scores for each category.
threshold (float): Cutoff to consider a prediction "positive".
"""
- return MultiLabel_TextCategorizer(nlp.vocab, model, name, threshold=threshold)
+ return MultiLabel_TextCategorizer(
+ nlp.vocab, model, name, threshold=threshold, scorer=scorer
+ )
+
+
+def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
+ return Scorer.score_cats(
+ examples,
+ "cats",
+ multi_label=True,
+ **kwargs,
+ )
+
+
+@registry.scorers("spacy.textcat_multilabel_scorer.v1")
+def make_textcat_multilabel_scorer():
+ return textcat_multilabel_score
class MultiLabel_TextCategorizer(TextCategorizer):
@@ -113,6 +138,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
name: str = "textcat_multilabel",
*,
threshold: float,
+ scorer: Optional[Callable] = textcat_multilabel_score,
) -> None:
"""Initialize a text categorizer for multi-label classification.
@@ -130,8 +156,9 @@ class MultiLabel_TextCategorizer(TextCategorizer):
self._rehearsal_model = None
cfg = {"labels": [], "threshold": threshold}
self.cfg = dict(cfg)
+ self.scorer = scorer
- def initialize(
+ def initialize( # type: ignore[override]
self,
get_examples: Callable[[], Iterable[Example]],
*,
@@ -166,25 +193,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(X=doc_sample, Y=label_sample)
- def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
- """Score a batch of examples.
-
- examples (Iterable[Example]): The examples to score.
- RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
-
- DOCS: https://spacy.io/api/textcategorizer#score
- """
- validate_examples(examples, "MultiLabel_TextCategorizer.score")
- kwargs.setdefault("threshold", self.cfg["threshold"])
- return Scorer.score_cats(
- examples,
- "cats",
- labels=self.labels,
- multi_label=True,
- **kwargs,
- )
-
- def _validate_categories(self, examples: List[Example]):
+ def _validate_categories(self, examples: Iterable[Example]):
"""This component allows any type of single- or multi-label annotations.
This method overwrites the more strict one from 'textcat'."""
pass
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index 00d9548a4..cb601e5dc 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -1,4 +1,4 @@
-from typing import Sequence, Iterable, Optional, Dict, Callable, List
+from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any
from thinc.api import Model, set_dropout_rate, Optimizer, Config
from itertools import islice
@@ -60,8 +60,8 @@ class Tok2Vec(TrainablePipe):
self.vocab = vocab
self.model = model
self.name = name
- self.listener_map = {}
- self.cfg = {}
+ self.listener_map: Dict[str, List["Tok2VecListener"]] = {}
+ self.cfg: Dict[str, Any] = {}
@property
def listeners(self) -> List["Tok2VecListener"]:
@@ -245,12 +245,12 @@ class Tok2VecListener(Model):
"""
Model.__init__(self, name=self.name, forward=forward, dims={"nO": width})
self.upstream_name = upstream_name
- self._batch_id = None
+ self._batch_id: Optional[int] = None
self._outputs = None
self._backprop = None
@classmethod
- def get_batch_id(cls, inputs: List[Doc]) -> int:
+ def get_batch_id(cls, inputs: Iterable[Doc]) -> int:
"""Calculate a content-sensitive hash of the batch of documents, to check
whether the next batch of documents is unexpected.
"""
diff --git a/spacy/pipeline/trainable_pipe.pxd b/spacy/pipeline/trainable_pipe.pxd
index d5cdbb511..65daa8b22 100644
--- a/spacy/pipeline/trainable_pipe.pxd
+++ b/spacy/pipeline/trainable_pipe.pxd
@@ -5,3 +5,4 @@ cdef class TrainablePipe(Pipe):
cdef public Vocab vocab
cdef public object model
cdef public object cfg
+ cdef public object scorer
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index ce1e133a2..76b0733cf 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -273,7 +273,7 @@ cdef class TrainablePipe(Pipe):
serialize = {}
if hasattr(self, "cfg") and self.cfg is not None:
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
- serialize["vocab"] = self.vocab.to_bytes
+ serialize["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
serialize["model"] = self.model.to_bytes
return util.to_bytes(serialize, exclude)
@@ -296,7 +296,7 @@ cdef class TrainablePipe(Pipe):
deserialize = {}
if hasattr(self, "cfg") and self.cfg is not None:
deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
- deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
+ deserialize["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude)
deserialize["model"] = load_model
util.from_bytes(bytes_data, deserialize, exclude)
return self
@@ -313,7 +313,7 @@ cdef class TrainablePipe(Pipe):
serialize = {}
if hasattr(self, "cfg") and self.cfg is not None:
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
- serialize["vocab"] = lambda p: self.vocab.to_disk(p)
+ serialize["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude)
serialize["model"] = lambda p: self.model.to_disk(p)
util.to_disk(path, serialize, exclude)
@@ -338,7 +338,7 @@ cdef class TrainablePipe(Pipe):
deserialize = {}
if hasattr(self, "cfg") and self.cfg is not None:
deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
- deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
+ deserialize["vocab"] = lambda p: self.vocab.from_disk(p, exclude=exclude)
deserialize["model"] = load_model
util.from_disk(path, deserialize, exclude)
return self
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index a495b1bc7..2571af102 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -49,7 +49,8 @@ cdef class Parser(TrainablePipe):
beam_density=0.0,
beam_update_prob=0.0,
multitasks=tuple(),
- incorrect_spans_key=None
+ incorrect_spans_key=None,
+ scorer=None,
):
"""Create a Parser.
@@ -86,6 +87,7 @@ cdef class Parser(TrainablePipe):
incorrect_spans_key (Optional[str]): Identifies spans that are known
to be incorrect entity annotations. The incorrect entity annotations
can be stored in the span group, under this key.
+ scorer (Optional[Callable]): The scoring method. Defaults to None.
"""
self.vocab = vocab
self.name = name
@@ -117,6 +119,7 @@ cdef class Parser(TrainablePipe):
self.add_multitask_objective(multitask)
self._rehearsal_model = None
+ self.scorer = scorer
def __getnewargs_ex__(self):
"""This allows pickling the Parser and its keyword-only init arguments"""
@@ -569,7 +572,7 @@ cdef class Parser(TrainablePipe):
def to_disk(self, path, exclude=tuple()):
serializers = {
"model": lambda p: (self.model.to_disk(p) if self.model is not True else True),
- "vocab": lambda p: self.vocab.to_disk(p),
+ "vocab": lambda p: self.vocab.to_disk(p, exclude=exclude),
"moves": lambda p: self.moves.to_disk(p, exclude=["strings"]),
"cfg": lambda p: srsly.write_json(p, self.cfg)
}
@@ -577,7 +580,7 @@ cdef class Parser(TrainablePipe):
def from_disk(self, path, exclude=tuple()):
deserializers = {
- "vocab": lambda p: self.vocab.from_disk(p),
+ "vocab": lambda p: self.vocab.from_disk(p, exclude=exclude),
"moves": lambda p: self.moves.from_disk(p, exclude=["strings"]),
"cfg": lambda p: self.cfg.update(srsly.read_json(p)),
"model": lambda p: None,
@@ -597,7 +600,7 @@ cdef class Parser(TrainablePipe):
def to_bytes(self, exclude=tuple()):
serializers = {
"model": lambda: (self.model.to_bytes()),
- "vocab": lambda: self.vocab.to_bytes(),
+ "vocab": lambda: self.vocab.to_bytes(exclude=exclude),
"moves": lambda: self.moves.to_bytes(exclude=["strings"]),
"cfg": lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True)
}
@@ -605,7 +608,7 @@ cdef class Parser(TrainablePipe):
def from_bytes(self, bytes_data, exclude=tuple()):
deserializers = {
- "vocab": lambda b: self.vocab.from_bytes(b),
+ "vocab": lambda b: self.vocab.from_bytes(b, exclude=exclude),
"moves": lambda b: self.moves.from_bytes(b, exclude=["strings"]),
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
"model": lambda b: None,
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 992e17d70..1dfd8ee85 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -1,5 +1,6 @@
from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
from typing import Iterable, TypeVar, TYPE_CHECKING
+from .compat import Literal
from enum import Enum
from pydantic import BaseModel, Field, ValidationError, validator, create_model
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
@@ -44,7 +45,7 @@ def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]:
for error in errors:
err_loc = " -> ".join([str(p) for p in error.get("loc", [])])
data[err_loc].append(error.get("msg"))
- return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()]
+ return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()] # type: ignore[arg-type]
# Initialization
@@ -82,7 +83,7 @@ def get_arg_model(
except ValueError:
# Typically happens if the method is part of a Cython module without
# binding=True. Here we just use an empty model that allows everything.
- return create_model(name, __config__=ArgSchemaConfigExtra)
+ return create_model(name, __config__=ArgSchemaConfigExtra) # type: ignore[arg-type, return-value]
has_variable = False
for param in sig.parameters.values():
if param.name in exclude:
@@ -102,8 +103,8 @@ def get_arg_model(
default = param.default if param.default != param.empty else default_empty
sig_args[param.name] = (annotation, default)
is_strict = strict and not has_variable
- sig_args["__config__"] = ArgSchemaConfig if is_strict else ArgSchemaConfigExtra
- return create_model(name, **sig_args)
+ sig_args["__config__"] = ArgSchemaConfig if is_strict else ArgSchemaConfigExtra # type: ignore[assignment]
+ return create_model(name, **sig_args) # type: ignore[arg-type, return-value]
def validate_init_settings(
@@ -159,6 +160,7 @@ class TokenPatternString(BaseModel):
NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
+ INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
class Config:
extra = "forbid"
@@ -175,8 +177,9 @@ class TokenPatternNumber(BaseModel):
REGEX: Optional[StrictStr] = Field(None, alias="regex")
IN: Optional[List[StrictInt]] = Field(None, alias="in")
NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in")
- ISSUBSET: Optional[List[StrictInt]] = Field(None, alias="issubset")
- ISSUPERSET: Optional[List[StrictInt]] = Field(None, alias="issuperset")
+ IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
+ IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset")
+ INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects")
EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
@@ -196,10 +199,10 @@ class TokenPatternNumber(BaseModel):
class TokenPatternOperator(str, Enum):
- plus: StrictStr = "+"
- start: StrictStr = "*"
- question: StrictStr = "?"
- exclamation: StrictStr = "!"
+ plus: StrictStr = StrictStr("+")
+ start: StrictStr = StrictStr("*")
+ question: StrictStr = StrictStr("?")
+ exclamation: StrictStr = StrictStr("!")
StringValue = Union[TokenPatternString, StrictStr]
@@ -207,6 +210,7 @@ NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
UnderscoreValue = Union[
TokenPatternString, TokenPatternNumber, str, int, float, list, bool
]
+IobValue = Literal["", "I", "O", "B", 0, 1, 2, 3]
class TokenPattern(BaseModel):
@@ -220,6 +224,9 @@ class TokenPattern(BaseModel):
lemma: Optional[StringValue] = None
shape: Optional[StringValue] = None
ent_type: Optional[StringValue] = None
+ ent_iob: Optional[IobValue] = None
+ ent_id: Optional[StringValue] = None
+ ent_kb_id: Optional[StringValue] = None
norm: Optional[StringValue] = None
length: Optional[NumberValue] = None
spacy: Optional[StrictBool] = None
@@ -349,7 +356,8 @@ class ConfigSchemaPretrain(BaseModel):
# fmt: off
max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for")
dropout: StrictFloat = Field(..., title="Dropout rate")
- n_save_every: Optional[StrictInt] = Field(..., title="Saving frequency")
+ n_save_every: Optional[StrictInt] = Field(..., title="Saving additional temporary model after n batches within an epoch")
+ n_save_epoch: Optional[StrictInt] = Field(..., title="Saving model after every n epoch")
optimizer: Optimizer = Field(..., title="The optimizer to use")
corpus: StrictStr = Field(..., title="Path in the config to the training data")
batcher: Batcher = Field(..., title="Batcher for the training data")
@@ -383,7 +391,7 @@ class ConfigSchemaInit(BaseModel):
class ConfigSchema(BaseModel):
training: ConfigSchemaTraining
nlp: ConfigSchemaNlp
- pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {}
+ pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {} # type: ignore[assignment]
components: Dict[str, Dict[str, Any]]
corpora: Dict[str, Reader]
initialize: ConfigSchemaInit
diff --git a/spacy/scorer.py b/spacy/scorer.py
index d93c1f0a1..e2978a36b 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -1,4 +1,5 @@
-from typing import Optional, Iterable, Dict, Set, Any, Callable, TYPE_CHECKING
+from typing import Optional, Iterable, Dict, Set, List, Any, Callable, Tuple
+from typing import TYPE_CHECKING
import numpy as np
from collections import defaultdict
@@ -74,8 +75,8 @@ class ROCAUCScore:
may throw an error."""
def __init__(self) -> None:
- self.golds = []
- self.cands = []
+ self.golds: List[Any] = []
+ self.cands: List[Any] = []
self.saved_score = 0.0
self.saved_score_at_len = 0
@@ -111,9 +112,10 @@ class Scorer:
DOCS: https://spacy.io/api/scorer#init
"""
- self.nlp = nlp
self.cfg = cfg
- if not nlp:
+ if nlp:
+ self.nlp = nlp
+ else:
nlp = get_lang_class(default_lang)()
for pipe in default_pipeline:
nlp.add_pipe(pipe)
@@ -129,7 +131,7 @@ class Scorer:
"""
scores = {}
if hasattr(self.nlp.tokenizer, "score"):
- scores.update(self.nlp.tokenizer.score(examples, **self.cfg))
+ scores.update(self.nlp.tokenizer.score(examples, **self.cfg)) # type: ignore
for name, component in self.nlp.pipeline:
if hasattr(component, "score"):
scores.update(component.score(examples, **self.cfg))
@@ -191,7 +193,7 @@ class Scorer:
attr: str,
*,
getter: Callable[[Token, str], Any] = getattr,
- missing_values: Set[Any] = MISSING_VALUES,
+ missing_values: Set[Any] = MISSING_VALUES, # type: ignore[assignment]
**cfg,
) -> Dict[str, Any]:
"""Returns an accuracy score for a token-level attribute.
@@ -201,6 +203,8 @@ class Scorer:
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
getter(token, attr) should return the value of the attribute for an
individual token.
+ missing_values (Set[Any]): Attribute values to treat as missing annotation
+ in the reference annotation.
RETURNS (Dict[str, Any]): A dictionary containing the accuracy score
under the key attr_acc.
@@ -240,25 +244,30 @@ class Scorer:
attr: str,
*,
getter: Callable[[Token, str], Any] = getattr,
- missing_values: Set[Any] = MISSING_VALUES,
+ missing_values: Set[Any] = MISSING_VALUES, # type: ignore[assignment]
**cfg,
) -> Dict[str, Any]:
- """Return PRF scores per feat for a token attribute in UFEATS format.
+ """Return micro PRF and PRF scores per feat for a token attribute in
+ UFEATS format.
examples (Iterable[Example]): Examples to score
attr (str): The attribute to score.
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
getter(token, attr) should return the value of the attribute for an
individual token.
- RETURNS (dict): A dictionary containing the per-feat PRF scores under
- the key attr_per_feat.
+ missing_values (Set[Any]): Attribute values to treat as missing
+ annotation in the reference annotation.
+ RETURNS (dict): A dictionary containing the micro PRF scores under the
+ key attr_micro_p/r/f and the per-feat PRF scores under
+ attr_per_feat.
"""
+ micro_score = PRFScore()
per_feat = {}
for example in examples:
pred_doc = example.predicted
gold_doc = example.reference
align = example.alignment
- gold_per_feat = {}
+ gold_per_feat: Dict[str, Set] = {}
missing_indices = set()
for gold_i, token in enumerate(gold_doc):
value = getter(token, attr)
@@ -273,7 +282,7 @@ class Scorer:
gold_per_feat[field].add((gold_i, feat))
else:
missing_indices.add(gold_i)
- pred_per_feat = {}
+ pred_per_feat: Dict[str, Set] = {}
for token in pred_doc:
if token.orth_.isspace():
continue
@@ -294,15 +303,24 @@ class Scorer:
pred_per_feat[field] = set()
pred_per_feat[field].add((gold_i, feat))
for field in per_feat:
+ micro_score.score_set(
+ pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
+ )
per_feat[field].score_set(
pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
)
- score_key = f"{attr}_per_feat"
- if any([len(v) for v in per_feat.values()]):
- result = {k: v.to_dict() for k, v in per_feat.items()}
- return {score_key: result}
+ result: Dict[str, Any] = {}
+ if len(micro_score) > 0:
+ result[f"{attr}_micro_p"] = micro_score.precision
+ result[f"{attr}_micro_r"] = micro_score.recall
+ result[f"{attr}_micro_f"] = micro_score.fscore
+ result[f"{attr}_per_feat"] = {k: v.to_dict() for k, v in per_feat.items()}
else:
- return {score_key: None}
+ result[f"{attr}_micro_p"] = None
+ result[f"{attr}_micro_r"] = None
+ result[f"{attr}_micro_f"] = None
+ result[f"{attr}_per_feat"] = None
+ return result
@staticmethod
def score_spans(
@@ -341,16 +359,17 @@ class Scorer:
pred_doc = example.predicted
gold_doc = example.reference
# Option to handle docs without annotation for this attribute
- if has_annotation is not None:
- if not has_annotation(gold_doc):
- continue
- # Find all labels in gold and doc
- labels = set(
- [k.label_ for k in getter(gold_doc, attr)]
- + [k.label_ for k in getter(pred_doc, attr)]
- )
+ if has_annotation is not None and not has_annotation(gold_doc):
+ continue
+ # Find all labels in gold
+ labels = set([k.label_ for k in getter(gold_doc, attr)])
+ # If labeled, find all labels in pred
+ if has_annotation is None or (
+ has_annotation is not None and has_annotation(pred_doc)
+ ):
+ labels |= set([k.label_ for k in getter(pred_doc, attr)])
# Set up all labels for per type scoring and prepare gold per type
- gold_per_type = {label: set() for label in labels}
+ gold_per_type: Dict[str, Set] = {label: set() for label in labels}
for label in labels:
if label not in score_per_type:
score_per_type[label] = PRFScore()
@@ -358,22 +377,27 @@ class Scorer:
gold_spans = set()
pred_spans = set()
for span in getter(gold_doc, attr):
+ gold_span: Tuple
if labeled:
gold_span = (span.label_, span.start, span.end - 1)
else:
gold_span = (span.start, span.end - 1)
gold_spans.add(gold_span)
gold_per_type[span.label_].add(gold_span)
- pred_per_type = {label: set() for label in labels}
- for span in example.get_aligned_spans_x2y(
- getter(pred_doc, attr), allow_overlap
+ pred_per_type: Dict[str, Set] = {label: set() for label in labels}
+ if has_annotation is None or (
+ has_annotation is not None and has_annotation(pred_doc)
):
- if labeled:
- pred_span = (span.label_, span.start, span.end - 1)
- else:
- pred_span = (span.start, span.end - 1)
- pred_spans.add(pred_span)
- pred_per_type[span.label_].add(pred_span)
+ for span in example.get_aligned_spans_x2y(
+ getter(pred_doc, attr), allow_overlap
+ ):
+ pred_span: Tuple
+ if labeled:
+ pred_span = (span.label_, span.start, span.end - 1)
+ else:
+ pred_span = (span.start, span.end - 1)
+ pred_spans.add(pred_span)
+ pred_per_type[span.label_].add(pred_span)
# Scores per label
if labeled:
for k, v in score_per_type.items():
@@ -382,7 +406,7 @@ class Scorer:
# Score for all labels
score.score_set(pred_spans, gold_spans)
# Assemble final result
- final_scores = {
+ final_scores: Dict[str, Any] = {
f"{attr}_p": None,
f"{attr}_r": None,
f"{attr}_f": None,
@@ -616,7 +640,7 @@ class Scorer:
sum(auc.score if auc.is_binary() else 0.0 for auc in auc_per_type.values())
/ n_cats
)
- results = {
+ results: Dict[str, Any] = {
f"{attr}_score": None,
f"{attr}_score_desc": None,
f"{attr}_micro_p": micro_prf.precision,
@@ -645,7 +669,7 @@ class Scorer:
@staticmethod
def score_links(
- examples: Iterable[Example], *, negative_labels: Iterable[str]
+ examples: Iterable[Example], *, negative_labels: Iterable[str], **cfg
) -> Dict[str, Any]:
"""Returns PRF for predicted links on the entity level.
To disentangle the performance of the NEL from the NER,
@@ -721,7 +745,7 @@ class Scorer:
head_attr: str = "head",
head_getter: Callable[[Token, str], Token] = getattr,
ignore_labels: Iterable[str] = SimpleFrozenList(),
- missing_values: Set[Any] = MISSING_VALUES,
+ missing_values: Set[Any] = MISSING_VALUES, # type: ignore[assignment]
**cfg,
) -> Dict[str, Any]:
"""Returns the UAS, LAS, and LAS per type scores for dependency
@@ -738,6 +762,8 @@ class Scorer:
head_getter(token, attr) should return the value of the head for an
individual token.
ignore_labels (Tuple): Labels to ignore while scoring (e.g., punct).
+ missing_values (Set[Any]): Attribute values to treat as missing annotation
+ in the reference annotation.
RETURNS (Dict[str, Any]): A dictionary containing the scores:
attr_uas, attr_las, and attr_las_per_type.
@@ -752,7 +778,7 @@ class Scorer:
pred_doc = example.predicted
align = example.alignment
gold_deps = set()
- gold_deps_per_dep = {}
+ gold_deps_per_dep: Dict[str, Set] = {}
for gold_i, token in enumerate(gold_doc):
dep = getter(token, attr)
head = head_getter(token, head_attr)
@@ -767,12 +793,12 @@ class Scorer:
else:
missing_indices.add(gold_i)
pred_deps = set()
- pred_deps_per_dep = {}
+ pred_deps_per_dep: Dict[str, Set] = {}
for token in pred_doc:
if token.orth_.isspace():
continue
if align.x2y.lengths[token.i] != 1:
- gold_i = None
+ gold_i = None # type: ignore
else:
gold_i = align.x2y[token.i].dataXd[0, 0]
if gold_i not in missing_indices:
@@ -819,7 +845,7 @@ class Scorer:
}
-def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]:
+def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Compute micro-PRF and per-entity PRF scores for a sequence of examples."""
score_per_type = defaultdict(PRFScore)
for eg in examples:
diff --git a/spacy/strings.pxd b/spacy/strings.pxd
index 07768d347..370180135 100644
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@@ -8,10 +8,10 @@ from murmurhash.mrmr cimport hash64
from .typedefs cimport attr_t, hash_t
-cpdef hash_t hash_string(unicode string) except 0
+cpdef hash_t hash_string(str string) except 0
cdef hash_t hash_utf8(char* utf8_string, int length) nogil
-cdef unicode decode_Utf8Str(const Utf8Str* string)
+cdef str decode_Utf8Str(const Utf8Str* string)
ctypedef union Utf8Str:
@@ -25,5 +25,5 @@ cdef class StringStore:
cdef vector[hash_t] keys
cdef public PreshMap _map
- cdef const Utf8Str* intern_unicode(self, unicode py_string)
+ cdef const Utf8Str* intern_unicode(self, str py_string)
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)
diff --git a/spacy/strings.pyi b/spacy/strings.pyi
new file mode 100644
index 000000000..5b4147e12
--- /dev/null
+++ b/spacy/strings.pyi
@@ -0,0 +1,22 @@
+from typing import Optional, Iterable, Iterator, Union, Any
+from pathlib import Path
+
+def get_string_id(key: Union[str, int]) -> int: ...
+
+class StringStore:
+ def __init__(
+ self, strings: Optional[Iterable[str]] = ..., freeze: bool = ...
+ ) -> None: ...
+ def __getitem__(self, string_or_id: Union[bytes, str, int]) -> Union[str, int]: ...
+ def as_int(self, key: Union[bytes, str, int]) -> int: ...
+ def as_string(self, key: Union[bytes, str, int]) -> str: ...
+ def add(self, string: str) -> int: ...
+ def __len__(self) -> int: ...
+ def __contains__(self, string: str) -> bool: ...
+ def __iter__(self) -> Iterator[str]: ...
+ def __reduce__(self) -> Any: ...
+ def to_disk(self, path: Union[str, Path]) -> None: ...
+ def from_disk(self, path: Union[str, Path]) -> StringStore: ...
+ def to_bytes(self, **kwargs: Any) -> bytes: ...
+ def from_bytes(self, bytes_data: bytes, **kwargs: Any) -> StringStore: ...
+ def _reset_and_load(self, strings: Iterable[str]) -> None: ...
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 4a20cb8af..39fc441e9 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -33,7 +33,7 @@ def get_string_id(key):
return hash_utf8(chars, len(chars))
-cpdef hash_t hash_string(unicode string) except 0:
+cpdef hash_t hash_string(str string) except 0:
chars = string.encode("utf8")
return hash_utf8(chars, len(chars))
@@ -46,7 +46,7 @@ cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
return hash32(utf8_string, length, 1)
-cdef unicode decode_Utf8Str(const Utf8Str* string):
+cdef str decode_Utf8Str(const Utf8Str* string):
cdef int i, length
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
return string.s[1:string.s[0]+1].decode("utf8")
@@ -107,17 +107,17 @@ cdef class StringStore:
def __getitem__(self, object string_or_id):
"""Retrieve a string from a given hash, or vice versa.
- string_or_id (bytes, unicode or uint64): The value to encode.
+ string_or_id (bytes, str or uint64): The value to encode.
Returns (str / uint64): The value to be retrieved.
"""
- if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
+ if isinstance(string_or_id, str) and len(string_or_id) == 0:
return 0
elif string_or_id == 0:
return ""
elif string_or_id in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string_or_id]
cdef hash_t key
- if isinstance(string_or_id, unicode):
+ if isinstance(string_or_id, str):
key = hash_string(string_or_id)
return key
elif isinstance(string_or_id, bytes):
@@ -135,14 +135,14 @@ cdef class StringStore:
def as_int(self, key):
"""If key is an int, return it; otherwise, get the int value."""
- if not isinstance(key, basestring):
+ if not isinstance(key, str):
return key
else:
return self[key]
def as_string(self, key):
"""If key is a string, return it; otherwise, get the string value."""
- if isinstance(key, basestring):
+ if isinstance(key, str):
return key
else:
return self[key]
@@ -153,7 +153,7 @@ cdef class StringStore:
string (str): The string to add.
RETURNS (uint64): The string's hash value.
"""
- if isinstance(string, unicode):
+ if isinstance(string, str):
if string in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string]
key = hash_string(string)
@@ -189,7 +189,7 @@ cdef class StringStore:
return True
elif string in SYMBOLS_BY_STR:
return True
- elif isinstance(string, unicode):
+ elif isinstance(string, str):
key = hash_string(string)
else:
string = string.encode("utf8")
@@ -269,7 +269,7 @@ cdef class StringStore:
for string in strings:
self.add(string)
- cdef const Utf8Str* intern_unicode(self, unicode py_string):
+ cdef const Utf8Str* intern_unicode(self, str py_string):
# 0 means missing, but we don't bother offsetting the index.
cdef bytes byte_string = py_string.encode("utf8")
return self._intern_utf8(byte_string, len(byte_string))
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 8c450b154..ffca79bb9 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -3,7 +3,13 @@ from spacy.util import get_lang_class
def pytest_addoption(parser):
- parser.addoption("--slow", action="store_true", help="include slow tests")
+ try:
+ parser.addoption("--slow", action="store_true", help="include slow tests")
+ parser.addoption("--issue", action="store", help="test specific issues")
+ # Options are already added, e.g. if conftest is copied in a build pipeline
+ # and runs twice
+ except ValueError:
+ pass
def pytest_runtest_setup(item):
@@ -16,10 +22,24 @@ def pytest_runtest_setup(item):
# options weren't given.
return item.config.getoption(f"--{opt}", False)
+ # Integration of boolean flags
for opt in ["slow"]:
if opt in item.keywords and not getopt(opt):
pytest.skip(f"need --{opt} option to run")
+ # Special integration to mark tests with issue numbers
+ issues = getopt("issue")
+ if isinstance(issues, str):
+ if "issue" in item.keywords:
+ # Convert issues provided on the CLI to list of ints
+ issue_nos = [int(issue.strip()) for issue in issues.split(",")]
+ # Get all issues specified by decorators and check if they're provided
+ issue_refs = [mark.args[0] for mark in item.iter_markers(name="issue")]
+ if not any([ref in issue_nos for ref in issue_refs]):
+ pytest.skip(f"not referencing specified issues: {issue_nos}")
+ else:
+ pytest.skip("not referencing any issues")
+
# Fixtures for language tokenizers (languages sorted alphabetically)
@@ -29,6 +49,11 @@ def tokenizer():
return get_lang_class("xx")().tokenizer
+@pytest.fixture(scope="session")
+def af_tokenizer():
+ return get_lang_class("af")().tokenizer
+
+
@pytest.fixture(scope="session")
def am_tokenizer():
return get_lang_class("am")().tokenizer
@@ -100,6 +125,16 @@ def es_tokenizer():
return get_lang_class("es")().tokenizer
+@pytest.fixture(scope="session")
+def es_vocab():
+ return get_lang_class("es")().vocab
+
+
+@pytest.fixture(scope="session")
+def et_tokenizer():
+ return get_lang_class("et")().tokenizer
+
+
@pytest.fixture(scope="session")
def eu_tokenizer():
return get_lang_class("eu")().tokenizer
@@ -125,6 +160,11 @@ def ga_tokenizer():
return get_lang_class("ga")().tokenizer
+@pytest.fixture(scope="session")
+def grc_tokenizer():
+ return get_lang_class("grc")().tokenizer
+
+
@pytest.fixture(scope="session")
def gu_tokenizer():
return get_lang_class("gu")().tokenizer
@@ -155,6 +195,11 @@ def id_tokenizer():
return get_lang_class("id")().tokenizer
+@pytest.fixture(scope="session")
+def is_tokenizer():
+ return get_lang_class("is")().tokenizer
+
+
@pytest.fixture(scope="session")
def it_tokenizer():
return get_lang_class("it")().tokenizer
@@ -182,6 +227,11 @@ def lt_tokenizer():
return get_lang_class("lt")().tokenizer
+@pytest.fixture(scope="session")
+def lv_tokenizer():
+ return get_lang_class("lv")().tokenizer
+
+
@pytest.fixture(scope="session")
def mk_tokenizer():
return get_lang_class("mk")().tokenizer
@@ -202,6 +252,11 @@ def ne_tokenizer():
return get_lang_class("ne")().tokenizer
+@pytest.fixture(scope="session")
+def nl_vocab():
+ return get_lang_class("nl")().vocab
+
+
@pytest.fixture(scope="session")
def nl_tokenizer():
return get_lang_class("nl")().tokenizer
@@ -217,6 +272,11 @@ def pt_tokenizer():
return get_lang_class("pt")().tokenizer
+@pytest.fixture(scope="session")
+def pt_vocab():
+ return get_lang_class("pt")().vocab
+
+
@pytest.fixture(scope="session")
def ro_tokenizer():
return get_lang_class("ro")().tokenizer
@@ -239,11 +299,26 @@ def sa_tokenizer():
return get_lang_class("sa")().tokenizer
+@pytest.fixture(scope="session")
+def sk_tokenizer():
+ return get_lang_class("sk")().tokenizer
+
+
+@pytest.fixture(scope="session")
+def sl_tokenizer():
+ return get_lang_class("sl")().tokenizer
+
+
@pytest.fixture(scope="session")
def sr_tokenizer():
return get_lang_class("sr")().tokenizer
+@pytest.fixture(scope="session")
+def sq_tokenizer():
+ return get_lang_class("sq")().tokenizer
+
+
@pytest.fixture(scope="session")
def sv_tokenizer():
return get_lang_class("sv")().tokenizer
@@ -260,6 +335,11 @@ def ti_tokenizer():
return get_lang_class("ti")().tokenizer
+@pytest.fixture(scope="session")
+def tl_tokenizer():
+ return get_lang_class("tl")().tokenizer
+
+
@pytest.fixture(scope="session")
def tr_tokenizer():
return get_lang_class("tr")().tokenizer
@@ -299,6 +379,11 @@ def vi_tokenizer():
return get_lang_class("vi")().tokenizer
+@pytest.fixture(scope="session")
+def xx_tokenizer():
+ return get_lang_class("xx")().tokenizer
+
+
@pytest.fixture(scope="session")
def yo_tokenizer():
return get_lang_class("yo")().tokenizer
diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py
index ef54c581c..c334cc6eb 100644
--- a/spacy/tests/doc/test_array.py
+++ b/spacy/tests/doc/test_array.py
@@ -1,8 +1,31 @@
+import numpy
import pytest
+
from spacy.tokens import Doc
from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH
+@pytest.mark.issue(2203)
+def test_issue2203(en_vocab):
+ """Test that lemmas are set correctly in doc.from_array."""
+ words = ["I", "'ll", "survive"]
+ tags = ["PRP", "MD", "VB"]
+ lemmas = ["-PRON-", "will", "survive"]
+ tag_ids = [en_vocab.strings.add(tag) for tag in tags]
+ lemma_ids = [en_vocab.strings.add(lemma) for lemma in lemmas]
+ doc = Doc(en_vocab, words=words)
+ # Work around lemma corruption problem and set lemmas after tags
+ doc.from_array("TAG", numpy.array(tag_ids, dtype="uint64"))
+ doc.from_array("LEMMA", numpy.array(lemma_ids, dtype="uint64"))
+ assert [t.tag_ for t in doc] == tags
+ assert [t.lemma_ for t in doc] == lemmas
+ # We need to serialize both tag and lemma, since this is what causes the bug
+ doc_array = doc.to_array(["TAG", "LEMMA"])
+ new_doc = Doc(doc.vocab, words=words).from_array(["TAG", "LEMMA"], doc_array)
+ assert [t.tag_ for t in new_doc] == tags
+ assert [t.lemma_ for t in new_doc] == lemmas
+
+
def test_doc_array_attr_of_token(en_vocab):
doc = Doc(en_vocab, words=["An", "example", "sentence"])
example = doc.vocab["example"]
diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py
index 6c9de8f07..302a9b6ea 100644
--- a/spacy/tests/doc/test_creation.py
+++ b/spacy/tests/doc/test_creation.py
@@ -69,4 +69,11 @@ def test_create_with_heads_and_no_deps(vocab):
words = "I like ginger".split()
heads = list(range(len(words)))
with pytest.raises(ValueError):
- doc = Doc(vocab, words=words, heads=heads)
+ Doc(vocab, words=words, heads=heads)
+
+
+def test_create_invalid_pos(vocab):
+ words = "I like ginger".split()
+ pos = "QQ ZZ XX".split()
+ with pytest.raises(ValueError):
+ Doc(vocab, words=words, pos=pos)
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 57df87642..10700b787 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -1,14 +1,17 @@
import weakref
-import pytest
import numpy
+import pytest
+from thinc.api import NumpyOps, get_current_ops
+from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS
+from spacy.attrs import SENT_START, TAG
+from spacy.lang.en import English
from spacy.lang.xx import MultiLanguage
+from spacy.language import Language
+from spacy.lexeme import Lexeme
from spacy.tokens import Doc, Span, Token
from spacy.vocab import Vocab
-from spacy.lexeme import Lexeme
-from spacy.lang.en import English
-from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH
from .test_underscore import clean_underscore # noqa: F401
@@ -30,6 +33,220 @@ def test_doc_api_init(en_vocab):
assert [t.is_sent_start for t in doc] == [True, False, True, False]
+@pytest.mark.issue(1547)
+def test_issue1547():
+ """Test that entity labels still match after merging tokens."""
+ words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"]
+ doc = Doc(Vocab(), words=words)
+ doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])]
+ with doc.retokenize() as retokenizer:
+ retokenizer.merge(doc[5:7])
+ assert [ent.text for ent in doc.ents]
+
+
+@pytest.mark.issue(1757)
+def test_issue1757():
+ """Test comparison against None doesn't cause segfault."""
+ doc = Doc(Vocab(), words=["a", "b", "c"])
+ assert not doc[0] < None
+ assert not doc[0] is None
+ assert doc[0] >= None
+ assert not doc[:2] < None
+ assert not doc[:2] is None
+ assert doc[:2] >= None
+ assert not doc.vocab["a"] is None
+ assert not doc.vocab["a"] < None
+
+
+@pytest.mark.issue(2396)
+def test_issue2396(en_vocab):
+ words = ["She", "created", "a", "test", "for", "spacy"]
+ heads = [1, 1, 3, 1, 3, 4]
+ deps = ["dep"] * len(heads)
+ matrix = numpy.array(
+ [
+ [0, 1, 1, 1, 1, 1],
+ [1, 1, 1, 1, 1, 1],
+ [1, 1, 2, 3, 3, 3],
+ [1, 1, 3, 3, 3, 3],
+ [1, 1, 3, 3, 4, 4],
+ [1, 1, 3, 3, 4, 5],
+ ],
+ dtype=numpy.int32,
+ )
+ doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
+ span = doc[:]
+ assert (doc.get_lca_matrix() == matrix).all()
+ assert (span.get_lca_matrix() == matrix).all()
+
+
+@pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
+@pytest.mark.parametrize("lang_cls", [English, MultiLanguage])
+@pytest.mark.issue(2782)
+def test_issue2782(text, lang_cls):
+ """Check that like_num handles + and - before number."""
+ nlp = lang_cls()
+ doc = nlp(text)
+ assert len(doc) == 1
+ assert doc[0].like_num
+
+
+@pytest.mark.parametrize(
+ "sentence",
+ [
+ "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
+ "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
+ "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
+ "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
+ "It was a missed assignment, but it shouldn't have resulted in a turnover ...",
+ ],
+)
+@pytest.mark.issue(3869)
+def test_issue3869(sentence):
+ """Test that the Doc's count_by function works consistently"""
+ nlp = English()
+ doc = nlp(sentence)
+ count = 0
+ for token in doc:
+ count += token.is_alpha
+ assert count == doc.count_by(IS_ALPHA).get(1, 0)
+
+
+@pytest.mark.issue(3962)
+def test_issue3962(en_vocab):
+ """Ensure that as_doc does not result in out-of-bound access of tokens.
+ This is achieved by setting the head to itself if it would lie out of the span otherwise."""
+ # fmt: off
+ words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
+ heads = [1, 7, 1, 2, 7, 7, 7, 7, 9, 7, 7]
+ deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
+ # fmt: on
+ doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
+ span2 = doc[1:5] # "jests at scars ,"
+ doc2 = span2.as_doc()
+ doc2_json = doc2.to_json()
+ assert doc2_json
+ # head set to itself, being the new artificial root
+ assert doc2[0].head.text == "jests"
+ assert doc2[0].dep_ == "dep"
+ assert doc2[1].head.text == "jests"
+ assert doc2[1].dep_ == "prep"
+ assert doc2[2].head.text == "at"
+ assert doc2[2].dep_ == "pobj"
+ assert doc2[3].head.text == "jests" # head set to the new artificial root
+ assert doc2[3].dep_ == "dep"
+ # We should still have 1 sentence
+ assert len(list(doc2.sents)) == 1
+ span3 = doc[6:9] # "never felt a"
+ doc3 = span3.as_doc()
+ doc3_json = doc3.to_json()
+ assert doc3_json
+ assert doc3[0].head.text == "felt"
+ assert doc3[0].dep_ == "neg"
+ assert doc3[1].head.text == "felt"
+ assert doc3[1].dep_ == "ROOT"
+ assert doc3[2].head.text == "felt" # head set to ancestor
+ assert doc3[2].dep_ == "dep"
+ # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
+ assert len(list(doc3.sents)) == 1
+
+
+@pytest.mark.issue(3962)
+def test_issue3962_long(en_vocab):
+ """Ensure that as_doc does not result in out-of-bound access of tokens.
+ This is achieved by setting the head to itself if it would lie out of the span otherwise."""
+ # fmt: off
+ words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
+ heads = [1, 1, 1, 2, 1, 7, 7, 7, 9, 7, 7]
+ deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
+ # fmt: on
+ two_sent_doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
+ span2 = two_sent_doc[1:7] # "jests at scars. They never"
+ doc2 = span2.as_doc()
+ doc2_json = doc2.to_json()
+ assert doc2_json
+ # head set to itself, being the new artificial root (in sentence 1)
+ assert doc2[0].head.text == "jests"
+ assert doc2[0].dep_ == "ROOT"
+ assert doc2[1].head.text == "jests"
+ assert doc2[1].dep_ == "prep"
+ assert doc2[2].head.text == "at"
+ assert doc2[2].dep_ == "pobj"
+ assert doc2[3].head.text == "jests"
+ assert doc2[3].dep_ == "punct"
+ # head set to itself, being the new artificial root (in sentence 2)
+ assert doc2[4].head.text == "They"
+ assert doc2[4].dep_ == "dep"
+ # head set to the new artificial head (in sentence 2)
+ assert doc2[4].head.text == "They"
+ assert doc2[4].dep_ == "dep"
+ # We should still have 2 sentences
+ sents = list(doc2.sents)
+ assert len(sents) == 2
+ assert sents[0].text == "jests at scars ."
+ assert sents[1].text == "They never"
+
+
+@Language.factory("my_pipe")
+class CustomPipe:
+ def __init__(self, nlp, name="my_pipe"):
+ self.name = name
+ Span.set_extension("my_ext", getter=self._get_my_ext)
+ Doc.set_extension("my_ext", default=None)
+
+ def __call__(self, doc):
+ gathered_ext = []
+ for sent in doc.sents:
+ sent_ext = self._get_my_ext(sent)
+ sent._.set("my_ext", sent_ext)
+ gathered_ext.append(sent_ext)
+
+ doc._.set("my_ext", "\n".join(gathered_ext))
+ return doc
+
+ @staticmethod
+ def _get_my_ext(span):
+ return str(span.end)
+
+
+@pytest.mark.issue(4903)
+def test_issue4903():
+ """Ensure that this runs correctly and doesn't hang or crash on Windows /
+ macOS."""
+ nlp = English()
+ nlp.add_pipe("sentencizer")
+ nlp.add_pipe("my_pipe", after="sentencizer")
+ text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
+ if isinstance(get_current_ops(), NumpyOps):
+ docs = list(nlp.pipe(text, n_process=2))
+ assert docs[0].text == "I like bananas."
+ assert docs[1].text == "Do you like them?"
+ assert docs[2].text == "No, I prefer wasabi."
+
+
+@pytest.mark.issue(5048)
+def test_issue5048(en_vocab):
+ words = ["This", "is", "a", "sentence"]
+ pos_s = ["DET", "VERB", "DET", "NOUN"]
+ spaces = [" ", " ", " ", ""]
+ deps_s = ["dep", "adj", "nn", "atm"]
+ tags_s = ["DT", "VBZ", "DT", "NN"]
+ strings = en_vocab.strings
+ for w in words:
+ strings.add(w)
+ deps = [strings.add(d) for d in deps_s]
+ pos = [strings.add(p) for p in pos_s]
+ tags = [strings.add(t) for t in tags_s]
+ attrs = [POS, DEP, TAG]
+ array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64")
+ doc = Doc(en_vocab, words=words, spaces=spaces)
+ doc.from_array(attrs, array)
+ v1 = [(token.text, token.pos_, token.tag_) for token in doc]
+ doc2 = Doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s)
+ v2 = [(token.text, token.pos_, token.tag_) for token in doc2]
+ assert v1 == v2
+
+
@pytest.mark.parametrize("text", [["one", "two", "three"]])
def test_doc_api_compare_by_string_position(en_vocab, text):
doc = Doc(en_vocab, words=text)
@@ -350,6 +567,7 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
"Merging the docs is fun.",
"",
"They don't think alike. ",
+ "",
"Another doc.",
]
en_texts_without_empty = [t for t in en_texts if len(t)]
@@ -357,9 +575,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
en_docs = [en_tokenizer(text) for text in en_texts]
en_docs[0].spans["group"] = [en_docs[0][1:4]]
en_docs[2].spans["group"] = [en_docs[2][1:4]]
- en_docs[3].spans["group"] = [en_docs[3][0:1]]
+ en_docs[4].spans["group"] = [en_docs[4][0:1]]
span_group_texts = sorted(
- [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text]
+ [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[4][0:1].text]
)
de_doc = de_tokenizer(de_text)
Token.set_extension("is_ambiguous", default=False)
diff --git a/spacy/tests/doc/test_pickle_doc.py b/spacy/tests/doc/test_pickle_doc.py
index 28cb66714..738a751a0 100644
--- a/spacy/tests/doc/test_pickle_doc.py
+++ b/spacy/tests/doc/test_pickle_doc.py
@@ -5,9 +5,11 @@ from spacy.compat import pickle
def test_pickle_single_doc():
nlp = Language()
doc = nlp("pickle roundtrip")
+ doc._context = 3
data = pickle.dumps(doc, 1)
doc2 = pickle.loads(data)
assert doc2.text == "pickle roundtrip"
+ assert doc2._context == 3
def test_list_of_docs_pickles_efficiently():
diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py
index 16df1713d..ec4deb033 100644
--- a/spacy/tests/doc/test_retokenize_split.py
+++ b/spacy/tests/doc/test_retokenize_split.py
@@ -1,8 +1,50 @@
+import numpy
import pytest
+
from spacy.vocab import Vocab
from spacy.tokens import Doc, Token
+@pytest.mark.issue(3540)
+def test_issue3540(en_vocab):
+ words = ["I", "live", "in", "NewYork", "right", "now"]
+ tensor = numpy.asarray(
+ [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
+ dtype="f",
+ )
+ doc = Doc(en_vocab, words=words)
+ doc.tensor = tensor
+ gold_text = ["I", "live", "in", "NewYork", "right", "now"]
+ assert [token.text for token in doc] == gold_text
+ gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
+ for i, lemma in enumerate(gold_lemma):
+ doc[i].lemma_ = lemma
+ assert [token.lemma_ for token in doc] == gold_lemma
+ vectors_1 = [token.vector for token in doc]
+ assert len(vectors_1) == len(doc)
+
+ with doc.retokenize() as retokenizer:
+ heads = [(doc[3], 1), doc[2]]
+ attrs = {
+ "POS": ["PROPN", "PROPN"],
+ "LEMMA": ["New", "York"],
+ "DEP": ["pobj", "compound"],
+ }
+ retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
+
+ gold_text = ["I", "live", "in", "New", "York", "right", "now"]
+ assert [token.text for token in doc] == gold_text
+ gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
+ assert [token.lemma_ for token in doc] == gold_lemma
+ vectors_2 = [token.vector for token in doc]
+ assert len(vectors_2) == len(doc)
+ assert vectors_1[0].tolist() == vectors_2[0].tolist()
+ assert vectors_1[1].tolist() == vectors_2[1].tolist()
+ assert vectors_1[2].tolist() == vectors_2[2].tolist()
+ assert vectors_1[4].tolist() == vectors_2[5].tolist()
+ assert vectors_1[5].tolist() == vectors_2[6].tolist()
+
+
def test_doc_retokenize_split(en_vocab):
words = ["LosAngeles", "start", "."]
heads = [1, 2, 2]
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 6e34f2126..10aba5b94 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -1,11 +1,15 @@
import pytest
import numpy
from numpy.testing import assert_array_equal
+
from spacy.attrs import ORTH, LENGTH
+from spacy.lang.en import English
from spacy.tokens import Doc, Span, Token
from spacy.vocab import Vocab
from spacy.util import filter_spans
+from thinc.api import get_current_ops
+from ..util import add_vecs_to_vocab
from .test_underscore import clean_underscore # noqa: F401
@@ -41,6 +45,106 @@ def doc_not_parsed(en_tokenizer):
return doc
+@pytest.mark.issue(1537)
+def test_issue1537():
+ """Test that Span.as_doc() doesn't segfault."""
+ string = "The sky is blue . The man is pink . The dog is purple ."
+ doc = Doc(Vocab(), words=string.split())
+ doc[0].sent_start = True
+ for word in doc[1:]:
+ if word.nbor(-1).text == ".":
+ word.sent_start = True
+ else:
+ word.sent_start = False
+ sents = list(doc.sents)
+ sent0 = sents[0].as_doc()
+ sent1 = sents[1].as_doc()
+ assert isinstance(sent0, Doc)
+ assert isinstance(sent1, Doc)
+
+
+@pytest.mark.issue(1612)
+def test_issue1612(en_tokenizer):
+ """Test that span.orth_ is identical to span.text"""
+ doc = en_tokenizer("The black cat purrs.")
+ span = doc[1:3]
+ assert span.orth_ == span.text
+
+
+@pytest.mark.issue(3199)
+def test_issue3199():
+ """Test that Span.noun_chunks works correctly if no noun chunks iterator
+ is available. To make this test future-proof, we're constructing a Doc
+ with a new Vocab here and a parse tree to make sure the noun chunks run.
+ """
+ words = ["This", "is", "a", "sentence"]
+ doc = Doc(Vocab(), words=words, heads=[0] * len(words), deps=["dep"] * len(words))
+ with pytest.raises(NotImplementedError):
+ list(doc[0:3].noun_chunks)
+
+
+@pytest.mark.issue(5152)
+def test_issue5152():
+ # Test that the comparison between a Span and a Token, goes well
+ # There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
+ nlp = English()
+ text = nlp("Talk about being boring!")
+ text_var = nlp("Talk of being boring!")
+ y = nlp("Let")
+ span = text[0:3] # Talk about being
+ span_2 = text[0:3] # Talk about being
+ span_3 = text_var[0:3] # Talk of being
+ token = y[0] # Let
+ with pytest.warns(UserWarning):
+ assert span.similarity(token) == 0.0
+ assert span.similarity(span_2) == 1.0
+ with pytest.warns(UserWarning):
+ assert span_2.similarity(span_3) < 1.0
+
+
+@pytest.mark.issue(6755)
+def test_issue6755(en_tokenizer):
+ doc = en_tokenizer("This is a magnificent sentence.")
+ span = doc[:0]
+ assert span.text_with_ws == ""
+ assert span.text == ""
+
+
+@pytest.mark.parametrize(
+ "sentence, start_idx,end_idx,label",
+ [("Welcome to Mumbai, my friend", 11, 17, "GPE")],
+)
+@pytest.mark.issue(6815)
+def test_issue6815_1(sentence, start_idx, end_idx, label):
+ nlp = English()
+ doc = nlp(sentence)
+ span = doc[:].char_span(start_idx, end_idx, label=label)
+ assert span.label_ == label
+
+
+@pytest.mark.parametrize(
+ "sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)]
+)
+@pytest.mark.issue(6815)
+def test_issue6815_2(sentence, start_idx, end_idx, kb_id):
+ nlp = English()
+ doc = nlp(sentence)
+ span = doc[:].char_span(start_idx, end_idx, kb_id=kb_id)
+ assert span.kb_id == kb_id
+
+
+@pytest.mark.parametrize(
+ "sentence, start_idx,end_idx,vector",
+ [("Welcome to Mumbai, my friend", 11, 17, numpy.array([0.1, 0.2, 0.3]))],
+)
+@pytest.mark.issue(6815)
+def test_issue6815_3(sentence, start_idx, end_idx, vector):
+ nlp = English()
+ doc = nlp(sentence)
+ span = doc[:].char_span(start_idx, end_idx, vector=vector)
+ assert (span.vector == vector).all()
+
+
@pytest.mark.parametrize(
"i_sent,i,j,text",
[
@@ -96,6 +200,12 @@ def test_spans_span_sent(doc, doc_not_parsed):
assert doc[:2].sent.root.text == "is"
assert doc[:2].sent.text == "This is a sentence."
assert doc[6:7].sent.root.left_edge.text == "This"
+ assert doc[0 : len(doc)].sent == list(doc.sents)[0]
+ assert list(doc[0 : len(doc)].sents) == list(doc.sents)
+
+ with pytest.raises(ValueError):
+ doc_not_parsed[:2].sent
+
# test on manual sbd
doc_not_parsed[0].is_sent_start = True
doc_not_parsed[5].is_sent_start = True
@@ -103,6 +213,35 @@ def test_spans_span_sent(doc, doc_not_parsed):
assert doc_not_parsed[10:14].sent == doc_not_parsed[5:]
+@pytest.mark.parametrize(
+ "start,end,expected_sentence",
+ [
+ (0, 14, "This is"), # Entire doc
+ (1, 4, "This is"), # Overlapping with 2 sentences
+ (0, 2, "This is"), # Beginning of the Doc. Full sentence
+ (0, 1, "This is"), # Beginning of the Doc. Part of a sentence
+ (10, 14, "And a"), # End of the Doc. Overlapping with 2 senteces
+ (12, 14, "third."), # End of the Doc. Full sentence
+ (1, 1, "This is"), # Empty Span
+ ],
+)
+def test_spans_span_sent_user_hooks(doc, start, end, expected_sentence):
+
+ # Doc-level sents hook
+ def user_hook(doc):
+ return [doc[ii : ii + 2] for ii in range(0, len(doc), 2)]
+
+ doc.user_hooks["sents"] = user_hook
+
+ # Make sure doc-level sents hook works
+ assert doc[start:end].sent.text == expected_sentence
+
+ # Span-level sent hook
+ doc.user_span_hooks["sent"] = lambda x: x
+ # Now, span=level sent hook overrides the doc-level sents hook
+ assert doc[start:end].sent == doc[start:end]
+
+
def test_spans_lca_matrix(en_tokenizer):
"""Test span's lca matrix generation"""
tokens = en_tokenizer("the lazy dog slept")
@@ -357,6 +496,9 @@ def test_span_eq_hash(doc, doc_not_parsed):
assert hash(doc[0:2]) != hash(doc[1:3])
assert hash(doc[0:2]) != hash(doc_not_parsed[0:2])
+ # check that an out-of-bounds is not equivalent to the span of the full doc
+ assert doc[0 : len(doc)] != doc[len(doc) : len(doc) + 1]
+
def test_span_boundaries(doc):
start = 1
@@ -369,6 +511,33 @@ def test_span_boundaries(doc):
with pytest.raises(IndexError):
span[5]
+ empty_span_0 = doc[0:0]
+ assert empty_span_0.text == ""
+ assert empty_span_0.start == 0
+ assert empty_span_0.end == 0
+ assert empty_span_0.start_char == 0
+ assert empty_span_0.end_char == 0
+
+ empty_span_1 = doc[1:1]
+ assert empty_span_1.text == ""
+ assert empty_span_1.start == 1
+ assert empty_span_1.end == 1
+ assert empty_span_1.start_char == empty_span_1.end_char
+
+ oob_span_start = doc[-len(doc) - 1 : -len(doc) - 10]
+ assert oob_span_start.text == ""
+ assert oob_span_start.start == 0
+ assert oob_span_start.end == 0
+ assert oob_span_start.start_char == 0
+ assert oob_span_start.end_char == 0
+
+ oob_span_end = doc[len(doc) + 1 : len(doc) + 10]
+ assert oob_span_end.text == ""
+ assert oob_span_end.start == len(doc)
+ assert oob_span_end.end == len(doc)
+ assert oob_span_end.start_char == len(doc.text)
+ assert oob_span_end.end_char == len(doc.text)
+
def test_span_lemma(doc):
# span lemmas should have the same number of spaces as the span
@@ -382,3 +551,58 @@ def test_sent(en_tokenizer):
assert not span.doc.has_annotation("SENT_START")
with pytest.raises(ValueError):
span.sent
+
+
+def test_span_with_vectors(doc):
+ ops = get_current_ops()
+ prev_vectors = doc.vocab.vectors
+ vectors = [
+ ("apple", ops.asarray([1, 2, 3])),
+ ("orange", ops.asarray([-1, -2, -3])),
+ ("And", ops.asarray([-1, -1, -1])),
+ ("juice", ops.asarray([5, 5, 10])),
+ ("pie", ops.asarray([7, 6.3, 8.9])),
+ ]
+ add_vecs_to_vocab(doc.vocab, vectors)
+ # 0-length span
+ assert_array_equal(ops.to_numpy(doc[0:0].vector), numpy.zeros((3,)))
+ # longer span with no vector
+ assert_array_equal(ops.to_numpy(doc[0:4].vector), numpy.zeros((3,)))
+ # single-token span with vector
+ assert_array_equal(ops.to_numpy(doc[10:11].vector), [-1, -1, -1])
+ doc.vocab.vectors = prev_vectors
+
+
+@pytest.mark.parametrize(
+ "start,end,expected_sentences,expected_sentences_with_hook",
+ [
+ (0, 14, 3, 7), # Entire doc
+ (3, 6, 2, 2), # Overlapping with 2 sentences
+ (0, 4, 1, 2), # Beginning of the Doc. Full sentence
+ (0, 3, 1, 2), # Beginning of the Doc. Part of a sentence
+ (9, 14, 2, 3), # End of the Doc. Overlapping with 2 senteces
+ (10, 14, 1, 2), # End of the Doc. Full sentence
+ (11, 14, 1, 2), # End of the Doc. Partial sentence
+ (0, 0, 1, 1), # Empty Span
+ ],
+)
+def test_span_sents(doc, start, end, expected_sentences, expected_sentences_with_hook):
+
+ assert len(list(doc[start:end].sents)) == expected_sentences
+
+ def user_hook(doc):
+ return [doc[ii : ii + 2] for ii in range(0, len(doc), 2)]
+
+ doc.user_hooks["sents"] = user_hook
+
+ assert len(list(doc[start:end].sents)) == expected_sentences_with_hook
+
+ doc.user_span_hooks["sents"] = lambda x: [x]
+
+ assert list(doc[start:end].sents)[0] == doc[start:end]
+ assert len(list(doc[start:end].sents)) == 1
+
+
+def test_span_sents_not_parsed(doc_not_parsed):
+ with pytest.raises(ValueError):
+ list(Span(doc_not_parsed, 0, 3).sents)
diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py
index 5ea0bcff0..e715c5e85 100644
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@@ -203,6 +203,12 @@ def test_set_pos():
assert doc[1].pos_ == "VERB"
+def test_set_invalid_pos():
+ doc = Doc(Vocab(), words=["hello", "world"])
+ with pytest.raises(ValueError):
+ doc[0].pos_ = "blah"
+
+
def test_tokens_sent(doc):
"""Test token.sent property"""
assert len(list(doc.sents)) == 3
diff --git a/spacy/tests/regression/__init__.py b/spacy/tests/lang/af/__init__.py
similarity index 100%
rename from spacy/tests/regression/__init__.py
rename to spacy/tests/lang/af/__init__.py
diff --git a/spacy/tests/lang/af/test_text.py b/spacy/tests/lang/af/test_text.py
new file mode 100644
index 000000000..99c2a9f4c
--- /dev/null
+++ b/spacy/tests/lang/af/test_text.py
@@ -0,0 +1,22 @@
+import pytest
+
+
+def test_long_text(af_tokenizer):
+ # Excerpt: Universal Declaration of Human Rights; “'n” changed to “die” in first sentence
+ text = """
+Hierdie Universele Verklaring van Menseregte as die algemene standaard vir die verwesenliking deur alle mense en nasies,
+om te verseker dat elke individu en elke deel van die gemeenskap hierdie Verklaring in ag sal neem en deur opvoeding,
+respek vir hierdie regte en vryhede te bevorder, op nasionale en internasionale vlak, daarna sal strewe om die universele
+en effektiewe erkenning en agting van hierdie regte te verseker, nie net vir die mense van die Lidstate nie, maar ook vir
+die mense in die gebiede onder hul jurisdiksie.
+
+"""
+ tokens = af_tokenizer(text)
+ assert len(tokens) == 100
+
+
+@pytest.mark.xfail
+def test_indefinite_article(af_tokenizer):
+ text = "as 'n algemene standaard"
+ tokens = af_tokenizer(text)
+ assert len(tokens) == 4
diff --git a/spacy/tests/lang/af/test_tokenizer.py b/spacy/tests/lang/af/test_tokenizer.py
new file mode 100644
index 000000000..db52db5e3
--- /dev/null
+++ b/spacy/tests/lang/af/test_tokenizer.py
@@ -0,0 +1,29 @@
+import pytest
+
+AF_BASIC_TOKENIZATION_TESTS = [
+ (
+ "Elkeen het die reg tot lewe, vryheid en sekuriteit van persoon.",
+ [
+ "Elkeen",
+ "het",
+ "die",
+ "reg",
+ "tot",
+ "lewe",
+ ",",
+ "vryheid",
+ "en",
+ "sekuriteit",
+ "van",
+ "persoon",
+ ".",
+ ],
+ ),
+]
+
+
+@pytest.mark.parametrize("text,expected_tokens", AF_BASIC_TOKENIZATION_TESTS)
+def test_af_tokenizer_basic(af_tokenizer, text, expected_tokens):
+ tokens = af_tokenizer(text)
+ token_list = [token.text for token in tokens if not token.is_space]
+ assert expected_tokens == token_list
diff --git a/spacy/tests/lang/ca/test_exception.py b/spacy/tests/lang/ca/test_exception.py
index cfb574b63..499027ab1 100644
--- a/spacy/tests/lang/ca/test_exception.py
+++ b/spacy/tests/lang/ca/test_exception.py
@@ -11,7 +11,18 @@ def test_ca_tokenizer_handles_abbr(ca_tokenizer, text, lemma):
def test_ca_tokenizer_handles_exc_in_text(ca_tokenizer):
- text = "La Núria i el Pere han vingut aprox. a les 7 de la tarda."
- tokens = ca_tokenizer(text)
- assert len(tokens) == 15
- assert tokens[7].text == "aprox."
+ text = "La Dra. Puig viu a la pl. dels Til·lers."
+ doc = ca_tokenizer(text)
+ assert [t.text for t in doc] == [
+ "La",
+ "Dra.",
+ "Puig",
+ "viu",
+ "a",
+ "la",
+ "pl.",
+ "d",
+ "els",
+ "Til·lers",
+ ".",
+ ]
diff --git a/spacy/tests/lang/ca/test_prefix_suffix_infix.py b/spacy/tests/lang/ca/test_prefix_suffix_infix.py
index a3c76ab5b..afbdf3696 100644
--- a/spacy/tests/lang/ca/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/ca/test_prefix_suffix_infix.py
@@ -2,7 +2,14 @@ import pytest
@pytest.mark.parametrize(
- "text,expected_tokens", [("d'un", ["d'", "un"]), ("s'ha", ["s'", "ha"])]
+ "text,expected_tokens",
+ [
+ ("d'un", ["d'", "un"]),
+ ("s'ha", ["s'", "ha"]),
+ ("del", ["d", "el"]),
+ ("cantar-te", ["cantar", "-te"]),
+ ("-hola", ["-", "hola"]),
+ ],
)
def test_contractions(ca_tokenizer, text, expected_tokens):
"""Test that the contractions are split into two tokens"""
diff --git a/spacy/tests/lang/ca/test_text.py b/spacy/tests/lang/ca/test_text.py
index 55bad0e94..5db7af553 100644
--- a/spacy/tests/lang/ca/test_text.py
+++ b/spacy/tests/lang/ca/test_text.py
@@ -12,17 +12,20 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer):
una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida."""
tokens = ca_tokenizer(text)
- assert len(tokens) == 140
+ assert len(tokens) == 146
@pytest.mark.parametrize(
"text,length",
[
- ("Perquè va anar-hi?", 4),
+ ("Perquè va anar-hi?", 5),
+ ("El cotxe dels veins.", 6),
("“Ah no?”", 5),
("""Sí! "Anem", va contestar el Joan Carles""", 11),
("Van córrer aprox. 10km", 5),
("Llavors perqué...", 3),
+ ("Vull parlar-te'n demà al matí", 8),
+ ("Vull explicar-t'ho demà al matí", 8),
],
)
def test_ca_tokenizer_handles_cnts(ca_tokenizer, text, length):
diff --git a/spacy/tests/lang/en/test_prefix_suffix_infix.py b/spacy/tests/lang/en/test_prefix_suffix_infix.py
index 9dfb54fd6..a903496e8 100644
--- a/spacy/tests/lang/en/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/en/test_prefix_suffix_infix.py
@@ -119,6 +119,7 @@ def test_en_tokenizer_splits_period_abbr(en_tokenizer):
assert tokens[4].text == "Mr."
+@pytest.mark.issue(225)
@pytest.mark.xfail(reason="Issue #225 - not yet implemented")
def test_en_tokenizer_splits_em_dash_infix(en_tokenizer):
tokens = en_tokenizer(
diff --git a/spacy/tests/lang/en/test_sbd.py b/spacy/tests/lang/en/test_sbd.py
index 39d8d3b59..d30c72750 100644
--- a/spacy/tests/lang/en/test_sbd.py
+++ b/spacy/tests/lang/en/test_sbd.py
@@ -4,6 +4,15 @@ from spacy.tokens import Doc
from ...util import apply_transition_sequence
+@pytest.mark.issue(309)
+def test_issue309(en_vocab):
+ """Test Issue #309: SBD fails on empty string"""
+ doc = Doc(en_vocab, words=[" "], heads=[0], deps=["ROOT"])
+ assert len(doc) == 1
+ sents = list(doc.sents)
+ assert len(sents) == 1
+
+
@pytest.mark.parametrize("words", [["A", "test", "sentence"]])
@pytest.mark.parametrize("punct", [".", "!", "?", ""])
def test_en_sbd_single_punct(en_vocab, words, punct):
diff --git a/spacy/tests/lang/en/test_tokenizer.py b/spacy/tests/lang/en/test_tokenizer.py
new file mode 100644
index 000000000..e6d1d7d85
--- /dev/null
+++ b/spacy/tests/lang/en/test_tokenizer.py
@@ -0,0 +1,169 @@
+import pytest
+
+
+@pytest.mark.issue(351)
+def test_issue351(en_tokenizer):
+ doc = en_tokenizer(" This is a cat.")
+ assert doc[0].idx == 0
+ assert len(doc[0]) == 3
+ assert doc[1].idx == 3
+
+
+@pytest.mark.issue(360)
+def test_issue360(en_tokenizer):
+ """Test tokenization of big ellipsis"""
+ tokens = en_tokenizer("$45...............Asking")
+ assert len(tokens) > 2
+
+
+@pytest.mark.issue(736)
+@pytest.mark.parametrize("text,number", [("7am", "7"), ("11p.m.", "11")])
+def test_issue736(en_tokenizer, text, number):
+ """Test that times like "7am" are tokenized correctly and that numbers are
+ converted to string."""
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 2
+ assert tokens[0].text == number
+
+
+@pytest.mark.issue(740)
+@pytest.mark.parametrize("text", ["3/4/2012", "01/12/1900"])
+def test_issue740(en_tokenizer, text):
+ """Test that dates are not split and kept as one token. This behaviour is
+ currently inconsistent, since dates separated by hyphens are still split.
+ This will be hard to prevent without causing clashes with numeric ranges."""
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 1
+
+
+@pytest.mark.issue(744)
+@pytest.mark.parametrize("text", ["We were scared", "We Were Scared"])
+def test_issue744(en_tokenizer, text):
+ """Test that 'were' and 'Were' are excluded from the contractions
+ generated by the English tokenizer exceptions."""
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 3
+ assert tokens[1].text.lower() == "were"
+
+
+@pytest.mark.issue(759)
+@pytest.mark.parametrize(
+ "text,is_num", [("one", True), ("ten", True), ("teneleven", False)]
+)
+def test_issue759(en_tokenizer, text, is_num):
+ tokens = en_tokenizer(text)
+ assert tokens[0].like_num == is_num
+
+
+@pytest.mark.issue(775)
+@pytest.mark.parametrize("text", ["Shell", "shell", "Shed", "shed"])
+def test_issue775(en_tokenizer, text):
+ """Test that 'Shell' and 'shell' are excluded from the contractions
+ generated by the English tokenizer exceptions."""
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 1
+ assert tokens[0].text == text
+
+
+@pytest.mark.issue(792)
+@pytest.mark.parametrize("text", ["This is a string ", "This is a string\u0020"])
+def test_issue792(en_tokenizer, text):
+ """Test for Issue #792: Trailing whitespace is removed after tokenization."""
+ doc = en_tokenizer(text)
+ assert "".join([token.text_with_ws for token in doc]) == text
+
+
+@pytest.mark.issue(792)
+@pytest.mark.parametrize("text", ["This is a string", "This is a string\n"])
+def test_control_issue792(en_tokenizer, text):
+ """Test base case for Issue #792: Non-trailing whitespace"""
+ doc = en_tokenizer(text)
+ assert "".join([token.text_with_ws for token in doc]) == text
+
+
+@pytest.mark.issue(859)
+@pytest.mark.parametrize(
+ "text", ["aaabbb@ccc.com\nThank you!", "aaabbb@ccc.com \nThank you!"]
+)
+def test_issue859(en_tokenizer, text):
+ """Test that no extra space is added in doc.text method."""
+ doc = en_tokenizer(text)
+ assert doc.text == text
+
+
+@pytest.mark.issue(886)
+@pytest.mark.parametrize("text", ["Datum:2014-06-02\nDokument:76467"])
+def test_issue886(en_tokenizer, text):
+ """Test that token.idx matches the original text index for texts with newlines."""
+ doc = en_tokenizer(text)
+ for token in doc:
+ assert len(token.text) == len(token.text_with_ws)
+ assert text[token.idx] == token.text[0]
+
+
+@pytest.mark.issue(891)
+@pytest.mark.parametrize("text", ["want/need"])
+def test_issue891(en_tokenizer, text):
+ """Test that / infixes are split correctly."""
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 3
+ assert tokens[1].text == "/"
+
+
+@pytest.mark.issue(957)
+@pytest.mark.slow
+def test_issue957(en_tokenizer):
+ """Test that spaCy doesn't hang on many punctuation characters.
+ If this test hangs, check (new) regular expressions for conflicting greedy operators
+ """
+ # Skip test if pytest-timeout is not installed
+ pytest.importorskip("pytest_timeout")
+ for punct in [".", ",", "'", '"', ":", "?", "!", ";", "-"]:
+ string = "0"
+ for i in range(1, 100):
+ string += punct + str(i)
+ doc = en_tokenizer(string)
+ assert doc
+
+
+@pytest.mark.parametrize("text", ["test@example.com", "john.doe@example.co.uk"])
+@pytest.mark.issue(1698)
+def test_issue1698(en_tokenizer, text):
+ """Test that doc doesn't identify email-addresses as URLs"""
+ doc = en_tokenizer(text)
+ assert len(doc) == 1
+ assert not doc[0].like_url
+
+
+@pytest.mark.issue(1758)
+def test_issue1758(en_tokenizer):
+ """Test that "would've" is handled by the English tokenizer exceptions."""
+ tokens = en_tokenizer("would've")
+ assert len(tokens) == 2
+
+
+@pytest.mark.issue(1773)
+def test_issue1773(en_tokenizer):
+ """Test that spaces don't receive a POS but no TAG. This is the root cause
+ of the serialization issue reported in #1773."""
+ doc = en_tokenizer("\n")
+ if doc[0].pos_ == "SPACE":
+ assert doc[0].tag_ != ""
+
+
+@pytest.mark.issue(3277)
+def test_issue3277(es_tokenizer):
+ """Test that hyphens are split correctly as prefixes."""
+ doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.")
+ assert len(doc) == 14
+ assert doc[0].text == "\u2014"
+ assert doc[5].text == "\u2013"
+ assert doc[9].text == "\u2013"
+
+
+@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
+@pytest.mark.issue(3521)
+def test_issue3521(en_tokenizer, word):
+ tok = en_tokenizer(word)[1]
+ # 'not' and 'would' should be stopwords, also in their abbreviated forms
+ assert tok.is_stop
diff --git a/spacy/tests/lang/es/test_noun_chunks.py b/spacy/tests/lang/es/test_noun_chunks.py
index e5afd81c9..6118a0458 100644
--- a/spacy/tests/lang/es/test_noun_chunks.py
+++ b/spacy/tests/lang/es/test_noun_chunks.py
@@ -1,6 +1,156 @@
+from spacy.tokens import Doc
import pytest
+# fmt: off
+@pytest.mark.parametrize(
+ "words,heads,deps,pos,chunk_offsets",
+ [
+ # un gato -> "un gato"
+ (
+ ["un", "gato"],
+ [1, 1],
+ ["det", "ROOT"],
+ ["DET", "NOUN"],
+ [(0, 2)],
+ ),
+ # la camisa negra -> "la camisa negra"
+ (
+ ["la", "camisa", "negra"],
+ [1, 1, 1],
+ ["det", "ROOT", "amod"],
+ ["DET", "NOUN", "ADJ"],
+ [(0, 3)],
+ ),
+ # un lindo gatito -> "un lindo gatito"
+ (
+ ["Un", "lindo", "gatito"],
+ [2, 2, 2],
+ ["det", "amod", "ROOT"],
+ ["DET", "ADJ", "NOUN"],
+ [(0,3)]
+ ),
+ # una chica hermosa e inteligente -> una chica hermosa e inteligente
+ (
+ ["Una", "chica", "hermosa", "e", "inteligente"],
+ [1, 1, 1, 4, 2],
+ ["det", "ROOT", "amod", "cc", "conj"],
+ ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
+ [(0,5)]
+ ),
+ # el fabuloso gato pardo -> "el fabuloso gato pardo"
+ (
+ ["el", "fabuloso", "gato", "pardo"],
+ [2, 2, 2, 2],
+ ["det", "amod", "ROOT", "amod"],
+ ["DET", "ADJ", "NOUN", "ADJ"],
+ [(0,4)]
+ ),
+ # Tengo un gato y un perro -> un gato, un perro
+ (
+ ["Tengo", "un", "gato", "y", "un", "perro"],
+ [0, 2, 0, 5, 5, 0],
+ ["ROOT", "det", "obj", "cc", "det", "conj"],
+ ["VERB", "DET", "NOUN", "CCONJ", "DET", "NOUN"],
+ [(1,3), (4,6)]
+
+ ),
+ # Dom Pedro II -> Dom Pedro II
+ (
+ ["Dom", "Pedro", "II"],
+ [0, 0, 0],
+ ["ROOT", "flat", "flat"],
+ ["PROPN", "PROPN", "PROPN"],
+ [(0,3)]
+ ),
+ # los Estados Unidos -> los Estados Unidos
+ (
+ ["los", "Estados", "Unidos"],
+ [1, 1, 1],
+ ["det", "ROOT", "flat"],
+ ["DET", "PROPN", "PROPN"],
+ [(0,3)]
+ ),
+ # Miguel de Cervantes -> Miguel de Cervantes
+ (
+ ["Miguel", "de", "Cervantes"],
+ [0, 2, 0],
+ ["ROOT", "case", "flat"],
+ ["PROPN", "ADP", "PROPN"],
+ [(0,3)]
+ ),
+ (
+ ["Rio", "de", "Janeiro"],
+ [0, 2, 0],
+ ["ROOT", "case", "flat"],
+ ["PROPN", "ADP", "PROPN"],
+ [(0,3)]
+ ),
+ # la destrucción de la ciudad -> la destrucción, la ciudad
+ (
+ ["la", "destrucción", "de", "la", "ciudad"],
+ [1, 1, 4, 4, 1],
+ ['det', 'ROOT', 'case', 'det', 'nmod'],
+ ['DET', 'NOUN', 'ADP', 'DET', 'NOUN'],
+ [(0,2), (3,5)]
+ ),
+ # la traducción de Susana del informe -> la traducción, Susana, informe
+ (
+ ['la', 'traducción', 'de', 'Susana', 'del', 'informe'],
+ [1, 1, 3, 1, 5, 1],
+ ['det', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
+ ['DET', 'NOUN', 'ADP', 'PROPN', 'ADP', 'NOUN'],
+ [(0,2), (3,4), (5,6)]
+
+ ),
+ # El gato regordete de Susana y su amigo -> el gato regordete, Susana, su amigo
+ (
+ ['El', 'gato', 'regordete', 'de', 'Susana', 'y', 'su', 'amigo'],
+ [1, 1, 1, 4, 1, 7, 7, 1],
+ ['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'conj'],
+ ['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
+ [(0,3), (4,5), (6,8)]
+ ),
+ # Afirmó que sigue el criterio europeo y que trata de incentivar el mercado donde no lo hay -> el criterio europeo, el mercado, donde, lo
+ (
+ ['Afirmó', 'que', 'sigue', 'el', 'criterio', 'europeo', 'y', 'que', 'trata', 'de', 'incentivar', 'el', 'mercado', 'donde', 'no', 'lo', 'hay'],
+ [0, 2, 0, 4, 2, 4, 8, 8, 2, 10, 8, 12, 10, 16, 16, 16, 0],
+ ['ROOT', 'mark', 'ccomp', 'det', 'obj', 'amod', 'cc', 'mark', 'conj', 'mark', 'xcomp', 'det', 'obj', 'obl', 'advmod', 'obj', 'advcl'],
+ ['VERB', 'SCONJ', 'VERB', 'DET', 'NOUN', 'ADJ', 'CCONJ', 'SCONJ', 'VERB', 'ADP', 'VERB', 'DET', 'NOUN', 'PRON', 'ADV', 'PRON', 'AUX'],
+ [(3,6), (11,13), (13,14), (15,16)]
+ ),
+ # En este sentido se refirió a la reciente creación del Ministerio de Ciencia y Tecnología y a las primeras declaraciones de su titular, Anna Birulés, sobre el impulso de la investigación, desarrollo e innovación -> este sentido, se, la reciente creación, Ministerio de Ciencia y Tecnología, a las primeras declaraciones, su titular, , Anna Birulés,, el impulso, la investigación, , desarrollo, innovación
+ (
+ ['En', 'este', 'sentido', 'se', 'refirió', 'a', 'la', 'reciente', 'creación', 'del', 'Ministerio', 'de', 'Ciencia', 'y', 'Tecnología', 'y', 'a', 'las', 'primeras', 'declaraciones', 'de', 'su', 'titular', ',', 'Anna', 'Birulés', ',', 'sobre', 'el', 'impulso', 'de', 'la', 'investigación', ',', 'desarrollo', 'e', 'innovación'],
+ [2, 2, 4, 4, 4, 8, 8, 8, 4, 10, 8, 12, 10, 14, 12, 19, 19, 19, 19, 8, 22, 22, 19, 24, 22, 24, 24, 29, 29, 19, 32, 32, 29, 34, 32, 36, 32],
+ ['case', 'det', 'obl', 'obj', 'ROOT', 'case', 'det', 'amod', 'obj', 'case', 'nmod', 'case', 'flat', 'cc', 'conj', 'cc', 'case', 'det', 'amod', 'conj', 'case', 'det', 'nmod', 'punct', 'appos', 'flat', 'punct', 'case', 'det', 'nmod', 'case', 'det', 'nmod', 'punct', 'conj', 'cc', 'conj'],
+ ['ADP', 'DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'ADP', 'PROPN', 'CCONJ', 'PROPN', 'CCONJ', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', 'PUNCT', 'PROPN', 'PROPN', 'PUNCT', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'PUNCT', 'NOUN', 'CCONJ', 'NOUN'],
+ [(1, 3), (3, 4), (6, 9), (10, 15), (16, 20), (21, 23), (23, 27), (28, 30), (31, 33), (33, 35), (36, 37)]
+ ),
+ # Asimismo defiende la financiación pública de la investigación básica y pone de manifiesto que las empresas se centran más en la investigación y desarrollo con objetivos de mercado. -> la financiación pública, la investigación básica, manifiesto, las empresas, se, la investigación, desarrollo, objetivos, mercado
+ (
+ ['Asimismo', 'defiende', 'la', 'financiación', 'pública', 'de', 'la', 'investigación', 'básica', 'y', 'pone', 'de', 'manifiesto', 'que', 'las', 'empresas', 'se', 'centran', 'más', 'en', 'la', 'investigación', 'y', 'desarrollo', 'con', 'objetivos', 'de', 'mercado'],
+ [1, 1, 3, 1, 3, 7, 7, 3, 7, 10, 1, 12, 10, 17, 15, 17, 17, 10, 17, 21, 21, 17, 23, 21, 25, 17, 27, 25],
+ ['advmod', 'ROOT', 'det', 'obj', 'amod', 'case', 'det', 'nmod', 'amod', 'cc', 'conj', 'case', 'obl', 'mark', 'det', 'nsubj', 'obj', 'ccomp', 'obj', 'case', 'det', 'obl', 'cc', 'conj', 'case', 'obl', 'case', 'nmod'],
+ ['ADV', 'VERB', 'DET', 'NOUN', 'ADJ', 'ADP', 'DET', 'NOUN', 'ADJ', 'CCONJ', 'VERB', 'ADP', 'NOUN', 'SCONJ', 'DET', 'NOUN', 'PRON', 'VERB', 'ADV', 'ADP', 'DET', 'NOUN', 'CCONJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
+ [(2, 5), (6, 9), (12, 13), (14, 16), (16, 17), (20, 22), (23, 24), (25, 26), (27, 28)]
+ ),
+ # Tras indicar que la inversión media en investigación en la Unión Europea se sitúa en el 1,8 por ciento del PIB, frente al 2,8 por ciento en Japón y EEUU, Couceiro dijo que España está en "el buen camino" y se está creando un entorno propicio para la innovación empresarial' -> la inversión media, investigación, la Unión Europea, se, PIB, Japón, EEUU, Couceiro, España, se, un entorno propicio para la innovación empresaria
+ (
+ ['Tras', 'indicar', 'que', 'la', 'inversión', 'media', 'en', 'investigación', 'en', 'la', 'Unión', 'Europea', 'se', 'sitúa', 'en', 'el', '1,8', 'por', 'ciento', 'del', 'PIB', ',', 'frente', 'al', '2,8', 'por', 'ciento', 'en', 'Japón', 'y', 'EEUU', ',', 'Couceiro', 'dijo', 'que', 'España', 'está', 'en', '"', 'el', 'buen', 'camino', '"', 'y', 'se', 'está', 'creando', 'un', 'entorno', 'propicio', 'para', 'la', 'innovación', 'empresarial'],
+ [1, 33, 13, 4, 13, 4, 7, 4, 10, 10, 4, 10, 13, 1, 16, 16, 13, 18, 16, 20, 16, 24, 24, 22, 13, 26, 24, 28, 24, 30, 28, 1, 33, 33, 41, 41, 41, 41, 41, 41, 41, 33, 41, 46, 46, 46, 33, 48, 46, 48, 52, 52, 49, 52],
+ ['mark', 'advcl', 'mark', 'det', 'nsubj', 'amod', 'case', 'nmod', 'case', 'det', 'nmod', 'flat', 'obj', 'ccomp', 'case', 'det', 'obj', 'case', 'compound', 'case', 'nmod', 'punct', 'case', 'fixed', 'obl', 'case', 'compound', 'case', 'nmod', 'cc', 'conj', 'punct', 'nsubj', 'ROOT', 'mark', 'nsubj', 'cop', 'case', 'punct', 'det', 'amod', 'ccomp', 'punct', 'cc', 'obj', 'aux', 'conj', 'det', 'nsubj', 'amod', 'case', 'det', 'nmod', 'amod'],
+ ['ADP', 'VERB', 'SCONJ', 'DET', 'NOUN', 'ADJ', 'ADP', 'NOUN', 'ADP', 'DET', 'PROPN', 'PROPN', 'PRON', 'VERB', 'ADP', 'DET', 'NUM', 'ADP', 'NUM', 'ADP', 'PROPN', 'PUNCT', 'NOUN', 'ADP', 'NUM', 'ADP', 'NUM', 'ADP', 'PROPN', 'CCONJ', 'PROPN', 'PUNCT', 'PROPN', 'VERB', 'SCONJ', 'PROPN', 'AUX', 'ADP', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'PUNCT', 'CCONJ', 'PRON', 'AUX', 'VERB', 'DET', 'NOUN', 'ADJ', 'ADP', 'DET', 'NOUN', 'ADJ'],
+ [(3, 6), (7, 8), (9, 12), (12, 13), (20, 21), (28, 29), (30, 31), (32, 33), (35, 36), (44, 45), (47, 54)]
+ ),
+ ],
+)
+# fmt: on
+def test_es_noun_chunks(es_vocab, words, heads, deps, pos, chunk_offsets):
+ doc = Doc(es_vocab, words=words, heads=heads, deps=deps, pos=pos)
+ assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
+
+
def test_noun_chunks_is_parsed_es(es_tokenizer):
"""Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed."""
doc = es_tokenizer("en Oxford este verano")
diff --git a/spacy/tests/lang/es/test_text.py b/spacy/tests/lang/es/test_text.py
index 96f6bcab5..d95f6d26b 100644
--- a/spacy/tests/lang/es/test_text.py
+++ b/spacy/tests/lang/es/test_text.py
@@ -1,5 +1,16 @@
import pytest
from spacy.lang.es.lex_attrs import like_num
+from spacy.lang.es import Spanish
+
+
+@pytest.mark.issue(3803)
+def test_issue3803():
+ """Test that spanish num-like tokens have True for like_num attribute."""
+ nlp = Spanish()
+ text = "2 dos 1000 mil 12 doce"
+ doc = nlp(text)
+
+ assert [t.like_num for t in doc] == [True, True, True, True, True, True]
def test_es_tokenizer_handles_long_text(es_tokenizer):
diff --git a/spacy/tests/lang/et/__init__.py b/spacy/tests/lang/et/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/et/test_text.py b/spacy/tests/lang/et/test_text.py
new file mode 100644
index 000000000..9515a7cc1
--- /dev/null
+++ b/spacy/tests/lang/et/test_text.py
@@ -0,0 +1,26 @@
+import pytest
+
+
+def test_long_text(et_tokenizer):
+ # Excerpt: European Convention on Human Rights
+ text = """
+arvestades, et nimetatud deklaratsiooni eesmärk on tagada selles
+kuulutatud õiguste üldine ja tõhus tunnustamine ning järgimine;
+arvestades, et Euroopa Nõukogu eesmärk on saavutada tema
+liikmete suurem ühtsus ning et üheks selle eesmärgi saavutamise
+vahendiks on inimõiguste ja põhivabaduste järgimine ning
+elluviimine;
+taaskinnitades oma sügavat usku neisse põhivabadustesse, mis
+on õigluse ja rahu aluseks maailmas ning mida kõige paremini
+tagab ühelt poolt tõhus poliitiline demokraatia ning teiselt poolt
+inimõiguste, millest nad sõltuvad, üldine mõistmine ja järgimine;
+"""
+ tokens = et_tokenizer(text)
+ assert len(tokens) == 94
+
+
+@pytest.mark.xfail
+def test_ordinal_number(et_tokenizer):
+ text = "10. detsembril 1948"
+ tokens = et_tokenizer(text)
+ assert len(tokens) == 3
diff --git a/spacy/tests/lang/et/test_tokenizer.py b/spacy/tests/lang/et/test_tokenizer.py
new file mode 100644
index 000000000..f0f8079ca
--- /dev/null
+++ b/spacy/tests/lang/et/test_tokenizer.py
@@ -0,0 +1,29 @@
+import pytest
+
+ET_BASIC_TOKENIZATION_TESTS = [
+ (
+ "Kedagi ei või piinata ega ebainimlikult või alandavalt kohelda "
+ "ega karistada.",
+ [
+ "Kedagi",
+ "ei",
+ "või",
+ "piinata",
+ "ega",
+ "ebainimlikult",
+ "või",
+ "alandavalt",
+ "kohelda",
+ "ega",
+ "karistada",
+ ".",
+ ],
+ ),
+]
+
+
+@pytest.mark.parametrize("text,expected_tokens", ET_BASIC_TOKENIZATION_TESTS)
+def test_et_tokenizer_basic(et_tokenizer, text, expected_tokens):
+ tokens = et_tokenizer(text)
+ token_list = [token.text for token in tokens if not token.is_space]
+ assert expected_tokens == token_list
diff --git a/spacy/tests/lang/fr/test_prefix_suffix_infix.py b/spacy/tests/lang/fr/test_prefix_suffix_infix.py
index 2ead34069..272531b63 100644
--- a/spacy/tests/lang/fr/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/fr/test_prefix_suffix_infix.py
@@ -1,9 +1,10 @@
import pytest
-from spacy.language import Language
+from spacy.language import Language, BaseDefaults
from spacy.lang.punctuation import TOKENIZER_INFIXES
from spacy.lang.char_classes import ALPHA
+@pytest.mark.issue(768)
@pytest.mark.parametrize(
"text,expected_tokens", [("l'avion", ["l'", "avion"]), ("j'ai", ["j'", "ai"])]
)
@@ -12,7 +13,7 @@ def test_issue768(text, expected_tokens):
SPLIT_INFIX = r"(?<=[{a}]\')(?=[{a}])".format(a=ALPHA)
class FrenchTest(Language):
- class Defaults(Language.Defaults):
+ class Defaults(BaseDefaults):
infixes = TOKENIZER_INFIXES + [SPLIT_INFIX]
fr_tokenizer_w_infix = FrenchTest().tokenizer
diff --git a/spacy/tests/lang/grc/__init__.py b/spacy/tests/lang/grc/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/grc/test_text.py b/spacy/tests/lang/grc/test_text.py
new file mode 100644
index 000000000..5d8317c36
--- /dev/null
+++ b/spacy/tests/lang/grc/test_text.py
@@ -0,0 +1,23 @@
+import pytest
+
+
+@pytest.mark.parametrize(
+ "text,match",
+ [
+ ("ι", True),
+ ("α", True),
+ ("ϟα", True),
+ ("ἑκατόν", True),
+ ("ἐνακόσια", True),
+ ("δισχίλια", True),
+ ("μύρια", True),
+ ("εἷς", True),
+ ("λόγος", False),
+ (",", False),
+ ("λβ", True),
+ ],
+)
+def test_lex_attrs_like_number(grc_tokenizer, text, match):
+ tokens = grc_tokenizer(text)
+ assert len(tokens) == 1
+ assert tokens[0].like_num == match
diff --git a/spacy/tests/lang/hi/test_text.py b/spacy/tests/lang/hi/test_text.py
new file mode 100644
index 000000000..791cc3822
--- /dev/null
+++ b/spacy/tests/lang/hi/test_text.py
@@ -0,0 +1,11 @@
+import pytest
+from spacy.lang.hi import Hindi
+
+
+@pytest.mark.issue(3625)
+def test_issue3625():
+ """Test that default punctuation rules applies to hindi unicode characters"""
+ nlp = Hindi()
+ doc = nlp("hi. how हुए. होटल, होटल")
+ expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
+ assert [token.text for token in doc] == expected
diff --git a/spacy/tests/lang/hr/__init__.py b/spacy/tests/lang/hr/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/hr/test_text.py b/spacy/tests/lang/hr/test_text.py
new file mode 100644
index 000000000..82e65afe7
--- /dev/null
+++ b/spacy/tests/lang/hr/test_text.py
@@ -0,0 +1,26 @@
+import pytest
+
+
+def test_long_text(hr_tokenizer):
+ # Excerpt: European Convention on Human Rights
+ text = """
+uzimajući u obzir da ta deklaracija nastoji osigurati opće i djelotvorno
+priznanje i poštovanje u njoj proglašenih prava;
+uzimajući u obzir da je cilj Vijeća Europe postizanje većeg jedinstva
+njegovih članica, i da je jedan od načina postizanja toga cilja
+očuvanje i daljnje ostvarivanje ljudskih prava i temeljnih sloboda;
+potvrđujući svoju duboku privrženost tim temeljnim slobodama
+koje su osnova pravde i mira u svijetu i koje su najbolje zaštićene
+istinskom političkom demokracijom s jedne strane te zajedničkim
+razumijevanjem i poštovanjem ljudskih prava o kojima te slobode
+ovise s druge strane;
+"""
+ tokens = hr_tokenizer(text)
+ assert len(tokens) == 105
+
+
+@pytest.mark.xfail
+def test_ordinal_number(hr_tokenizer):
+ text = "10. prosinca 1948"
+ tokens = hr_tokenizer(text)
+ assert len(tokens) == 3
diff --git a/spacy/tests/lang/hr/test_tokenizer.py b/spacy/tests/lang/hr/test_tokenizer.py
new file mode 100644
index 000000000..dace33b2d
--- /dev/null
+++ b/spacy/tests/lang/hr/test_tokenizer.py
@@ -0,0 +1,31 @@
+import pytest
+
+HR_BASIC_TOKENIZATION_TESTS = [
+ (
+ "Nitko se ne smije podvrgnuti mučenju ni nečovječnom ili "
+ "ponižavajućem postupanju ili kazni.",
+ [
+ "Nitko",
+ "se",
+ "ne",
+ "smije",
+ "podvrgnuti",
+ "mučenju",
+ "ni",
+ "nečovječnom",
+ "ili",
+ "ponižavajućem",
+ "postupanju",
+ "ili",
+ "kazni",
+ ".",
+ ],
+ ),
+]
+
+
+@pytest.mark.parametrize("text,expected_tokens", HR_BASIC_TOKENIZATION_TESTS)
+def test_hr_tokenizer_basic(hr_tokenizer, text, expected_tokens):
+ tokens = hr_tokenizer(text)
+ token_list = [token.text for token in tokens if not token.is_space]
+ assert expected_tokens == token_list
diff --git a/spacy/tests/lang/hu/test_tokenizer.py b/spacy/tests/lang/hu/test_tokenizer.py
index fd3acd0a0..0488474ae 100644
--- a/spacy/tests/lang/hu/test_tokenizer.py
+++ b/spacy/tests/lang/hu/test_tokenizer.py
@@ -294,7 +294,7 @@ WIKI_TESTS = [
]
EXTRA_TESTS = (
- DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS + WIKI_TESTS + TYPO_TESTS
+ DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS + WIKI_TESTS + TYPO_TESTS # type: ignore[operator]
)
# normal: default tests + 10% of extra tests
diff --git a/spacy/tests/lang/is/__init__.py b/spacy/tests/lang/is/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/is/test_text.py b/spacy/tests/lang/is/test_text.py
new file mode 100644
index 000000000..6e3654a6e
--- /dev/null
+++ b/spacy/tests/lang/is/test_text.py
@@ -0,0 +1,26 @@
+import pytest
+
+
+def test_long_text(is_tokenizer):
+ # Excerpt: European Convention on Human Rights
+ text = """
+hafa í huga, að yfirlýsing þessi hefur það markmið að tryggja
+almenna og raunhæfa viðurkenningu og vernd þeirra réttinda,
+sem þar er lýst;
+hafa í huga, að markmið Evrópuráðs er að koma á nánari einingu
+aðildarríkjanna og að ein af leiðunum að því marki er sú, að
+mannréttindi og mannfrelsi séu í heiðri höfð og efld;
+lýsa á ný eindreginni trú sinni á það mannfrelsi, sem er undirstaða
+réttlætis og friðar í heiminum og best er tryggt, annars vegar með
+virku, lýðræðislegu stjórnarfari og, hins vegar, almennum skilningi
+og varðveislu þeirra mannréttinda, sem eru grundvöllur frelsisins;
+"""
+ tokens = is_tokenizer(text)
+ assert len(tokens) == 120
+
+
+@pytest.mark.xfail
+def test_ordinal_number(is_tokenizer):
+ text = "10. desember 1948"
+ tokens = is_tokenizer(text)
+ assert len(tokens) == 3
diff --git a/spacy/tests/lang/is/test_tokenizer.py b/spacy/tests/lang/is/test_tokenizer.py
new file mode 100644
index 000000000..0c05a6050
--- /dev/null
+++ b/spacy/tests/lang/is/test_tokenizer.py
@@ -0,0 +1,30 @@
+import pytest
+
+IS_BASIC_TOKENIZATION_TESTS = [
+ (
+ "Enginn maður skal sæta pyndingum eða ómannlegri eða "
+ "vanvirðandi meðferð eða refsingu. ",
+ [
+ "Enginn",
+ "maður",
+ "skal",
+ "sæta",
+ "pyndingum",
+ "eða",
+ "ómannlegri",
+ "eða",
+ "vanvirðandi",
+ "meðferð",
+ "eða",
+ "refsingu",
+ ".",
+ ],
+ ),
+]
+
+
+@pytest.mark.parametrize("text,expected_tokens", IS_BASIC_TOKENIZATION_TESTS)
+def test_is_tokenizer_basic(is_tokenizer, text, expected_tokens):
+ tokens = is_tokenizer(text)
+ token_list = [token.text for token in tokens if not token.is_space]
+ assert expected_tokens == token_list
diff --git a/spacy/tests/lang/it/test_text.py b/spacy/tests/lang/it/test_text.py
new file mode 100644
index 000000000..6023a20b1
--- /dev/null
+++ b/spacy/tests/lang/it/test_text.py
@@ -0,0 +1,14 @@
+import pytest
+
+
+@pytest.mark.issue(2822)
+def test_issue2822(it_tokenizer):
+ """Test that the abbreviation of poco is kept as one word."""
+ doc = it_tokenizer("Vuoi un po' di zucchero?")
+ assert len(doc) == 6
+ assert doc[0].text == "Vuoi"
+ assert doc[1].text == "un"
+ assert doc[2].text == "po'"
+ assert doc[3].text == "di"
+ assert doc[4].text == "zucchero"
+ assert doc[5].text == "?"
diff --git a/spacy/tests/lang/ja/test_lemmatization.py b/spacy/tests/lang/ja/test_lemmatization.py
index 6041611e6..21879a569 100644
--- a/spacy/tests/lang/ja/test_lemmatization.py
+++ b/spacy/tests/lang/ja/test_lemmatization.py
@@ -8,3 +8,17 @@ import pytest
def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma):
test_lemma = ja_tokenizer(word)[0].lemma_
assert test_lemma == lemma
+
+
+@pytest.mark.parametrize(
+ "word,norm",
+ [
+ ("SUMMER", "サマー"),
+ ("食べ物", "食べ物"),
+ ("綜合", "総合"),
+ ("コンピュータ", "コンピューター"),
+ ],
+)
+def test_ja_lemmatizer_norm(ja_tokenizer, word, norm):
+ test_norm = ja_tokenizer(word)[0].norm_
+ assert test_norm == norm
diff --git a/spacy/tests/lang/ja/test_morphologizer_factory.py b/spacy/tests/lang/ja/test_morphologizer_factory.py
new file mode 100644
index 000000000..a4e038d01
--- /dev/null
+++ b/spacy/tests/lang/ja/test_morphologizer_factory.py
@@ -0,0 +1,9 @@
+import pytest
+from spacy.lang.ja import Japanese
+
+
+def test_ja_morphologizer_factory():
+ pytest.importorskip("sudachipy")
+ nlp = Japanese()
+ morphologizer = nlp.add_pipe("morphologizer")
+ assert morphologizer.cfg["extend"] is True
diff --git a/spacy/tests/lang/ja/test_serialize.py b/spacy/tests/lang/ja/test_serialize.py
index e05a363bf..011eb470f 100644
--- a/spacy/tests/lang/ja/test_serialize.py
+++ b/spacy/tests/lang/ja/test_serialize.py
@@ -1,3 +1,5 @@
+import pickle
+
from spacy.lang.ja import Japanese
from ...util import make_tempdir
@@ -31,3 +33,9 @@ def test_ja_tokenizer_serialize(ja_tokenizer):
nlp_r.from_disk(d)
assert nlp_bytes == nlp_r.to_bytes()
assert nlp_r.tokenizer.split_mode == "B"
+
+
+def test_ja_tokenizer_pickle(ja_tokenizer):
+ b = pickle.dumps(ja_tokenizer)
+ ja_tokenizer_re = pickle.loads(b)
+ assert ja_tokenizer.to_bytes() == ja_tokenizer_re.to_bytes()
diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py
index c8c85d655..ef7bed06d 100644
--- a/spacy/tests/lang/ja/test_tokenizer.py
+++ b/spacy/tests/lang/ja/test_tokenizer.py
@@ -34,26 +34,38 @@ SENTENCE_TESTS = [
]
tokens1 = [
- DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", reading="イイン", sub_tokens=None),
- DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", reading="カイ", sub_tokens=None),
+ DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", norm="委員", reading="イイン", sub_tokens=None),
+ DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", norm="会", reading="カイ", sub_tokens=None),
]
tokens2 = [
- DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", reading="センキョ", sub_tokens=None),
- DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", reading="カンリ", sub_tokens=None),
- DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", reading="イイン", sub_tokens=None),
- DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", reading="カイ", sub_tokens=None),
+ DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", norm="選挙", reading="センキョ", sub_tokens=None),
+ DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", norm="管理", reading="カンリ", sub_tokens=None),
+ DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", norm="委員", reading="イイン", sub_tokens=None),
+ DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", norm="会", reading="カイ", sub_tokens=None),
]
tokens3 = [
- DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", reading="センキョ", sub_tokens=None),
- DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", reading="カンリ", sub_tokens=None),
- DetailedToken(surface="委員会", tag="名詞-普通名詞-一般", inf="", lemma="委員会", reading="イインカイ", sub_tokens=None),
+ DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", norm="選挙", reading="センキョ", sub_tokens=None),
+ DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", norm="管理", reading="カンリ", sub_tokens=None),
+ DetailedToken(surface="委員会", tag="名詞-普通名詞-一般", inf="", lemma="委員会", norm="委員会", reading="イインカイ", sub_tokens=None),
]
SUB_TOKEN_TESTS = [
- ("選挙管理委員会", [None, None, None, None], [None, None, [tokens1]], [[tokens2, tokens3]])
+ ("選挙管理委員会", [None, None, [tokens1]], [[tokens2, tokens3]])
]
# fmt: on
+@pytest.mark.issue(2901)
+def test_issue2901():
+ """Test that `nlp` doesn't fail."""
+ try:
+ nlp = Japanese()
+ except ImportError:
+ pytest.skip()
+
+ doc = nlp("pythonが大好きです")
+ assert doc
+
+
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
def test_ja_tokenizer(ja_tokenizer, text, expected_tokens):
tokens = [token.text for token in ja_tokenizer(text)]
@@ -111,18 +123,16 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
assert len(nlp_c(text)) == len_c
-@pytest.mark.parametrize(
- "text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS
-)
+@pytest.mark.parametrize("text,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS)
def test_ja_tokenizer_sub_tokens(
- ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c
+ ja_tokenizer, text, sub_tokens_list_b, sub_tokens_list_c
):
nlp_a = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "A"}}})
nlp_b = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "B"}}})
nlp_c = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "C"}}})
- assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a
- assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a
+ assert ja_tokenizer(text).user_data.get("sub_tokens") is None
+ assert nlp_a(text).user_data.get("sub_tokens") is None
assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b
assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c
@@ -132,16 +142,20 @@ def test_ja_tokenizer_sub_tokens(
[
(
"取ってつけた",
- ("五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"),
- ("トッ", "テ", "ツケ", "タ"),
+ (["五段-ラ行;連用形-促音便"], [], ["下一段-カ行;連用形-一般"], ["助動詞-タ;終止形-一般"]),
+ (["トッ"], ["テ"], ["ツケ"], ["タ"]),
),
+ ("2=3", ([], [], []), (["ニ"], ["_"], ["サン"])),
],
)
def test_ja_tokenizer_inflections_reading_forms(
ja_tokenizer, text, inflections, reading_forms
):
- assert ja_tokenizer(text).user_data["inflections"] == inflections
- assert ja_tokenizer(text).user_data["reading_forms"] == reading_forms
+ tokens = ja_tokenizer(text)
+ test_inflections = [tt.morph.get("Inflection") for tt in tokens]
+ assert test_inflections == list(inflections)
+ test_readings = [tt.morph.get("Reading") for tt in tokens]
+ assert test_readings == list(reading_forms)
def test_ja_tokenizer_emptyish_texts(ja_tokenizer):
diff --git a/spacy/tests/lang/ko/test_serialize.py b/spacy/tests/lang/ko/test_serialize.py
new file mode 100644
index 000000000..75288fcc5
--- /dev/null
+++ b/spacy/tests/lang/ko/test_serialize.py
@@ -0,0 +1,24 @@
+import pickle
+
+from spacy.lang.ko import Korean
+from ...util import make_tempdir
+
+
+def test_ko_tokenizer_serialize(ko_tokenizer):
+ tokenizer_bytes = ko_tokenizer.to_bytes()
+ nlp = Korean()
+ nlp.tokenizer.from_bytes(tokenizer_bytes)
+ assert tokenizer_bytes == nlp.tokenizer.to_bytes()
+
+ with make_tempdir() as d:
+ file_path = d / "tokenizer"
+ ko_tokenizer.to_disk(file_path)
+ nlp = Korean()
+ nlp.tokenizer.from_disk(file_path)
+ assert tokenizer_bytes == nlp.tokenizer.to_bytes()
+
+
+def test_ko_tokenizer_pickle(ko_tokenizer):
+ b = pickle.dumps(ko_tokenizer)
+ ko_tokenizer_re = pickle.loads(b)
+ assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()
diff --git a/spacy/tests/lang/ky/test_tokenizer.py b/spacy/tests/lang/ky/test_tokenizer.py
index 91a048764..5cf6eb1a6 100644
--- a/spacy/tests/lang/ky/test_tokenizer.py
+++ b/spacy/tests/lang/ky/test_tokenizer.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/lv/__init__.py b/spacy/tests/lang/lv/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/lv/test_text.py b/spacy/tests/lang/lv/test_text.py
new file mode 100644
index 000000000..5ca5fd0a7
--- /dev/null
+++ b/spacy/tests/lang/lv/test_text.py
@@ -0,0 +1,27 @@
+import pytest
+
+
+def test_long_text(lv_tokenizer):
+ # Excerpt: European Convention on Human Rights
+ text = """
+Ievērodamas, ka šī deklarācija paredz nodrošināt vispārēju un
+efektīvu tajā pasludināto tiesību atzīšanu un ievērošanu;
+Ievērodamas, ka Eiropas Padomes mērķis ir panākt lielāku vienotību
+tās dalībvalstu starpā un ka viens no līdzekļiem, kā šo mērķi
+sasniegt, ir cilvēka tiesību un pamatbrīvību ievērošana un turpmāka
+īstenošana;
+No jauna apliecinādamas patiesu pārliecību, ka šīs pamatbrīvības
+ir taisnīguma un miera pamats visā pasaulē un ka tās vislabāk var
+nodrošināt patiess demokrātisks politisks režīms no vienas puses un
+vispārējo cilvēktiesību, uz kurām tās pamatojas, kopīga izpratne un
+ievērošana no otras puses;
+"""
+ tokens = lv_tokenizer(text)
+ assert len(tokens) == 109
+
+
+@pytest.mark.xfail
+def test_ordinal_number(lv_tokenizer):
+ text = "10. decembrī"
+ tokens = lv_tokenizer(text)
+ assert len(tokens) == 2
diff --git a/spacy/tests/lang/lv/test_tokenizer.py b/spacy/tests/lang/lv/test_tokenizer.py
new file mode 100644
index 000000000..3ce7ad5fa
--- /dev/null
+++ b/spacy/tests/lang/lv/test_tokenizer.py
@@ -0,0 +1,30 @@
+import pytest
+
+LV_BASIC_TOKENIZATION_TESTS = [
+ (
+ "Nevienu nedrīkst spīdzināt vai cietsirdīgi vai pazemojoši ar viņu "
+ "apieties vai sodīt.",
+ [
+ "Nevienu",
+ "nedrīkst",
+ "spīdzināt",
+ "vai",
+ "cietsirdīgi",
+ "vai",
+ "pazemojoši",
+ "ar",
+ "viņu",
+ "apieties",
+ "vai",
+ "sodīt",
+ ".",
+ ],
+ ),
+]
+
+
+@pytest.mark.parametrize("text,expected_tokens", LV_BASIC_TOKENIZATION_TESTS)
+def test_lv_tokenizer_basic(lv_tokenizer, text, expected_tokens):
+ tokens = lv_tokenizer(text)
+ token_list = [token.text for token in tokens if not token.is_space]
+ assert expected_tokens == token_list
diff --git a/spacy/tests/lang/nl/test_noun_chunks.py b/spacy/tests/lang/nl/test_noun_chunks.py
new file mode 100644
index 000000000..73b501e4a
--- /dev/null
+++ b/spacy/tests/lang/nl/test_noun_chunks.py
@@ -0,0 +1,209 @@
+from spacy.tokens import Doc
+import pytest
+
+
+@pytest.fixture
+def nl_sample(nl_vocab):
+ # TEXT :
+ # Haar vriend lacht luid. We kregen alweer ruzie toen we de supermarkt ingingen.
+ # Aan het begin van de supermarkt is al het fruit en de groentes. Uiteindelijk hebben we dan ook
+ # geen avondeten gekocht.
+ words = [
+ "Haar",
+ "vriend",
+ "lacht",
+ "luid",
+ ".",
+ "We",
+ "kregen",
+ "alweer",
+ "ruzie",
+ "toen",
+ "we",
+ "de",
+ "supermarkt",
+ "ingingen",
+ ".",
+ "Aan",
+ "het",
+ "begin",
+ "van",
+ "de",
+ "supermarkt",
+ "is",
+ "al",
+ "het",
+ "fruit",
+ "en",
+ "de",
+ "groentes",
+ ".",
+ "Uiteindelijk",
+ "hebben",
+ "we",
+ "dan",
+ "ook",
+ "geen",
+ "avondeten",
+ "gekocht",
+ ".",
+ ]
+ heads = [
+ 1,
+ 2,
+ 2,
+ 2,
+ 2,
+ 6,
+ 6,
+ 6,
+ 6,
+ 13,
+ 13,
+ 12,
+ 13,
+ 6,
+ 6,
+ 17,
+ 17,
+ 24,
+ 20,
+ 20,
+ 17,
+ 24,
+ 24,
+ 24,
+ 24,
+ 27,
+ 27,
+ 24,
+ 24,
+ 36,
+ 36,
+ 36,
+ 36,
+ 36,
+ 35,
+ 36,
+ 36,
+ 36,
+ ]
+ deps = [
+ "nmod:poss",
+ "nsubj",
+ "ROOT",
+ "advmod",
+ "punct",
+ "nsubj",
+ "ROOT",
+ "advmod",
+ "obj",
+ "mark",
+ "nsubj",
+ "det",
+ "obj",
+ "advcl",
+ "punct",
+ "case",
+ "det",
+ "obl",
+ "case",
+ "det",
+ "nmod",
+ "cop",
+ "advmod",
+ "det",
+ "ROOT",
+ "cc",
+ "det",
+ "conj",
+ "punct",
+ "advmod",
+ "aux",
+ "nsubj",
+ "advmod",
+ "advmod",
+ "det",
+ "obj",
+ "ROOT",
+ "punct",
+ ]
+ pos = [
+ "PRON",
+ "NOUN",
+ "VERB",
+ "ADJ",
+ "PUNCT",
+ "PRON",
+ "VERB",
+ "ADV",
+ "NOUN",
+ "SCONJ",
+ "PRON",
+ "DET",
+ "NOUN",
+ "NOUN",
+ "PUNCT",
+ "ADP",
+ "DET",
+ "NOUN",
+ "ADP",
+ "DET",
+ "NOUN",
+ "AUX",
+ "ADV",
+ "DET",
+ "NOUN",
+ "CCONJ",
+ "DET",
+ "NOUN",
+ "PUNCT",
+ "ADJ",
+ "AUX",
+ "PRON",
+ "ADV",
+ "ADV",
+ "DET",
+ "NOUN",
+ "VERB",
+ "PUNCT",
+ ]
+ return Doc(nl_vocab, words=words, heads=heads, deps=deps, pos=pos)
+
+
+@pytest.fixture
+def nl_reference_chunking():
+ # Using frog https://github.com/LanguageMachines/frog/ we obtain the following NOUN-PHRASES:
+ return [
+ "haar vriend",
+ "we",
+ "ruzie",
+ "we",
+ "de supermarkt",
+ "het begin",
+ "de supermarkt",
+ "het fruit",
+ "de groentes",
+ "we",
+ "geen avondeten",
+ ]
+
+
+def test_need_dep(nl_tokenizer):
+ """
+ Test that noun_chunks raises Value Error for 'nl' language if Doc is not parsed.
+ """
+ txt = "Haar vriend lacht luid."
+ doc = nl_tokenizer(txt)
+
+ with pytest.raises(ValueError):
+ list(doc.noun_chunks)
+
+
+def test_chunking(nl_sample, nl_reference_chunking):
+ """
+ Test the noun chunks of a sample text. Uses a sample.
+ The sample text simulates a Doc object as would be produced by nl_core_news_md.
+ """
+ chunks = [s.text.lower() for s in nl_sample.noun_chunks]
+ assert chunks == nl_reference_chunking
diff --git a/spacy/tests/lang/pt/test_noun_chunks.py b/spacy/tests/lang/pt/test_noun_chunks.py
new file mode 100644
index 000000000..9a42ce268
--- /dev/null
+++ b/spacy/tests/lang/pt/test_noun_chunks.py
@@ -0,0 +1,221 @@
+from spacy.tokens import Doc
+import pytest
+
+
+# fmt: off
+@pytest.mark.parametrize(
+ "words,heads,deps,pos,chunk_offsets",
+ [
+ # determiner + noun
+ # um cachorro -> um cachorro
+ (
+ ["um", "cachorro"],
+ [1, 1],
+ ["det", "ROOT"],
+ ["DET", "NOUN"],
+ [(0, 2)],
+ ),
+ # two determiners + noun
+ # meu o pai -> meu o pai
+ (
+ ["meu", "o", "pai"],
+ [2, 2, 2],
+ ["det", "det", "ROOT"],
+ ["DET", "DET", "NOUN"],
+ [(0, 3)],
+ ),
+ # two determiners + noun
+ # todos essos caros -> todos essos caros
+ (
+ ["todos", "essos", "caros"],
+ [2, 2, 2],
+ ["det", "det", "ROOT"],
+ ["DET", "DET", "NOUN"],
+ [(0, 3)],
+ ),
+ # two determiners, one is after noun
+ # um irmão meu -> um irmão meu
+ (
+ ["um", "irmão", "meu"],
+ [1, 1, 1],
+ ["det", "ROOT", "det"],
+ ["DET", "NOUN", "DET"],
+ [(0, 3)],
+ ),
+ # two determiners + noun
+ # o meu pai -> o meu pai
+ (
+ ["o", "meu", "pai"],
+ [2, 2, 2],
+ ["det","det", "ROOT"],
+ ["DET", "DET", "NOUN"],
+ [(0, 3)],
+ ),
+ # relative pronoun
+ # A bicicleta essa está estragada -> A bicicleta
+ (
+ ['A', 'bicicleta', 'essa', 'está', 'estragada'],
+ [1, 4, 1, 4, 4],
+ ['det', 'nsubj', 'det', 'cop', 'ROOT'],
+ ['DET', 'NOUN', 'PRON', 'AUX', 'ADJ'],
+ [(0,2)]
+ ),
+ # relative subclause
+ # o computador que comprou -> o computador
+ (
+ ['o', 'computador', 'que', 'comprou'],
+ [1, 1, 3, 1],
+ ['det', 'ROOT', 'nsubj', 'acl:relcl'],
+ ['DET', 'NOUN', 'PRON', 'VERB'],
+ [(0, 2), (2, 3)]
+ ),
+ # det + noun + adj
+ # O cachorro marrom -> O cachorro marrom
+ (
+ ["O", "cachorro", "marrom"],
+ [1, 1, 1],
+ ["det", "ROOT", "amod"],
+ ["DET", "NOUN", "ADJ"],
+ [(0, 3)],
+ ),
+ # det + noun + adj plural
+ # As calças baratas -> As calças baratas
+ (
+ ["As", "calças", "baratas"],
+ [1, 1, 1],
+ ["det", "ROOT", "amod"],
+ ["DET", "NOUN", "ADJ"],
+ [(0, 3)],
+ ),
+ # det + adj + noun
+ # Uma boa ideia -> Uma boa ideia
+ (
+ ['uma', 'boa', 'ideia'],
+ [2, 2, 2],
+ ["det", "amod", "ROOT"],
+ ["DET", "ADJ", "NOUN"],
+ [(0,3)]
+ ),
+ # multiple adjectives
+ # Uma garota esperta e inteligente -> Uma garota esperta e inteligente
+ (
+ ["Uma", "garota", "esperta", "e", "inteligente"],
+ [1, 1, 1, 4, 2],
+ ["det", "ROOT", "amod", "cc", "conj"],
+ ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
+ [(0,5)]
+ ),
+ # determiner, adjective, compound created by flat
+ # a grande São Paolo -> a grande São Paolo
+ (
+ ["a", "grande", "São", "Paolo"],
+ [2, 2, 2, 2],
+ ["det", "amod", "ROOT", "flat:name"],
+ ["DET", "ADJ", "PROPN", "PROPN"],
+ [(0,4)]
+ ),
+ # one determiner + one noun + one adjective qualified by an adverb
+ # alguns fazendeiros muito ricos -> alguns fazendeiros muito ricos
+ (
+ ['alguns', 'fazendeiros', 'muito', 'ricos'],
+ [1, 1, 3, 1],
+ ['det', 'ROOT', 'advmod', 'amod'],
+ ['DET', 'NOUN', 'ADV', 'ADJ'],
+ [(0,4)]
+ ),
+ # Two NPs conjuncted
+ # Eu tenho um cachorro e um gato -> Eu, um cacharo, um gato
+ (
+ ["Eu", "tenho", "um", "cachorro", "e", "um", "gato"],
+ [1, 1, 3, 1, 6, 6, 3],
+ ['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
+ ['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
+ [(0,1), (2,4), (5,7)]
+
+ ),
+ # Two NPs together
+ # o escritor brasileiro Aníbal Machado -> o escritor brasileiro, Aníbal Machado
+ (
+ ['o', 'escritor', 'brasileiro', 'Aníbal', 'Machado'],
+ [1, 1, 1, 1, 3],
+ ['det', 'ROOT', 'amod', 'appos', 'flat:name'],
+ ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
+ [(0, 3), (3, 5)]
+ ),
+ # Noun compound, person name and titles
+ # Dom Pedro II -> Dom Pedro II
+ (
+ ["Dom", "Pedro", "II"],
+ [0, 0, 0],
+ ["ROOT", "flat:name", "flat:name"],
+ ["PROPN", "PROPN", "PROPN"],
+ [(0,3)]
+ ),
+ # Noun compound created by flat
+ # os Estados Unidos -> os Estados Unidos
+ (
+ ["os", "Estados", "Unidos"],
+ [1, 1, 1],
+ ["det", "ROOT", "flat:name"],
+ ["DET", "PROPN", "PROPN"],
+ [(0,3)]
+ ),
+ # nmod relation between NPs
+ # a destruição da cidade -> a destruição, cidade
+ (
+ ['a', 'destruição', 'da', 'cidade'],
+ [1, 1, 3, 1],
+ ['det', 'ROOT', 'case', 'nmod'],
+ ['DET', 'NOUN', 'ADP', 'NOUN'],
+ [(0,2), (3,4)]
+ ),
+ # Compounding by nmod, several NPs chained together
+ # a primeira fábrica de medicamentos do governo -> a primeira fábrica, medicamentos, governo
+ (
+ ["a", "primeira", "fábrica", "de", "medicamentos", "do", "governo"],
+ [2, 2, 2, 4, 2, 6, 2],
+ ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
+ ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
+ [(0, 3), (4, 5), (6, 7)]
+ ),
+ # several NPs
+ # Tradução da reportagem de Susana -> Tradução, reportagem, Susana
+ (
+ ['Tradução', 'da', 'reportagem', 'de', 'Susana'],
+ [0, 2, 0, 4, 2],
+ ['ROOT', 'case', 'nmod', 'case', 'nmod'],
+ ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
+ [(0,1), (2,3), (4,5)]
+
+ ),
+ # Several NPs
+ # O gato gordo da Susana e seu amigo -> O gato gordo, Susana, seu amigo
+ (
+ ['O', 'gato', 'gordo', 'da', 'Susana', 'e', 'seu', 'amigo'],
+ [1, 1, 1, 4, 1, 7, 7, 1],
+ ['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'conj'],
+ ['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
+ [(0,3), (4,5), (6,8)]
+ ),
+ # Passive subject
+ # Os novos gastos são alimentados pela grande conta bancária de Clinton -> Os novos gastos, grande conta bancária, Clinton
+ (
+ ['Os', 'novos', 'gastos', 'são', 'alimentados', 'pela', 'grande', 'conta', 'bancária', 'de', 'Clinton'],
+ [2, 2, 4, 4, 4, 7, 7, 4, 7, 10, 7],
+ ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
+ ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
+ [(0, 3), (6, 9), (10, 11)]
+ )
+ ],
+)
+# fmt: on
+def test_pt_noun_chunks(pt_vocab, words, heads, deps, pos, chunk_offsets):
+ doc = Doc(pt_vocab, words=words, heads=heads, deps=deps, pos=pos)
+ assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
+
+
+def test_noun_chunks_is_parsed_pt(pt_tokenizer):
+ """Test that noun_chunks raises Value Error for 'pt' language if Doc is not parsed."""
+ doc = pt_tokenizer("en Oxford este verano")
+ with pytest.raises(ValueError):
+ list(doc.noun_chunks)
diff --git a/spacy/tests/lang/sk/__init__.py b/spacy/tests/lang/sk/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/sk/test_text.py b/spacy/tests/lang/sk/test_text.py
new file mode 100644
index 000000000..62ea2a783
--- /dev/null
+++ b/spacy/tests/lang/sk/test_text.py
@@ -0,0 +1,48 @@
+import pytest
+
+
+def test_long_text(sk_tokenizer):
+ # Excerpt: European Convention on Human Rights
+ text = """
+majúc na zreteli, že cieľom tejto deklarácie je zabezpečiť všeobecné
+a účinné uznávanie a dodržiavanie práv v nej vyhlásených;
+majúc na zreteli, že cieľom Rady Európy je dosiahnutie väčšej
+jednoty medzi jej členmi, a že jedným zo spôsobov, ktorým sa
+má tento cieľ napĺňať, je ochrana a ďalší rozvoj ľudských práv
+a základných slobôd;
+znovu potvrdzujúc svoju hlbokú vieru v tie základné slobody, ktoré
+sú základom spravodlivosti a mieru vo svete, a ktoré sú najlepšie
+zachovávané na jednej strane účinnou politickou demokraciou
+a na strane druhej spoločným poňatím a dodržiavaním ľudských
+práv, od ktorých závisia;
+ """
+ tokens = sk_tokenizer(text)
+ assert len(tokens) == 118
+
+
+@pytest.mark.parametrize(
+ "text,match",
+ [
+ ("10", True),
+ ("1", True),
+ ("10,000", True),
+ ("10,00", True),
+ ("štyri", True),
+ ("devätnásť", True),
+ ("milión", True),
+ ("pes", False),
+ (",", False),
+ ("1/2", True),
+ ],
+)
+def test_lex_attrs_like_number(sk_tokenizer, text, match):
+ tokens = sk_tokenizer(text)
+ assert len(tokens) == 1
+ assert tokens[0].like_num == match
+
+
+@pytest.mark.xfail
+def test_ordinal_number(sk_tokenizer):
+ text = "10. decembra 1948"
+ tokens = sk_tokenizer(text)
+ assert len(tokens) == 3
diff --git a/spacy/tests/lang/sk/test_tokenizer.py b/spacy/tests/lang/sk/test_tokenizer.py
new file mode 100644
index 000000000..247847284
--- /dev/null
+++ b/spacy/tests/lang/sk/test_tokenizer.py
@@ -0,0 +1,15 @@
+import pytest
+
+SK_BASIC_TOKENIZATION_TESTS = [
+ (
+ "Kedy sa narodil Andrej Kiska?",
+ ["Kedy", "sa", "narodil", "Andrej", "Kiska", "?"],
+ ),
+]
+
+
+@pytest.mark.parametrize("text,expected_tokens", SK_BASIC_TOKENIZATION_TESTS)
+def test_sk_tokenizer_basic(sk_tokenizer, text, expected_tokens):
+ tokens = sk_tokenizer(text)
+ token_list = [token.text for token in tokens if not token.is_space]
+ assert expected_tokens == token_list
diff --git a/spacy/tests/lang/sl/__init__.py b/spacy/tests/lang/sl/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/sl/test_text.py b/spacy/tests/lang/sl/test_text.py
new file mode 100644
index 000000000..ddc5b6b5d
--- /dev/null
+++ b/spacy/tests/lang/sl/test_text.py
@@ -0,0 +1,27 @@
+import pytest
+
+
+def test_long_text(sl_tokenizer):
+ # Excerpt: European Convention on Human Rights
+ text = """
+upoštevajoč, da si ta deklaracija prizadeva zagotoviti splošno in
+učinkovito priznavanje in spoštovanje v njej razglašenih pravic,
+upoštevajoč, da je cilj Sveta Evrope doseči večjo enotnost med
+njegovimi članicami, in da je eden izmed načinov za zagotavljanje
+tega cilja varstvo in nadaljnji razvoj človekovih pravic in temeljnih
+svoboščin,
+ponovno potrjujoč svojo globoko vero v temeljne svoboščine, na
+katerih temeljita pravičnost in mir v svetu, in ki jih je mogoče najbolje
+zavarovati na eni strani z dejansko politično demokracijo in na drugi
+strani s skupnim razumevanjem in spoštovanjem človekovih pravic,
+od katerih so te svoboščine odvisne,
+"""
+ tokens = sl_tokenizer(text)
+ assert len(tokens) == 116
+
+
+@pytest.mark.xfail
+def test_ordinal_number(sl_tokenizer):
+ text = "10. decembra 1948"
+ tokens = sl_tokenizer(text)
+ assert len(tokens) == 3
diff --git a/spacy/tests/lang/sl/test_tokenizer.py b/spacy/tests/lang/sl/test_tokenizer.py
new file mode 100644
index 000000000..f2b15b0ff
--- /dev/null
+++ b/spacy/tests/lang/sl/test_tokenizer.py
@@ -0,0 +1,32 @@
+import pytest
+
+SL_BASIC_TOKENIZATION_TESTS = [
+ (
+ "Vsakdo ima pravico do spoštovanja njegovega zasebnega in "
+ "družinskega življenja, doma in dopisovanja.",
+ [
+ "Vsakdo",
+ "ima",
+ "pravico",
+ "do",
+ "spoštovanja",
+ "njegovega",
+ "zasebnega",
+ "in",
+ "družinskega",
+ "življenja",
+ ",",
+ "doma",
+ "in",
+ "dopisovanja",
+ ".",
+ ],
+ ),
+]
+
+
+@pytest.mark.parametrize("text,expected_tokens", SL_BASIC_TOKENIZATION_TESTS)
+def test_sl_tokenizer_basic(sl_tokenizer, text, expected_tokens):
+ tokens = sl_tokenizer(text)
+ token_list = [token.text for token in tokens if not token.is_space]
+ assert expected_tokens == token_list
diff --git a/spacy/tests/lang/sq/__init__.py b/spacy/tests/lang/sq/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/sq/test_text.py b/spacy/tests/lang/sq/test_text.py
new file mode 100644
index 000000000..44eedaa54
--- /dev/null
+++ b/spacy/tests/lang/sq/test_text.py
@@ -0,0 +1,25 @@
+import pytest
+
+
+def test_long_text(sq_tokenizer):
+ # Excerpt: European Convention on Human Rights
+ text = """
+Qeveritë nënshkruese, anëtare të Këshillit të Evropës,
+Duke pasur parasysh Deklaratën Universale të të Drejtave të
+Njeriut, të shpallur nga Asambleja e Përgjithshme e Kombeve të
+Bashkuara më 10 dhjetor 1948;
+Duke pasur parasysh, se kjo Deklaratë ka për qëllim të sigurojë
+njohjen dhe zbatimin universal dhe efektiv të të drejtave të
+shpallura në të;
+Duke pasur parasysh se qëllimi i Këshillit të Evropës është që të
+realizojë një bashkim më të ngushtë midis anëtarëve të tij dhe
+se një nga mjetet për të arritur këtë qëllim është mbrojtja dhe
+zhvillimi i të drejtave të njeriut dhe i lirive themelore;
+Duke ripohuar besimin e tyre të thellë në këto liri themelore që
+përbëjnë themelet e drejtësisë dhe të paqes në botë, ruajtja e të
+cilave mbështetet kryesisht mbi një regjim politik demokratik nga
+njëra anë, dhe nga ana tjetër mbi një kuptim dhe respektim të
+përbashkët të të drejtave të njeriut nga të cilat varen;
+"""
+ tokens = sq_tokenizer(text)
+ assert len(tokens) == 182
diff --git a/spacy/tests/lang/sq/test_tokenizer.py b/spacy/tests/lang/sq/test_tokenizer.py
new file mode 100644
index 000000000..8fd25f588
--- /dev/null
+++ b/spacy/tests/lang/sq/test_tokenizer.py
@@ -0,0 +1,31 @@
+import pytest
+
+SQ_BASIC_TOKENIZATION_TESTS = [
+ (
+ "Askush nuk mund t’i nënshtrohet torturës ose dënimeve ose "
+ "trajtimeve çnjerëzore ose poshtëruese.",
+ [
+ "Askush",
+ "nuk",
+ "mund",
+ "t’i",
+ "nënshtrohet",
+ "torturës",
+ "ose",
+ "dënimeve",
+ "ose",
+ "trajtimeve",
+ "çnjerëzore",
+ "ose",
+ "poshtëruese",
+ ".",
+ ],
+ ),
+]
+
+
+@pytest.mark.parametrize("text,expected_tokens", SQ_BASIC_TOKENIZATION_TESTS)
+def test_sq_tokenizer_basic(sq_tokenizer, text, expected_tokens):
+ tokens = sq_tokenizer(text)
+ token_list = [token.text for token in tokens if not token.is_space]
+ assert expected_tokens == token_list
diff --git a/spacy/tests/lang/sv/test_exceptions.py b/spacy/tests/lang/sv/test_exceptions.py
index e6cae4d2b..b49a0c832 100644
--- a/spacy/tests/lang/sv/test_exceptions.py
+++ b/spacy/tests/lang/sv/test_exceptions.py
@@ -1,6 +1,5 @@
import pytest
-
SV_TOKEN_EXCEPTION_TESTS = [
(
"Smörsåsen används bl.a. till fisk",
@@ -17,6 +16,26 @@ SV_TOKEN_EXCEPTION_TESTS = [
]
+@pytest.mark.issue(805)
+@pytest.mark.parametrize(
+ "text,expected_tokens",
+ [
+ (
+ "Smörsåsen används bl.a. till fisk",
+ ["Smörsåsen", "används", "bl.a.", "till", "fisk"],
+ ),
+ (
+ "Jag kommer först kl. 13 p.g.a. diverse förseningar",
+ ["Jag", "kommer", "först", "kl.", "13", "p.g.a.", "diverse", "förseningar"],
+ ),
+ ],
+)
+def test_issue805(sv_tokenizer, text, expected_tokens):
+ tokens = sv_tokenizer(text)
+ token_list = [token.text for token in tokens if not token.is_space]
+ assert expected_tokens == token_list
+
+
@pytest.mark.parametrize("text,expected_tokens", SV_TOKEN_EXCEPTION_TESTS)
def test_sv_tokenizer_handles_exception_cases(sv_tokenizer, text, expected_tokens):
tokens = sv_tokenizer(text)
diff --git a/spacy/tests/lang/test_attrs.py b/spacy/tests/lang/test_attrs.py
index b39109455..1c27c1744 100644
--- a/spacy/tests/lang/test_attrs.py
+++ b/spacy/tests/lang/test_attrs.py
@@ -1,6 +1,16 @@
import pytest
-from spacy.attrs import intify_attrs, ORTH, NORM, LEMMA, IS_ALPHA
-from spacy.lang.lex_attrs import is_punct, is_ascii, is_currency, like_url, word_shape
+from spacy.attrs import intify_attrs, ENT_IOB
+
+from spacy.attrs import IS_ALPHA, LEMMA, NORM, ORTH, intify_attrs
+from spacy.lang.en.stop_words import STOP_WORDS
+from spacy.lang.lex_attrs import is_ascii, is_currency, is_punct, is_stop
+from spacy.lang.lex_attrs import like_url, word_shape
+
+
+@pytest.mark.parametrize("word", ["the"])
+@pytest.mark.issue(1889)
+def test_issue1889(word):
+ assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
@pytest.mark.parametrize("text", ["dog"])
@@ -24,6 +34,38 @@ def test_attrs_do_deprecated(text):
assert int_attrs == {ORTH: 10, IS_ALPHA: True}
+def test_attrs_ent_iob_intify():
+ int_attrs = intify_attrs({"ENT_IOB": ""})
+ assert int_attrs == {ENT_IOB: 0}
+
+ int_attrs = intify_attrs({"ENT_IOB": "I"})
+ assert int_attrs == {ENT_IOB: 1}
+
+ int_attrs = intify_attrs({"ENT_IOB": "O"})
+ assert int_attrs == {ENT_IOB: 2}
+
+ int_attrs = intify_attrs({"ENT_IOB": "B"})
+ assert int_attrs == {ENT_IOB: 3}
+
+ int_attrs = intify_attrs({ENT_IOB: ""})
+ assert int_attrs == {ENT_IOB: 0}
+
+ int_attrs = intify_attrs({ENT_IOB: "I"})
+ assert int_attrs == {ENT_IOB: 1}
+
+ int_attrs = intify_attrs({ENT_IOB: "O"})
+ assert int_attrs == {ENT_IOB: 2}
+
+ int_attrs = intify_attrs({ENT_IOB: "B"})
+ assert int_attrs == {ENT_IOB: 3}
+
+ with pytest.raises(ValueError):
+ int_attrs = intify_attrs({"ENT_IOB": "XX"})
+
+ with pytest.raises(ValueError):
+ int_attrs = intify_attrs({ENT_IOB: "XX"})
+
+
@pytest.mark.parametrize("text,match", [(",", True), (" ", False), ("a", False)])
def test_lex_attrs_is_punct(text, match):
assert is_punct(text) == match
@@ -58,9 +100,10 @@ def test_lex_attrs_is_currency(text, match):
("www.google.com", True),
("google.com", True),
("sydney.com", True),
- ("2girls1cup.org", True),
+ ("1abc2def.org", True),
("http://stupid", True),
("www.hi", True),
+ ("example.com/example", True),
("dog", False),
("1.2", False),
("1.a", False),
diff --git a/spacy/tests/lang/th/test_serialize.py b/spacy/tests/lang/th/test_serialize.py
new file mode 100644
index 000000000..a3de4bf54
--- /dev/null
+++ b/spacy/tests/lang/th/test_serialize.py
@@ -0,0 +1,24 @@
+import pickle
+
+from spacy.lang.th import Thai
+from ...util import make_tempdir
+
+
+def test_th_tokenizer_serialize(th_tokenizer):
+ tokenizer_bytes = th_tokenizer.to_bytes()
+ nlp = Thai()
+ nlp.tokenizer.from_bytes(tokenizer_bytes)
+ assert tokenizer_bytes == nlp.tokenizer.to_bytes()
+
+ with make_tempdir() as d:
+ file_path = d / "tokenizer"
+ th_tokenizer.to_disk(file_path)
+ nlp = Thai()
+ nlp.tokenizer.from_disk(file_path)
+ assert tokenizer_bytes == nlp.tokenizer.to_bytes()
+
+
+def test_th_tokenizer_pickle(th_tokenizer):
+ b = pickle.dumps(th_tokenizer)
+ th_tokenizer_re = pickle.loads(b)
+ assert th_tokenizer.to_bytes() == th_tokenizer_re.to_bytes()
diff --git a/spacy/tests/lang/ti/test_text.py b/spacy/tests/lang/ti/test_text.py
index 177a9e4b2..d21005640 100644
--- a/spacy/tests/lang/ti/test_text.py
+++ b/spacy/tests/lang/ti/test_text.py
@@ -37,7 +37,7 @@ def test_ti_tokenizer_handles_cnts(ti_tokenizer, text, length):
("10.000", True),
("1000", True),
("999,0", True),
- ("ሐደ", True),
+ ("ሓደ", True),
("ክልተ", True),
("ትሪልዮን", True),
("ከልቢ", False),
diff --git a/spacy/tests/lang/tl/__init__.py b/spacy/tests/lang/tl/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/tl/test_indices.py b/spacy/tests/lang/tl/test_indices.py
new file mode 100644
index 000000000..7c99ae573
--- /dev/null
+++ b/spacy/tests/lang/tl/test_indices.py
@@ -0,0 +1,8 @@
+def test_tl_simple_punct(tl_tokenizer):
+ text = "Sige, punta ka dito"
+ tokens = tl_tokenizer(text)
+ assert tokens[0].idx == 0
+ assert tokens[1].idx == 4
+ assert tokens[2].idx == 6
+ assert tokens[3].idx == 12
+ assert tokens[4].idx == 15
diff --git a/spacy/tests/lang/tl/test_punct.py b/spacy/tests/lang/tl/test_punct.py
new file mode 100644
index 000000000..d6bcf297d
--- /dev/null
+++ b/spacy/tests/lang/tl/test_punct.py
@@ -0,0 +1,127 @@
+import pytest
+from spacy.util import compile_prefix_regex
+from spacy.lang.punctuation import TOKENIZER_PREFIXES
+
+
+PUNCT_OPEN = ["(", "[", "{", "*"]
+PUNCT_CLOSE = [")", "]", "}", "*"]
+PUNCT_PAIRED = [("(", ")"), ("[", "]"), ("{", "}"), ("*", "*")]
+
+
+@pytest.mark.parametrize("text", ["(", "((", "<"])
+def test_tl_tokenizer_handles_only_punct(tl_tokenizer, text):
+ tokens = tl_tokenizer(text)
+ assert len(tokens) == len(text)
+
+
+@pytest.mark.parametrize("punct", PUNCT_OPEN)
+@pytest.mark.parametrize("text", ["Mabuhay"])
+def test_tl_tokenizer_split_open_punct(tl_tokenizer, punct, text):
+ tokens = tl_tokenizer(punct + text)
+ assert len(tokens) == 2
+ assert tokens[0].text == punct
+ assert tokens[1].text == text
+
+
+@pytest.mark.parametrize("punct", PUNCT_CLOSE)
+@pytest.mark.parametrize("text", ["Mabuhay"])
+def test_tl_tokenizer_splits_close_punct(tl_tokenizer, punct, text):
+ tokens = tl_tokenizer(text + punct)
+ assert len(tokens) == 2
+ assert tokens[0].text == text
+ assert tokens[1].text == punct
+
+
+@pytest.mark.parametrize("punct", PUNCT_OPEN)
+@pytest.mark.parametrize("punct_add", ["`"])
+@pytest.mark.parametrize("text", ["Mabuhay"])
+def test_tl_tokenizer_splits_two_diff_open_punct(tl_tokenizer, punct, punct_add, text):
+ tokens = tl_tokenizer(punct + punct_add + text)
+ assert len(tokens) == 3
+ assert tokens[0].text == punct
+ assert tokens[1].text == punct_add
+ assert tokens[2].text == text
+
+
+@pytest.mark.parametrize("punct", PUNCT_CLOSE)
+@pytest.mark.parametrize("punct_add", ["`"])
+@pytest.mark.parametrize("text", ["Mabuhay"])
+def test_tl_tokenizer_splits_two_diff_close_punct(tl_tokenizer, punct, punct_add, text):
+ tokens = tl_tokenizer(text + punct + punct_add)
+ assert len(tokens) == 3
+ assert tokens[0].text == text
+ assert tokens[1].text == punct
+ assert tokens[2].text == punct_add
+
+
+@pytest.mark.parametrize("punct", PUNCT_OPEN)
+@pytest.mark.parametrize("text", ["Mabuhay"])
+def test_tl_tokenizer_splits_same_open_punct(tl_tokenizer, punct, text):
+ tokens = tl_tokenizer(punct + punct + punct + text)
+ assert len(tokens) == 4
+ assert tokens[0].text == punct
+ assert tokens[3].text == text
+
+
+@pytest.mark.parametrize("punct", PUNCT_CLOSE)
+@pytest.mark.parametrize("text", ["Mabuhay"])
+def test_tl_tokenizer_splits_same_close_punct(tl_tokenizer, punct, text):
+ tokens = tl_tokenizer(text + punct + punct + punct)
+ assert len(tokens) == 4
+ assert tokens[0].text == text
+ assert tokens[1].text == punct
+
+
+@pytest.mark.parametrize("text", ["'Ang"])
+def test_tl_tokenizer_splits_open_apostrophe(tl_tokenizer, text):
+ tokens = tl_tokenizer(text)
+ assert len(tokens) == 2
+ assert tokens[0].text == "'"
+
+
+@pytest.mark.parametrize("text", ["Mabuhay''"])
+def test_tl_tokenizer_splits_double_end_quote(tl_tokenizer, text):
+ tokens = tl_tokenizer(text)
+ assert len(tokens) == 2
+ tokens_punct = tl_tokenizer("''")
+ assert len(tokens_punct) == 1
+
+
+@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
+@pytest.mark.parametrize("text", ["Mabuhay"])
+def test_tl_tokenizer_splits_open_close_punct(
+ tl_tokenizer, punct_open, punct_close, text
+):
+ tokens = tl_tokenizer(punct_open + text + punct_close)
+ assert len(tokens) == 3
+ assert tokens[0].text == punct_open
+ assert tokens[1].text == text
+ assert tokens[2].text == punct_close
+
+
+@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
+@pytest.mark.parametrize("punct_open2,punct_close2", [("`", "'")])
+@pytest.mark.parametrize("text", ["Mabuhay"])
+def test_tl_tokenizer_two_diff_punct(
+ tl_tokenizer, punct_open, punct_close, punct_open2, punct_close2, text
+):
+ tokens = tl_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
+ assert len(tokens) == 5
+ assert tokens[0].text == punct_open2
+ assert tokens[1].text == punct_open
+ assert tokens[2].text == text
+ assert tokens[3].text == punct_close
+ assert tokens[4].text == punct_close2
+
+
+@pytest.mark.parametrize("text,punct", [("(sa'yo", "(")])
+def test_tl_tokenizer_splits_pre_punct_regex(text, punct):
+ tl_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
+ match = tl_search_prefixes(text)
+ assert match.group() == punct
+
+
+def test_tl_tokenizer_splits_bracket_period(tl_tokenizer):
+ text = "(Dumating siya kahapon)."
+ tokens = tl_tokenizer(text)
+ assert tokens[len(tokens) - 1].text == "."
diff --git a/spacy/tests/lang/tl/test_text.py b/spacy/tests/lang/tl/test_text.py
new file mode 100644
index 000000000..17429617c
--- /dev/null
+++ b/spacy/tests/lang/tl/test_text.py
@@ -0,0 +1,73 @@
+import pytest
+from spacy.lang.tl.lex_attrs import like_num
+
+# https://github.com/explosion/spaCy/blob/master/spacy/tests/lang/en/test_text.py
+
+
+def test_tl_tokenizer_handles_long_text(tl_tokenizer):
+ # Excerpt: "Sapagkat ang Pilosopiya ay Ginagawa" by Padre Roque Ferriols
+ text = """
+ Tingin tayo nang tingin. Kailangan lamang nating dumilat at
+ marami tayong makikita. At ang pagtingin ay isang gawain na ako lamang ang
+ makagagawa, kung ako nga ang makakita. Kahit na napanood na ng aking
+ matalik na kaibigan ang isang sine, kailangan ko pa ring panoorin, kung
+ ako nga ang may gustong makakita. Kahit na gaano kadikit ang aming
+ pagkabuklod, hindi siya maaaring tumingin sa isang paraan na ako ang
+ nakakakita. Kung ako ang makakita, ako lamang ang makatitingin.
+ """
+ tokens = tl_tokenizer(text)
+ assert len(tokens) == 97
+
+
+@pytest.mark.parametrize(
+ "text,length",
+ [
+ ("Huwag mo nang itanong sa akin.", 7),
+ ("Nasubukan mo na bang hulihin ang hangin?", 8),
+ ("Hindi ba?", 3),
+ ("Nagbukas ang DFA ng 1,000 appointment slots para sa pasaporte.", 11),
+ ("'Wala raw pasok bukas kasi may bagyo!' sabi ni Micah.", 14),
+ ("'Ingat,' aniya. 'Maingay sila pag malayo at tahimik kung malapit.'", 17),
+ ],
+)
+def test_tl_tokenizer_handles_cnts(tl_tokenizer, text, length):
+ tokens = tl_tokenizer(text)
+ assert len(tokens) == length
+
+
+@pytest.mark.parametrize(
+ "text,match",
+ [
+ ("10", True),
+ ("isa", True),
+ ("dalawa", True),
+ ("tatlumpu", True),
+ pytest.param(
+ "isang daan",
+ True,
+ marks=pytest.mark.xfail(reason="Not yet implemented (means 100)"),
+ ),
+ pytest.param(
+ "kalahati",
+ True,
+ marks=pytest.mark.xfail(reason="Not yet implemented (means 1/2)"),
+ ),
+ pytest.param(
+ "isa't kalahati",
+ True,
+ marks=pytest.mark.xfail(
+ reason="Not yet implemented (means one-and-a-half)"
+ ),
+ ),
+ ],
+)
+def test_lex_attrs_like_number(tl_tokenizer, text, match):
+ tokens = tl_tokenizer(text)
+ assert all([token.like_num for token in tokens]) == match
+
+
+@pytest.mark.xfail(reason="Not yet implemented, fails when capitalized.")
+@pytest.mark.parametrize("word", ["isa", "dalawa", "tatlo"])
+def test_tl_lex_attrs_capitals(word):
+ assert like_num(word)
+ assert like_num(word.upper())
diff --git a/spacy/tests/lang/vi/test_serialize.py b/spacy/tests/lang/vi/test_serialize.py
index ed4652df7..55dab799c 100644
--- a/spacy/tests/lang/vi/test_serialize.py
+++ b/spacy/tests/lang/vi/test_serialize.py
@@ -1,3 +1,5 @@
+import pickle
+
from spacy.lang.vi import Vietnamese
from ...util import make_tempdir
@@ -31,3 +33,9 @@ def test_vi_tokenizer_serialize(vi_tokenizer):
nlp_r.from_disk(d)
assert nlp_bytes == nlp_r.to_bytes()
assert nlp_r.tokenizer.use_pyvi is False
+
+
+def test_vi_tokenizer_pickle(vi_tokenizer):
+ b = pickle.dumps(vi_tokenizer)
+ vi_tokenizer_re = pickle.loads(b)
+ assert vi_tokenizer.to_bytes() == vi_tokenizer_re.to_bytes()
diff --git a/spacy/tests/lang/xx/__init__.py b/spacy/tests/lang/xx/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/xx/test_text.py b/spacy/tests/lang/xx/test_text.py
new file mode 100644
index 000000000..477f0ebe2
--- /dev/null
+++ b/spacy/tests/lang/xx/test_text.py
@@ -0,0 +1,24 @@
+import pytest
+
+
+def test_long_text(xx_tokenizer):
+ # Excerpt: Text in Skolt Sami taken from https://www.samediggi.fi
+ text = """
+Säʹmmla lie Euroopp unioon oʹdinakai alggmeer. Säʹmmlai alggmeerstatus lij raʹvvjum Lääʹddjânnam vuâđđlääʹjjest.
+Alggmeer kriteeʹr vuâđđâʹvve meeraikõskksaž tuâjjorganisaatio, ILO, suåppmõʹšše nââmar 169.
+Suåppmõõžž mieʹldd jiõččvälddsaž jânnmin jälsteei meeraid ââʹnet alggmeeran,
+ko sij puõlvvâʹvve naroodâst, kååʹtt jânnam välddmõõžž leʹbe aazztummuž leʹbe ânnʼjõž riikkraaʹji šõddâm ääiʹj jälste
+jânnmest leʹbe tõn mäddtiõđlaž vuuʹdest, koozz jânnam kooll. Alggmeer ij leäkku mieʹrreei sââʹjest jiiʹjjes jälstemvuuʹdest.
+Alggmeer âlgg jiõčč ââʹnned jiiʹjjes alggmeeran leʹbe leeʹd tõn miõlâst, što sij lie alggmeer.
+Alggmeer lij õlggâm seeilted vuõiggâdvuõđlaž sââʹjest huõlǩâni obbnes leʹbe vueʹzzi jiiʹjjes sosiaalʼlaž, täälʼlaž,
+kulttuurlaž da poliittlaž instituutioid.
+
+Säʹmmlai statuuzz ǩeeʹrjteš Lääʹddjânnam vuâđđläkka eeʹjj 1995. Säʹmmlain alggmeeran lij vuõiggâdvuõtt tuõʹllʼjed da
+ooudâsviikkâd ǩiõlâz da kulttuurâz di tõõzz kuulli ääʹrbvuâlaž jieʹllemvueʹjjeez. Sääʹmǩiõl ââʹnnmest veʹrǧǧniiʹǩǩi
+åʹrnn lij šiõttuum jiiʹjjes lääʹǩǩ. Säʹmmlain lij leämmaž eeʹjjest 1996 vueʹljeeʹl dommvuuʹdsteez ǩiõlâz da kulttuurâz kuõskki
+vuâđđlääʹjj meâldlaž jiõččvaaldâšm. Säʹmmlai jiõččvaldšma kuulli tuâjaid håidd säʹmmlai vaalin vaʹlljääm parlameʹntt,
+Sääʹmteʹǧǧ.
+"""
+
+ tokens = xx_tokenizer(text)
+ assert len(tokens) == 179
diff --git a/spacy/tests/lang/xx/test_tokenizer.py b/spacy/tests/lang/xx/test_tokenizer.py
new file mode 100644
index 000000000..15c760a6b
--- /dev/null
+++ b/spacy/tests/lang/xx/test_tokenizer.py
@@ -0,0 +1,25 @@
+import pytest
+
+XX_BASIC_TOKENIZATION_TESTS = [
+ (
+ "Lääʹddjânnmest lie nuʹtt 10 000 säʹmmliʹžžed. Seeʹst pâʹjjel",
+ [
+ "Lääʹddjânnmest",
+ "lie",
+ "nuʹtt",
+ "10",
+ "000",
+ "säʹmmliʹžžed",
+ ".",
+ "Seeʹst",
+ "pâʹjjel",
+ ],
+ ),
+]
+
+
+@pytest.mark.parametrize("text,expected_tokens", XX_BASIC_TOKENIZATION_TESTS)
+def test_xx_tokenizer_basic(xx_tokenizer, text, expected_tokens):
+ tokens = xx_tokenizer(text)
+ token_list = [token.text for token in tokens if not token.is_space]
+ assert expected_tokens == token_list
diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py
index 0e1eae588..1728c82af 100644
--- a/spacy/tests/matcher/test_dependency_matcher.py
+++ b/spacy/tests/matcher/test_dependency_matcher.py
@@ -368,3 +368,89 @@ def test_dependency_matcher_span_user_data(en_tokenizer):
assert doc_match[0] == span_match[0]
for doc_t_i, span_t_i in zip(doc_match[1], span_match[1]):
assert doc_t_i == span_t_i + offset
+
+
+@pytest.mark.issue(9263)
+def test_dependency_matcher_order_issue(en_tokenizer):
+ # issue from #9263
+ doc = en_tokenizer("I like text")
+ doc[2].head = doc[1]
+
+ # this matches on attrs but not rel op
+ pattern1 = [
+ {"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "like"}},
+ {
+ "LEFT_ID": "root",
+ "RIGHT_ID": "r",
+ "RIGHT_ATTRS": {"ORTH": "text"},
+ "REL_OP": "<",
+ },
+ ]
+
+ # this matches on rel op but not attrs
+ pattern2 = [
+ {"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "like"}},
+ {
+ "LEFT_ID": "root",
+ "RIGHT_ID": "r",
+ "RIGHT_ATTRS": {"ORTH": "fish"},
+ "REL_OP": ">",
+ },
+ ]
+
+ matcher = DependencyMatcher(en_tokenizer.vocab)
+
+ # This should behave the same as the next pattern
+ matcher.add("check", [pattern1, pattern2])
+ matches = matcher(doc)
+
+ assert matches == []
+
+ # use a new matcher
+ matcher = DependencyMatcher(en_tokenizer.vocab)
+ # adding one at a time under same label gets a match
+ matcher.add("check", [pattern1])
+ matcher.add("check", [pattern2])
+ matches = matcher(doc)
+
+ assert matches == []
+
+
+@pytest.mark.issue(9263)
+def test_dependency_matcher_remove(en_tokenizer):
+ # issue from #9263
+ doc = en_tokenizer("The red book")
+ doc[1].head = doc[2]
+
+ # this matches
+ pattern1 = [
+ {"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "book"}},
+ {
+ "LEFT_ID": "root",
+ "RIGHT_ID": "r",
+ "RIGHT_ATTRS": {"ORTH": "red"},
+ "REL_OP": ">",
+ },
+ ]
+
+ # add and then remove it
+ matcher = DependencyMatcher(en_tokenizer.vocab)
+ matcher.add("check", [pattern1])
+ matcher.remove("check")
+
+ # this matches on rel op but not attrs
+ pattern2 = [
+ {"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "flag"}},
+ {
+ "LEFT_ID": "root",
+ "RIGHT_ID": "r",
+ "RIGHT_ATTRS": {"ORTH": "blue"},
+ "REL_OP": ">",
+ },
+ ]
+
+ # Adding this new pattern with the same label, which should not match
+ matcher.add("check", [pattern2])
+ matches = matcher(doc)
+
+ assert matches == []
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index e0f655bbe..a27baf130 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -270,6 +270,16 @@ def test_matcher_subset_value_operator(en_vocab):
doc[0].tag_ = "A"
assert len(matcher(doc)) == 0
+ # IS_SUBSET with a list value
+ Token.set_extension("ext", default=[])
+ matcher = Matcher(en_vocab)
+ pattern = [{"_": {"ext": {"IS_SUBSET": ["A", "B"]}}}]
+ matcher.add("M", [pattern])
+ doc = Doc(en_vocab, words=["a", "b", "c"])
+ doc[0]._.ext = ["A"]
+ doc[1]._.ext = ["C", "D"]
+ assert len(matcher(doc)) == 2
+
def test_matcher_superset_value_operator(en_vocab):
matcher = Matcher(en_vocab)
@@ -308,6 +318,72 @@ def test_matcher_superset_value_operator(en_vocab):
doc[0].tag_ = "A"
assert len(matcher(doc)) == 3
+ # IS_SUPERSET with a list value
+ Token.set_extension("ext", default=[])
+ matcher = Matcher(en_vocab)
+ pattern = [{"_": {"ext": {"IS_SUPERSET": ["A"]}}}]
+ matcher.add("M", [pattern])
+ doc = Doc(en_vocab, words=["a", "b", "c"])
+ doc[0]._.ext = ["A", "B"]
+ assert len(matcher(doc)) == 1
+
+
+def test_matcher_intersect_value_operator(en_vocab):
+ matcher = Matcher(en_vocab)
+ pattern = [{"MORPH": {"INTERSECTS": ["Feat=Val", "Feat2=Val2", "Feat3=Val3"]}}]
+ matcher.add("M", [pattern])
+ doc = Doc(en_vocab, words=["a", "b", "c"])
+ assert len(matcher(doc)) == 0
+ doc[0].set_morph("Feat=Val")
+ assert len(matcher(doc)) == 1
+ doc[0].set_morph("Feat=Val|Feat2=Val2")
+ assert len(matcher(doc)) == 1
+ doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
+ assert len(matcher(doc)) == 1
+ doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
+ assert len(matcher(doc)) == 1
+
+ # INTERSECTS with a single value is the same as IN
+ matcher = Matcher(en_vocab)
+ pattern = [{"TAG": {"INTERSECTS": ["A", "B"]}}]
+ matcher.add("M", [pattern])
+ doc = Doc(en_vocab, words=["a", "b", "c"])
+ doc[0].tag_ = "A"
+ assert len(matcher(doc)) == 1
+
+ # INTERSECTS with an empty pattern list matches nothing
+ matcher = Matcher(en_vocab)
+ pattern = [{"TAG": {"INTERSECTS": []}}]
+ matcher.add("M", [pattern])
+ doc = Doc(en_vocab, words=["a", "b", "c"])
+ doc[0].tag_ = "A"
+ assert len(matcher(doc)) == 0
+
+ # INTERSECTS with a list value
+ Token.set_extension("ext", default=[])
+ matcher = Matcher(en_vocab)
+ pattern = [{"_": {"ext": {"INTERSECTS": ["A", "C"]}}}]
+ matcher.add("M", [pattern])
+ doc = Doc(en_vocab, words=["a", "b", "c"])
+ doc[0]._.ext = ["A", "B"]
+ assert len(matcher(doc)) == 1
+
+ # INTERSECTS with an empty pattern list matches nothing
+ matcher = Matcher(en_vocab)
+ pattern = [{"_": {"ext": {"INTERSECTS": []}}}]
+ matcher.add("M", [pattern])
+ doc = Doc(en_vocab, words=["a", "b", "c"])
+ doc[0]._.ext = ["A", "B"]
+ assert len(matcher(doc)) == 0
+
+ # INTERSECTS with an empty value matches nothing
+ matcher = Matcher(en_vocab)
+ pattern = [{"_": {"ext": {"INTERSECTS": ["A", "B"]}}}]
+ matcher.add("M", [pattern])
+ doc = Doc(en_vocab, words=["a", "b", "c"])
+ doc[0]._.ext = []
+ assert len(matcher(doc)) == 0
+
def test_matcher_morph_handling(en_vocab):
# order of features in pattern doesn't matter
@@ -500,6 +576,16 @@ def test_matcher_callback(en_vocab):
mock.assert_called_once_with(matcher, doc, 0, matches)
+def test_matcher_callback_with_alignments(en_vocab):
+ mock = Mock()
+ matcher = Matcher(en_vocab)
+ pattern = [{"ORTH": "test"}]
+ matcher.add("Rule", [pattern], on_match=mock)
+ doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
+ matches = matcher(doc, with_alignments=True)
+ mock.assert_called_once_with(matcher, doc, 0, matches)
+
+
def test_matcher_span(matcher):
text = "JavaScript is good but Java is better"
doc = Doc(matcher.vocab, words=text.split())
@@ -556,3 +642,30 @@ def test_matcher_no_zero_length(en_vocab):
matcher = Matcher(en_vocab)
matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
assert len(matcher(doc)) == 0
+
+
+def test_matcher_ent_iob_key(en_vocab):
+ """Test that patterns with ent_iob works correctly."""
+ matcher = Matcher(en_vocab)
+ matcher.add("Rule", [[{"ENT_IOB": "I"}]])
+ doc1 = Doc(en_vocab, words=["I", "visited", "New", "York", "and", "California"])
+ doc1.ents = [Span(doc1, 2, 4, label="GPE"), Span(doc1, 5, 6, label="GPE")]
+ doc2 = Doc(en_vocab, words=["I", "visited", "my", "friend", "Alicia"])
+ doc2.ents = [Span(doc2, 4, 5, label="PERSON")]
+ matches1 = [doc1[start:end].text for _, start, end in matcher(doc1)]
+ matches2 = [doc2[start:end].text for _, start, end in matcher(doc2)]
+ assert len(matches1) == 1
+ assert matches1[0] == "York"
+ assert len(matches2) == 0
+
+ matcher = Matcher(en_vocab) # Test iob pattern with operators
+ matcher.add("Rule", [[{"ENT_IOB": "I", "OP": "+"}]])
+ doc = Doc(
+ en_vocab, words=["I", "visited", "my", "friend", "Anna", "Maria", "Esperanza"]
+ )
+ doc.ents = [Span(doc, 4, 7, label="PERSON")]
+ matches = [doc[start:end].text for _, start, end in matcher(doc)]
+ assert len(matches) == 3
+ assert matches[0] == "Maria"
+ assert matches[1] == "Maria Esperanza"
+ assert matches[2] == "Esperanza"
diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py
index dcbe1ff33..3649b07ed 100644
--- a/spacy/tests/matcher/test_matcher_logic.py
+++ b/spacy/tests/matcher/test_matcher_logic.py
@@ -1,10 +1,14 @@
-import pytest
import re
-from spacy.lang.en import English
-from spacy.matcher import Matcher
-from spacy.tokens import Doc, Span
+import pytest
+from spacy.attrs import IS_PUNCT, LOWER, ORTH
+from spacy.errors import MatchPatternError
+from spacy.lang.en import English
+from spacy.lang.lex_attrs import LEX_ATTRS
+from spacy.matcher import Matcher
+from spacy.tokens import Doc, Span, Token
+from spacy.vocab import Vocab
pattern1 = [{"ORTH": "A"}, {"ORTH": "A", "OP": "*"}]
pattern2 = [{"ORTH": "A", "OP": "*"}, {"ORTH": "A"}]
@@ -36,6 +40,473 @@ def doc(en_tokenizer, text):
return doc
+@pytest.mark.issue(118)
+@pytest.mark.parametrize(
+ "patterns",
+ [
+ [[{"LOWER": "celtics"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]],
+ [[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "celtics"}]],
+ ],
+)
+def test_issue118(en_tokenizer, patterns):
+ """Test a bug that arose from having overlapping matches"""
+ text = (
+ "how many points did lebron james score against the boston celtics last night"
+ )
+ doc = en_tokenizer(text)
+ ORG = doc.vocab.strings["ORG"]
+ matcher = Matcher(doc.vocab)
+ matcher.add("BostonCeltics", patterns)
+ assert len(list(doc.ents)) == 0
+ matches = [(ORG, start, end) for _, start, end in matcher(doc)]
+ assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
+ doc.ents = matches[:1]
+ ents = list(doc.ents)
+ assert len(ents) == 1
+ assert ents[0].label == ORG
+ assert ents[0].start == 9
+ assert ents[0].end == 11
+
+
+@pytest.mark.issue(118)
+@pytest.mark.parametrize(
+ "patterns",
+ [
+ [[{"LOWER": "boston"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]],
+ [[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "boston"}]],
+ ],
+)
+def test_issue118_prefix_reorder(en_tokenizer, patterns):
+ """Test a bug that arose from having overlapping matches"""
+ text = (
+ "how many points did lebron james score against the boston celtics last night"
+ )
+ doc = en_tokenizer(text)
+ ORG = doc.vocab.strings["ORG"]
+ matcher = Matcher(doc.vocab)
+ matcher.add("BostonCeltics", patterns)
+ assert len(list(doc.ents)) == 0
+ matches = [(ORG, start, end) for _, start, end in matcher(doc)]
+ doc.ents += tuple(matches)[1:]
+ assert matches == [(ORG, 9, 10), (ORG, 9, 11)]
+ ents = doc.ents
+ assert len(ents) == 1
+ assert ents[0].label == ORG
+ assert ents[0].start == 9
+ assert ents[0].end == 11
+
+
+@pytest.mark.issue(242)
+def test_issue242(en_tokenizer):
+ """Test overlapping multi-word phrases."""
+ text = "There are different food safety standards in different countries."
+ patterns = [
+ [{"LOWER": "food"}, {"LOWER": "safety"}],
+ [{"LOWER": "safety"}, {"LOWER": "standards"}],
+ ]
+ doc = en_tokenizer(text)
+ matcher = Matcher(doc.vocab)
+ matcher.add("FOOD", patterns)
+ matches = [(ent_type, start, end) for ent_type, start, end in matcher(doc)]
+ match1, match2 = matches
+ assert match1[1] == 3
+ assert match1[2] == 5
+ assert match2[1] == 4
+ assert match2[2] == 6
+ with pytest.raises(ValueError):
+ # One token can only be part of one entity, so test that the matches
+ # can't be added as entities
+ doc.ents += tuple(matches)
+
+
+@pytest.mark.issue(587)
+def test_issue587(en_tokenizer):
+ """Test that Matcher doesn't segfault on particular input"""
+ doc = en_tokenizer("a b; c")
+ matcher = Matcher(doc.vocab)
+ matcher.add("TEST1", [[{ORTH: "a"}, {ORTH: "b"}]])
+ matches = matcher(doc)
+ assert len(matches) == 1
+ matcher.add("TEST2", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "c"}]])
+ matches = matcher(doc)
+ assert len(matches) == 2
+ matcher.add("TEST3", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "d"}]])
+ matches = matcher(doc)
+ assert len(matches) == 2
+
+
+@pytest.mark.issue(588)
+def test_issue588(en_vocab):
+ """Test if empty specs still cause an error when adding patterns"""
+ matcher = Matcher(en_vocab)
+ with pytest.raises(ValueError):
+ matcher.add("TEST", [[]])
+
+
+@pytest.mark.issue(590)
+def test_issue590(en_vocab):
+ """Test overlapping matches"""
+ doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"])
+ matcher = Matcher(en_vocab)
+ matcher.add(
+ "ab", [[{"IS_ALPHA": True}, {"ORTH": ":"}, {"LIKE_NUM": True}, {"ORTH": "%"}]]
+ )
+ matcher.add("ab", [[{"IS_ALPHA": True}, {"ORTH": "="}, {"LIKE_NUM": True}]])
+ matches = matcher(doc)
+ assert len(matches) == 2
+
+
+@pytest.mark.issue(615)
+def test_issue615(en_tokenizer):
+ def merge_phrases(matcher, doc, i, matches):
+ """Merge a phrase. We have to be careful here because we'll change the
+ token indices. To avoid problems, merge all the phrases once we're called
+ on the last match."""
+ if i != len(matches) - 1:
+ return None
+ spans = [Span(doc, start, end, label=label) for label, start, end in matches]
+ with doc.retokenize() as retokenizer:
+ for span in spans:
+ tag = "NNP" if span.label_ else span.root.tag_
+ attrs = {"tag": tag, "lemma": span.text}
+ retokenizer.merge(span, attrs=attrs)
+ doc.ents = doc.ents + (span,)
+
+ text = "The golf club is broken"
+ pattern = [{"ORTH": "golf"}, {"ORTH": "club"}]
+ label = "Sport_Equipment"
+ doc = en_tokenizer(text)
+ matcher = Matcher(doc.vocab)
+ matcher.add(label, [pattern], on_match=merge_phrases)
+ matcher(doc)
+ entities = list(doc.ents)
+ assert entities != []
+ assert entities[0].label != 0
+
+
+@pytest.mark.issue(850)
+def test_issue850():
+ """The variable-length pattern matches the succeeding token. Check we
+ handle the ambiguity correctly."""
+ vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
+ matcher = Matcher(vocab)
+ pattern = [{"LOWER": "bob"}, {"OP": "*"}, {"LOWER": "frank"}]
+ matcher.add("FarAway", [pattern])
+ doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
+ match = matcher(doc)
+ assert len(match) == 1
+ ent_id, start, end = match[0]
+ assert start == 0
+ assert end == 4
+
+
+@pytest.mark.issue(850)
+def test_issue850_basic():
+ """Test Matcher matches with '*' operator and Boolean flag"""
+ vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
+ matcher = Matcher(vocab)
+ pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}]
+ matcher.add("FarAway", [pattern])
+ doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
+ match = matcher(doc)
+ assert len(match) == 1
+ ent_id, start, end = match[0]
+ assert start == 0
+ assert end == 4
+
+
+@pytest.mark.issue(1434)
+def test_issue1434():
+ """Test matches occur when optional element at end of short doc."""
+ pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}]
+ vocab = Vocab(lex_attr_getters=LEX_ATTRS)
+ hello_world = Doc(vocab, words=["Hello", "World"])
+ hello = Doc(vocab, words=["Hello"])
+ matcher = Matcher(vocab)
+ matcher.add("MyMatcher", [pattern])
+ matches = matcher(hello_world)
+ assert matches
+ matches = matcher(hello)
+ assert matches
+
+
+@pytest.mark.parametrize(
+ "string,start,end",
+ [
+ ("a", 0, 1),
+ ("a b", 0, 2),
+ ("a c", 0, 1),
+ ("a b c", 0, 2),
+ ("a b b c", 0, 3),
+ ("a b b", 0, 3),
+ ],
+)
+@pytest.mark.issue(1450)
+def test_issue1450(string, start, end):
+ """Test matcher works when patterns end with * operator."""
+ pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
+ matcher = Matcher(Vocab())
+ matcher.add("TSTEND", [pattern])
+ doc = Doc(Vocab(), words=string.split())
+ matches = matcher(doc)
+ if start is None or end is None:
+ assert matches == []
+ assert matches[-1][1] == start
+ assert matches[-1][2] == end
+
+
+@pytest.mark.issue(1945)
+def test_issue1945():
+ """Test regression in Matcher introduced in v2.0.6."""
+ matcher = Matcher(Vocab())
+ matcher.add("MWE", [[{"orth": "a"}, {"orth": "a"}]])
+ doc = Doc(matcher.vocab, words=["a", "a", "a"])
+ matches = matcher(doc) # we should see two overlapping matches here
+ assert len(matches) == 2
+ assert matches[0][1:] == (0, 2)
+ assert matches[1][1:] == (1, 3)
+
+
+@pytest.mark.issue(1971)
+def test_issue1971(en_vocab):
+ # Possibly related to #2675 and #2671?
+ matcher = Matcher(en_vocab)
+ pattern = [
+ {"ORTH": "Doe"},
+ {"ORTH": "!", "OP": "?"},
+ {"_": {"optional": True}, "OP": "?"},
+ {"ORTH": "!", "OP": "?"},
+ ]
+ Token.set_extension("optional", default=False)
+ matcher.add("TEST", [pattern])
+ doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"])
+ # We could also assert length 1 here, but this is more conclusive, because
+ # the real problem here is that it returns a duplicate match for a match_id
+ # that's not actually in the vocab!
+ matches = matcher(doc)
+ assert all([match_id in en_vocab.strings for match_id, start, end in matches])
+
+
+@pytest.mark.issue(1971)
+def test_issue_1971_2(en_vocab):
+ matcher = Matcher(en_vocab)
+ pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
+ pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}]
+ doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
+ matcher.add("TEST1", [pattern1, pattern2])
+ matches = matcher(doc)
+ assert len(matches) == 2
+
+
+@pytest.mark.issue(1971)
+def test_issue_1971_3(en_vocab):
+ """Test that pattern matches correctly for multiple extension attributes."""
+ Token.set_extension("a", default=1, force=True)
+ Token.set_extension("b", default=2, force=True)
+ doc = Doc(en_vocab, words=["hello", "world"])
+ matcher = Matcher(en_vocab)
+ matcher.add("A", [[{"_": {"a": 1}}]])
+ matcher.add("B", [[{"_": {"b": 2}}]])
+ matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc))
+ assert len(matches) == 4
+ assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
+
+
+@pytest.mark.issue(1971)
+def test_issue_1971_4(en_vocab):
+ """Test that pattern matches correctly with multiple extension attribute
+ values on a single token.
+ """
+ Token.set_extension("ext_a", default="str_a", force=True)
+ Token.set_extension("ext_b", default="str_b", force=True)
+ matcher = Matcher(en_vocab)
+ doc = Doc(en_vocab, words=["this", "is", "text"])
+ pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3
+ matcher.add("TEST", [pattern])
+ matches = matcher(doc)
+ # Uncommenting this caused a segmentation fault
+ assert len(matches) == 1
+ assert matches[0] == (en_vocab.strings["TEST"], 0, 3)
+
+
+@pytest.mark.issue(2464)
+def test_issue2464(en_vocab):
+ """Test problem with successive ?. This is the same bug, so putting it here."""
+ matcher = Matcher(en_vocab)
+ doc = Doc(en_vocab, words=["a", "b"])
+ matcher.add("4", [[{"OP": "?"}, {"OP": "?"}]])
+ matches = matcher(doc)
+ assert len(matches) == 3
+
+
+@pytest.mark.issue(2569)
+def test_issue2569(en_tokenizer):
+ """Test that operator + is greedy."""
+ doc = en_tokenizer("It is May 15, 1993.")
+ doc.ents = [Span(doc, 2, 6, label=doc.vocab.strings["DATE"])]
+ matcher = Matcher(doc.vocab)
+ matcher.add("RULE", [[{"ENT_TYPE": "DATE", "OP": "+"}]])
+ matched = [doc[start:end] for _, start, end in matcher(doc)]
+ matched = sorted(matched, key=len, reverse=True)
+ assert len(matched) == 10
+ assert len(matched[0]) == 4
+ assert matched[0].text == "May 15, 1993"
+
+
+@pytest.mark.issue(2671)
+def test_issue2671():
+ """Ensure the correct entity ID is returned for matches with quantifiers.
+ See also #2675
+ """
+ nlp = English()
+ matcher = Matcher(nlp.vocab)
+ pattern_id = "test_pattern"
+ pattern = [
+ {"LOWER": "high"},
+ {"IS_PUNCT": True, "OP": "?"},
+ {"LOWER": "adrenaline"},
+ ]
+ matcher.add(pattern_id, [pattern])
+ doc1 = nlp("This is a high-adrenaline situation.")
+ doc2 = nlp("This is a high adrenaline situation.")
+ matches1 = matcher(doc1)
+ for match_id, start, end in matches1:
+ assert nlp.vocab.strings[match_id] == pattern_id
+ matches2 = matcher(doc2)
+ for match_id, start, end in matches2:
+ assert nlp.vocab.strings[match_id] == pattern_id
+
+
+@pytest.mark.issue(3009)
+def test_issue3009(en_vocab):
+ """Test problem with matcher quantifiers"""
+ patterns = [
+ [{"ORTH": "has"}, {"LOWER": "to"}, {"LOWER": "do"}, {"TAG": "IN"}],
+ [
+ {"ORTH": "has"},
+ {"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"},
+ {"LOWER": "to"},
+ {"LOWER": "do"},
+ {"TAG": "IN"},
+ ],
+ [
+ {"ORTH": "has"},
+ {"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"},
+ {"LOWER": "to"},
+ {"LOWER": "do"},
+ {"TAG": "IN"},
+ ],
+ ]
+ words = ["also", "has", "to", "do", "with"]
+ tags = ["RB", "VBZ", "TO", "VB", "IN"]
+ pos = ["ADV", "VERB", "ADP", "VERB", "ADP"]
+ doc = Doc(en_vocab, words=words, tags=tags, pos=pos)
+ matcher = Matcher(en_vocab)
+ for i, pattern in enumerate(patterns):
+ matcher.add(str(i), [pattern])
+ matches = matcher(doc)
+ assert matches
+
+
+@pytest.mark.issue(3328)
+def test_issue3328(en_vocab):
+ doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"])
+ matcher = Matcher(en_vocab)
+ patterns = [
+ [{"LOWER": {"IN": ["hello", "how"]}}],
+ [{"LOWER": {"IN": ["you", "doing"]}}],
+ ]
+ matcher.add("TEST", patterns)
+ matches = matcher(doc)
+ assert len(matches) == 4
+ matched_texts = [doc[start:end].text for _, start, end in matches]
+ assert matched_texts == ["Hello", "how", "you", "doing"]
+
+
+@pytest.mark.issue(3549)
+def test_issue3549(en_vocab):
+ """Test that match pattern validation doesn't raise on empty errors."""
+ matcher = Matcher(en_vocab, validate=True)
+ pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
+ matcher.add("GOOD", [pattern])
+ with pytest.raises(MatchPatternError):
+ matcher.add("BAD", [[{"X": "Y"}]])
+
+
+@pytest.mark.skip("Matching currently only works on strings and integers")
+@pytest.mark.issue(3555)
+def test_issue3555(en_vocab):
+ """Test that custom extensions with default None don't break matcher."""
+ Token.set_extension("issue3555", default=None)
+ matcher = Matcher(en_vocab)
+ pattern = [{"ORTH": "have"}, {"_": {"issue3555": True}}]
+ matcher.add("TEST", [pattern])
+ doc = Doc(en_vocab, words=["have", "apple"])
+ matcher(doc)
+
+
+@pytest.mark.issue(3839)
+def test_issue3839(en_vocab):
+ """Test that match IDs returned by the matcher are correct, are in the string"""
+ doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
+ matcher = Matcher(en_vocab)
+ match_id = "PATTERN"
+ pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
+ pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
+ matcher.add(match_id, [pattern1])
+ matches = matcher(doc)
+ assert matches[0][0] == en_vocab.strings[match_id]
+ matcher = Matcher(en_vocab)
+ matcher.add(match_id, [pattern2])
+ matches = matcher(doc)
+ assert matches[0][0] == en_vocab.strings[match_id]
+
+
+@pytest.mark.issue(3879)
+def test_issue3879(en_vocab):
+ doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
+ assert len(doc) == 5
+ pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
+ matcher = Matcher(en_vocab)
+ matcher.add("TEST", [pattern])
+ assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test'
+
+
+@pytest.mark.issue(3951)
+def test_issue3951(en_vocab):
+ """Test that combinations of optional rules are matched correctly."""
+ matcher = Matcher(en_vocab)
+ pattern = [
+ {"LOWER": "hello"},
+ {"LOWER": "this", "OP": "?"},
+ {"OP": "?"},
+ {"LOWER": "world"},
+ ]
+ matcher.add("TEST", [pattern])
+ doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
+ matches = matcher(doc)
+ assert len(matches) == 0
+
+
+@pytest.mark.issue(4120)
+def test_issue4120(en_vocab):
+ """Test that matches without a final {OP: ?} token are returned."""
+ matcher = Matcher(en_vocab)
+ matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
+ doc1 = Doc(en_vocab, words=["a"])
+ assert len(matcher(doc1)) == 1 # works
+ doc2 = Doc(en_vocab, words=["a", "b", "c"])
+ assert len(matcher(doc2)) == 2 # fixed
+ matcher = Matcher(en_vocab)
+ matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
+ doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
+ assert len(matcher(doc3)) == 2 # works
+ matcher = Matcher(en_vocab)
+ matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
+ doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
+ assert len(matcher(doc4)) == 3 # fixed
+
+
@pytest.mark.parametrize(
"pattern,re_pattern",
[
@@ -152,6 +623,7 @@ def test_operator_combos(en_vocab):
assert not matches, (string, pattern_str)
+@pytest.mark.issue(1450)
def test_matcher_end_zero_plus(en_vocab):
"""Test matcher works when patterns end with * operator. (issue 1450)"""
matcher = Matcher(en_vocab)
diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py
index 4d21aea81..8c265785c 100644
--- a/spacy/tests/matcher/test_pattern_validation.py
+++ b/spacy/tests/matcher/test_pattern_validation.py
@@ -12,6 +12,7 @@ TEST_PATTERNS = [
([{"IS_PUNCT": True, "OP": "$"}], 1, 1),
([{"_": "foo"}], 1, 1),
('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
+ ([{"ENT_IOB": "foo"}], 1, 1),
([1, 2, 3], 3, 1),
# Bad patterns flagged outside of Matcher
([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0)
@@ -22,6 +23,8 @@ TEST_PATTERNS = [
([{"TEXT": {"VALUE": "foo"}}], 2, 0), # prev: (1, 0)
([{"IS_DIGIT": -1}], 1, 0),
([{"ORTH": -1}], 1, 0),
+ ([{"ENT_ID": -1}], 1, 0),
+ ([{"ENT_KB_ID": -1}], 1, 0),
# Good patterns
([{"TEXT": "foo"}, {"LOWER": "bar"}], 0, 0),
([{"LEMMA": {"IN": ["love", "like"]}}, {"POS": "DET", "OP": "?"}], 0, 0),
@@ -33,6 +36,8 @@ TEST_PATTERNS = [
([{"orth": "foo"}], 0, 0), # prev: xfail
([{"IS_SENT_START": True}], 0, 0),
([{"SENT_START": True}], 0, 0),
+ ([{"ENT_ID": "STRING"}], 0, 0),
+ ([{"ENT_KB_ID": "STRING"}], 0, 0),
]
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index 478949601..f893d81f8 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -1,8 +1,125 @@
import pytest
import srsly
from mock import Mock
-from spacy.matcher import PhraseMatcher
+
+from spacy.lang.en import English
+from spacy.matcher import PhraseMatcher, Matcher
from spacy.tokens import Doc, Span
+from spacy.vocab import Vocab
+
+
+from ..util import make_tempdir
+
+
+@pytest.mark.issue(3248)
+def test_issue3248_1():
+ """Test that the PhraseMatcher correctly reports its number of rules, not
+ total number of patterns."""
+ nlp = English()
+ matcher = PhraseMatcher(nlp.vocab)
+ matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
+ matcher.add("TEST2", [nlp("d")])
+ assert len(matcher) == 2
+
+
+@pytest.mark.issue(3331)
+def test_issue3331(en_vocab):
+ """Test that duplicate patterns for different rules result in multiple
+ matches, one per rule.
+ """
+ matcher = PhraseMatcher(en_vocab)
+ matcher.add("A", [Doc(en_vocab, words=["Barack", "Obama"])])
+ matcher.add("B", [Doc(en_vocab, words=["Barack", "Obama"])])
+ doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"])
+ matches = matcher(doc)
+ assert len(matches) == 2
+ match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]]
+ assert sorted(match_ids) == ["A", "B"]
+
+
+@pytest.mark.issue(3972)
+def test_issue3972(en_vocab):
+ """Test that the PhraseMatcher returns duplicates for duplicate match IDs."""
+ matcher = PhraseMatcher(en_vocab)
+ matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
+ matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
+ doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
+ matches = matcher(doc)
+
+ assert len(matches) == 2
+
+ # We should have a match for each of the two rules
+ found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
+ assert "A" in found_ids
+ assert "B" in found_ids
+
+
+@pytest.mark.issue(4002)
+def test_issue4002(en_vocab):
+ """Test that the PhraseMatcher can match on overwritten NORM attributes."""
+ matcher = PhraseMatcher(en_vocab, attr="NORM")
+ pattern1 = Doc(en_vocab, words=["c", "d"])
+ assert [t.norm_ for t in pattern1] == ["c", "d"]
+ matcher.add("TEST", [pattern1])
+ doc = Doc(en_vocab, words=["a", "b", "c", "d"])
+ assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
+ matches = matcher(doc)
+ assert len(matches) == 1
+ matcher = PhraseMatcher(en_vocab, attr="NORM")
+ pattern2 = Doc(en_vocab, words=["1", "2"])
+ pattern2[0].norm_ = "c"
+ pattern2[1].norm_ = "d"
+ assert [t.norm_ for t in pattern2] == ["c", "d"]
+ matcher.add("TEST", [pattern2])
+ matches = matcher(doc)
+ assert len(matches) == 1
+
+
+@pytest.mark.issue(4373)
+def test_issue4373():
+ """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
+ matcher = Matcher(Vocab())
+ assert isinstance(matcher.vocab, Vocab)
+ matcher = PhraseMatcher(Vocab())
+ assert isinstance(matcher.vocab, Vocab)
+
+
+@pytest.mark.issue(4651)
+def test_issue4651_with_phrase_matcher_attr():
+ """Test that the EntityRuler PhraseMatcher is deserialized correctly using
+ the method from_disk when the EntityRuler argument phrase_matcher_attr is
+ specified.
+ """
+ text = "Spacy is a python library for nlp"
+ nlp = English()
+ patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
+ ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"})
+ ruler.add_patterns(patterns)
+ doc = nlp(text)
+ res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
+ nlp_reloaded = English()
+ with make_tempdir() as d:
+ file_path = d / "entityruler"
+ ruler.to_disk(file_path)
+ nlp_reloaded.add_pipe("entity_ruler").from_disk(file_path)
+ doc_reloaded = nlp_reloaded(text)
+ res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
+ assert res == res_reloaded
+
+
+@pytest.mark.issue(6839)
+def test_issue6839(en_vocab):
+ """Ensure that PhraseMatcher accepts Span as input"""
+ # fmt: off
+ words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."]
+ # fmt: on
+ doc = Doc(en_vocab, words=words)
+ span = doc[:8]
+ pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
+ matcher = PhraseMatcher(en_vocab)
+ matcher.add("SPACY", [pattern])
+ matches = matcher(span)
+ assert matches
def test_matcher_phrase_matcher(en_vocab):
diff --git a/spacy/tests/package/__init__.py b/spacy/tests/package/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py
index 82c39b72c..75908df59 100644
--- a/spacy/tests/package/test_requirements.py
+++ b/spacy/tests/package/test_requirements.py
@@ -11,6 +11,11 @@ def test_build_dependencies():
"mock",
"flake8",
"hypothesis",
+ "pre-commit",
+ "mypy",
+ "types-dataclasses",
+ "types-mock",
+ "types-requests",
]
# ignore language-specific packages that shouldn't be installed by all
libs_ignore_setup = [
@@ -20,6 +25,7 @@ def test_build_dependencies():
"sudachipy",
"sudachidict_core",
"spacy-pkuseg",
+ "thinc-apple-ops",
]
# check requirements.txt
diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py
index cba6fa81e..bb226f9c5 100644
--- a/spacy/tests/parser/test_arc_eager_oracle.py
+++ b/spacy/tests/parser/test_arc_eager_oracle.py
@@ -40,6 +40,28 @@ def arc_eager(vocab):
return moves
+@pytest.mark.issue(7056)
+def test_issue7056():
+ """Test that the Unshift transition works properly, and doesn't cause
+ sentence segmentation errors."""
+ vocab = Vocab()
+ ae = ArcEager(
+ vocab.strings, ArcEager.get_actions(left_labels=["amod"], right_labels=["pobj"])
+ )
+ doc = Doc(vocab, words="Severe pain , after trauma".split())
+ state = ae.init_batch([doc])[0]
+ ae.apply_transition(state, "S")
+ ae.apply_transition(state, "L-amod")
+ ae.apply_transition(state, "S")
+ ae.apply_transition(state, "S")
+ ae.apply_transition(state, "S")
+ ae.apply_transition(state, "R-pobj")
+ ae.apply_transition(state, "D")
+ ae.apply_transition(state, "D")
+ ae.apply_transition(state, "D")
+ assert not state.eol()
+
+
def test_oracle_four_words(arc_eager, vocab):
words = ["a", "b", "c", "d"]
heads = [1, 1, 3, 3]
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index ee9b6bf01..b3b29d1f9 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -1,15 +1,18 @@
+import random
+
import pytest
from numpy.testing import assert_equal
-from spacy.attrs import ENT_IOB
-from spacy import util
+from spacy.attrs import ENT_IOB
+from spacy import util, registry
from spacy.lang.en import English
+from spacy.lang.it import Italian
from spacy.language import Language
from spacy.lookups import Lookups
from spacy.pipeline._parser_internals.ner import BiluoPushDown
-from spacy.training import Example
+from spacy.training import Example, iob_to_biluo
from spacy.tokens import Doc, Span
-from spacy.vocab import Vocab, registry
+from spacy.vocab import Vocab
import logging
from ..util import make_tempdir
@@ -58,6 +61,152 @@ def tsys(vocab, entity_types):
return BiluoPushDown(vocab.strings, actions)
+@pytest.mark.parametrize("label", ["U-JOB-NAME"])
+@pytest.mark.issue(1967)
+def test_issue1967(label):
+ nlp = Language()
+ config = {}
+ ner = nlp.create_pipe("ner", config=config)
+ example = Example.from_dict(
+ Doc(ner.vocab, words=["word"]),
+ {
+ "ids": [0],
+ "words": ["word"],
+ "tags": ["tag"],
+ "heads": [0],
+ "deps": ["dep"],
+ "entities": [label],
+ },
+ )
+ assert "JOB-NAME" in ner.moves.get_actions(examples=[example])[1]
+
+
+@pytest.mark.issue(2179)
+def test_issue2179():
+ """Test that spurious 'extra_labels' aren't created when initializing NER."""
+ nlp = Italian()
+ ner = nlp.add_pipe("ner")
+ ner.add_label("CITIZENSHIP")
+ nlp.initialize()
+ nlp2 = Italian()
+ nlp2.add_pipe("ner")
+ assert len(nlp2.get_pipe("ner").labels) == 0
+ model = nlp2.get_pipe("ner").model
+ model.attrs["resize_output"](model, nlp.get_pipe("ner").moves.n_moves)
+ nlp2.from_bytes(nlp.to_bytes())
+ assert "extra_labels" not in nlp2.get_pipe("ner").cfg
+ assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",)
+
+
+@pytest.mark.issue(2385)
+def test_issue2385():
+ """Test that IOB tags are correctly converted to BILUO tags."""
+ # fix bug in labels with a 'b' character
+ tags1 = ("B-BRAWLER", "I-BRAWLER", "I-BRAWLER")
+ assert iob_to_biluo(tags1) == ["B-BRAWLER", "I-BRAWLER", "L-BRAWLER"]
+ # maintain support for iob1 format
+ tags2 = ("I-ORG", "I-ORG", "B-ORG")
+ assert iob_to_biluo(tags2) == ["B-ORG", "L-ORG", "U-ORG"]
+ # maintain support for iob2 format
+ tags3 = ("B-PERSON", "I-PERSON", "B-PERSON")
+ assert iob_to_biluo(tags3) == ["B-PERSON", "L-PERSON", "U-PERSON"]
+
+
+@pytest.mark.issue(2800)
+def test_issue2800():
+ """Test issue that arises when too many labels are added to NER model.
+ Used to cause segfault.
+ """
+ nlp = English()
+ train_data = []
+ train_data.extend(
+ [Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})]
+ )
+ entity_types = [str(i) for i in range(1000)]
+ ner = nlp.add_pipe("ner")
+ for entity_type in list(entity_types):
+ ner.add_label(entity_type)
+ optimizer = nlp.initialize()
+ for i in range(20):
+ losses = {}
+ random.shuffle(train_data)
+ for example in train_data:
+ nlp.update([example], sgd=optimizer, losses=losses, drop=0.5)
+
+
+@pytest.mark.issue(3209)
+def test_issue3209():
+ """Test issue that occurred in spaCy nightly where NER labels were being
+ mapped to classes incorrectly after loading the model, when the labels
+ were added using ner.add_label().
+ """
+ nlp = English()
+ ner = nlp.add_pipe("ner")
+ ner.add_label("ANIMAL")
+ nlp.initialize()
+ move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
+ assert ner.move_names == move_names
+ nlp2 = English()
+ ner2 = nlp2.add_pipe("ner")
+ model = ner2.model
+ model.attrs["resize_output"](model, ner.moves.n_moves)
+ nlp2.from_bytes(nlp.to_bytes())
+ assert ner2.move_names == move_names
+
+
+@pytest.mark.issue(4267)
+def test_issue4267():
+ """Test that running an entity_ruler after ner gives consistent results"""
+ nlp = English()
+ ner = nlp.add_pipe("ner")
+ ner.add_label("PEOPLE")
+ nlp.initialize()
+ assert "ner" in nlp.pipe_names
+ # assert that we have correct IOB annotations
+ doc1 = nlp("hi")
+ assert doc1.has_annotation("ENT_IOB")
+ for token in doc1:
+ assert token.ent_iob == 2
+ # add entity ruler and run again
+ patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
+ ruler = nlp.add_pipe("entity_ruler")
+ ruler.add_patterns(patterns)
+ assert "entity_ruler" in nlp.pipe_names
+ assert "ner" in nlp.pipe_names
+ # assert that we still have correct IOB annotations
+ doc2 = nlp("hi")
+ assert doc2.has_annotation("ENT_IOB")
+ for token in doc2:
+ assert token.ent_iob == 2
+
+
+@pytest.mark.issue(4313)
+def test_issue4313():
+ """This should not crash or exit with some strange error code"""
+ beam_width = 16
+ beam_density = 0.0001
+ nlp = English()
+ config = {
+ "beam_width": beam_width,
+ "beam_density": beam_density,
+ }
+ ner = nlp.add_pipe("beam_ner", config=config)
+ ner.add_label("SOME_LABEL")
+ nlp.initialize()
+ # add a new label to the doc
+ doc = nlp("What do you think about Apple ?")
+ assert len(ner.labels) == 1
+ assert "SOME_LABEL" in ner.labels
+ apple_ent = Span(doc, 5, 6, label="MY_ORG")
+ doc.ents = list(doc.ents) + [apple_ent]
+
+ # ensure the beam_parse still works with the new label
+ docs = [doc]
+ ner.beam_parse(docs, drop=0.0, beam_width=beam_width, beam_density=beam_density)
+ assert len(ner.labels) == 2
+ assert "MY_ORG" in ner.labels
+
+
def test_get_oracle_moves(tsys, doc, entity_annots):
example = Example.from_dict(doc, {"entities": entity_annots})
act_classes = tsys.get_oracle_sequence(example, _debug=False)
@@ -329,8 +478,8 @@ def test_ner_constructor(en_vocab):
}
cfg = {"model": DEFAULT_NER_MODEL}
model = registry.resolve(cfg, validate=True)["model"]
- ner_1 = EntityRecognizer(en_vocab, model, **config)
- ner_2 = EntityRecognizer(en_vocab, model)
+ EntityRecognizer(en_vocab, model, **config)
+ EntityRecognizer(en_vocab, model)
def test_ner_before_ruler():
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 1b0d9d256..7bbb30d8e 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -1,15 +1,17 @@
import pytest
from numpy.testing import assert_equal
-from spacy.attrs import DEP
+from thinc.api import Adam
+from spacy import registry, util
+from spacy.attrs import DEP, NORM
from spacy.lang.en import English
-from spacy.training import Example
from spacy.tokens import Doc
-from spacy import util, registry
+from spacy.training import Example
+from spacy.vocab import Vocab
-from ..util import apply_transition_sequence, make_tempdir
from ...pipeline import DependencyParser
from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL
+from ..util import apply_transition_sequence, make_tempdir
TRAIN_DATA = [
(
@@ -59,6 +61,94 @@ PARTIAL_DATA = [
eps = 0.1
+@pytest.fixture
+def vocab():
+ return Vocab(lex_attr_getters={NORM: lambda s: s})
+
+
+@pytest.fixture
+def parser(vocab):
+ vocab.strings.add("ROOT")
+ cfg = {"model": DEFAULT_PARSER_MODEL}
+ model = registry.resolve(cfg, validate=True)["model"]
+ parser = DependencyParser(vocab, model)
+ parser.cfg["token_vector_width"] = 4
+ parser.cfg["hidden_width"] = 32
+ # parser.add_label('right')
+ parser.add_label("left")
+ parser.initialize(lambda: [_parser_example(parser)])
+ sgd = Adam(0.001)
+
+ for i in range(10):
+ losses = {}
+ doc = Doc(vocab, words=["a", "b", "c", "d"])
+ example = Example.from_dict(
+ doc, {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]}
+ )
+ parser.update([example], sgd=sgd, losses=losses)
+ return parser
+
+
+def _parser_example(parser):
+ doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
+ gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]}
+ return Example.from_dict(doc, gold)
+
+
+@pytest.mark.issue(2772)
+def test_issue2772(en_vocab):
+ """Test that deprojectivization doesn't mess up sentence boundaries."""
+ # fmt: off
+ words = ["When", "we", "write", "or", "communicate", "virtually", ",", "we", "can", "hide", "our", "true", "feelings", "."]
+ # fmt: on
+ # A tree with a non-projective (i.e. crossing) arc
+ # The arcs (0, 4) and (2, 9) cross.
+ heads = [4, 2, 9, 2, 2, 4, 9, 9, 9, 9, 12, 12, 9, 9]
+ deps = ["dep"] * len(heads)
+ doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
+ assert doc[1].is_sent_start is False
+
+
+@pytest.mark.issue(3830)
+def test_issue3830_no_subtok():
+ """Test that the parser doesn't have subtok label if not learn_tokens"""
+ config = {
+ "learn_tokens": False,
+ }
+ model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"]
+ parser = DependencyParser(Vocab(), model, **config)
+ parser.add_label("nsubj")
+ assert "subtok" not in parser.labels
+ parser.initialize(lambda: [_parser_example(parser)])
+ assert "subtok" not in parser.labels
+
+
+@pytest.mark.issue(3830)
+def test_issue3830_with_subtok():
+ """Test that the parser does have subtok label if learn_tokens=True."""
+ config = {
+ "learn_tokens": True,
+ }
+ model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"]
+ parser = DependencyParser(Vocab(), model, **config)
+ parser.add_label("nsubj")
+ assert "subtok" not in parser.labels
+ parser.initialize(lambda: [_parser_example(parser)])
+ assert "subtok" in parser.labels
+
+
+@pytest.mark.issue(7716)
+@pytest.mark.xfail(reason="Not fixed yet")
+def test_partial_annotation(parser):
+ doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
+ doc[2].is_sent_start = False
+ # Note that if the following line is used, then doc[2].is_sent_start == False
+ # doc[3].is_sent_start = False
+
+ doc = parser(doc)
+ assert doc[2].is_sent_start == False
+
+
def test_parser_root(en_vocab):
words = ["i", "do", "n't", "have", "other", "assistance"]
heads = [3, 3, 3, 3, 5, 3]
@@ -224,8 +314,8 @@ def test_parser_constructor(en_vocab):
}
cfg = {"model": DEFAULT_PARSER_MODEL}
model = registry.resolve(cfg, validate=True)["model"]
- parser_1 = DependencyParser(en_vocab, model, **config)
- parser_2 = DependencyParser(en_vocab, model)
+ DependencyParser(en_vocab, model, **config)
+ DependencyParser(en_vocab, model)
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
diff --git a/spacy/tests/pipeline/test_annotates_on_update.py b/spacy/tests/pipeline/test_annotates_on_update.py
index c5288112d..869b8b874 100644
--- a/spacy/tests/pipeline/test_annotates_on_update.py
+++ b/spacy/tests/pipeline/test_annotates_on_update.py
@@ -74,7 +74,7 @@ def test_annotates_on_update():
nlp.add_pipe("assert_sents")
# When the pipeline runs, annotations are set
- doc = nlp("This is a sentence.")
+ nlp("This is a sentence.")
examples = []
for text in ["a a", "b b", "c c"]:
diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py
index 9c750ffd0..dab3ebf57 100644
--- a/spacy/tests/pipeline/test_attributeruler.py
+++ b/spacy/tests/pipeline/test_attributeruler.py
@@ -32,24 +32,6 @@ def pattern_dicts():
]
-@registry.misc("attribute_ruler_patterns")
-def attribute_ruler_patterns():
- return [
- {
- "patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
- "attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
- },
- # one pattern sets the lemma
- {"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
- # another pattern sets the morphology
- {
- "patterns": [[{"ORTH": "test"}]],
- "attrs": {"MORPH": "Case=Nom|Number=Sing"},
- "index": 0,
- },
- ]
-
-
@pytest.fixture
def tag_map():
return {
@@ -121,7 +103,25 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
assert doc.has_annotation("LEMMA")
assert doc.has_annotation("MORPH")
nlp.remove_pipe("attribute_ruler")
+
# initialize with patterns from misc registry
+ @registry.misc("attribute_ruler_patterns")
+ def attribute_ruler_patterns():
+ return [
+ {
+ "patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
+ "attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
+ },
+ # one pattern sets the lemma
+ {"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
+ # another pattern sets the morphology
+ {
+ "patterns": [[{"ORTH": "test"}]],
+ "attrs": {"MORPH": "Case=Nom|Number=Sing"},
+ "index": 0,
+ },
+ ]
+
nlp.config["initialize"]["components"]["attribute_ruler"] = {
"patterns": {"@misc": "attribute_ruler_patterns"}
}
@@ -162,6 +162,26 @@ def test_attributeruler_score(nlp, pattern_dicts):
assert scores["lemma_acc"] == pytest.approx(0.2)
# no morphs are set
assert scores["morph_acc"] is None
+ nlp.remove_pipe("attribute_ruler")
+
+ # test with custom scorer
+ @registry.misc("weird_scorer.v1")
+ def make_weird_scorer():
+ def weird_scorer(examples, weird_score, **kwargs):
+ return {"weird_score": weird_score}
+
+ return weird_scorer
+
+ ruler = nlp.add_pipe(
+ "attribute_ruler", config={"scorer": {"@misc": "weird_scorer.v1"}}
+ )
+ ruler.initialize(lambda: [], patterns=pattern_dicts)
+ scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.12345})
+ assert scores["weird_score"] == 0.12345
+ assert "token_acc" in scores
+ assert "lemma_acc" not in scores
+ scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.23456})
+ assert scores["weird_score"] == 0.23456
def test_attributeruler_rule_order(nlp):
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index b97795344..3740e430e 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1,18 +1,20 @@
from typing import Callable, Iterable
+
import pytest
from numpy.testing import assert_equal
+
+from spacy import registry, util
from spacy.attrs import ENT_KB_ID
from spacy.compat import pickle
-from spacy.kb import KnowledgeBase, get_candidates, Candidate
-from spacy.vocab import Vocab
-
-from spacy import util, registry
+from spacy.kb import Candidate, KnowledgeBase, get_candidates
+from spacy.lang.en import English
from spacy.ml import load_kb
from spacy.scorer import Scorer
-from spacy.training import Example
-from spacy.lang.en import English
from spacy.tests.util import make_tempdir
from spacy.tokens import Span
+from spacy.training import Example
+from spacy.util import ensure_path
+from spacy.vocab import Vocab
@pytest.fixture
@@ -25,6 +27,198 @@ def assert_almost_equal(a, b):
assert a - delta <= b <= a + delta
+@pytest.mark.issue(4674)
+def test_issue4674():
+ """Test that setting entities with overlapping identifiers does not mess up IO"""
+ nlp = English()
+ kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+ vector1 = [0.9, 1.1, 1.01]
+ vector2 = [1.8, 2.25, 2.01]
+ with pytest.warns(UserWarning):
+ kb.set_entities(
+ entity_list=["Q1", "Q1"],
+ freq_list=[32, 111],
+ vector_list=[vector1, vector2],
+ )
+ assert kb.get_size_entities() == 1
+ # dumping to file & loading back in
+ with make_tempdir() as d:
+ dir_path = ensure_path(d)
+ if not dir_path.exists():
+ dir_path.mkdir()
+ file_path = dir_path / "kb"
+ kb.to_disk(str(file_path))
+ kb2 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+ kb2.from_disk(str(file_path))
+ assert kb2.get_size_entities() == 1
+
+
+@pytest.mark.issue(6730)
+def test_issue6730(en_vocab):
+ """Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
+ from spacy.kb import KnowledgeBase
+
+ kb = KnowledgeBase(en_vocab, entity_vector_length=3)
+ kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])
+
+ with pytest.raises(ValueError):
+ kb.add_alias(alias="", entities=["1"], probabilities=[0.4])
+ assert kb.contains_alias("") is False
+
+ kb.add_alias(alias="x", entities=["1"], probabilities=[0.2])
+ kb.add_alias(alias="y", entities=["1"], probabilities=[0.1])
+
+ with make_tempdir() as tmp_dir:
+ kb.to_disk(tmp_dir)
+ kb.from_disk(tmp_dir)
+ assert kb.get_size_aliases() == 2
+ assert set(kb.get_alias_strings()) == {"x", "y"}
+
+
+@pytest.mark.issue(7065)
+def test_issue7065():
+ text = "Kathleen Battle sang in Mahler 's Symphony No. 8 at the Cincinnati Symphony Orchestra 's May Festival."
+ nlp = English()
+ nlp.add_pipe("sentencizer")
+ ruler = nlp.add_pipe("entity_ruler")
+ patterns = [
+ {
+ "label": "THING",
+ "pattern": [
+ {"LOWER": "symphony"},
+ {"LOWER": "no"},
+ {"LOWER": "."},
+ {"LOWER": "8"},
+ ],
+ }
+ ]
+ ruler.add_patterns(patterns)
+
+ doc = nlp(text)
+ sentences = [s for s in doc.sents]
+ assert len(sentences) == 2
+ sent0 = sentences[0]
+ ent = doc.ents[0]
+ assert ent.start < sent0.end < ent.end
+ assert sentences.index(ent.sent) == 0
+
+
+@pytest.mark.issue(7065)
+def test_issue7065_b():
+ # Test that the NEL doesn't crash when an entity crosses a sentence boundary
+ nlp = English()
+ vector_length = 3
+ nlp.add_pipe("sentencizer")
+ text = "Mahler 's Symphony No. 8 was beautiful."
+ entities = [(0, 6, "PERSON"), (10, 24, "WORK")]
+ links = {
+ (0, 6): {"Q7304": 1.0, "Q270853": 0.0},
+ (10, 24): {"Q7304": 0.0, "Q270853": 1.0},
+ }
+ sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0]
+ doc = nlp(text)
+ example = Example.from_dict(
+ doc, {"entities": entities, "links": links, "sent_starts": sent_starts}
+ )
+ train_examples = [example]
+
+ def create_kb(vocab):
+ # create artificial KB
+ mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+ mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7])
+ mykb.add_alias(
+ alias="No. 8",
+ entities=["Q270853"],
+ probabilities=[1.0],
+ )
+ mykb.add_entity(entity="Q7304", freq=12, entity_vector=[6, -4, 3])
+ mykb.add_alias(
+ alias="Mahler",
+ entities=["Q7304"],
+ probabilities=[1.0],
+ )
+ return mykb
+
+ # Create the Entity Linker component and add it to the pipeline
+ entity_linker = nlp.add_pipe("entity_linker", last=True)
+ entity_linker.set_kb(create_kb)
+ # train the NEL pipe
+ optimizer = nlp.initialize(get_examples=lambda: train_examples)
+ for i in range(2):
+ losses = {}
+ nlp.update(train_examples, sgd=optimizer, losses=losses)
+
+ # Add a custom rule-based component to mimick NER
+ patterns = [
+ {"label": "PERSON", "pattern": [{"LOWER": "mahler"}]},
+ {
+ "label": "WORK",
+ "pattern": [
+ {"LOWER": "symphony"},
+ {"LOWER": "no"},
+ {"LOWER": "."},
+ {"LOWER": "8"},
+ ],
+ },
+ ]
+ ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
+ ruler.add_patterns(patterns)
+ # test the trained model - this should not throw E148
+ doc = nlp(text)
+ assert doc
+
+
+def test_partial_links():
+ # Test that having some entities on the doc without gold links, doesn't crash
+ TRAIN_DATA = [
+ (
+ "Russ Cochran his reprints include EC Comics.",
+ {
+ "links": {(0, 12): {"Q2146908": 1.0}},
+ "entities": [(0, 12, "PERSON")],
+ "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0],
+ },
+ )
+ ]
+ nlp = English()
+ vector_length = 3
+ train_examples = []
+ for text, annotation in TRAIN_DATA:
+ doc = nlp(text)
+ train_examples.append(Example.from_dict(doc, annotation))
+
+ def create_kb(vocab):
+ # create artificial KB
+ mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+ mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
+ mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9])
+ return mykb
+
+ # Create and train the Entity Linker
+ entity_linker = nlp.add_pipe("entity_linker", last=True)
+ entity_linker.set_kb(create_kb)
+ optimizer = nlp.initialize(get_examples=lambda: train_examples)
+ for i in range(2):
+ losses = {}
+ nlp.update(train_examples, sgd=optimizer, losses=losses)
+
+ # adding additional components that are required for the entity_linker
+ nlp.add_pipe("sentencizer", first=True)
+ patterns = [
+ {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]},
+ {"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]},
+ ]
+ ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
+ ruler.add_patterns(patterns)
+
+ # this will run the pipeline on the examples and shouldn't crash
+ results = nlp.evaluate(train_examples)
+ assert "PERSON" in results["ents_per_type"]
+ assert "PERSON" in results["nel_f_per_type"]
+ assert "ORG" in results["ents_per_type"]
+ assert "ORG" not in results["nel_f_per_type"]
+
+
def test_kb_valid_entities(nlp):
"""Test the valid construction of a KB with 3 entities and two aliases"""
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
@@ -154,6 +348,40 @@ def test_kb_serialize(nlp):
mykb.from_disk(d / "unknown" / "kb")
+@pytest.mark.issue(9137)
+def test_kb_serialize_2(nlp):
+ v = [5, 6, 7, 8]
+ kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
+ kb1.set_entities(["E1"], [1], [v])
+ assert kb1.get_vector("E1") == v
+ with make_tempdir() as d:
+ kb1.to_disk(d / "kb")
+ kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
+ kb2.from_disk(d / "kb")
+ assert kb2.get_vector("E1") == v
+
+
+def test_kb_set_entities(nlp):
+ """Test that set_entities entirely overwrites the previous set of entities"""
+ v = [5, 6, 7, 8]
+ v1 = [1, 1, 1, 0]
+ v2 = [2, 2, 2, 3]
+ kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
+ kb1.set_entities(["E0"], [1], [v])
+ assert kb1.get_entity_strings() == ["E0"]
+ kb1.set_entities(["E1", "E2"], [1, 9], [v1, v2])
+ assert set(kb1.get_entity_strings()) == {"E1", "E2"}
+ assert kb1.get_vector("E1") == v1
+ assert kb1.get_vector("E2") == v2
+ with make_tempdir() as d:
+ kb1.to_disk(d / "kb")
+ kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
+ kb2.from_disk(d / "kb")
+ assert set(kb2.get_entity_strings()) == {"E1", "E2"}
+ assert kb2.get_vector("E1") == v1
+ assert kb2.get_vector("E2") == v2
+
+
def test_kb_serialize_vocab(nlp):
"""Test serialization of the KB and custom strings"""
entity = "MyFunnyID"
diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index dc0ca0301..f2031d0a9 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -1,10 +1,14 @@
import pytest
from spacy import registry
-from spacy.tokens import Span
+from spacy.tokens import Doc, Span
from spacy.language import Language
-from spacy.pipeline import EntityRuler
+from spacy.lang.en import English
+from spacy.pipeline import EntityRuler, EntityRecognizer, merge_entities
+from spacy.pipeline.ner import DEFAULT_NER_MODEL
from spacy.errors import MatchPatternError
+from spacy.tests.util import make_tempdir
+
from thinc.api import NumpyOps, get_current_ops
@@ -32,6 +36,117 @@ def add_ent_component(doc):
return doc
+@pytest.mark.issue(3345)
+def test_issue3345():
+ """Test case where preset entity crosses sentence boundary."""
+ nlp = English()
+ doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
+ doc[4].is_sent_start = True
+ ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
+ cfg = {"model": DEFAULT_NER_MODEL}
+ model = registry.resolve(cfg, validate=True)["model"]
+ ner = EntityRecognizer(doc.vocab, model)
+ # Add the OUT action. I wouldn't have thought this would be necessary...
+ ner.moves.add_action(5, "")
+ ner.add_label("GPE")
+ doc = ruler(doc)
+ # Get into the state just before "New"
+ state = ner.moves.init_batch([doc])[0]
+ ner.moves.apply_transition(state, "O")
+ ner.moves.apply_transition(state, "O")
+ ner.moves.apply_transition(state, "O")
+ # Check that B-GPE is valid.
+ assert ner.moves.is_valid(state, "B-GPE")
+
+
+@pytest.mark.issue(4849)
+def test_issue4849():
+ nlp = English()
+ patterns = [
+ {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
+ {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
+ ]
+ ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"})
+ ruler.add_patterns(patterns)
+ text = """
+ The left is starting to take aim at Democratic front-runner Joe Biden.
+ Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
+ """
+ # USING 1 PROCESS
+ count_ents = 0
+ for doc in nlp.pipe([text], n_process=1):
+ count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
+ assert count_ents == 2
+ # USING 2 PROCESSES
+ if isinstance(get_current_ops, NumpyOps):
+ count_ents = 0
+ for doc in nlp.pipe([text], n_process=2):
+ count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
+ assert count_ents == 2
+
+
+@pytest.mark.issue(5918)
+def test_issue5918():
+ # Test edge case when merging entities.
+ nlp = English()
+ ruler = nlp.add_pipe("entity_ruler")
+ patterns = [
+ {"label": "ORG", "pattern": "Digicon Inc"},
+ {"label": "ORG", "pattern": "Rotan Mosle Inc's"},
+ {"label": "ORG", "pattern": "Rotan Mosle Technology Partners Ltd"},
+ ]
+ ruler.add_patterns(patterns)
+
+ text = """
+ Digicon Inc said it has completed the previously-announced disposition
+ of its computer systems division to an investment group led by
+ Rotan Mosle Inc's Rotan Mosle Technology Partners Ltd affiliate.
+ """
+ doc = nlp(text)
+ assert len(doc.ents) == 3
+ # make it so that the third span's head is within the entity (ent_iob=I)
+ # bug #5918 would wrongly transfer that I to the full entity, resulting in 2 instead of 3 final ents.
+ # TODO: test for logging here
+ # with pytest.warns(UserWarning):
+ # doc[29].head = doc[33]
+ doc = merge_entities(doc)
+ assert len(doc.ents) == 3
+
+
+@pytest.mark.issue(8168)
+def test_issue8168():
+ nlp = English()
+ ruler = nlp.add_pipe("entity_ruler")
+ patterns = [
+ {"label": "ORG", "pattern": "Apple"},
+ {
+ "label": "GPE",
+ "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}],
+ "id": "san-francisco",
+ },
+ {
+ "label": "GPE",
+ "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}],
+ "id": "san-francisco",
+ },
+ ]
+ ruler.add_patterns(patterns)
+
+ assert ruler._ent_ids == {8043148519967183733: ("GPE", "san-francisco")}
+
+
+@pytest.mark.issue(8216)
+def test_entity_ruler_fix8216(nlp, patterns):
+ """Test that patterns don't get added excessively."""
+ ruler = nlp.add_pipe("entity_ruler", config={"validate": True})
+ ruler.add_patterns(patterns)
+ pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
+ assert pattern_count > 0
+ ruler.add_patterns([])
+ after_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
+ assert after_count == pattern_count
+
+
def test_entity_ruler_init(nlp, patterns):
ruler = EntityRuler(nlp, patterns=patterns)
assert len(ruler) == len(patterns)
@@ -238,3 +353,205 @@ def test_entity_ruler_multiprocessing(nlp, n_process):
for doc in nlp.pipe(texts, n_process=2):
for ent in doc.ents:
assert ent.ent_id_ == "1234"
+
+
+def test_entity_ruler_serialize_jsonl(nlp, patterns):
+ ruler = nlp.add_pipe("entity_ruler")
+ ruler.add_patterns(patterns)
+ with make_tempdir() as d:
+ ruler.to_disk(d / "test_ruler.jsonl")
+ ruler.from_disk(d / "test_ruler.jsonl") # read from an existing jsonl file
+ with pytest.raises(ValueError):
+ ruler.from_disk(d / "non_existing.jsonl") # read from a bad jsonl file
+
+
+def test_entity_ruler_serialize_dir(nlp, patterns):
+ ruler = nlp.add_pipe("entity_ruler")
+ ruler.add_patterns(patterns)
+ with make_tempdir() as d:
+ ruler.to_disk(d / "test_ruler")
+ ruler.from_disk(d / "test_ruler") # read from an existing directory
+ with pytest.raises(ValueError):
+ ruler.from_disk(d / "non_existing_dir") # read from a bad directory
+
+
+def test_entity_ruler_remove_basic(nlp):
+ ruler = EntityRuler(nlp)
+ patterns = [
+ {"label": "PERSON", "pattern": "Duygu", "id": "duygu"},
+ {"label": "ORG", "pattern": "ACME", "id": "acme"},
+ {"label": "ORG", "pattern": "ACM"},
+ ]
+ ruler.add_patterns(patterns)
+ doc = ruler(nlp.make_doc("Duygu went to school"))
+ assert len(ruler.patterns) == 3
+ assert len(doc.ents) == 1
+ assert doc.ents[0].label_ == "PERSON"
+ assert doc.ents[0].text == "Duygu"
+ assert "PERSON||duygu" in ruler.phrase_matcher
+ ruler.remove("duygu")
+ doc = ruler(nlp.make_doc("Duygu went to school"))
+ assert len(doc.ents) == 0
+ assert "PERSON||duygu" not in ruler.phrase_matcher
+ assert len(ruler.patterns) == 2
+
+
+def test_entity_ruler_remove_same_id_multiple_patterns(nlp):
+ ruler = EntityRuler(nlp)
+ patterns = [
+ {"label": "PERSON", "pattern": "Duygu", "id": "duygu"},
+ {"label": "ORG", "pattern": "DuyguCorp", "id": "duygu"},
+ {"label": "ORG", "pattern": "ACME", "id": "acme"},
+ ]
+ ruler.add_patterns(patterns)
+ doc = ruler(nlp.make_doc("Duygu founded DuyguCorp and ACME."))
+ assert len(ruler.patterns) == 3
+ assert "PERSON||duygu" in ruler.phrase_matcher
+ assert "ORG||duygu" in ruler.phrase_matcher
+ assert len(doc.ents) == 3
+ ruler.remove("duygu")
+ doc = ruler(nlp.make_doc("Duygu founded DuyguCorp and ACME."))
+ assert len(ruler.patterns) == 1
+ assert "PERSON||duygu" not in ruler.phrase_matcher
+ assert "ORG||duygu" not in ruler.phrase_matcher
+ assert len(doc.ents) == 1
+
+
+def test_entity_ruler_remove_nonexisting_pattern(nlp):
+ ruler = EntityRuler(nlp)
+ patterns = [
+ {"label": "PERSON", "pattern": "Duygu", "id": "duygu"},
+ {"label": "ORG", "pattern": "ACME", "id": "acme"},
+ {"label": "ORG", "pattern": "ACM"},
+ ]
+ ruler.add_patterns(patterns)
+ assert len(ruler.patterns) == 3
+ with pytest.raises(ValueError):
+ ruler.remove("nepattern")
+ assert len(ruler.patterns) == 3
+
+
+def test_entity_ruler_remove_several_patterns(nlp):
+ ruler = EntityRuler(nlp)
+ patterns = [
+ {"label": "PERSON", "pattern": "Duygu", "id": "duygu"},
+ {"label": "ORG", "pattern": "ACME", "id": "acme"},
+ {"label": "ORG", "pattern": "ACM"},
+ ]
+ ruler.add_patterns(patterns)
+ doc = ruler(nlp.make_doc("Duygu founded her company ACME."))
+ assert len(ruler.patterns) == 3
+ assert len(doc.ents) == 2
+ assert doc.ents[0].label_ == "PERSON"
+ assert doc.ents[0].text == "Duygu"
+ assert doc.ents[1].label_ == "ORG"
+ assert doc.ents[1].text == "ACME"
+ ruler.remove("duygu")
+ doc = ruler(nlp.make_doc("Duygu founded her company ACME"))
+ assert len(ruler.patterns) == 2
+ assert len(doc.ents) == 1
+ assert doc.ents[0].label_ == "ORG"
+ assert doc.ents[0].text == "ACME"
+ ruler.remove("acme")
+ doc = ruler(nlp.make_doc("Duygu founded her company ACME"))
+ assert len(ruler.patterns) == 1
+ assert len(doc.ents) == 0
+
+
+def test_entity_ruler_remove_patterns_in_a_row(nlp):
+ ruler = EntityRuler(nlp)
+ patterns = [
+ {"label": "PERSON", "pattern": "Duygu", "id": "duygu"},
+ {"label": "ORG", "pattern": "ACME", "id": "acme"},
+ {"label": "DATE", "pattern": "her birthday", "id": "bday"},
+ {"label": "ORG", "pattern": "ACM"},
+ ]
+ ruler.add_patterns(patterns)
+ doc = ruler(nlp.make_doc("Duygu founded her company ACME on her birthday"))
+ assert len(doc.ents) == 3
+ assert doc.ents[0].label_ == "PERSON"
+ assert doc.ents[0].text == "Duygu"
+ assert doc.ents[1].label_ == "ORG"
+ assert doc.ents[1].text == "ACME"
+ assert doc.ents[2].label_ == "DATE"
+ assert doc.ents[2].text == "her birthday"
+ ruler.remove("duygu")
+ ruler.remove("acme")
+ ruler.remove("bday")
+ doc = ruler(nlp.make_doc("Duygu went to school"))
+ assert len(doc.ents) == 0
+
+
+def test_entity_ruler_remove_all_patterns(nlp):
+ ruler = EntityRuler(nlp)
+ patterns = [
+ {"label": "PERSON", "pattern": "Duygu", "id": "duygu"},
+ {"label": "ORG", "pattern": "ACME", "id": "acme"},
+ {"label": "DATE", "pattern": "her birthday", "id": "bday"},
+ ]
+ ruler.add_patterns(patterns)
+ assert len(ruler.patterns) == 3
+ ruler.remove("duygu")
+ assert len(ruler.patterns) == 2
+ ruler.remove("acme")
+ assert len(ruler.patterns) == 1
+ ruler.remove("bday")
+ assert len(ruler.patterns) == 0
+ with pytest.warns(UserWarning):
+ doc = ruler(nlp.make_doc("Duygu founded her company ACME on her birthday"))
+ assert len(doc.ents) == 0
+
+
+def test_entity_ruler_remove_and_add(nlp):
+ ruler = EntityRuler(nlp)
+ patterns = [{"label": "DATE", "pattern": "last time"}]
+ ruler.add_patterns(patterns)
+ doc = ruler(
+ nlp.make_doc("I saw him last time we met, this time he brought some flowers")
+ )
+ assert len(ruler.patterns) == 1
+ assert len(doc.ents) == 1
+ assert doc.ents[0].label_ == "DATE"
+ assert doc.ents[0].text == "last time"
+ patterns1 = [{"label": "DATE", "pattern": "this time", "id": "ttime"}]
+ ruler.add_patterns(patterns1)
+ doc = ruler(
+ nlp.make_doc("I saw him last time we met, this time he brought some flowers")
+ )
+ assert len(ruler.patterns) == 2
+ assert len(doc.ents) == 2
+ assert doc.ents[0].label_ == "DATE"
+ assert doc.ents[0].text == "last time"
+ assert doc.ents[1].label_ == "DATE"
+ assert doc.ents[1].text == "this time"
+ ruler.remove("ttime")
+ doc = ruler(
+ nlp.make_doc("I saw him last time we met, this time he brought some flowers")
+ )
+ assert len(ruler.patterns) == 1
+ assert len(doc.ents) == 1
+ assert doc.ents[0].label_ == "DATE"
+ assert doc.ents[0].text == "last time"
+ ruler.add_patterns(patterns1)
+ doc = ruler(
+ nlp.make_doc("I saw him last time we met, this time he brought some flowers")
+ )
+ assert len(ruler.patterns) == 2
+ assert len(doc.ents) == 2
+ patterns2 = [{"label": "DATE", "pattern": "another time", "id": "ttime"}]
+ ruler.add_patterns(patterns2)
+ doc = ruler(
+ nlp.make_doc(
+ "I saw him last time we met, this time he brought some flowers, another time some chocolate."
+ )
+ )
+ assert len(ruler.patterns) == 3
+ assert len(doc.ents) == 3
+ ruler.remove("ttime")
+ doc = ruler(
+ nlp.make_doc(
+ "I saw him last time we met, this time he brought some flowers, another time some chocolate."
+ )
+ )
+ assert len(ruler.patterns) == 1
+ assert len(doc.ents) == 1
diff --git a/spacy/tests/pipeline/test_functions.py b/spacy/tests/pipeline/test_functions.py
index 454d7b08b..e4adfe2fe 100644
--- a/spacy/tests/pipeline/test_functions.py
+++ b/spacy/tests/pipeline/test_functions.py
@@ -3,6 +3,8 @@ from spacy.pipeline.functions import merge_subtokens
from spacy.language import Language
from spacy.tokens import Span, Doc
+from ..doc.test_underscore import clean_underscore # noqa: F401
+
@pytest.fixture
def doc(en_vocab):
@@ -74,3 +76,26 @@ def test_token_splitter():
"i",
]
assert all(len(t.text) <= token_splitter.split_length for t in doc)
+
+
+@pytest.mark.usefixtures("clean_underscore")
+def test_factories_doc_cleaner():
+ nlp = Language()
+ nlp.add_pipe("doc_cleaner")
+ doc = nlp.make_doc("text")
+ doc.tensor = [1, 2, 3]
+ doc = nlp(doc)
+ assert doc.tensor is None
+
+ nlp = Language()
+ nlp.add_pipe("doc_cleaner", config={"silent": False})
+ with pytest.warns(UserWarning):
+ doc = nlp("text")
+
+ Doc.set_extension("test_attr", default=-1)
+ nlp = Language()
+ nlp.add_pipe("doc_cleaner", config={"attrs": {"_.test_attr": 0}})
+ doc = nlp.make_doc("text")
+ doc._.test_attr = 100
+ doc = nlp(doc)
+ assert doc._.test_attr == 0
diff --git a/spacy/tests/pipeline/test_lemmatizer.py b/spacy/tests/pipeline/test_lemmatizer.py
index 1bec8696c..0d2d3d6e5 100644
--- a/spacy/tests/pipeline/test_lemmatizer.py
+++ b/spacy/tests/pipeline/test_lemmatizer.py
@@ -110,4 +110,4 @@ def test_lemmatizer_serialize(nlp):
assert doc2[0].lemma_ == "cope"
# Make sure that lemmatizer cache can be pickled
- b = pickle.dumps(lemmatizer2)
+ pickle.dumps(lemmatizer2)
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index 9680d70d2..11d6f0477 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -8,6 +8,7 @@ from spacy.language import Language
from spacy.tests.util import make_tempdir
from spacy.morphology import Morphology
from spacy.attrs import MORPH
+from spacy.tokens import Doc
def test_label_types():
@@ -137,6 +138,41 @@ def test_overfitting_IO():
assert [str(t.morph) for t in doc] == gold_morphs
assert [t.pos_ for t in doc] == gold_pos_tags
+ # Test overwrite+extend settings
+ # (note that "" is unset, "_" is set and empty)
+ morphs = ["Feat=V", "Feat=N", "_"]
+ doc = Doc(nlp.vocab, words=["blue", "ham", "like"], morphs=morphs)
+ orig_morphs = [str(t.morph) for t in doc]
+ orig_pos_tags = [t.pos_ for t in doc]
+ morphologizer = nlp.get_pipe("morphologizer")
+
+ # don't overwrite or extend
+ morphologizer.cfg["overwrite"] = False
+ doc = morphologizer(doc)
+ assert [str(t.morph) for t in doc] == orig_morphs
+ assert [t.pos_ for t in doc] == orig_pos_tags
+
+ # overwrite and extend
+ morphologizer.cfg["overwrite"] = True
+ morphologizer.cfg["extend"] = True
+ doc = Doc(nlp.vocab, words=["I", "like"], morphs=["Feat=A|That=A|This=A", ""])
+ doc = morphologizer(doc)
+ assert [str(t.morph) for t in doc] == ["Feat=N|That=A|This=A", "Feat=V"]
+
+ # extend without overwriting
+ morphologizer.cfg["overwrite"] = False
+ morphologizer.cfg["extend"] = True
+ doc = Doc(nlp.vocab, words=["I", "like"], morphs=["Feat=A|That=A|This=A", "That=B"])
+ doc = morphologizer(doc)
+ assert [str(t.morph) for t in doc] == ["Feat=A|That=A|This=A", "Feat=V|That=B"]
+
+ # overwrite without extending
+ morphologizer.cfg["overwrite"] = True
+ morphologizer.cfg["extend"] = False
+ doc = Doc(nlp.vocab, words=["I", "like"], morphs=["Feat=A|That=A|This=A", ""])
+ doc = morphologizer(doc)
+ assert [str(t.morph) for t in doc] == ["Feat=N", "Feat=V"]
+
# Test with unset morph and partial POS
nlp.remove_pipe("morphologizer")
nlp.add_pipe("morphologizer")
diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py
index f1f0c8a6e..4128e2a48 100644
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@@ -1,4 +1,6 @@
import pytest
+
+import spacy
from spacy.language import Language
from spacy.lang.en import English
from spacy.lang.de import German
@@ -11,6 +13,37 @@ from pydantic import StrictInt, StrictStr
from ..util import make_tempdir
+@pytest.mark.issue(5137)
+def test_issue5137():
+ factory_name = "test_issue5137"
+ pipe_name = "my_component"
+
+ @Language.factory(factory_name)
+ class MyComponent:
+ def __init__(self, nlp, name=pipe_name, categories="all_categories"):
+ self.nlp = nlp
+ self.categories = categories
+ self.name = name
+
+ def __call__(self, doc):
+ pass
+
+ def to_disk(self, path, **kwargs):
+ pass
+
+ def from_disk(self, path, **cfg):
+ pass
+
+ nlp = English()
+ my_component = nlp.add_pipe(factory_name, name=pipe_name)
+ assert my_component.categories == "all_categories"
+ with make_tempdir() as tmpdir:
+ nlp.to_disk(tmpdir)
+ overrides = {"components": {pipe_name: {"categories": "my_categories"}}}
+ nlp2 = spacy.load(tmpdir, config=overrides)
+ assert nlp2.get_pipe(pipe_name).categories == "my_categories"
+
+
def test_pipe_function_component():
name = "test_component"
@@ -135,8 +168,8 @@ def test_pipe_class_component_defaults():
self,
nlp: Language,
name: str,
- value1: StrictInt = 10,
- value2: StrictStr = "hello",
+ value1: StrictInt = StrictInt(10),
+ value2: StrictStr = StrictStr("hello"),
):
self.nlp = nlp
self.value1 = value1
@@ -196,7 +229,11 @@ def test_pipe_class_component_model_custom():
@Language.factory(name, default_config=default_config)
class Component:
def __init__(
- self, nlp: Language, model: Model, name: str, value1: StrictInt = 10
+ self,
+ nlp: Language,
+ model: Model,
+ name: str,
+ value1: StrictInt = StrictInt(10),
):
self.nlp = nlp
self.model = model
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index e530cb5c4..4b8fb8ebc 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -1,9 +1,17 @@
+import gc
+
+import numpy
import pytest
+from thinc.api import get_current_ops
+
+from spacy.lang.en import English
+from spacy.lang.en.syntax_iterators import noun_chunks
from spacy.language import Language
from spacy.pipeline import TrainablePipe
+from spacy.tokens import Doc
from spacy.training import Example
from spacy.util import SimpleFrozenList, get_arg_names
-from spacy.lang.en import English
+from spacy.vocab import Vocab
@pytest.fixture
@@ -21,6 +29,138 @@ def other_pipe(doc):
return doc
+@pytest.mark.issue(1506)
+def test_issue1506():
+ def string_generator():
+ for _ in range(10001):
+ yield "It's sentence produced by that bug."
+ for _ in range(10001):
+ yield "I erase some hbdsaj lemmas."
+ for _ in range(10001):
+ yield "I erase lemmas."
+ for _ in range(10001):
+ yield "It's sentence produced by that bug."
+ for _ in range(10001):
+ yield "It's sentence produced by that bug."
+
+ nlp = English()
+ for i, d in enumerate(nlp.pipe(string_generator())):
+ # We should run cleanup more than one time to actually cleanup data.
+ # In first run — clean up only mark strings as «not hitted».
+ if i == 10000 or i == 20000 or i == 30000:
+ gc.collect()
+ for t in d:
+ str(t.lemma_)
+
+
+@pytest.mark.issue(1654)
+def test_issue1654():
+ nlp = Language(Vocab())
+ assert not nlp.pipeline
+
+ @Language.component("component")
+ def component(doc):
+ return doc
+
+ nlp.add_pipe("component", name="1")
+ nlp.add_pipe("component", name="2", after="1")
+ nlp.add_pipe("component", name="3", after="2")
+ assert nlp.pipe_names == ["1", "2", "3"]
+ nlp2 = Language(Vocab())
+ assert not nlp2.pipeline
+ nlp2.add_pipe("component", name="3")
+ nlp2.add_pipe("component", name="2", before="3")
+ nlp2.add_pipe("component", name="1", before="2")
+ assert nlp2.pipe_names == ["1", "2", "3"]
+
+
+@pytest.mark.issue(3880)
+def test_issue3880():
+ """Test that `nlp.pipe()` works when an empty string ends the batch.
+
+ Fixed in v7.0.5 of Thinc.
+ """
+ texts = ["hello", "world", "", ""]
+ nlp = English()
+ nlp.add_pipe("parser").add_label("dep")
+ nlp.add_pipe("ner").add_label("PERSON")
+ nlp.add_pipe("tagger").add_label("NN")
+ nlp.initialize()
+ for doc in nlp.pipe(texts):
+ pass
+
+
+@pytest.mark.issue(5082)
+def test_issue5082():
+ # Ensure the 'merge_entities' pipeline does something sensible for the vectors of the merged tokens
+ nlp = English()
+ vocab = nlp.vocab
+ array1 = numpy.asarray([0.1, 0.5, 0.8], dtype=numpy.float32)
+ array2 = numpy.asarray([-0.2, -0.6, -0.9], dtype=numpy.float32)
+ array3 = numpy.asarray([0.3, -0.1, 0.7], dtype=numpy.float32)
+ array4 = numpy.asarray([0.5, 0, 0.3], dtype=numpy.float32)
+ array34 = numpy.asarray([0.4, -0.05, 0.5], dtype=numpy.float32)
+ vocab.set_vector("I", array1)
+ vocab.set_vector("like", array2)
+ vocab.set_vector("David", array3)
+ vocab.set_vector("Bowie", array4)
+ text = "I like David Bowie"
+ patterns = [
+ {"label": "PERSON", "pattern": [{"LOWER": "david"}, {"LOWER": "bowie"}]}
+ ]
+ ruler = nlp.add_pipe("entity_ruler")
+ ruler.add_patterns(patterns)
+ parsed_vectors_1 = [t.vector for t in nlp(text)]
+ assert len(parsed_vectors_1) == 4
+ ops = get_current_ops()
+ numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[0]), array1)
+ numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[1]), array2)
+ numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[2]), array3)
+ numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[3]), array4)
+ nlp.add_pipe("merge_entities")
+ parsed_vectors_2 = [t.vector for t in nlp(text)]
+ assert len(parsed_vectors_2) == 3
+ numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[0]), array1)
+ numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[1]), array2)
+ numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[2]), array34)
+
+
+@pytest.mark.issue(5458)
+def test_issue5458():
+ # Test that the noun chuncker does not generate overlapping spans
+ # fmt: off
+ words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."]
+ vocab = Vocab(strings=words)
+ deps = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"]
+ pos = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"]
+ heads = [0, 2, 0, 9, 6, 6, 2, 6, 7, 7, 0]
+ # fmt: on
+ en_doc = Doc(vocab, words=words, pos=pos, heads=heads, deps=deps)
+ en_doc.noun_chunks_iterator = noun_chunks
+
+ # if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans"
+ nlp = English()
+ merge_nps = nlp.create_pipe("merge_noun_chunks")
+ merge_nps(en_doc)
+
+
+def test_multiple_predictions():
+ class DummyPipe(TrainablePipe):
+ def __init__(self):
+ self.model = "dummy_model"
+
+ def predict(self, docs):
+ return ([1, 2, 3], [4, 5, 6])
+
+ def set_annotations(self, docs, scores):
+ return docs
+
+ nlp = Language()
+ doc = nlp.make_doc("foo")
+ dummy_pipe = DummyPipe()
+ dummy_pipe(doc)
+
+
def test_add_pipe_no_name(nlp):
nlp.add_pipe("new_pipe")
assert "new_pipe" in nlp.pipe_names
@@ -52,7 +192,7 @@ def test_cant_add_pipe_first_and_last(nlp):
nlp.add_pipe("new_pipe", first=True, last=True)
-@pytest.mark.parametrize("name", ["my_component"])
+@pytest.mark.parametrize("name", ["test_get_pipe"])
def test_get_pipe(nlp, name):
with pytest.raises(KeyError):
nlp.get_pipe(name)
@@ -62,7 +202,7 @@ def test_get_pipe(nlp, name):
@pytest.mark.parametrize(
"name,replacement,invalid_replacement",
- [("my_component", "other_pipe", lambda doc: doc)],
+ [("test_replace_pipe", "other_pipe", lambda doc: doc)],
)
def test_replace_pipe(nlp, name, replacement, invalid_replacement):
with pytest.raises(ValueError):
@@ -435,8 +575,8 @@ def test_update_with_annotates():
return component
- c1 = Language.component(f"{name}1", func=make_component(f"{name}1"))
- c2 = Language.component(f"{name}2", func=make_component(f"{name}2"))
+ Language.component(f"{name}1", func=make_component(f"{name}1"))
+ Language.component(f"{name}2", func=make_component(f"{name}2"))
components = set([f"{name}1", f"{name}2"])
diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
index 5345a4749..8060bc621 100644
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@@ -1,9 +1,17 @@
import pytest
-from numpy.testing import assert_equal
-from spacy.language import Language
-from spacy.training import Example
-from spacy.util import fix_random_seed, registry
+import numpy
+from numpy.testing import assert_array_equal, assert_almost_equal
+from thinc.api import get_current_ops, Ragged
+from spacy import util
+from spacy.lang.en import English
+from spacy.language import Language
+from spacy.tokens import SpanGroup
+from spacy.tokens._dict_proxies import SpanGroups
+from spacy.training import Example
+from spacy.util import fix_random_seed, registry, make_tempdir
+
+OPS = get_current_ops()
SPAN_KEY = "labeled_spans"
@@ -15,17 +23,22 @@ TRAIN_DATA = [
),
]
+TRAIN_DATA_OVERLAPPING = [
+ ("Who is Shaka Khan?", {"spans": {SPAN_KEY: [(7, 17, "PERSON")]}}),
+ (
+ "I like London and Berlin",
+ {"spans": {SPAN_KEY: [(7, 13, "LOC"), (18, 24, "LOC"), (7, 24, "DOUBLE_LOC")]}},
+ ),
+ ("", {"spans": {SPAN_KEY: []}}),
+]
-def make_get_examples(nlp):
+
+def make_examples(nlp, data=TRAIN_DATA):
train_examples = []
- for t in TRAIN_DATA:
+ for t in data:
eg = Example.from_dict(nlp.make_doc(t[0]), t[1])
train_examples.append(eg)
-
- def get_examples():
- return train_examples
-
- return get_examples
+ return train_examples
def test_no_label():
@@ -52,9 +65,7 @@ def test_implicit_labels():
nlp = Language()
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
assert len(spancat.labels) == 0
- train_examples = []
- for t in TRAIN_DATA:
- train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+ train_examples = make_examples(nlp)
nlp.initialize(get_examples=lambda: train_examples)
assert spancat.labels == ("PERSON", "LOC")
@@ -69,24 +80,78 @@ def test_explicit_labels():
assert spancat.labels == ("PERSON", "LOC")
-def test_simple_train():
- fix_random_seed(0)
+# TODO figure out why this is flaky
+@pytest.mark.skip(reason="Test is unreliable for unknown reason")
+def test_doc_gc():
+ # If the Doc object is garbage collected, the spans won't be functional afterwards
nlp = Language()
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
- get_examples = make_get_examples(nlp)
- nlp.initialize(get_examples)
- sgd = nlp.create_optimizer()
- assert len(spancat.labels) != 0
- for i in range(40):
- losses = {}
- nlp.update(list(get_examples()), losses=losses, drop=0.1, sgd=sgd)
- doc = nlp("I like London and Berlin.")
- assert doc.spans[spancat.key] == doc.spans[SPAN_KEY]
- assert len(doc.spans[spancat.key]) == 2
- assert doc.spans[spancat.key][0].text == "London"
- scores = nlp.evaluate(get_examples())
- assert f"spans_{SPAN_KEY}_f" in scores
- assert scores[f"spans_{SPAN_KEY}_f"] == 1.0
+ spancat.add_label("PERSON")
+ nlp.initialize()
+ texts = [
+ "Just a sentence.",
+ "I like London and Berlin",
+ "I like Berlin",
+ "I eat ham.",
+ ]
+ all_spans = [doc.spans for doc in nlp.pipe(texts)]
+ for text, spangroups in zip(texts, all_spans):
+ assert isinstance(spangroups, SpanGroups)
+ for key, spangroup in spangroups.items():
+ assert isinstance(spangroup, SpanGroup)
+ # XXX This fails with length 0 sometimes
+ assert len(spangroup) > 0
+ with pytest.raises(RuntimeError):
+ span = spangroup[0]
+
+
+@pytest.mark.parametrize(
+ "max_positive,nr_results", [(None, 4), (1, 2), (2, 3), (3, 4), (4, 4)]
+)
+def test_make_spangroup(max_positive, nr_results):
+ fix_random_seed(0)
+ nlp = Language()
+ spancat = nlp.add_pipe(
+ "spancat",
+ config={"spans_key": SPAN_KEY, "threshold": 0.5, "max_positive": max_positive},
+ )
+ doc = nlp.make_doc("Greater London")
+ ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2])
+ indices = ngram_suggester([doc])[0].dataXd
+ assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
+ labels = ["Thing", "City", "Person", "GreatCity"]
+ scores = numpy.asarray(
+ [[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f"
+ )
+ spangroup = spancat._make_span_group(doc, indices, scores, labels)
+ assert len(spangroup) == nr_results
+
+ # first span is always the second token "London"
+ assert spangroup[0].text == "London"
+ assert spangroup[0].label_ == "City"
+ assert_almost_equal(0.6, spangroup.attrs["scores"][0], 5)
+
+ # second span depends on the number of positives that were allowed
+ assert spangroup[1].text == "Greater London"
+ if max_positive == 1:
+ assert spangroup[1].label_ == "GreatCity"
+ assert_almost_equal(0.9, spangroup.attrs["scores"][1], 5)
+ else:
+ assert spangroup[1].label_ == "Thing"
+ assert_almost_equal(0.8, spangroup.attrs["scores"][1], 5)
+
+ if nr_results > 2:
+ assert spangroup[2].text == "Greater London"
+ if max_positive == 2:
+ assert spangroup[2].label_ == "GreatCity"
+ assert_almost_equal(0.9, spangroup.attrs["scores"][2], 5)
+ else:
+ assert spangroup[2].label_ == "City"
+ assert_almost_equal(0.7, spangroup.attrs["scores"][2], 5)
+
+ assert spangroup[-1].text == "Greater London"
+ assert spangroup[-1].label_ == "GreatCity"
+ assert_almost_equal(0.9, spangroup.attrs["scores"][-1], 5)
def test_ngram_suggester(en_tokenizer):
@@ -116,12 +181,15 @@ def test_ngram_suggester(en_tokenizer):
for span in spans:
assert 0 <= span[0] < len(doc)
assert 0 < span[1] <= len(doc)
- spans_set.add((span[0], span[1]))
+ spans_set.add((int(span[0]), int(span[1])))
# spans are unique
assert spans.shape[0] == len(spans_set)
offset += ngrams.lengths[i]
# the number of spans is correct
- assert_equal(ngrams.lengths, [max(0, len(doc) - (size - 1)) for doc in docs])
+ assert_array_equal(
+ OPS.to_numpy(ngrams.lengths),
+ [max(0, len(doc) - (size - 1)) for doc in docs],
+ )
# test 1-3-gram suggestions
ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2, 3])
@@ -129,9 +197,9 @@ def test_ngram_suggester(en_tokenizer):
en_tokenizer(text) for text in ["a", "a b", "a b c", "a b c d", "a b c d e"]
]
ngrams = ngram_suggester(docs)
- assert_equal(ngrams.lengths, [1, 3, 6, 9, 12])
- assert_equal(
- ngrams.data,
+ assert_array_equal(OPS.to_numpy(ngrams.lengths), [1, 3, 6, 9, 12])
+ assert_array_equal(
+ OPS.to_numpy(ngrams.data),
[
# doc 0
[0, 1],
@@ -176,10 +244,156 @@ def test_ngram_suggester(en_tokenizer):
ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1])
docs = [en_tokenizer(text) for text in ["", "a", ""]]
ngrams = ngram_suggester(docs)
- assert_equal(ngrams.lengths, [len(doc) for doc in docs])
+ assert_array_equal(OPS.to_numpy(ngrams.lengths), [len(doc) for doc in docs])
# test all empty docs
ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1])
docs = [en_tokenizer(text) for text in ["", "", ""]]
ngrams = ngram_suggester(docs)
- assert_equal(ngrams.lengths, [len(doc) for doc in docs])
+ assert_array_equal(OPS.to_numpy(ngrams.lengths), [len(doc) for doc in docs])
+
+
+def test_ngram_sizes(en_tokenizer):
+ # test that the range suggester works well
+ size_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2, 3])
+ suggester_factory = registry.misc.get("spacy.ngram_range_suggester.v1")
+ range_suggester = suggester_factory(min_size=1, max_size=3)
+ docs = [
+ en_tokenizer(text) for text in ["a", "a b", "a b c", "a b c d", "a b c d e"]
+ ]
+ ngrams_1 = size_suggester(docs)
+ ngrams_2 = range_suggester(docs)
+ assert_array_equal(OPS.to_numpy(ngrams_1.lengths), [1, 3, 6, 9, 12])
+ assert_array_equal(OPS.to_numpy(ngrams_1.lengths), OPS.to_numpy(ngrams_2.lengths))
+ assert_array_equal(OPS.to_numpy(ngrams_1.data), OPS.to_numpy(ngrams_2.data))
+
+ # one more variation
+ suggester_factory = registry.misc.get("spacy.ngram_range_suggester.v1")
+ range_suggester = suggester_factory(min_size=2, max_size=4)
+ ngrams_3 = range_suggester(docs)
+ assert_array_equal(OPS.to_numpy(ngrams_3.lengths), [0, 1, 3, 6, 9])
+
+
+def test_overfitting_IO():
+ # Simple test to try and quickly overfit the spancat component - ensuring the ML models work correctly
+ fix_random_seed(0)
+ nlp = English()
+ spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
+ train_examples = make_examples(nlp)
+ optimizer = nlp.initialize(get_examples=lambda: train_examples)
+ assert spancat.model.get_dim("nO") == 2
+ assert set(spancat.labels) == {"LOC", "PERSON"}
+
+ for i in range(50):
+ losses = {}
+ nlp.update(train_examples, sgd=optimizer, losses=losses)
+ assert losses["spancat"] < 0.01
+
+ # test the trained model
+ test_text = "I like London and Berlin"
+ doc = nlp(test_text)
+ assert doc.spans[spancat.key] == doc.spans[SPAN_KEY]
+ spans = doc.spans[SPAN_KEY]
+ assert len(spans) == 2
+ assert len(spans.attrs["scores"]) == 2
+ assert min(spans.attrs["scores"]) > 0.9
+ assert set([span.text for span in spans]) == {"London", "Berlin"}
+ assert set([span.label_ for span in spans]) == {"LOC"}
+
+ # Also test the results are still the same after IO
+ with make_tempdir() as tmp_dir:
+ nlp.to_disk(tmp_dir)
+ nlp2 = util.load_model_from_path(tmp_dir)
+ doc2 = nlp2(test_text)
+ spans2 = doc2.spans[SPAN_KEY]
+ assert len(spans2) == 2
+ assert len(spans2.attrs["scores"]) == 2
+ assert min(spans2.attrs["scores"]) > 0.9
+ assert set([span.text for span in spans2]) == {"London", "Berlin"}
+ assert set([span.label_ for span in spans2]) == {"LOC"}
+
+ # Test scoring
+ scores = nlp.evaluate(train_examples)
+ assert f"spans_{SPAN_KEY}_f" in scores
+ assert scores[f"spans_{SPAN_KEY}_p"] == 1.0
+ assert scores[f"spans_{SPAN_KEY}_r"] == 1.0
+ assert scores[f"spans_{SPAN_KEY}_f"] == 1.0
+
+ # also test that the spancat works for just a single entity in a sentence
+ doc = nlp("London")
+ assert len(doc.spans[spancat.key]) == 1
+
+
+def test_overfitting_IO_overlapping():
+ # Test for overfitting on overlapping entities
+ fix_random_seed(0)
+ nlp = English()
+ spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
+
+ train_examples = make_examples(nlp, data=TRAIN_DATA_OVERLAPPING)
+ optimizer = nlp.initialize(get_examples=lambda: train_examples)
+ assert spancat.model.get_dim("nO") == 3
+ assert set(spancat.labels) == {"PERSON", "LOC", "DOUBLE_LOC"}
+
+ for i in range(50):
+ losses = {}
+ nlp.update(train_examples, sgd=optimizer, losses=losses)
+ assert losses["spancat"] < 0.01
+
+ # test the trained model
+ test_text = "I like London and Berlin"
+ doc = nlp(test_text)
+ spans = doc.spans[SPAN_KEY]
+ assert len(spans) == 3
+ assert len(spans.attrs["scores"]) == 3
+ assert min(spans.attrs["scores"]) > 0.9
+ assert set([span.text for span in spans]) == {
+ "London",
+ "Berlin",
+ "London and Berlin",
+ }
+ assert set([span.label_ for span in spans]) == {"LOC", "DOUBLE_LOC"}
+
+ # Also test the results are still the same after IO
+ with make_tempdir() as tmp_dir:
+ nlp.to_disk(tmp_dir)
+ nlp2 = util.load_model_from_path(tmp_dir)
+ doc2 = nlp2(test_text)
+ spans2 = doc2.spans[SPAN_KEY]
+ assert len(spans2) == 3
+ assert len(spans2.attrs["scores"]) == 3
+ assert min(spans2.attrs["scores"]) > 0.9
+ assert set([span.text for span in spans2]) == {
+ "London",
+ "Berlin",
+ "London and Berlin",
+ }
+ assert set([span.label_ for span in spans2]) == {"LOC", "DOUBLE_LOC"}
+
+
+def test_zero_suggestions():
+ # Test with a suggester that returns 0 suggestions
+
+ @registry.misc("test_zero_suggester")
+ def make_zero_suggester():
+ def zero_suggester(docs, *, ops=None):
+ if ops is None:
+ ops = get_current_ops()
+ return Ragged(
+ ops.xp.zeros((0, 0), dtype="i"), ops.xp.zeros((len(docs),), dtype="i")
+ )
+
+ return zero_suggester
+
+ fix_random_seed(0)
+ nlp = English()
+ spancat = nlp.add_pipe(
+ "spancat",
+ config={"suggester": {"@misc": "test_zero_suggester"}, "spans_key": SPAN_KEY},
+ )
+ train_examples = make_examples(nlp)
+ optimizer = nlp.initialize(get_examples=lambda: train_examples)
+ assert spancat.model.get_dim("nO") == 2
+ assert set(spancat.labels) == {"LOC", "PERSON"}
+
+ nlp.update(train_examples, sgd=optimizer)
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index 37895e7c8..96e75851e 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -6,10 +6,27 @@ from spacy import util
from spacy.training import Example
from spacy.lang.en import English
from spacy.language import Language
+from thinc.api import compounding
from ..util import make_tempdir
+@pytest.mark.issue(4348)
+def test_issue4348():
+ """Test that training the tagger with empty data, doesn't throw errors"""
+ nlp = English()
+ example = Example.from_dict(nlp.make_doc(""), {"tags": []})
+ TRAIN_DATA = [example, example]
+ tagger = nlp.add_pipe("tagger")
+ tagger.add_label("A")
+ optimizer = nlp.initialize()
+ for i in range(5):
+ losses = {}
+ batches = util.minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
+ for batch in batches:
+ nlp.update(batch, sgd=optimizer, losses=losses)
+
+
def test_label_types():
nlp = Language()
tagger = nlp.add_pipe("tagger")
@@ -182,6 +199,17 @@ def test_overfitting_IO():
assert_equal(batch_deps_1, batch_deps_2)
assert_equal(batch_deps_1, no_batch_deps)
+ # Try to unlearn the first 'N' tag with negative annotation
+ neg_ex = Example.from_dict(nlp.make_doc(test_text), {"tags": ["!N", "V", "J", "N"]})
+
+ for i in range(20):
+ losses = {}
+ nlp.update([neg_ex], sgd=optimizer, losses=losses)
+
+ # test the "untrained" tag
+ doc3 = nlp(test_text)
+ assert doc3[0].tag_ != "N"
+
def test_tagger_requires_labels():
nlp = English()
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index b134b8508..282789f2b 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -1,20 +1,31 @@
-import pytest
import random
+
import numpy.random
+import pytest
from numpy.testing import assert_almost_equal
-from thinc.api import fix_random_seed
+from thinc.api import Config, compounding, fix_random_seed, get_current_ops
+from wasabi import msg
+
+import spacy
from spacy import util
+from spacy.cli.evaluate import print_prf_per_type, print_textcats_auc_per_cat
from spacy.lang.en import English
from spacy.language import Language
from spacy.pipeline import TextCategorizer
-from spacy.tokens import Doc
+from spacy.pipeline.textcat import single_label_bow_config
+from spacy.pipeline.textcat import single_label_cnn_config
+from spacy.pipeline.textcat import single_label_default_config
+from spacy.pipeline.textcat_multilabel import multi_label_bow_config
+from spacy.pipeline.textcat_multilabel import multi_label_cnn_config
+from spacy.pipeline.textcat_multilabel import multi_label_default_config
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
from spacy.scorer import Scorer
+from spacy.tokens import Doc, DocBin
from spacy.training import Example
+from spacy.training.initialize import init_nlp
from ..util import make_tempdir
-
TRAIN_DATA_SINGLE_LABEL = [
("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
@@ -48,6 +59,224 @@ def make_get_examples_multi_label(nlp):
return get_examples
+@pytest.mark.issue(3611)
+def test_issue3611():
+ """Test whether adding n-grams in the textcat works even when n > token length of some docs"""
+ unique_classes = ["offensive", "inoffensive"]
+ x_train = [
+ "This is an offensive text",
+ "This is the second offensive text",
+ "inoff",
+ ]
+ y_train = ["offensive", "offensive", "inoffensive"]
+ nlp = spacy.blank("en")
+ # preparing the data
+ train_data = []
+ for text, train_instance in zip(x_train, y_train):
+ cat_dict = {label: label == train_instance for label in unique_classes}
+ train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
+ # add a text categorizer component
+ model = {
+ "@architectures": "spacy.TextCatBOW.v1",
+ "exclusive_classes": True,
+ "ngram_size": 2,
+ "no_output_layer": False,
+ }
+ textcat = nlp.add_pipe("textcat", config={"model": model}, last=True)
+ for label in unique_classes:
+ textcat.add_label(label)
+ # training the network
+ with nlp.select_pipes(enable="textcat"):
+ optimizer = nlp.initialize()
+ for i in range(3):
+ losses = {}
+ batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+
+ for batch in batches:
+ nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
+
+
+@pytest.mark.issue(4030)
+def test_issue4030():
+ """Test whether textcat works fine with empty doc"""
+ unique_classes = ["offensive", "inoffensive"]
+ x_train = [
+ "This is an offensive text",
+ "This is the second offensive text",
+ "inoff",
+ ]
+ y_train = ["offensive", "offensive", "inoffensive"]
+ nlp = spacy.blank("en")
+ # preparing the data
+ train_data = []
+ for text, train_instance in zip(x_train, y_train):
+ cat_dict = {label: label == train_instance for label in unique_classes}
+ train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
+ # add a text categorizer component
+ model = {
+ "@architectures": "spacy.TextCatBOW.v1",
+ "exclusive_classes": True,
+ "ngram_size": 2,
+ "no_output_layer": False,
+ }
+ textcat = nlp.add_pipe("textcat", config={"model": model}, last=True)
+ for label in unique_classes:
+ textcat.add_label(label)
+ # training the network
+ with nlp.select_pipes(enable="textcat"):
+ optimizer = nlp.initialize()
+ for i in range(3):
+ losses = {}
+ batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+
+ for batch in batches:
+ nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
+ # processing of an empty doc should result in 0.0 for all categories
+ doc = nlp("")
+ assert doc.cats["offensive"] == 0.0
+ assert doc.cats["inoffensive"] == 0.0
+
+
+@pytest.mark.parametrize(
+ "textcat_config",
+ [
+ single_label_default_config,
+ single_label_bow_config,
+ single_label_cnn_config,
+ multi_label_default_config,
+ multi_label_bow_config,
+ multi_label_cnn_config,
+ ],
+)
+@pytest.mark.issue(5551)
+def test_issue5551(textcat_config):
+ """Test that after fixing the random seed, the results of the pipeline are truly identical"""
+ component = "textcat"
+
+ pipe_cfg = Config().from_str(textcat_config)
+ results = []
+ for i in range(3):
+ fix_random_seed(0)
+ nlp = English()
+ text = "Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g."
+ annots = {"cats": {"Labe1": 1.0, "Label2": 0.0, "Label3": 0.0}}
+ pipe = nlp.add_pipe(component, config=pipe_cfg, last=True)
+ for label in set(annots["cats"]):
+ pipe.add_label(label)
+ # Train
+ nlp.initialize()
+ doc = nlp.make_doc(text)
+ nlp.update([Example.from_dict(doc, annots)])
+ # Store the result of each iteration
+ result = pipe.model.predict([doc])
+ results.append(result[0])
+ # All results should be the same because of the fixed seed
+ assert len(results) == 3
+ ops = get_current_ops()
+ assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]), decimal=5)
+ assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]), decimal=5)
+
+
+CONFIG_ISSUE_6908 = """
+[paths]
+train = "TRAIN_PLACEHOLDER"
+raw = null
+init_tok2vec = null
+vectors = null
+
+[system]
+seed = 0
+gpu_allocator = null
+
+[nlp]
+lang = "en"
+pipeline = ["textcat"]
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+disabled = []
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+batch_size = 1000
+
+[components]
+
+[components.textcat]
+factory = "TEXTCAT_PLACEHOLDER"
+
+[corpora]
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths:train}
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths:train}
+
+
+[training]
+train_corpus = "corpora.train"
+dev_corpus = "corpora.dev"
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+frozen_components = []
+before_to_disk = null
+
+[pretraining]
+
+[initialize]
+vectors = ${paths.vectors}
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+lookups = null
+before_init = null
+after_init = null
+
+[initialize.components]
+
+[initialize.components.textcat]
+labels = ['label1', 'label2']
+
+[initialize.tokenizer]
+"""
+
+
+@pytest.mark.parametrize(
+ "component_name",
+ ["textcat", "textcat_multilabel"],
+)
+@pytest.mark.issue(6908)
+def test_issue6908(component_name):
+ """Test intializing textcat with labels in a list"""
+
+ def create_data(out_file):
+ nlp = spacy.blank("en")
+ doc = nlp.make_doc("Some text")
+ doc.cats = {"label1": 0, "label2": 1}
+ out_data = DocBin(docs=[doc]).to_bytes()
+ with out_file.open("wb") as file_:
+ file_.write(out_data)
+
+ with make_tempdir() as tmp_path:
+ train_path = tmp_path / "train.spacy"
+ create_data(train_path)
+ config_str = CONFIG_ISSUE_6908.replace("TEXTCAT_PLACEHOLDER", component_name)
+ config_str = config_str.replace("TRAIN_PLACEHOLDER", train_path.as_posix())
+ config = util.load_config_from_str(config_str)
+ init_nlp(config)
+
+
+@pytest.mark.issue(7019)
+def test_issue7019():
+ scores = {"LABEL_A": 0.39829102, "LABEL_B": 0.938298329382, "LABEL_C": None}
+ print_textcats_auc_per_cat(msg, scores)
+ scores = {
+ "LABEL_A": {"p": 0.3420302, "r": 0.3929020, "f": 0.49823928932},
+ "LABEL_B": {"p": None, "r": None, "f": None},
+ }
+ print_prf_per_type(msg, scores, name="foo", type="bar")
+
+
@pytest.mark.skip(reason="Test is flakey when run with others")
def test_simple_train():
nlp = Language()
diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py
deleted file mode 100644
index 6bb71f6f4..000000000
--- a/spacy/tests/regression/test_issue1-1000.py
+++ /dev/null
@@ -1,453 +0,0 @@
-import pytest
-import random
-from spacy import util
-from spacy.training import Example
-from spacy.matcher import Matcher
-from spacy.attrs import IS_PUNCT, ORTH, LOWER
-from spacy.vocab import Vocab
-from spacy.lang.en import English
-from spacy.lookups import Lookups
-from spacy.tokens import Doc, Span
-
-from ..util import make_tempdir
-
-
-@pytest.mark.parametrize(
- "patterns",
- [
- [[{"LOWER": "celtics"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]],
- [[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "celtics"}]],
- ],
-)
-def test_issue118(en_tokenizer, patterns):
- """Test a bug that arose from having overlapping matches"""
- text = (
- "how many points did lebron james score against the boston celtics last night"
- )
- doc = en_tokenizer(text)
- ORG = doc.vocab.strings["ORG"]
- matcher = Matcher(doc.vocab)
- matcher.add("BostonCeltics", patterns)
- assert len(list(doc.ents)) == 0
- matches = [(ORG, start, end) for _, start, end in matcher(doc)]
- assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
- doc.ents = matches[:1]
- ents = list(doc.ents)
- assert len(ents) == 1
- assert ents[0].label == ORG
- assert ents[0].start == 9
- assert ents[0].end == 11
-
-
-@pytest.mark.parametrize(
- "patterns",
- [
- [[{"LOWER": "boston"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]],
- [[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "boston"}]],
- ],
-)
-def test_issue118_prefix_reorder(en_tokenizer, patterns):
- """Test a bug that arose from having overlapping matches"""
- text = (
- "how many points did lebron james score against the boston celtics last night"
- )
- doc = en_tokenizer(text)
- ORG = doc.vocab.strings["ORG"]
- matcher = Matcher(doc.vocab)
- matcher.add("BostonCeltics", patterns)
- assert len(list(doc.ents)) == 0
- matches = [(ORG, start, end) for _, start, end in matcher(doc)]
- doc.ents += tuple(matches)[1:]
- assert matches == [(ORG, 9, 10), (ORG, 9, 11)]
- ents = doc.ents
- assert len(ents) == 1
- assert ents[0].label == ORG
- assert ents[0].start == 9
- assert ents[0].end == 11
-
-
-def test_issue242(en_tokenizer):
- """Test overlapping multi-word phrases."""
- text = "There are different food safety standards in different countries."
- patterns = [
- [{"LOWER": "food"}, {"LOWER": "safety"}],
- [{"LOWER": "safety"}, {"LOWER": "standards"}],
- ]
- doc = en_tokenizer(text)
- matcher = Matcher(doc.vocab)
- matcher.add("FOOD", patterns)
- matches = [(ent_type, start, end) for ent_type, start, end in matcher(doc)]
- match1, match2 = matches
- assert match1[1] == 3
- assert match1[2] == 5
- assert match2[1] == 4
- assert match2[2] == 6
- with pytest.raises(ValueError):
- # One token can only be part of one entity, so test that the matches
- # can't be added as entities
- doc.ents += tuple(matches)
-
-
-def test_issue309(en_vocab):
- """Test Issue #309: SBD fails on empty string"""
- doc = Doc(en_vocab, words=[" "], heads=[0], deps=["ROOT"])
- assert len(doc) == 1
- sents = list(doc.sents)
- assert len(sents) == 1
-
-
-def test_issue351(en_tokenizer):
- doc = en_tokenizer(" This is a cat.")
- assert doc[0].idx == 0
- assert len(doc[0]) == 3
- assert doc[1].idx == 3
-
-
-def test_issue360(en_tokenizer):
- """Test tokenization of big ellipsis"""
- tokens = en_tokenizer("$45...............Asking")
- assert len(tokens) > 2
-
-
-@pytest.mark.parametrize("text1,text2", [("cat", "dog")])
-def test_issue361(en_vocab, text1, text2):
- """Test Issue #361: Equality of lexemes"""
- assert en_vocab[text1] == en_vocab[text1]
- assert en_vocab[text1] != en_vocab[text2]
-
-
-def test_issue587(en_tokenizer):
- """Test that Matcher doesn't segfault on particular input"""
- doc = en_tokenizer("a b; c")
- matcher = Matcher(doc.vocab)
- matcher.add("TEST1", [[{ORTH: "a"}, {ORTH: "b"}]])
- matches = matcher(doc)
- assert len(matches) == 1
- matcher.add("TEST2", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "c"}]])
- matches = matcher(doc)
- assert len(matches) == 2
- matcher.add("TEST3", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "d"}]])
- matches = matcher(doc)
- assert len(matches) == 2
-
-
-def test_issue588(en_vocab):
- matcher = Matcher(en_vocab)
- with pytest.raises(ValueError):
- matcher.add("TEST", [[]])
-
-
-def test_issue590(en_vocab):
- """Test overlapping matches"""
- doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"])
- matcher = Matcher(en_vocab)
- matcher.add(
- "ab", [[{"IS_ALPHA": True}, {"ORTH": ":"}, {"LIKE_NUM": True}, {"ORTH": "%"}]]
- )
- matcher.add("ab", [[{"IS_ALPHA": True}, {"ORTH": "="}, {"LIKE_NUM": True}]])
- matches = matcher(doc)
- assert len(matches) == 2
-
-
-@pytest.mark.skip(reason="Old vocab-based lemmatization")
-def test_issue595():
- """Test lemmatization of base forms"""
- words = ["Do", "n't", "feed", "the", "dog"]
- lookups = Lookups()
- lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]})
- lookups.add_table("lemma_index", {"verb": {}})
- lookups.add_table("lemma_exc", {"verb": {}})
- vocab = Vocab()
- doc = Doc(vocab, words=words)
- doc[2].tag_ = "VB"
- assert doc[2].text == "feed"
- assert doc[2].lemma_ == "feed"
-
-
-def test_issue599(en_vocab):
- doc = Doc(en_vocab)
- doc2 = Doc(doc.vocab)
- doc2.from_bytes(doc.to_bytes())
- assert doc2.has_annotation("DEP")
-
-
-def test_issue600():
- vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
- doc = Doc(vocab, words=["hello"])
- doc[0].tag_ = "NN"
-
-
-def test_issue615(en_tokenizer):
- def merge_phrases(matcher, doc, i, matches):
- """Merge a phrase. We have to be careful here because we'll change the
- token indices. To avoid problems, merge all the phrases once we're called
- on the last match."""
- if i != len(matches) - 1:
- return None
- spans = [Span(doc, start, end, label=label) for label, start, end in matches]
- with doc.retokenize() as retokenizer:
- for span in spans:
- tag = "NNP" if span.label_ else span.root.tag_
- attrs = {"tag": tag, "lemma": span.text}
- retokenizer.merge(span, attrs=attrs)
- doc.ents = doc.ents + (span,)
-
- text = "The golf club is broken"
- pattern = [{"ORTH": "golf"}, {"ORTH": "club"}]
- label = "Sport_Equipment"
- doc = en_tokenizer(text)
- matcher = Matcher(doc.vocab)
- matcher.add(label, [pattern], on_match=merge_phrases)
- matcher(doc)
- entities = list(doc.ents)
- assert entities != []
- assert entities[0].label != 0
-
-
-@pytest.mark.parametrize("text,number", [("7am", "7"), ("11p.m.", "11")])
-def test_issue736(en_tokenizer, text, number):
- """Test that times like "7am" are tokenized correctly and that numbers are
- converted to string."""
- tokens = en_tokenizer(text)
- assert len(tokens) == 2
- assert tokens[0].text == number
-
-
-@pytest.mark.parametrize("text", ["3/4/2012", "01/12/1900"])
-def test_issue740(en_tokenizer, text):
- """Test that dates are not split and kept as one token. This behaviour is
- currently inconsistent, since dates separated by hyphens are still split.
- This will be hard to prevent without causing clashes with numeric ranges."""
- tokens = en_tokenizer(text)
- assert len(tokens) == 1
-
-
-def test_issue743():
- doc = Doc(Vocab(), ["hello", "world"])
- token = doc[0]
- s = set([token])
- items = list(s)
- assert items[0] is token
-
-
-@pytest.mark.parametrize("text", ["We were scared", "We Were Scared"])
-def test_issue744(en_tokenizer, text):
- """Test that 'were' and 'Were' are excluded from the contractions
- generated by the English tokenizer exceptions."""
- tokens = en_tokenizer(text)
- assert len(tokens) == 3
- assert tokens[1].text.lower() == "were"
-
-
-@pytest.mark.parametrize(
- "text,is_num", [("one", True), ("ten", True), ("teneleven", False)]
-)
-def test_issue759(en_tokenizer, text, is_num):
- tokens = en_tokenizer(text)
- assert tokens[0].like_num == is_num
-
-
-@pytest.mark.parametrize("text", ["Shell", "shell", "Shed", "shed"])
-def test_issue775(en_tokenizer, text):
- """Test that 'Shell' and 'shell' are excluded from the contractions
- generated by the English tokenizer exceptions."""
- tokens = en_tokenizer(text)
- assert len(tokens) == 1
- assert tokens[0].text == text
-
-
-@pytest.mark.parametrize("text", ["This is a string ", "This is a string\u0020"])
-def test_issue792(en_tokenizer, text):
- """Test for Issue #792: Trailing whitespace is removed after tokenization."""
- doc = en_tokenizer(text)
- assert "".join([token.text_with_ws for token in doc]) == text
-
-
-@pytest.mark.parametrize("text", ["This is a string", "This is a string\n"])
-def test_control_issue792(en_tokenizer, text):
- """Test base case for Issue #792: Non-trailing whitespace"""
- doc = en_tokenizer(text)
- assert "".join([token.text_with_ws for token in doc]) == text
-
-
-@pytest.mark.skip(
- reason="Can not be fixed unless with variable-width lookbehinds, cf. PR #3218"
-)
-@pytest.mark.parametrize(
- "text,tokens",
- [
- ('"deserve,"--and', ['"', "deserve", ',"--', "and"]),
- ("exception;--exclusive", ["exception", ";--", "exclusive"]),
- ("day.--Is", ["day", ".--", "Is"]),
- ("refinement:--just", ["refinement", ":--", "just"]),
- ("memories?--To", ["memories", "?--", "To"]),
- ("Useful.=--Therefore", ["Useful", ".=--", "Therefore"]),
- ("=Hope.=--Pandora", ["=", "Hope", ".=--", "Pandora"]),
- ],
-)
-def test_issue801(en_tokenizer, text, tokens):
- """Test that special characters + hyphens are split correctly."""
- doc = en_tokenizer(text)
- assert len(doc) == len(tokens)
- assert [t.text for t in doc] == tokens
-
-
-@pytest.mark.parametrize(
- "text,expected_tokens",
- [
- (
- "Smörsåsen används bl.a. till fisk",
- ["Smörsåsen", "används", "bl.a.", "till", "fisk"],
- ),
- (
- "Jag kommer först kl. 13 p.g.a. diverse förseningar",
- ["Jag", "kommer", "först", "kl.", "13", "p.g.a.", "diverse", "förseningar"],
- ),
- ],
-)
-def test_issue805(sv_tokenizer, text, expected_tokens):
- tokens = sv_tokenizer(text)
- token_list = [token.text for token in tokens if not token.is_space]
- assert expected_tokens == token_list
-
-
-def test_issue850():
- """The variable-length pattern matches the succeeding token. Check we
- handle the ambiguity correctly."""
- vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
- matcher = Matcher(vocab)
- pattern = [{"LOWER": "bob"}, {"OP": "*"}, {"LOWER": "frank"}]
- matcher.add("FarAway", [pattern])
- doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
- match = matcher(doc)
- assert len(match) == 1
- ent_id, start, end = match[0]
- assert start == 0
- assert end == 4
-
-
-def test_issue850_basic():
- """Test Matcher matches with '*' operator and Boolean flag"""
- vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
- matcher = Matcher(vocab)
- pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}]
- matcher.add("FarAway", [pattern])
- doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
- match = matcher(doc)
- assert len(match) == 1
- ent_id, start, end = match[0]
- assert start == 0
- assert end == 4
-
-
-@pytest.mark.skip(
- reason="French exception list is not enabled in the default tokenizer anymore"
-)
-@pytest.mark.parametrize(
- "text", ["au-delàs", "pair-programmâmes", "terra-formées", "σ-compacts"]
-)
-def test_issue852(fr_tokenizer, text):
- """Test that French tokenizer exceptions are imported correctly."""
- tokens = fr_tokenizer(text)
- assert len(tokens) == 1
-
-
-@pytest.mark.parametrize(
- "text", ["aaabbb@ccc.com\nThank you!", "aaabbb@ccc.com \nThank you!"]
-)
-def test_issue859(en_tokenizer, text):
- """Test that no extra space is added in doc.text method."""
- doc = en_tokenizer(text)
- assert doc.text == text
-
-
-@pytest.mark.parametrize("text", ["Datum:2014-06-02\nDokument:76467"])
-def test_issue886(en_tokenizer, text):
- """Test that token.idx matches the original text index for texts with newlines."""
- doc = en_tokenizer(text)
- for token in doc:
- assert len(token.text) == len(token.text_with_ws)
- assert text[token.idx] == token.text[0]
-
-
-@pytest.mark.parametrize("text", ["want/need"])
-def test_issue891(en_tokenizer, text):
- """Test that / infixes are split correctly."""
- tokens = en_tokenizer(text)
- assert len(tokens) == 3
- assert tokens[1].text == "/"
-
-
-@pytest.mark.skip(reason="Old vocab-based lemmatization")
-@pytest.mark.parametrize(
- "text,tag,lemma",
- [("anus", "NN", "anus"), ("princess", "NN", "princess"), ("inner", "JJ", "inner")],
-)
-def test_issue912(en_vocab, text, tag, lemma):
- """Test base-forms are preserved."""
- doc = Doc(en_vocab, words=[text])
- doc[0].tag_ = tag
- assert doc[0].lemma_ == lemma
-
-
-@pytest.mark.slow
-def test_issue957(en_tokenizer):
- """Test that spaCy doesn't hang on many punctuation characters.
- If this test hangs, check (new) regular expressions for conflicting greedy operators
- """
- # Skip test if pytest-timeout is not installed
- pytest.importorskip("pytest_timeout")
- for punct in [".", ",", "'", '"', ":", "?", "!", ";", "-"]:
- string = "0"
- for i in range(1, 100):
- string += punct + str(i)
- doc = en_tokenizer(string)
- assert doc
-
-
-def test_issue999():
- """Test that adding entities and resuming training works passably OK.
- There are two issues here:
- 1) We have to re-add labels. This isn't very nice.
- 2) There's no way to set the learning rate for the weight update, so we
- end up out-of-scale, causing it to learn too fast.
- """
- TRAIN_DATA = [
- ["hey", []],
- ["howdy", []],
- ["hey there", []],
- ["hello", []],
- ["hi", []],
- ["i'm looking for a place to eat", []],
- ["i'm looking for a place in the north of town", [(31, 36, "LOCATION")]],
- ["show me chinese restaurants", [(8, 15, "CUISINE")]],
- ["show me chines restaurants", [(8, 14, "CUISINE")]],
- ]
- nlp = English()
- ner = nlp.add_pipe("ner")
- for _, offsets in TRAIN_DATA:
- for start, end, label in offsets:
- ner.add_label(label)
- nlp.initialize()
- for itn in range(20):
- random.shuffle(TRAIN_DATA)
- for raw_text, entity_offsets in TRAIN_DATA:
- example = Example.from_dict(
- nlp.make_doc(raw_text), {"entities": entity_offsets}
- )
- nlp.update([example])
-
- with make_tempdir() as model_dir:
- nlp.to_disk(model_dir)
- nlp2 = util.load_model_from_path(model_dir)
-
- for raw_text, entity_offsets in TRAIN_DATA:
- doc = nlp2(raw_text)
- ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents}
- for start, end, label in entity_offsets:
- if (start, end) in ents:
- assert ents[(start, end)] == label
- break
- else:
- if entity_offsets:
- raise Exception(ents)
diff --git a/spacy/tests/regression/test_issue1001-1500.py b/spacy/tests/regression/test_issue1001-1500.py
deleted file mode 100644
index d6a4600e3..000000000
--- a/spacy/tests/regression/test_issue1001-1500.py
+++ /dev/null
@@ -1,164 +0,0 @@
-import pytest
-import re
-from spacy.tokens import Doc
-from spacy.vocab import Vocab
-from spacy.lang.en import English
-from spacy.lang.lex_attrs import LEX_ATTRS
-from spacy.matcher import Matcher
-from spacy.tokenizer import Tokenizer
-from spacy.symbols import ORTH, LEMMA, POS
-
-
-def test_issue1061():
- """Test special-case works after tokenizing. Was caching problem."""
- text = "I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_."
- tokenizer = English().tokenizer
- doc = tokenizer(text)
- assert "MATH" in [w.text for w in doc]
- assert "_MATH_" not in [w.text for w in doc]
-
- tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}])
- doc = tokenizer(text)
- assert "_MATH_" in [w.text for w in doc]
- assert "MATH" not in [w.text for w in doc]
-
- # For sanity, check it works when pipeline is clean.
- tokenizer = English().tokenizer
- tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}])
- doc = tokenizer(text)
- assert "_MATH_" in [w.text for w in doc]
- assert "MATH" not in [w.text for w in doc]
-
-
-@pytest.mark.skip(
- reason="Can not be fixed without variable-width look-behind (which we don't want)"
-)
-def test_issue1235():
- """Test that g is not split of if preceded by a number and a letter"""
- nlp = English()
- testwords = "e2g 2g 52g"
- doc = nlp(testwords)
- assert len(doc) == 5
- assert doc[0].text == "e2g"
- assert doc[1].text == "2"
- assert doc[2].text == "g"
- assert doc[3].text == "52"
- assert doc[4].text == "g"
-
-
-def test_issue1242():
- nlp = English()
- doc = nlp("")
- assert len(doc) == 0
- docs = list(nlp.pipe(["", "hello"]))
- assert len(docs[0]) == 0
- assert len(docs[1]) == 1
-
-
-@pytest.mark.skip(reason="v3 no longer supports LEMMA/POS in tokenizer special cases")
-def test_issue1250():
- """Test cached special cases."""
- special_case = [{ORTH: "reimbur", LEMMA: "reimburse", POS: "VERB"}]
- nlp = English()
- nlp.tokenizer.add_special_case("reimbur", special_case)
- lemmas = [w.lemma_ for w in nlp("reimbur, reimbur...")]
- assert lemmas == ["reimburse", ",", "reimburse", "..."]
- lemmas = [w.lemma_ for w in nlp("reimbur, reimbur...")]
- assert lemmas == ["reimburse", ",", "reimburse", "..."]
-
-
-def test_issue1257():
- """Test that tokens compare correctly."""
- doc1 = Doc(Vocab(), words=["a", "b", "c"])
- doc2 = Doc(Vocab(), words=["a", "c", "e"])
- assert doc1[0] != doc2[0]
- assert not doc1[0] == doc2[0]
-
-
-def test_issue1375():
- """Test that token.nbor() raises IndexError for out-of-bounds access."""
- doc = Doc(Vocab(), words=["0", "1", "2"])
- with pytest.raises(IndexError):
- assert doc[0].nbor(-1)
- assert doc[1].nbor(-1).text == "0"
- with pytest.raises(IndexError):
- assert doc[2].nbor(1)
- assert doc[1].nbor(1).text == "2"
-
-
-def test_issue1434():
- """Test matches occur when optional element at end of short doc."""
- pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}]
- vocab = Vocab(lex_attr_getters=LEX_ATTRS)
- hello_world = Doc(vocab, words=["Hello", "World"])
- hello = Doc(vocab, words=["Hello"])
- matcher = Matcher(vocab)
- matcher.add("MyMatcher", [pattern])
- matches = matcher(hello_world)
- assert matches
- matches = matcher(hello)
- assert matches
-
-
-@pytest.mark.parametrize(
- "string,start,end",
- [
- ("a", 0, 1),
- ("a b", 0, 2),
- ("a c", 0, 1),
- ("a b c", 0, 2),
- ("a b b c", 0, 3),
- ("a b b", 0, 3),
- ],
-)
-def test_issue1450(string, start, end):
- """Test matcher works when patterns end with * operator."""
- pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
- matcher = Matcher(Vocab())
- matcher.add("TSTEND", [pattern])
- doc = Doc(Vocab(), words=string.split())
- matches = matcher(doc)
- if start is None or end is None:
- assert matches == []
- assert matches[-1][1] == start
- assert matches[-1][2] == end
-
-
-def test_issue1488():
- prefix_re = re.compile(r"""[\[\("']""")
- suffix_re = re.compile(r"""[\]\)"']""")
- infix_re = re.compile(r"""[-~\.]""")
- simple_url_re = re.compile(r"""^https?://""")
-
- def my_tokenizer(nlp):
- return Tokenizer(
- nlp.vocab,
- {},
- prefix_search=prefix_re.search,
- suffix_search=suffix_re.search,
- infix_finditer=infix_re.finditer,
- token_match=simple_url_re.match,
- )
-
- nlp = English()
- nlp.tokenizer = my_tokenizer(nlp)
- doc = nlp("This is a test.")
- for token in doc:
- assert token.text
-
-
-def test_issue1494():
- infix_re = re.compile(r"""[^a-z]""")
- test_cases = [
- ("token 123test", ["token", "1", "2", "3", "test"]),
- ("token 1test", ["token", "1test"]),
- ("hello...test", ["hello", ".", ".", ".", "test"]),
- ]
-
- def new_tokenizer(nlp):
- return Tokenizer(nlp.vocab, {}, infix_finditer=infix_re.finditer)
-
- nlp = English()
- nlp.tokenizer = new_tokenizer(nlp)
- for text, expected in test_cases:
- assert [token.text for token in nlp(text)] == expected
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
deleted file mode 100644
index f85ec70e1..000000000
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ /dev/null
@@ -1,351 +0,0 @@
-import pytest
-import gc
-import numpy
-import copy
-
-from spacy.training import Example
-from spacy.lang.en import English
-from spacy.lang.en.stop_words import STOP_WORDS
-from spacy.lang.lex_attrs import is_stop
-from spacy.vectors import Vectors
-from spacy.vocab import Vocab
-from spacy.language import Language
-from spacy.tokens import Doc, Span, Token
-from spacy.attrs import HEAD, DEP
-from spacy.matcher import Matcher
-
-from ..util import make_tempdir
-
-
-def test_issue1506():
- def string_generator():
- for _ in range(10001):
- yield "It's sentence produced by that bug."
- for _ in range(10001):
- yield "I erase some hbdsaj lemmas."
- for _ in range(10001):
- yield "I erase lemmas."
- for _ in range(10001):
- yield "It's sentence produced by that bug."
- for _ in range(10001):
- yield "It's sentence produced by that bug."
-
- nlp = English()
- for i, d in enumerate(nlp.pipe(string_generator())):
- # We should run cleanup more than one time to actually cleanup data.
- # In first run — clean up only mark strings as «not hitted».
- if i == 10000 or i == 20000 or i == 30000:
- gc.collect()
- for t in d:
- str(t.lemma_)
-
-
-def test_issue1518():
- """Test vectors.resize() works."""
- vectors = Vectors(shape=(10, 10))
- vectors.add("hello", row=2)
- vectors.resize((5, 9))
-
-
-def test_issue1537():
- """Test that Span.as_doc() doesn't segfault."""
- string = "The sky is blue . The man is pink . The dog is purple ."
- doc = Doc(Vocab(), words=string.split())
- doc[0].sent_start = True
- for word in doc[1:]:
- if word.nbor(-1).text == ".":
- word.sent_start = True
- else:
- word.sent_start = False
- sents = list(doc.sents)
- sent0 = sents[0].as_doc()
- sent1 = sents[1].as_doc()
- assert isinstance(sent0, Doc)
- assert isinstance(sent1, Doc)
-
-
-# TODO: Currently segfaulting, due to l_edge and r_edge misalignment
-# def test_issue1537_model():
-# nlp = load_spacy('en')
-# doc = nlp('The sky is blue. The man is pink. The dog is purple.')
-# sents = [s.as_doc() for s in doc.sents]
-# print(list(sents[0].noun_chunks))
-# print(list(sents[1].noun_chunks))
-
-
-def test_issue1539():
- """Ensure vectors.resize() doesn't try to modify dictionary during iteration."""
- v = Vectors(shape=(10, 10), keys=[5, 3, 98, 100])
- v.resize((100, 100))
-
-
-def test_issue1547():
- """Test that entity labels still match after merging tokens."""
- words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"]
- doc = Doc(Vocab(), words=words)
- doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])]
- with doc.retokenize() as retokenizer:
- retokenizer.merge(doc[5:7])
- assert [ent.text for ent in doc.ents]
-
-
-def test_issue1612(en_tokenizer):
- doc = en_tokenizer("The black cat purrs.")
- span = doc[1:3]
- assert span.orth_ == span.text
-
-
-def test_issue1654():
- nlp = Language(Vocab())
- assert not nlp.pipeline
-
- @Language.component("component")
- def component(doc):
- return doc
-
- nlp.add_pipe("component", name="1")
- nlp.add_pipe("component", name="2", after="1")
- nlp.add_pipe("component", name="3", after="2")
- assert nlp.pipe_names == ["1", "2", "3"]
- nlp2 = Language(Vocab())
- assert not nlp2.pipeline
- nlp2.add_pipe("component", name="3")
- nlp2.add_pipe("component", name="2", before="3")
- nlp2.add_pipe("component", name="1", before="2")
- assert nlp2.pipe_names == ["1", "2", "3"]
-
-
-@pytest.mark.parametrize("text", ["test@example.com", "john.doe@example.co.uk"])
-def test_issue1698(en_tokenizer, text):
- doc = en_tokenizer(text)
- assert len(doc) == 1
- assert not doc[0].like_url
-
-
-def test_issue1727():
- """Test that models with no pretrained vectors can be deserialized
- correctly after vectors are added."""
- nlp = Language(Vocab())
- data = numpy.ones((3, 300), dtype="f")
- vectors = Vectors(data=data, keys=["I", "am", "Matt"])
- tagger = nlp.create_pipe("tagger")
- tagger.add_label("PRP")
- assert tagger.cfg.get("pretrained_dims", 0) == 0
- tagger.vocab.vectors = vectors
- with make_tempdir() as path:
- tagger.to_disk(path)
- tagger = nlp.create_pipe("tagger").from_disk(path)
- assert tagger.cfg.get("pretrained_dims", 0) == 0
-
-
-def test_issue1757():
- """Test comparison against None doesn't cause segfault."""
- doc = Doc(Vocab(), words=["a", "b", "c"])
- assert not doc[0] < None
- assert not doc[0] is None
- assert doc[0] >= None
- assert not doc[:2] < None
- assert not doc[:2] is None
- assert doc[:2] >= None
- assert not doc.vocab["a"] is None
- assert not doc.vocab["a"] < None
-
-
-def test_issue1758(en_tokenizer):
- """Test that "would've" is handled by the English tokenizer exceptions."""
- tokens = en_tokenizer("would've")
- assert len(tokens) == 2
-
-
-def test_issue1773(en_tokenizer):
- """Test that spaces don't receive a POS but no TAG. This is the root cause
- of the serialization issue reported in #1773."""
- doc = en_tokenizer("\n")
- if doc[0].pos_ == "SPACE":
- assert doc[0].tag_ != ""
-
-
-def test_issue1799():
- """Test sentence boundaries are deserialized correctly, even for
- non-projective sentences."""
- heads_deps = numpy.asarray(
- [
- [1, 397],
- [4, 436],
- [2, 426],
- [1, 402],
- [0, 8206900633647566924],
- [18446744073709551615, 440],
- [18446744073709551614, 442],
- ],
- dtype="uint64",
- )
- doc = Doc(Vocab(), words="Just what I was looking for .".split())
- doc.vocab.strings.add("ROOT")
- doc = doc.from_array([HEAD, DEP], heads_deps)
- assert len(list(doc.sents)) == 1
-
-
-def test_issue1807():
- """Test vocab.set_vector also adds the word to the vocab."""
- vocab = Vocab(vectors_name="test_issue1807")
- assert "hello" not in vocab
- vocab.set_vector("hello", numpy.ones((50,), dtype="f"))
- assert "hello" in vocab
-
-
-def test_issue1834():
- """Test that sentence boundaries & parse/tag flags are not lost
- during serialization."""
- words = ["This", "is", "a", "first", "sentence", ".", "And", "another", "one"]
- doc = Doc(Vocab(), words=words)
- doc[6].is_sent_start = True
- new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
- assert new_doc[6].sent_start
- assert not new_doc.has_annotation("DEP")
- assert not new_doc.has_annotation("TAG")
- doc = Doc(
- Vocab(),
- words=words,
- tags=["TAG"] * len(words),
- heads=[0, 0, 0, 0, 0, 0, 6, 6, 6],
- deps=["dep"] * len(words),
- )
- new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
- assert new_doc[6].sent_start
- assert new_doc.has_annotation("DEP")
- assert new_doc.has_annotation("TAG")
-
-
-def test_issue1868():
- """Test Vocab.__contains__ works with int keys."""
- vocab = Vocab()
- lex = vocab["hello"]
- assert lex.orth in vocab
- assert lex.orth_ in vocab
- assert "some string" not in vocab
- int_id = vocab.strings.add("some string")
- assert int_id not in vocab
-
-
-def test_issue1883():
- matcher = Matcher(Vocab())
- matcher.add("pat1", [[{"orth": "hello"}]])
- doc = Doc(matcher.vocab, words=["hello"])
- assert len(matcher(doc)) == 1
- new_matcher = copy.deepcopy(matcher)
- new_doc = Doc(new_matcher.vocab, words=["hello"])
- assert len(new_matcher(new_doc)) == 1
-
-
-@pytest.mark.parametrize("word", ["the"])
-def test_issue1889(word):
- assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
-
-
-@pytest.mark.skip(reason="obsolete with the config refactor of v.3")
-def test_issue1915():
- cfg = {"hidden_depth": 2} # should error out
- nlp = Language()
- ner = nlp.add_pipe("ner")
- ner.add_label("answer")
- with pytest.raises(ValueError):
- nlp.initialize(**cfg)
-
-
-def test_issue1945():
- """Test regression in Matcher introduced in v2.0.6."""
- matcher = Matcher(Vocab())
- matcher.add("MWE", [[{"orth": "a"}, {"orth": "a"}]])
- doc = Doc(matcher.vocab, words=["a", "a", "a"])
- matches = matcher(doc) # we should see two overlapping matches here
- assert len(matches) == 2
- assert matches[0][1:] == (0, 2)
- assert matches[1][1:] == (1, 3)
-
-
-def test_issue1963(en_tokenizer):
- """Test that doc.merge() resizes doc.tensor"""
- doc = en_tokenizer("a b c d")
- doc.tensor = numpy.ones((len(doc), 128), dtype="f")
- with doc.retokenize() as retokenizer:
- retokenizer.merge(doc[0:2])
- assert len(doc) == 3
- assert doc.tensor.shape == (3, 128)
-
-
-@pytest.mark.parametrize("label", ["U-JOB-NAME"])
-def test_issue1967(label):
- nlp = Language()
- config = {}
- ner = nlp.create_pipe("ner", config=config)
- example = Example.from_dict(
- Doc(ner.vocab, words=["word"]),
- {
- "ids": [0],
- "words": ["word"],
- "tags": ["tag"],
- "heads": [0],
- "deps": ["dep"],
- "entities": [label],
- },
- )
- assert "JOB-NAME" in ner.moves.get_actions(examples=[example])[1]
-
-
-def test_issue1971(en_vocab):
- # Possibly related to #2675 and #2671?
- matcher = Matcher(en_vocab)
- pattern = [
- {"ORTH": "Doe"},
- {"ORTH": "!", "OP": "?"},
- {"_": {"optional": True}, "OP": "?"},
- {"ORTH": "!", "OP": "?"},
- ]
- Token.set_extension("optional", default=False)
- matcher.add("TEST", [pattern])
- doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"])
- # We could also assert length 1 here, but this is more conclusive, because
- # the real problem here is that it returns a duplicate match for a match_id
- # that's not actually in the vocab!
- matches = matcher(doc)
- assert all([match_id in en_vocab.strings for match_id, start, end in matches])
-
-
-def test_issue_1971_2(en_vocab):
- matcher = Matcher(en_vocab)
- pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
- pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}]
- doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
- matcher.add("TEST1", [pattern1, pattern2])
- matches = matcher(doc)
- assert len(matches) == 2
-
-
-def test_issue_1971_3(en_vocab):
- """Test that pattern matches correctly for multiple extension attributes."""
- Token.set_extension("a", default=1, force=True)
- Token.set_extension("b", default=2, force=True)
- doc = Doc(en_vocab, words=["hello", "world"])
- matcher = Matcher(en_vocab)
- matcher.add("A", [[{"_": {"a": 1}}]])
- matcher.add("B", [[{"_": {"b": 2}}]])
- matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc))
- assert len(matches) == 4
- assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
-
-
-def test_issue_1971_4(en_vocab):
- """Test that pattern matches correctly with multiple extension attribute
- values on a single token.
- """
- Token.set_extension("ext_a", default="str_a", force=True)
- Token.set_extension("ext_b", default="str_b", force=True)
- matcher = Matcher(en_vocab)
- doc = Doc(en_vocab, words=["this", "is", "text"])
- pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3
- matcher.add("TEST", [pattern])
- matches = matcher(doc)
- # Uncommenting this caused a segmentation fault
- assert len(matches) == 1
- assert matches[0] == (en_vocab.strings["TEST"], 0, 3)
diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py
deleted file mode 100644
index 09baab4d8..000000000
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ /dev/null
@@ -1,142 +0,0 @@
-import pytest
-import numpy
-from spacy.tokens import Doc
-from spacy.matcher import Matcher
-from spacy.displacy import render
-from spacy.training import iob_to_biluo
-from spacy.lang.it import Italian
-from spacy.lang.en import English
-
-from ..util import add_vecs_to_vocab
-
-
-@pytest.mark.skip(
- reason="Can not be fixed without iterative looping between prefix/suffix and infix"
-)
-def test_issue2070():
- """Test that checks that a dot followed by a quote is handled
- appropriately.
- """
- # Problem: The dot is now properly split off, but the prefix/suffix rules
- # are not applied again afterwards. This means that the quote will still be
- # attached to the remaining token.
- nlp = English()
- doc = nlp('First sentence."A quoted sentence" he said ...')
- assert len(doc) == 11
-
-
-def test_issue2179():
- """Test that spurious 'extra_labels' aren't created when initializing NER."""
- nlp = Italian()
- ner = nlp.add_pipe("ner")
- ner.add_label("CITIZENSHIP")
- nlp.initialize()
- nlp2 = Italian()
- nlp2.add_pipe("ner")
- assert len(nlp2.get_pipe("ner").labels) == 0
- model = nlp2.get_pipe("ner").model
- model.attrs["resize_output"](model, nlp.get_pipe("ner").moves.n_moves)
- nlp2.from_bytes(nlp.to_bytes())
- assert "extra_labels" not in nlp2.get_pipe("ner").cfg
- assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",)
-
-
-def test_issue2203(en_vocab):
- """Test that lemmas are set correctly in doc.from_array."""
- words = ["I", "'ll", "survive"]
- tags = ["PRP", "MD", "VB"]
- lemmas = ["-PRON-", "will", "survive"]
- tag_ids = [en_vocab.strings.add(tag) for tag in tags]
- lemma_ids = [en_vocab.strings.add(lemma) for lemma in lemmas]
- doc = Doc(en_vocab, words=words)
- # Work around lemma corruption problem and set lemmas after tags
- doc.from_array("TAG", numpy.array(tag_ids, dtype="uint64"))
- doc.from_array("LEMMA", numpy.array(lemma_ids, dtype="uint64"))
- assert [t.tag_ for t in doc] == tags
- assert [t.lemma_ for t in doc] == lemmas
- # We need to serialize both tag and lemma, since this is what causes the bug
- doc_array = doc.to_array(["TAG", "LEMMA"])
- new_doc = Doc(doc.vocab, words=words).from_array(["TAG", "LEMMA"], doc_array)
- assert [t.tag_ for t in new_doc] == tags
- assert [t.lemma_ for t in new_doc] == lemmas
-
-
-def test_issue2219(en_vocab):
- vectors = [("a", [1, 2, 3]), ("letter", [4, 5, 6])]
- add_vecs_to_vocab(en_vocab, vectors)
- [(word1, vec1), (word2, vec2)] = vectors
- doc = Doc(en_vocab, words=[word1, word2])
- assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
-
-
-def test_issue2361(de_vocab):
- chars = ("<", ">", "&", """)
- words = ["<", ">", "&", '"']
- doc = Doc(de_vocab, words=words, deps=["dep"] * len(words))
- html = render(doc)
- for char in chars:
- assert char in html
-
-
-def test_issue2385():
- """Test that IOB tags are correctly converted to BILUO tags."""
- # fix bug in labels with a 'b' character
- tags1 = ("B-BRAWLER", "I-BRAWLER", "I-BRAWLER")
- assert iob_to_biluo(tags1) == ["B-BRAWLER", "I-BRAWLER", "L-BRAWLER"]
- # maintain support for iob1 format
- tags2 = ("I-ORG", "I-ORG", "B-ORG")
- assert iob_to_biluo(tags2) == ["B-ORG", "L-ORG", "U-ORG"]
- # maintain support for iob2 format
- tags3 = ("B-PERSON", "I-PERSON", "B-PERSON")
- assert iob_to_biluo(tags3) == ["B-PERSON", "L-PERSON", "U-PERSON"]
-
-
-@pytest.mark.parametrize(
- "tags",
- [
- ("B-ORG", "L-ORG"),
- ("B-PERSON", "I-PERSON", "L-PERSON"),
- ("U-BRAWLER", "U-BRAWLER"),
- ],
-)
-def test_issue2385_biluo(tags):
- """Test that BILUO-compatible tags aren't modified."""
- assert iob_to_biluo(tags) == list(tags)
-
-
-def test_issue2396(en_vocab):
- words = ["She", "created", "a", "test", "for", "spacy"]
- heads = [1, 1, 3, 1, 3, 4]
- deps = ["dep"] * len(heads)
- matrix = numpy.array(
- [
- [0, 1, 1, 1, 1, 1],
- [1, 1, 1, 1, 1, 1],
- [1, 1, 2, 3, 3, 3],
- [1, 1, 3, 3, 3, 3],
- [1, 1, 3, 3, 4, 4],
- [1, 1, 3, 3, 4, 5],
- ],
- dtype=numpy.int32,
- )
- doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
- span = doc[:]
- assert (doc.get_lca_matrix() == matrix).all()
- assert (span.get_lca_matrix() == matrix).all()
-
-
-def test_issue2464(en_vocab):
- """Test problem with successive ?. This is the same bug, so putting it here."""
- matcher = Matcher(en_vocab)
- doc = Doc(en_vocab, words=["a", "b"])
- matcher.add("4", [[{"OP": "?"}, {"OP": "?"}]])
- matches = matcher(doc)
- assert len(matches) == 3
-
-
-def test_issue2482():
- """Test we can serialize and deserialize a blank NER or parser model."""
- nlp = Italian()
- nlp.add_pipe("ner")
- b = nlp.to_bytes()
- Italian().from_bytes(b)
diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py
deleted file mode 100644
index 4952a545d..000000000
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ /dev/null
@@ -1,223 +0,0 @@
-import pytest
-from spacy import displacy
-from spacy.training import Example
-from spacy.lang.en import English
-from spacy.lang.ja import Japanese
-from spacy.lang.xx import MultiLanguage
-from spacy.language import Language
-from spacy.matcher import Matcher
-from spacy.tokens import Doc, Span
-from spacy.vocab import Vocab
-from spacy.compat import pickle
-import numpy
-import random
-
-
-def test_issue2564():
- """Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe."""
- nlp = Language()
- tagger = nlp.add_pipe("tagger")
- tagger.add_label("A")
- nlp.initialize()
- doc = nlp("hello world")
- assert doc.has_annotation("TAG")
- docs = nlp.pipe(["hello", "world"])
- piped_doc = next(docs)
- assert piped_doc.has_annotation("TAG")
-
-
-def test_issue2569(en_tokenizer):
- """Test that operator + is greedy."""
- doc = en_tokenizer("It is May 15, 1993.")
- doc.ents = [Span(doc, 2, 6, label=doc.vocab.strings["DATE"])]
- matcher = Matcher(doc.vocab)
- matcher.add("RULE", [[{"ENT_TYPE": "DATE", "OP": "+"}]])
- matched = [doc[start:end] for _, start, end in matcher(doc)]
- matched = sorted(matched, key=len, reverse=True)
- assert len(matched) == 10
- assert len(matched[0]) == 4
- assert matched[0].text == "May 15, 1993"
-
-
-@pytest.mark.parametrize(
- "text",
- [
- "ABLEItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume TABLE ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume",
- "oow.jspsearch.eventoracleopenworldsearch.technologyoraclesolarissearch.technologystoragesearch.technologylinuxsearch.technologyserverssearch.technologyvirtualizationsearch.technologyengineeredsystemspcodewwmkmppscem:",
- ],
-)
-def test_issue2626_2835(en_tokenizer, text):
- """Check that sentence doesn't cause an infinite loop in the tokenizer."""
- doc = en_tokenizer(text)
- assert doc
-
-
-def test_issue2656(en_tokenizer):
- """Test that tokenizer correctly splits off punctuation after numbers with
- decimal points.
- """
- doc = en_tokenizer("I went for 40.3, and got home by 10.0.")
- assert len(doc) == 11
- assert doc[0].text == "I"
- assert doc[1].text == "went"
- assert doc[2].text == "for"
- assert doc[3].text == "40.3"
- assert doc[4].text == ","
- assert doc[5].text == "and"
- assert doc[6].text == "got"
- assert doc[7].text == "home"
- assert doc[8].text == "by"
- assert doc[9].text == "10.0"
- assert doc[10].text == "."
-
-
-def test_issue2671():
- """Ensure the correct entity ID is returned for matches with quantifiers.
- See also #2675
- """
- nlp = English()
- matcher = Matcher(nlp.vocab)
- pattern_id = "test_pattern"
- pattern = [
- {"LOWER": "high"},
- {"IS_PUNCT": True, "OP": "?"},
- {"LOWER": "adrenaline"},
- ]
- matcher.add(pattern_id, [pattern])
- doc1 = nlp("This is a high-adrenaline situation.")
- doc2 = nlp("This is a high adrenaline situation.")
- matches1 = matcher(doc1)
- for match_id, start, end in matches1:
- assert nlp.vocab.strings[match_id] == pattern_id
- matches2 = matcher(doc2)
- for match_id, start, end in matches2:
- assert nlp.vocab.strings[match_id] == pattern_id
-
-
-def test_issue2728(en_vocab):
- """Test that displaCy ENT visualizer escapes HTML correctly."""
- doc = Doc(en_vocab, words=["test", "", "test"])
- doc.ents = [Span(doc, 0, 1, label="TEST")]
- html = displacy.render(doc, style="ent")
- assert "<RELEASE>" in html
- doc.ents = [Span(doc, 1, 2, label="TEST")]
- html = displacy.render(doc, style="ent")
- assert "<RELEASE>" in html
-
-
-def test_issue2754(en_tokenizer):
- """Test that words like 'a' and 'a.m.' don't get exceptional norm values."""
- a = en_tokenizer("a")
- assert a[0].norm_ == "a"
- am = en_tokenizer("am")
- assert am[0].norm_ == "am"
-
-
-def test_issue2772(en_vocab):
- """Test that deprojectivization doesn't mess up sentence boundaries."""
- # fmt: off
- words = ["When", "we", "write", "or", "communicate", "virtually", ",", "we", "can", "hide", "our", "true", "feelings", "."]
- # fmt: on
- # A tree with a non-projective (i.e. crossing) arc
- # The arcs (0, 4) and (2, 9) cross.
- heads = [4, 2, 9, 2, 2, 4, 9, 9, 9, 9, 12, 12, 9, 9]
- deps = ["dep"] * len(heads)
- doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
- assert doc[1].is_sent_start is False
-
-
-@pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
-@pytest.mark.parametrize("lang_cls", [English, MultiLanguage])
-def test_issue2782(text, lang_cls):
- """Check that like_num handles + and - before number."""
- nlp = lang_cls()
- doc = nlp(text)
- assert len(doc) == 1
- assert doc[0].like_num
-
-
-def test_issue2800():
- """Test issue that arises when too many labels are added to NER model.
- Used to cause segfault.
- """
- nlp = English()
- train_data = []
- train_data.extend(
- [Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})]
- )
- entity_types = [str(i) for i in range(1000)]
- ner = nlp.add_pipe("ner")
- for entity_type in list(entity_types):
- ner.add_label(entity_type)
- optimizer = nlp.initialize()
- for i in range(20):
- losses = {}
- random.shuffle(train_data)
- for example in train_data:
- nlp.update([example], sgd=optimizer, losses=losses, drop=0.5)
-
-
-def test_issue2822(it_tokenizer):
- """Test that the abbreviation of poco is kept as one word."""
- doc = it_tokenizer("Vuoi un po' di zucchero?")
- assert len(doc) == 6
- assert doc[0].text == "Vuoi"
- assert doc[1].text == "un"
- assert doc[2].text == "po'"
- assert doc[3].text == "di"
- assert doc[4].text == "zucchero"
- assert doc[5].text == "?"
-
-
-def test_issue2833(en_vocab):
- """Test that a custom error is raised if a token or span is pickled."""
- doc = Doc(en_vocab, words=["Hello", "world"])
- with pytest.raises(NotImplementedError):
- pickle.dumps(doc[0])
- with pytest.raises(NotImplementedError):
- pickle.dumps(doc[0:2])
-
-
-def test_issue2871():
- """Test that vectors recover the correct key for spaCy reserved words."""
- words = ["dog", "cat", "SUFFIX"]
- vocab = Vocab(vectors_name="test_issue2871")
- vocab.vectors.resize(shape=(3, 10))
- vector_data = numpy.zeros((3, 10), dtype="f")
- for word in words:
- _ = vocab[word] # noqa: F841
- vocab.set_vector(word, vector_data[0])
- vocab.vectors.name = "dummy_vectors"
- assert vocab["dog"].rank == 0
- assert vocab["cat"].rank == 1
- assert vocab["SUFFIX"].rank == 2
- assert vocab.vectors.find(key="dog") == 0
- assert vocab.vectors.find(key="cat") == 1
- assert vocab.vectors.find(key="SUFFIX") == 2
-
-
-def test_issue2901():
- """Test that `nlp` doesn't fail."""
- try:
- nlp = Japanese()
- except ImportError:
- pytest.skip()
-
- doc = nlp("pythonが大好きです")
- assert doc
-
-
-def test_issue2926(fr_tokenizer):
- """Test that the tokenizer correctly splits tokens separated by a slash (/)
- ending in a digit.
- """
- doc = fr_tokenizer("Learn html5/css3/javascript/jquery")
- assert len(doc) == 8
- assert doc[0].text == "Learn"
- assert doc[1].text == "html5"
- assert doc[2].text == "/"
- assert doc[3].text == "css3"
- assert doc[4].text == "/"
- assert doc[5].text == "javascript"
- assert doc[6].text == "/"
- assert doc[7].text == "jquery"
diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py
deleted file mode 100644
index e123d2df9..000000000
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ /dev/null
@@ -1,255 +0,0 @@
-import pytest
-from spacy import registry
-from spacy.lang.en import English
-from spacy.lang.de import German
-from spacy.pipeline.ner import DEFAULT_NER_MODEL
-from spacy.pipeline import EntityRuler, EntityRecognizer
-from spacy.matcher import Matcher, PhraseMatcher
-from spacy.tokens import Doc
-from spacy.vocab import Vocab
-from spacy.attrs import ENT_IOB, ENT_TYPE
-from spacy.compat import pickle
-from spacy import displacy
-from spacy.vectors import Vectors
-import numpy
-
-
-def test_issue3002():
- """Test that the tokenizer doesn't hang on a long list of dots"""
- nlp = German()
- doc = nlp(
- "880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl"
- )
- assert len(doc) == 5
-
-
-def test_issue3009(en_vocab):
- """Test problem with matcher quantifiers"""
- patterns = [
- [{"ORTH": "has"}, {"LOWER": "to"}, {"LOWER": "do"}, {"TAG": "IN"}],
- [
- {"ORTH": "has"},
- {"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"},
- {"LOWER": "to"},
- {"LOWER": "do"},
- {"TAG": "IN"},
- ],
- [
- {"ORTH": "has"},
- {"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"},
- {"LOWER": "to"},
- {"LOWER": "do"},
- {"TAG": "IN"},
- ],
- ]
- words = ["also", "has", "to", "do", "with"]
- tags = ["RB", "VBZ", "TO", "VB", "IN"]
- pos = ["ADV", "VERB", "ADP", "VERB", "ADP"]
- doc = Doc(en_vocab, words=words, tags=tags, pos=pos)
- matcher = Matcher(en_vocab)
- for i, pattern in enumerate(patterns):
- matcher.add(str(i), [pattern])
- matches = matcher(doc)
- assert matches
-
-
-def test_issue3012(en_vocab):
- """Test that the is_tagged attribute doesn't get overwritten when we from_array
- without tag information."""
- words = ["This", "is", "10", "%", "."]
- tags = ["DT", "VBZ", "CD", "NN", "."]
- pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
- ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"]
- doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
- assert doc.has_annotation("TAG")
- expected = ("10", "NUM", "CD", "PERCENT")
- assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
- header = [ENT_IOB, ENT_TYPE]
- ent_array = doc.to_array(header)
- doc.from_array(header, ent_array)
- assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
- # Serializing then deserializing
- doc_bytes = doc.to_bytes()
- doc2 = Doc(en_vocab).from_bytes(doc_bytes)
- assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected
-
-
-def test_issue3199():
- """Test that Span.noun_chunks works correctly if no noun chunks iterator
- is available. To make this test future-proof, we're constructing a Doc
- with a new Vocab here and a parse tree to make sure the noun chunks run.
- """
- words = ["This", "is", "a", "sentence"]
- doc = Doc(Vocab(), words=words, heads=[0] * len(words), deps=["dep"] * len(words))
- with pytest.raises(NotImplementedError):
- list(doc[0:3].noun_chunks)
-
-
-def test_issue3209():
- """Test issue that occurred in spaCy nightly where NER labels were being
- mapped to classes incorrectly after loading the model, when the labels
- were added using ner.add_label().
- """
- nlp = English()
- ner = nlp.add_pipe("ner")
- ner.add_label("ANIMAL")
- nlp.initialize()
- move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
- assert ner.move_names == move_names
- nlp2 = English()
- ner2 = nlp2.add_pipe("ner")
- model = ner2.model
- model.attrs["resize_output"](model, ner.moves.n_moves)
- nlp2.from_bytes(nlp.to_bytes())
- assert ner2.move_names == move_names
-
-
-def test_issue3248_1():
- """Test that the PhraseMatcher correctly reports its number of rules, not
- total number of patterns."""
- nlp = English()
- matcher = PhraseMatcher(nlp.vocab)
- matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
- matcher.add("TEST2", [nlp("d")])
- assert len(matcher) == 2
-
-
-def test_issue3248_2():
- """Test that the PhraseMatcher can be pickled correctly."""
- nlp = English()
- matcher = PhraseMatcher(nlp.vocab)
- matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
- matcher.add("TEST2", [nlp("d")])
- data = pickle.dumps(matcher)
- new_matcher = pickle.loads(data)
- assert len(new_matcher) == len(matcher)
-
-
-def test_issue3277(es_tokenizer):
- """Test that hyphens are split correctly as prefixes."""
- doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.")
- assert len(doc) == 14
- assert doc[0].text == "\u2014"
- assert doc[5].text == "\u2013"
- assert doc[9].text == "\u2013"
-
-
-def test_issue3288(en_vocab):
- """Test that retokenization works correctly via displaCy when punctuation
- is merged onto the preceeding token and tensor is resized."""
- words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"]
- heads = [1, 1, 1, 4, 4, 6, 4, 4]
- deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"]
- doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
- doc.tensor = numpy.zeros((len(words), 96), dtype="float32")
- displacy.render(doc)
-
-
-def test_issue3289():
- """Test that Language.to_bytes handles serializing a pipeline component
- with an uninitialized model."""
- nlp = English()
- nlp.add_pipe("textcat")
- bytes_data = nlp.to_bytes()
- new_nlp = English()
- new_nlp.add_pipe("textcat")
- new_nlp.from_bytes(bytes_data)
-
-
-def test_issue3328(en_vocab):
- doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"])
- matcher = Matcher(en_vocab)
- patterns = [
- [{"LOWER": {"IN": ["hello", "how"]}}],
- [{"LOWER": {"IN": ["you", "doing"]}}],
- ]
- matcher.add("TEST", patterns)
- matches = matcher(doc)
- assert len(matches) == 4
- matched_texts = [doc[start:end].text for _, start, end in matches]
- assert matched_texts == ["Hello", "how", "you", "doing"]
-
-
-def test_issue3331(en_vocab):
- """Test that duplicate patterns for different rules result in multiple
- matches, one per rule.
- """
- matcher = PhraseMatcher(en_vocab)
- matcher.add("A", [Doc(en_vocab, words=["Barack", "Obama"])])
- matcher.add("B", [Doc(en_vocab, words=["Barack", "Obama"])])
- doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"])
- matches = matcher(doc)
- assert len(matches) == 2
- match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]]
- assert sorted(match_ids) == ["A", "B"]
-
-
-def test_issue3345():
- """Test case where preset entity crosses sentence boundary."""
- nlp = English()
- doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
- doc[4].is_sent_start = True
- ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
- cfg = {"model": DEFAULT_NER_MODEL}
- model = registry.resolve(cfg, validate=True)["model"]
- ner = EntityRecognizer(doc.vocab, model)
- # Add the OUT action. I wouldn't have thought this would be necessary...
- ner.moves.add_action(5, "")
- ner.add_label("GPE")
- doc = ruler(doc)
- # Get into the state just before "New"
- state = ner.moves.init_batch([doc])[0]
- ner.moves.apply_transition(state, "O")
- ner.moves.apply_transition(state, "O")
- ner.moves.apply_transition(state, "O")
- # Check that B-GPE is valid.
- assert ner.moves.is_valid(state, "B-GPE")
-
-
-def test_issue3412():
- data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
- vectors = Vectors(data=data, keys=["A", "B", "C"])
- keys, best_rows, scores = vectors.most_similar(
- numpy.asarray([[9, 8, 7], [0, 0, 0]], dtype="f")
- )
- assert best_rows[0] == 2
-
-
-@pytest.mark.skip(reason="default suffix rules avoid one upper-case letter before dot")
-def test_issue3449():
- nlp = English()
- nlp.add_pipe("sentencizer")
- text1 = "He gave the ball to I. Do you want to go to the movies with I?"
- text2 = "He gave the ball to I. Do you want to go to the movies with I?"
- text3 = "He gave the ball to I.\nDo you want to go to the movies with I?"
- t1 = nlp(text1)
- t2 = nlp(text2)
- t3 = nlp(text3)
- assert t1[5].text == "I"
- assert t2[5].text == "I"
- assert t3[5].text == "I"
-
-
-def test_issue3456():
- # this crashed because of a padding error in layer.ops.unflatten in thinc
- nlp = English()
- tagger = nlp.add_pipe("tagger")
- tagger.add_label("A")
- nlp.initialize()
- list(nlp.pipe(["hi", ""]))
-
-
-def test_issue3468():
- """Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can
- be restored after serialization."""
- nlp = English()
- nlp.add_pipe("sentencizer")
- doc = nlp("Hello world")
- assert doc[0].is_sent_start
- assert doc.has_annotation("SENT_START")
- assert len(list(doc.sents)) == 1
- doc_bytes = doc.to_bytes()
- new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
- assert new_doc[0].is_sent_start
- assert new_doc.has_annotation("SENT_START")
- assert len(list(new_doc.sents)) == 1
diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py
deleted file mode 100644
index 71c3768dd..000000000
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ /dev/null
@@ -1,472 +0,0 @@
-import pytest
-from spacy.language import Language
-from spacy.vocab import Vocab
-from spacy.pipeline import EntityRuler, DependencyParser
-from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
-from spacy import displacy, load
-from spacy.displacy import parse_deps
-from spacy.tokens import Doc, Token
-from spacy.matcher import Matcher, PhraseMatcher
-from spacy.errors import MatchPatternError
-from spacy.util import minibatch
-from spacy.training import Example
-from spacy.lang.hi import Hindi
-from spacy.lang.es import Spanish
-from spacy.lang.en import English
-from spacy.attrs import IS_ALPHA
-from spacy import registry
-from thinc.api import compounding
-import spacy
-import srsly
-import numpy
-
-from ..util import make_tempdir
-
-
-@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
-def test_issue3521(en_tokenizer, word):
- tok = en_tokenizer(word)[1]
- # 'not' and 'would' should be stopwords, also in their abbreviated forms
- assert tok.is_stop
-
-
-def test_issue_3526_1(en_vocab):
- patterns = [
- {"label": "HELLO", "pattern": "hello world"},
- {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
- {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
- {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
- {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
- ]
- nlp = Language(vocab=en_vocab)
- ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
- ruler_bytes = ruler.to_bytes()
- assert len(ruler) == len(patterns)
- assert len(ruler.labels) == 4
- assert ruler.overwrite
- new_ruler = EntityRuler(nlp)
- new_ruler = new_ruler.from_bytes(ruler_bytes)
- assert len(new_ruler) == len(ruler)
- assert len(new_ruler.labels) == 4
- assert new_ruler.overwrite == ruler.overwrite
- assert new_ruler.ent_id_sep == ruler.ent_id_sep
-
-
-def test_issue_3526_2(en_vocab):
- patterns = [
- {"label": "HELLO", "pattern": "hello world"},
- {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
- {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
- {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
- {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
- ]
- nlp = Language(vocab=en_vocab)
- ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
- bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
- new_ruler = EntityRuler(nlp)
- new_ruler = new_ruler.from_bytes(bytes_old_style)
- assert len(new_ruler) == len(ruler)
- for pattern in ruler.patterns:
- assert pattern in new_ruler.patterns
- assert new_ruler.overwrite is not ruler.overwrite
-
-
-def test_issue_3526_3(en_vocab):
- patterns = [
- {"label": "HELLO", "pattern": "hello world"},
- {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
- {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
- {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
- {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
- ]
- nlp = Language(vocab=en_vocab)
- ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
- with make_tempdir() as tmpdir:
- out_file = tmpdir / "entity_ruler"
- srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
- new_ruler = EntityRuler(nlp).from_disk(out_file)
- for pattern in ruler.patterns:
- assert pattern in new_ruler.patterns
- assert len(new_ruler) == len(ruler)
- assert new_ruler.overwrite is not ruler.overwrite
-
-
-def test_issue_3526_4(en_vocab):
- nlp = Language(vocab=en_vocab)
- patterns = [{"label": "ORG", "pattern": "Apple"}]
- config = {"overwrite_ents": True}
- ruler = nlp.add_pipe("entity_ruler", config=config)
- ruler.add_patterns(patterns)
- with make_tempdir() as tmpdir:
- nlp.to_disk(tmpdir)
- ruler = nlp.get_pipe("entity_ruler")
- assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
- assert ruler.overwrite is True
- nlp2 = load(tmpdir)
- new_ruler = nlp2.get_pipe("entity_ruler")
- assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
- assert new_ruler.overwrite is True
-
-
-def test_issue3531():
- """Test that displaCy renderer doesn't require "settings" key."""
- example_dep = {
- "words": [
- {"text": "But", "tag": "CCONJ"},
- {"text": "Google", "tag": "PROPN"},
- {"text": "is", "tag": "VERB"},
- {"text": "starting", "tag": "VERB"},
- {"text": "from", "tag": "ADP"},
- {"text": "behind.", "tag": "ADV"},
- ],
- "arcs": [
- {"start": 0, "end": 3, "label": "cc", "dir": "left"},
- {"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
- {"start": 2, "end": 3, "label": "aux", "dir": "left"},
- {"start": 3, "end": 4, "label": "prep", "dir": "right"},
- {"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
- ],
- }
- example_ent = {
- "text": "But Google is starting from behind.",
- "ents": [{"start": 4, "end": 10, "label": "ORG"}],
- }
- dep_html = displacy.render(example_dep, style="dep", manual=True)
- assert dep_html
- ent_html = displacy.render(example_ent, style="ent", manual=True)
- assert ent_html
-
-
-def test_issue3540(en_vocab):
- words = ["I", "live", "in", "NewYork", "right", "now"]
- tensor = numpy.asarray(
- [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
- dtype="f",
- )
- doc = Doc(en_vocab, words=words)
- doc.tensor = tensor
- gold_text = ["I", "live", "in", "NewYork", "right", "now"]
- assert [token.text for token in doc] == gold_text
- gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
- for i, lemma in enumerate(gold_lemma):
- doc[i].lemma_ = lemma
- assert [token.lemma_ for token in doc] == gold_lemma
- vectors_1 = [token.vector for token in doc]
- assert len(vectors_1) == len(doc)
-
- with doc.retokenize() as retokenizer:
- heads = [(doc[3], 1), doc[2]]
- attrs = {
- "POS": ["PROPN", "PROPN"],
- "LEMMA": ["New", "York"],
- "DEP": ["pobj", "compound"],
- }
- retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
-
- gold_text = ["I", "live", "in", "New", "York", "right", "now"]
- assert [token.text for token in doc] == gold_text
- gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
- assert [token.lemma_ for token in doc] == gold_lemma
- vectors_2 = [token.vector for token in doc]
- assert len(vectors_2) == len(doc)
- assert vectors_1[0].tolist() == vectors_2[0].tolist()
- assert vectors_1[1].tolist() == vectors_2[1].tolist()
- assert vectors_1[2].tolist() == vectors_2[2].tolist()
- assert vectors_1[4].tolist() == vectors_2[5].tolist()
- assert vectors_1[5].tolist() == vectors_2[6].tolist()
-
-
-def test_issue3549(en_vocab):
- """Test that match pattern validation doesn't raise on empty errors."""
- matcher = Matcher(en_vocab, validate=True)
- pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
- matcher.add("GOOD", [pattern])
- with pytest.raises(MatchPatternError):
- matcher.add("BAD", [[{"X": "Y"}]])
-
-
-@pytest.mark.skip("Matching currently only works on strings and integers")
-def test_issue3555(en_vocab):
- """Test that custom extensions with default None don't break matcher."""
- Token.set_extension("issue3555", default=None)
- matcher = Matcher(en_vocab)
- pattern = [{"ORTH": "have"}, {"_": {"issue3555": True}}]
- matcher.add("TEST", [pattern])
- doc = Doc(en_vocab, words=["have", "apple"])
- matcher(doc)
-
-
-def test_issue3611():
- """Test whether adding n-grams in the textcat works even when n > token length of some docs"""
- unique_classes = ["offensive", "inoffensive"]
- x_train = [
- "This is an offensive text",
- "This is the second offensive text",
- "inoff",
- ]
- y_train = ["offensive", "offensive", "inoffensive"]
- nlp = spacy.blank("en")
- # preparing the data
- train_data = []
- for text, train_instance in zip(x_train, y_train):
- cat_dict = {label: label == train_instance for label in unique_classes}
- train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
- # add a text categorizer component
- model = {
- "@architectures": "spacy.TextCatBOW.v1",
- "exclusive_classes": True,
- "ngram_size": 2,
- "no_output_layer": False,
- }
- textcat = nlp.add_pipe("textcat", config={"model": model}, last=True)
- for label in unique_classes:
- textcat.add_label(label)
- # training the network
- with nlp.select_pipes(enable="textcat"):
- optimizer = nlp.initialize()
- for i in range(3):
- losses = {}
- batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
-
- for batch in batches:
- nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
-
-
-def test_issue3625():
- """Test that default punctuation rules applies to hindi unicode characters"""
- nlp = Hindi()
- doc = nlp("hi. how हुए. होटल, होटल")
- expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
- assert [token.text for token in doc] == expected
-
-
-def test_issue3803():
- """Test that spanish num-like tokens have True for like_num attribute."""
- nlp = Spanish()
- text = "2 dos 1000 mil 12 doce"
- doc = nlp(text)
-
- assert [t.like_num for t in doc] == [True, True, True, True, True, True]
-
-
-def _parser_example(parser):
- doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
- gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]}
- return Example.from_dict(doc, gold)
-
-
-def test_issue3830_no_subtok():
- """Test that the parser doesn't have subtok label if not learn_tokens"""
- config = {
- "learn_tokens": False,
- }
- model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"]
- parser = DependencyParser(Vocab(), model, **config)
- parser.add_label("nsubj")
- assert "subtok" not in parser.labels
- parser.initialize(lambda: [_parser_example(parser)])
- assert "subtok" not in parser.labels
-
-
-def test_issue3830_with_subtok():
- """Test that the parser does have subtok label if learn_tokens=True."""
- config = {
- "learn_tokens": True,
- }
- model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"]
- parser = DependencyParser(Vocab(), model, **config)
- parser.add_label("nsubj")
- assert "subtok" not in parser.labels
- parser.initialize(lambda: [_parser_example(parser)])
- assert "subtok" in parser.labels
-
-
-def test_issue3839(en_vocab):
- """Test that match IDs returned by the matcher are correct, are in the string"""
- doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
- matcher = Matcher(en_vocab)
- match_id = "PATTERN"
- pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
- pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
- matcher.add(match_id, [pattern1])
- matches = matcher(doc)
- assert matches[0][0] == en_vocab.strings[match_id]
- matcher = Matcher(en_vocab)
- matcher.add(match_id, [pattern2])
- matches = matcher(doc)
- assert matches[0][0] == en_vocab.strings[match_id]
-
-
-@pytest.mark.parametrize(
- "sentence",
- [
- "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
- "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
- "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
- "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
- "It was a missed assignment, but it shouldn't have resulted in a turnover ...",
- ],
-)
-def test_issue3869(sentence):
- """Test that the Doc's count_by function works consistently"""
- nlp = English()
- doc = nlp(sentence)
- count = 0
- for token in doc:
- count += token.is_alpha
- assert count == doc.count_by(IS_ALPHA).get(1, 0)
-
-
-def test_issue3879(en_vocab):
- doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
- assert len(doc) == 5
- pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
- matcher = Matcher(en_vocab)
- matcher.add("TEST", [pattern])
- assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test'
-
-
-def test_issue3880():
- """Test that `nlp.pipe()` works when an empty string ends the batch.
-
- Fixed in v7.0.5 of Thinc.
- """
- texts = ["hello", "world", "", ""]
- nlp = English()
- nlp.add_pipe("parser").add_label("dep")
- nlp.add_pipe("ner").add_label("PERSON")
- nlp.add_pipe("tagger").add_label("NN")
- nlp.initialize()
- for doc in nlp.pipe(texts):
- pass
-
-
-def test_issue3882(en_vocab):
- """Test that displaCy doesn't serialize the doc.user_data when making a
- copy of the Doc.
- """
- doc = Doc(en_vocab, words=["Hello", "world"], deps=["dep", "dep"])
- doc.user_data["test"] = set()
- parse_deps(doc)
-
-
-def test_issue3951(en_vocab):
- """Test that combinations of optional rules are matched correctly."""
- matcher = Matcher(en_vocab)
- pattern = [
- {"LOWER": "hello"},
- {"LOWER": "this", "OP": "?"},
- {"OP": "?"},
- {"LOWER": "world"},
- ]
- matcher.add("TEST", [pattern])
- doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
- matches = matcher(doc)
- assert len(matches) == 0
-
-
-def test_issue3959():
- """Ensure that a modified pos attribute is serialized correctly."""
- nlp = English()
- doc = nlp(
- "displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
- )
- assert doc[0].pos_ == ""
- doc[0].pos_ = "NOUN"
- assert doc[0].pos_ == "NOUN"
- # usually this is already True when starting from proper models instead of blank English
- with make_tempdir() as tmp_dir:
- file_path = tmp_dir / "my_doc"
- doc.to_disk(file_path)
- doc2 = nlp("")
- doc2.from_disk(file_path)
- assert doc2[0].pos_ == "NOUN"
-
-
-def test_issue3962(en_vocab):
- """Ensure that as_doc does not result in out-of-bound access of tokens.
- This is achieved by setting the head to itself if it would lie out of the span otherwise."""
- # fmt: off
- words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
- heads = [1, 7, 1, 2, 7, 7, 7, 7, 9, 7, 7]
- deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
- # fmt: on
- doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
- span2 = doc[1:5] # "jests at scars ,"
- doc2 = span2.as_doc()
- doc2_json = doc2.to_json()
- assert doc2_json
- # head set to itself, being the new artificial root
- assert doc2[0].head.text == "jests"
- assert doc2[0].dep_ == "dep"
- assert doc2[1].head.text == "jests"
- assert doc2[1].dep_ == "prep"
- assert doc2[2].head.text == "at"
- assert doc2[2].dep_ == "pobj"
- assert doc2[3].head.text == "jests" # head set to the new artificial root
- assert doc2[3].dep_ == "dep"
- # We should still have 1 sentence
- assert len(list(doc2.sents)) == 1
- span3 = doc[6:9] # "never felt a"
- doc3 = span3.as_doc()
- doc3_json = doc3.to_json()
- assert doc3_json
- assert doc3[0].head.text == "felt"
- assert doc3[0].dep_ == "neg"
- assert doc3[1].head.text == "felt"
- assert doc3[1].dep_ == "ROOT"
- assert doc3[2].head.text == "felt" # head set to ancestor
- assert doc3[2].dep_ == "dep"
- # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
- assert len(list(doc3.sents)) == 1
-
-
-def test_issue3962_long(en_vocab):
- """Ensure that as_doc does not result in out-of-bound access of tokens.
- This is achieved by setting the head to itself if it would lie out of the span otherwise."""
- # fmt: off
- words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
- heads = [1, 1, 1, 2, 1, 7, 7, 7, 9, 7, 7]
- deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
- # fmt: on
- two_sent_doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
- span2 = two_sent_doc[1:7] # "jests at scars. They never"
- doc2 = span2.as_doc()
- doc2_json = doc2.to_json()
- assert doc2_json
- # head set to itself, being the new artificial root (in sentence 1)
- assert doc2[0].head.text == "jests"
- assert doc2[0].dep_ == "ROOT"
- assert doc2[1].head.text == "jests"
- assert doc2[1].dep_ == "prep"
- assert doc2[2].head.text == "at"
- assert doc2[2].dep_ == "pobj"
- assert doc2[3].head.text == "jests"
- assert doc2[3].dep_ == "punct"
- # head set to itself, being the new artificial root (in sentence 2)
- assert doc2[4].head.text == "They"
- assert doc2[4].dep_ == "dep"
- # head set to the new artificial head (in sentence 2)
- assert doc2[4].head.text == "They"
- assert doc2[4].dep_ == "dep"
- # We should still have 2 sentences
- sents = list(doc2.sents)
- assert len(sents) == 2
- assert sents[0].text == "jests at scars ."
- assert sents[1].text == "They never"
-
-
-def test_issue3972(en_vocab):
- """Test that the PhraseMatcher returns duplicates for duplicate match IDs."""
- matcher = PhraseMatcher(en_vocab)
- matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
- matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
- doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
- matches = matcher(doc)
-
- assert len(matches) == 2
-
- # We should have a match for each of the two rules
- found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
- assert "A" in found_ids
- assert "B" in found_ids
diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py
deleted file mode 100644
index 4410e6236..000000000
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ /dev/null
@@ -1,432 +0,0 @@
-import pytest
-from spacy.pipeline import TrainablePipe
-from spacy.matcher import PhraseMatcher, Matcher
-from spacy.tokens import Doc, Span, DocBin
-from spacy.training import Example, Corpus
-from spacy.training.converters import json_to_docs
-from spacy.vocab import Vocab
-from spacy.lang.en import English
-from spacy.util import minibatch, ensure_path, load_model
-from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex
-from spacy.tokenizer import Tokenizer
-from spacy.lang.el import Greek
-from spacy.language import Language
-import spacy
-from thinc.api import compounding
-
-from ..util import make_tempdir
-
-
-def test_issue4002(en_vocab):
- """Test that the PhraseMatcher can match on overwritten NORM attributes."""
- matcher = PhraseMatcher(en_vocab, attr="NORM")
- pattern1 = Doc(en_vocab, words=["c", "d"])
- assert [t.norm_ for t in pattern1] == ["c", "d"]
- matcher.add("TEST", [pattern1])
- doc = Doc(en_vocab, words=["a", "b", "c", "d"])
- assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
- matches = matcher(doc)
- assert len(matches) == 1
- matcher = PhraseMatcher(en_vocab, attr="NORM")
- pattern2 = Doc(en_vocab, words=["1", "2"])
- pattern2[0].norm_ = "c"
- pattern2[1].norm_ = "d"
- assert [t.norm_ for t in pattern2] == ["c", "d"]
- matcher.add("TEST", [pattern2])
- matches = matcher(doc)
- assert len(matches) == 1
-
-
-def test_issue4030():
- """Test whether textcat works fine with empty doc"""
- unique_classes = ["offensive", "inoffensive"]
- x_train = [
- "This is an offensive text",
- "This is the second offensive text",
- "inoff",
- ]
- y_train = ["offensive", "offensive", "inoffensive"]
- nlp = spacy.blank("en")
- # preparing the data
- train_data = []
- for text, train_instance in zip(x_train, y_train):
- cat_dict = {label: label == train_instance for label in unique_classes}
- train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
- # add a text categorizer component
- model = {
- "@architectures": "spacy.TextCatBOW.v1",
- "exclusive_classes": True,
- "ngram_size": 2,
- "no_output_layer": False,
- }
- textcat = nlp.add_pipe("textcat", config={"model": model}, last=True)
- for label in unique_classes:
- textcat.add_label(label)
- # training the network
- with nlp.select_pipes(enable="textcat"):
- optimizer = nlp.initialize()
- for i in range(3):
- losses = {}
- batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
-
- for batch in batches:
- nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
- # processing of an empty doc should result in 0.0 for all categories
- doc = nlp("")
- assert doc.cats["offensive"] == 0.0
- assert doc.cats["inoffensive"] == 0.0
-
-
-def test_issue4042():
- """Test that serialization of an EntityRuler before NER works fine."""
- nlp = English()
- # add ner pipe
- ner = nlp.add_pipe("ner")
- ner.add_label("SOME_LABEL")
- nlp.initialize()
- # Add entity ruler
- patterns = [
- {"label": "MY_ORG", "pattern": "Apple"},
- {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
- ]
- # works fine with "after"
- ruler = nlp.add_pipe("entity_ruler", before="ner")
- ruler.add_patterns(patterns)
- doc1 = nlp("What do you think about Apple ?")
- assert doc1.ents[0].label_ == "MY_ORG"
-
- with make_tempdir() as d:
- output_dir = ensure_path(d)
- if not output_dir.exists():
- output_dir.mkdir()
- nlp.to_disk(output_dir)
- nlp2 = load_model(output_dir)
- doc2 = nlp2("What do you think about Apple ?")
- assert doc2.ents[0].label_ == "MY_ORG"
-
-
-def test_issue4042_bug2():
- """
- Test that serialization of an NER works fine when new labels were added.
- This is the second bug of two bugs underlying the issue 4042.
- """
- nlp1 = English()
- # add ner pipe
- ner1 = nlp1.add_pipe("ner")
- ner1.add_label("SOME_LABEL")
- nlp1.initialize()
- # add a new label to the doc
- doc1 = nlp1("What do you think about Apple ?")
- assert len(ner1.labels) == 1
- assert "SOME_LABEL" in ner1.labels
- apple_ent = Span(doc1, 5, 6, label="MY_ORG")
- doc1.ents = list(doc1.ents) + [apple_ent]
- # Add the label explicitly. Previously we didn't require this.
- ner1.add_label("MY_ORG")
- ner1(doc1)
- assert len(ner1.labels) == 2
- assert "SOME_LABEL" in ner1.labels
- assert "MY_ORG" in ner1.labels
- with make_tempdir() as d:
- # assert IO goes fine
- output_dir = ensure_path(d)
- if not output_dir.exists():
- output_dir.mkdir()
- ner1.to_disk(output_dir)
- config = {}
- ner2 = nlp1.create_pipe("ner", config=config)
- ner2.from_disk(output_dir)
- assert len(ner2.labels) == 2
-
-
-def test_issue4054(en_vocab):
- """Test that a new blank model can be made with a vocab from file,
- and that serialization does not drop the language at any point."""
- nlp1 = English()
- vocab1 = nlp1.vocab
- with make_tempdir() as d:
- vocab_dir = ensure_path(d / "vocab")
- if not vocab_dir.exists():
- vocab_dir.mkdir()
- vocab1.to_disk(vocab_dir)
- vocab2 = Vocab().from_disk(vocab_dir)
- nlp2 = spacy.blank("en", vocab=vocab2)
- nlp_dir = ensure_path(d / "nlp")
- if not nlp_dir.exists():
- nlp_dir.mkdir()
- nlp2.to_disk(nlp_dir)
- nlp3 = load_model(nlp_dir)
- assert nlp3.lang == "en"
-
-
-def test_issue4120(en_vocab):
- """Test that matches without a final {OP: ?} token are returned."""
- matcher = Matcher(en_vocab)
- matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
- doc1 = Doc(en_vocab, words=["a"])
- assert len(matcher(doc1)) == 1 # works
- doc2 = Doc(en_vocab, words=["a", "b", "c"])
- assert len(matcher(doc2)) == 2 # fixed
- matcher = Matcher(en_vocab)
- matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
- doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
- assert len(matcher(doc3)) == 2 # works
- matcher = Matcher(en_vocab)
- matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
- doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
- assert len(matcher(doc4)) == 3 # fixed
-
-
-def test_issue4133(en_vocab):
- nlp = English()
- vocab_bytes = nlp.vocab.to_bytes()
- words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
- pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
- doc = Doc(en_vocab, words=words)
- for i, token in enumerate(doc):
- token.pos_ = pos[i]
- # usually this is already True when starting from proper models instead of blank English
- doc_bytes = doc.to_bytes()
- vocab = Vocab()
- vocab = vocab.from_bytes(vocab_bytes)
- doc = Doc(vocab).from_bytes(doc_bytes)
- actual = []
- for token in doc:
- actual.append(token.pos_)
- assert actual == pos
-
-
-def test_issue4190():
- def customize_tokenizer(nlp):
- prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
- suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)
- infix_re = compile_infix_regex(nlp.Defaults.infixes)
- # Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
- exceptions = {
- k: v
- for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
- if not (len(k) == 2 and k[1] == ".")
- }
- new_tokenizer = Tokenizer(
- nlp.vocab,
- exceptions,
- prefix_search=prefix_re.search,
- suffix_search=suffix_re.search,
- infix_finditer=infix_re.finditer,
- token_match=nlp.tokenizer.token_match,
- )
- nlp.tokenizer = new_tokenizer
-
- test_string = "Test c."
- # Load default language
- nlp_1 = English()
- doc_1a = nlp_1(test_string)
- result_1a = [token.text for token in doc_1a] # noqa: F841
- # Modify tokenizer
- customize_tokenizer(nlp_1)
- doc_1b = nlp_1(test_string)
- result_1b = [token.text for token in doc_1b]
- # Save and Reload
- with make_tempdir() as model_dir:
- nlp_1.to_disk(model_dir)
- nlp_2 = load_model(model_dir)
- # This should be the modified tokenizer
- doc_2 = nlp_2(test_string)
- result_2 = [token.text for token in doc_2]
- assert result_1b == result_2
-
-
-def test_issue4267():
- """Test that running an entity_ruler after ner gives consistent results"""
- nlp = English()
- ner = nlp.add_pipe("ner")
- ner.add_label("PEOPLE")
- nlp.initialize()
- assert "ner" in nlp.pipe_names
- # assert that we have correct IOB annotations
- doc1 = nlp("hi")
- assert doc1.has_annotation("ENT_IOB")
- for token in doc1:
- assert token.ent_iob == 2
- # add entity ruler and run again
- patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
- ruler = nlp.add_pipe("entity_ruler")
- ruler.add_patterns(patterns)
- assert "entity_ruler" in nlp.pipe_names
- assert "ner" in nlp.pipe_names
- # assert that we still have correct IOB annotations
- doc2 = nlp("hi")
- assert doc2.has_annotation("ENT_IOB")
- for token in doc2:
- assert token.ent_iob == 2
-
-
-@pytest.mark.skip(reason="lemmatizer lookups no longer in vocab")
-def test_issue4272():
- """Test that lookup table can be accessed from Token.lemma if no POS tags
- are available."""
- nlp = Greek()
- doc = nlp("Χθες")
- assert doc[0].lemma_
-
-
-def test_multiple_predictions():
- class DummyPipe(TrainablePipe):
- def __init__(self):
- self.model = "dummy_model"
-
- def predict(self, docs):
- return ([1, 2, 3], [4, 5, 6])
-
- def set_annotations(self, docs, scores):
- return docs
-
- nlp = Language()
- doc = nlp.make_doc("foo")
- dummy_pipe = DummyPipe()
- dummy_pipe(doc)
-
-
-def test_issue4313():
- """This should not crash or exit with some strange error code"""
- beam_width = 16
- beam_density = 0.0001
- nlp = English()
- config = {
- "beam_width": beam_width,
- "beam_density": beam_density,
- }
- ner = nlp.add_pipe("beam_ner", config=config)
- ner.add_label("SOME_LABEL")
- nlp.initialize()
- # add a new label to the doc
- doc = nlp("What do you think about Apple ?")
- assert len(ner.labels) == 1
- assert "SOME_LABEL" in ner.labels
- apple_ent = Span(doc, 5, 6, label="MY_ORG")
- doc.ents = list(doc.ents) + [apple_ent]
-
- # ensure the beam_parse still works with the new label
- docs = [doc]
- ner.beam_parse(docs, drop=0.0, beam_width=beam_width, beam_density=beam_density)
- assert len(ner.labels) == 2
- assert "MY_ORG" in ner.labels
-
-
-def test_issue4348():
- """Test that training the tagger with empty data, doesn't throw errors"""
- nlp = English()
- example = Example.from_dict(nlp.make_doc(""), {"tags": []})
- TRAIN_DATA = [example, example]
- tagger = nlp.add_pipe("tagger")
- tagger.add_label("A")
- optimizer = nlp.initialize()
- for i in range(5):
- losses = {}
- batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
- for batch in batches:
- nlp.update(batch, sgd=optimizer, losses=losses)
-
-
-def test_issue4367():
- """Test that docbin init goes well"""
- DocBin()
- DocBin(attrs=["LEMMA"])
- DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
-
-
-def test_issue4373():
- """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
- matcher = Matcher(Vocab())
- assert isinstance(matcher.vocab, Vocab)
- matcher = PhraseMatcher(Vocab())
- assert isinstance(matcher.vocab, Vocab)
-
-
-def test_issue4402():
- json_data = {
- "id": 0,
- "paragraphs": [
- {
- "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
- "sentences": [
- {
- "tokens": [
- {"id": 0, "orth": "How", "ner": "O"},
- {"id": 1, "orth": "should", "ner": "O"},
- {"id": 2, "orth": "I", "ner": "O"},
- {"id": 3, "orth": "cook", "ner": "O"},
- {"id": 4, "orth": "bacon", "ner": "O"},
- {"id": 5, "orth": "in", "ner": "O"},
- {"id": 6, "orth": "an", "ner": "O"},
- {"id": 7, "orth": "oven", "ner": "O"},
- {"id": 8, "orth": "?", "ner": "O"},
- ],
- "brackets": [],
- },
- {
- "tokens": [
- {"id": 9, "orth": "\n", "ner": "O"},
- {"id": 10, "orth": "I", "ner": "O"},
- {"id": 11, "orth": "'ve", "ner": "O"},
- {"id": 12, "orth": "heard", "ner": "O"},
- {"id": 13, "orth": "of", "ner": "O"},
- {"id": 14, "orth": "people", "ner": "O"},
- {"id": 15, "orth": "cooking", "ner": "O"},
- {"id": 16, "orth": "bacon", "ner": "O"},
- {"id": 17, "orth": "in", "ner": "O"},
- {"id": 18, "orth": "an", "ner": "O"},
- {"id": 19, "orth": "oven", "ner": "O"},
- {"id": 20, "orth": ".", "ner": "O"},
- ],
- "brackets": [],
- },
- ],
- "cats": [
- {"label": "baking", "value": 1.0},
- {"label": "not_baking", "value": 0.0},
- ],
- },
- {
- "raw": "What is the difference between white and brown eggs?\n",
- "sentences": [
- {
- "tokens": [
- {"id": 0, "orth": "What", "ner": "O"},
- {"id": 1, "orth": "is", "ner": "O"},
- {"id": 2, "orth": "the", "ner": "O"},
- {"id": 3, "orth": "difference", "ner": "O"},
- {"id": 4, "orth": "between", "ner": "O"},
- {"id": 5, "orth": "white", "ner": "O"},
- {"id": 6, "orth": "and", "ner": "O"},
- {"id": 7, "orth": "brown", "ner": "O"},
- {"id": 8, "orth": "eggs", "ner": "O"},
- {"id": 9, "orth": "?", "ner": "O"},
- ],
- "brackets": [],
- },
- {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
- ],
- "cats": [
- {"label": "baking", "value": 0.0},
- {"label": "not_baking", "value": 1.0},
- ],
- },
- ],
- }
- nlp = English()
- attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
- with make_tempdir() as tmpdir:
- output_file = tmpdir / "test4402.spacy"
- docs = json_to_docs([json_data])
- data = DocBin(docs=docs, attrs=attrs).to_bytes()
- with output_file.open("wb") as file_:
- file_.write(data)
- reader = Corpus(output_file)
- train_data = list(reader(nlp))
- assert len(train_data) == 2
-
- split_train_data = []
- for eg in train_data:
- split_train_data.extend(eg.split_sents())
- assert len(split_train_data) == 4
diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py
deleted file mode 100644
index effd67306..000000000
--- a/spacy/tests/regression/test_issue4501-5000.py
+++ /dev/null
@@ -1,255 +0,0 @@
-import pytest
-from spacy.tokens import Doc, Span, DocBin
-from spacy.training import Example
-from spacy.training.converters.conllu_to_docs import conllu_to_docs
-from spacy.lang.en import English
-from spacy.kb import KnowledgeBase
-from spacy.vocab import Vocab
-from spacy.language import Language
-from spacy.util import ensure_path, load_model_from_path
-import numpy
-import pickle
-from thinc.api import NumpyOps, get_current_ops
-
-from ..util import make_tempdir
-
-
-def test_issue4528(en_vocab):
- """Test that user_data is correctly serialized in DocBin."""
- doc = Doc(en_vocab, words=["hello", "world"])
- doc.user_data["foo"] = "bar"
- # This is how extension attribute values are stored in the user data
- doc.user_data[("._.", "foo", None, None)] = "bar"
- doc_bin = DocBin(store_user_data=True)
- doc_bin.add(doc)
- doc_bin_bytes = doc_bin.to_bytes()
- new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
- new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
- assert new_doc.user_data["foo"] == "bar"
- assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
-
-
-@pytest.mark.parametrize(
- "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
-)
-def test_gold_misaligned(en_tokenizer, text, words):
- doc = en_tokenizer(text)
- Example.from_dict(doc, {"words": words})
-
-
-def test_issue4651_with_phrase_matcher_attr():
- """Test that the EntityRuler PhraseMatcher is deserialized correctly using
- the method from_disk when the EntityRuler argument phrase_matcher_attr is
- specified.
- """
- text = "Spacy is a python library for nlp"
- nlp = English()
- patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
- ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"})
- ruler.add_patterns(patterns)
- doc = nlp(text)
- res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
- nlp_reloaded = English()
- with make_tempdir() as d:
- file_path = d / "entityruler"
- ruler.to_disk(file_path)
- nlp_reloaded.add_pipe("entity_ruler").from_disk(file_path)
- doc_reloaded = nlp_reloaded(text)
- res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
- assert res == res_reloaded
-
-
-def test_issue4651_without_phrase_matcher_attr():
- """Test that the EntityRuler PhraseMatcher is deserialized correctly using
- the method from_disk when the EntityRuler argument phrase_matcher_attr is
- not specified.
- """
- text = "Spacy is a python library for nlp"
- nlp = English()
- patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
- ruler = nlp.add_pipe("entity_ruler")
- ruler.add_patterns(patterns)
- doc = nlp(text)
- res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
- nlp_reloaded = English()
- with make_tempdir() as d:
- file_path = d / "entityruler"
- ruler.to_disk(file_path)
- nlp_reloaded.add_pipe("entity_ruler").from_disk(file_path)
- doc_reloaded = nlp_reloaded(text)
- res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
- assert res == res_reloaded
-
-
-def test_issue4665():
- """
- conllu_to_docs should not raise an exception if the HEAD column contains an
- underscore
- """
- input_data = """
-1 [ _ PUNCT -LRB- _ _ punct _ _
-2 This _ DET DT _ _ det _ _
-3 killing _ NOUN NN _ _ nsubj _ _
-4 of _ ADP IN _ _ case _ _
-5 a _ DET DT _ _ det _ _
-6 respected _ ADJ JJ _ _ amod _ _
-7 cleric _ NOUN NN _ _ nmod _ _
-8 will _ AUX MD _ _ aux _ _
-9 be _ AUX VB _ _ aux _ _
-10 causing _ VERB VBG _ _ root _ _
-11 us _ PRON PRP _ _ iobj _ _
-12 trouble _ NOUN NN _ _ dobj _ _
-13 for _ ADP IN _ _ case _ _
-14 years _ NOUN NNS _ _ nmod _ _
-15 to _ PART TO _ _ mark _ _
-16 come _ VERB VB _ _ acl _ _
-17 . _ PUNCT . _ _ punct _ _
-18 ] _ PUNCT -RRB- _ _ punct _ _
-"""
- conllu_to_docs(input_data)
-
-
-def test_issue4674():
- """Test that setting entities with overlapping identifiers does not mess up IO"""
- nlp = English()
- kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
- vector1 = [0.9, 1.1, 1.01]
- vector2 = [1.8, 2.25, 2.01]
- with pytest.warns(UserWarning):
- kb.set_entities(
- entity_list=["Q1", "Q1"],
- freq_list=[32, 111],
- vector_list=[vector1, vector2],
- )
- assert kb.get_size_entities() == 1
- # dumping to file & loading back in
- with make_tempdir() as d:
- dir_path = ensure_path(d)
- if not dir_path.exists():
- dir_path.mkdir()
- file_path = dir_path / "kb"
- kb.to_disk(str(file_path))
- kb2 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
- kb2.from_disk(str(file_path))
- assert kb2.get_size_entities() == 1
-
-
-@pytest.mark.skip(reason="API change: disable just disables, new exclude arg")
-def test_issue4707():
- """Tests that disabled component names are also excluded from nlp.from_disk
- by default when loading a model.
- """
- nlp = English()
- nlp.add_pipe("sentencizer")
- nlp.add_pipe("entity_ruler")
- assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
- exclude = ["tokenizer", "sentencizer"]
- with make_tempdir() as tmpdir:
- nlp.to_disk(tmpdir, exclude=exclude)
- new_nlp = load_model_from_path(tmpdir, disable=exclude)
- assert "sentencizer" not in new_nlp.pipe_names
- assert "entity_ruler" in new_nlp.pipe_names
-
-
-def test_issue4725_1():
- """Ensure the pickling of the NER goes well"""
- vocab = Vocab(vectors_name="test_vocab_add_vector")
- nlp = English(vocab=vocab)
- config = {
- "update_with_oracle_cut_size": 111,
- }
- ner = nlp.create_pipe("ner", config=config)
- with make_tempdir() as tmp_path:
- with (tmp_path / "ner.pkl").open("wb") as file_:
- pickle.dump(ner, file_)
- assert ner.cfg["update_with_oracle_cut_size"] == 111
-
- with (tmp_path / "ner.pkl").open("rb") as file_:
- ner2 = pickle.load(file_)
- assert ner2.cfg["update_with_oracle_cut_size"] == 111
-
-
-def test_issue4725_2():
- if isinstance(get_current_ops, NumpyOps):
- # ensures that this runs correctly and doesn't hang or crash because of the global vectors
- # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows),
- # or because of issues with pickling the NER (cf test_issue4725_1)
- vocab = Vocab(vectors_name="test_vocab_add_vector")
- data = numpy.ndarray((5, 3), dtype="f")
- data[0] = 1.0
- data[1] = 2.0
- vocab.set_vector("cat", data[0])
- vocab.set_vector("dog", data[1])
- nlp = English(vocab=vocab)
- nlp.add_pipe("ner")
- nlp.initialize()
- docs = ["Kurt is in London."] * 10
- for _ in nlp.pipe(docs, batch_size=2, n_process=2):
- pass
-
-
-def test_issue4849():
- nlp = English()
- patterns = [
- {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
- {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
- ]
- ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"})
- ruler.add_patterns(patterns)
- text = """
- The left is starting to take aim at Democratic front-runner Joe Biden.
- Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
- """
- # USING 1 PROCESS
- count_ents = 0
- for doc in nlp.pipe([text], n_process=1):
- count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
- assert count_ents == 2
- # USING 2 PROCESSES
- if isinstance(get_current_ops, NumpyOps):
- count_ents = 0
- for doc in nlp.pipe([text], n_process=2):
- count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
- assert count_ents == 2
-
-
-@Language.factory("my_pipe")
-class CustomPipe:
- def __init__(self, nlp, name="my_pipe"):
- self.name = name
- Span.set_extension("my_ext", getter=self._get_my_ext)
- Doc.set_extension("my_ext", default=None)
-
- def __call__(self, doc):
- gathered_ext = []
- for sent in doc.sents:
- sent_ext = self._get_my_ext(sent)
- sent._.set("my_ext", sent_ext)
- gathered_ext.append(sent_ext)
-
- doc._.set("my_ext", "\n".join(gathered_ext))
- return doc
-
- @staticmethod
- def _get_my_ext(span):
- return str(span.end)
-
-
-def test_issue4903():
- """Ensure that this runs correctly and doesn't hang or crash on Windows /
- macOS."""
- nlp = English()
- nlp.add_pipe("sentencizer")
- nlp.add_pipe("my_pipe", after="sentencizer")
- text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
- if isinstance(get_current_ops(), NumpyOps):
- docs = list(nlp.pipe(text, n_process=2))
- assert docs[0].text == "I like bananas."
- assert docs[1].text == "Do you like them?"
- assert docs[2].text == "No, I prefer wasabi."
-
-
-def test_issue4924():
- nlp = Language()
- example = Example.from_dict(nlp.make_doc(""), {})
- nlp.evaluate([example])
diff --git a/spacy/tests/regression/test_issue5001-5500.py b/spacy/tests/regression/test_issue5001-5500.py
deleted file mode 100644
index 9eefef2e5..000000000
--- a/spacy/tests/regression/test_issue5001-5500.py
+++ /dev/null
@@ -1,140 +0,0 @@
-import numpy
-from spacy.tokens import Doc, DocBin
-from spacy.attrs import DEP, POS, TAG
-from spacy.lang.en import English
-from spacy.language import Language
-from spacy.lang.en.syntax_iterators import noun_chunks
-from spacy.vocab import Vocab
-import spacy
-from thinc.api import get_current_ops
-import pytest
-
-from ...util import make_tempdir
-
-
-def test_issue5048(en_vocab):
- words = ["This", "is", "a", "sentence"]
- pos_s = ["DET", "VERB", "DET", "NOUN"]
- spaces = [" ", " ", " ", ""]
- deps_s = ["dep", "adj", "nn", "atm"]
- tags_s = ["DT", "VBZ", "DT", "NN"]
- strings = en_vocab.strings
- for w in words:
- strings.add(w)
- deps = [strings.add(d) for d in deps_s]
- pos = [strings.add(p) for p in pos_s]
- tags = [strings.add(t) for t in tags_s]
- attrs = [POS, DEP, TAG]
- array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64")
- doc = Doc(en_vocab, words=words, spaces=spaces)
- doc.from_array(attrs, array)
- v1 = [(token.text, token.pos_, token.tag_) for token in doc]
- doc2 = Doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s)
- v2 = [(token.text, token.pos_, token.tag_) for token in doc2]
- assert v1 == v2
-
-
-def test_issue5082():
- # Ensure the 'merge_entities' pipeline does something sensible for the vectors of the merged tokens
- nlp = English()
- vocab = nlp.vocab
- array1 = numpy.asarray([0.1, 0.5, 0.8], dtype=numpy.float32)
- array2 = numpy.asarray([-0.2, -0.6, -0.9], dtype=numpy.float32)
- array3 = numpy.asarray([0.3, -0.1, 0.7], dtype=numpy.float32)
- array4 = numpy.asarray([0.5, 0, 0.3], dtype=numpy.float32)
- array34 = numpy.asarray([0.4, -0.05, 0.5], dtype=numpy.float32)
- vocab.set_vector("I", array1)
- vocab.set_vector("like", array2)
- vocab.set_vector("David", array3)
- vocab.set_vector("Bowie", array4)
- text = "I like David Bowie"
- patterns = [
- {"label": "PERSON", "pattern": [{"LOWER": "david"}, {"LOWER": "bowie"}]}
- ]
- ruler = nlp.add_pipe("entity_ruler")
- ruler.add_patterns(patterns)
- parsed_vectors_1 = [t.vector for t in nlp(text)]
- assert len(parsed_vectors_1) == 4
- ops = get_current_ops()
- numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[0]), array1)
- numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[1]), array2)
- numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[2]), array3)
- numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[3]), array4)
- nlp.add_pipe("merge_entities")
- parsed_vectors_2 = [t.vector for t in nlp(text)]
- assert len(parsed_vectors_2) == 3
- numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[0]), array1)
- numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[1]), array2)
- numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[2]), array34)
-
-
-def test_issue5137():
- @Language.factory("my_component")
- class MyComponent:
- def __init__(self, nlp, name="my_component", categories="all_categories"):
- self.nlp = nlp
- self.categories = categories
- self.name = name
-
- def __call__(self, doc):
- pass
-
- def to_disk(self, path, **kwargs):
- pass
-
- def from_disk(self, path, **cfg):
- pass
-
- nlp = English()
- my_component = nlp.add_pipe("my_component")
- assert my_component.categories == "all_categories"
- with make_tempdir() as tmpdir:
- nlp.to_disk(tmpdir)
- overrides = {"components": {"my_component": {"categories": "my_categories"}}}
- nlp2 = spacy.load(tmpdir, config=overrides)
- assert nlp2.get_pipe("my_component").categories == "my_categories"
-
-
-def test_issue5141(en_vocab):
- """Ensure an empty DocBin does not crash on serialization"""
- doc_bin = DocBin(attrs=["DEP", "HEAD"])
- assert list(doc_bin.get_docs(en_vocab)) == []
- doc_bin_bytes = doc_bin.to_bytes()
- doc_bin_2 = DocBin().from_bytes(doc_bin_bytes)
- assert list(doc_bin_2.get_docs(en_vocab)) == []
-
-
-def test_issue5152():
- # Test that the comparison between a Span and a Token, goes well
- # There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
- nlp = English()
- text = nlp("Talk about being boring!")
- text_var = nlp("Talk of being boring!")
- y = nlp("Let")
- span = text[0:3] # Talk about being
- span_2 = text[0:3] # Talk about being
- span_3 = text_var[0:3] # Talk of being
- token = y[0] # Let
- with pytest.warns(UserWarning):
- assert span.similarity(token) == 0.0
- assert span.similarity(span_2) == 1.0
- with pytest.warns(UserWarning):
- assert span_2.similarity(span_3) < 1.0
-
-
-def test_issue5458():
- # Test that the noun chuncker does not generate overlapping spans
- # fmt: off
- words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."]
- vocab = Vocab(strings=words)
- deps = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"]
- pos = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"]
- heads = [0, 2, 0, 9, 6, 6, 2, 6, 7, 7, 0]
- # fmt: on
- en_doc = Doc(vocab, words=words, pos=pos, heads=heads, deps=deps)
- en_doc.noun_chunks_iterator = noun_chunks
-
- # if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans"
- nlp = English()
- merge_nps = nlp.create_pipe("merge_noun_chunks")
- merge_nps(en_doc)
diff --git a/spacy/tests/regression/test_issue5501-6000.py b/spacy/tests/regression/test_issue5501-6000.py
deleted file mode 100644
index a35de92fa..000000000
--- a/spacy/tests/regression/test_issue5501-6000.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import pytest
-from numpy.testing import assert_almost_equal
-from thinc.api import Config, fix_random_seed, get_current_ops
-
-from spacy.lang.en import English
-from spacy.pipeline.textcat import single_label_default_config, single_label_bow_config
-from spacy.pipeline.textcat import single_label_cnn_config
-from spacy.pipeline.textcat_multilabel import multi_label_default_config
-from spacy.pipeline.textcat_multilabel import multi_label_bow_config
-from spacy.pipeline.textcat_multilabel import multi_label_cnn_config
-from spacy.tokens import Span
-from spacy import displacy
-from spacy.pipeline import merge_entities
-from spacy.training import Example
-
-
-@pytest.mark.parametrize(
- "textcat_config",
- [
- single_label_default_config,
- single_label_bow_config,
- single_label_cnn_config,
- multi_label_default_config,
- multi_label_bow_config,
- multi_label_cnn_config,
- ],
-)
-def test_issue5551(textcat_config):
- """Test that after fixing the random seed, the results of the pipeline are truly identical"""
- component = "textcat"
-
- pipe_cfg = Config().from_str(textcat_config)
- results = []
- for i in range(3):
- fix_random_seed(0)
- nlp = English()
- text = "Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g."
- annots = {"cats": {"Labe1": 1.0, "Label2": 0.0, "Label3": 0.0}}
- pipe = nlp.add_pipe(component, config=pipe_cfg, last=True)
- for label in set(annots["cats"]):
- pipe.add_label(label)
- # Train
- nlp.initialize()
- doc = nlp.make_doc(text)
- nlp.update([Example.from_dict(doc, annots)])
- # Store the result of each iteration
- result = pipe.model.predict([doc])
- results.append(result[0])
- # All results should be the same because of the fixed seed
- assert len(results) == 3
- ops = get_current_ops()
- assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]))
- assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]))
-
-
-def test_issue5838():
- # Displacy's EntityRenderer break line
- # not working after last entity
- sample_text = "First line\nSecond line, with ent\nThird line\nFourth line\n"
- nlp = English()
- doc = nlp(sample_text)
- doc.ents = [Span(doc, 7, 8, label="test")]
- html = displacy.render(doc, style="ent")
- found = html.count("")
- assert found == 4
-
-
-def test_issue5918():
- # Test edge case when merging entities.
- nlp = English()
- ruler = nlp.add_pipe("entity_ruler")
- patterns = [
- {"label": "ORG", "pattern": "Digicon Inc"},
- {"label": "ORG", "pattern": "Rotan Mosle Inc's"},
- {"label": "ORG", "pattern": "Rotan Mosle Technology Partners Ltd"},
- ]
- ruler.add_patterns(patterns)
-
- text = """
- Digicon Inc said it has completed the previously-announced disposition
- of its computer systems division to an investment group led by
- Rotan Mosle Inc's Rotan Mosle Technology Partners Ltd affiliate.
- """
- doc = nlp(text)
- assert len(doc.ents) == 3
- # make it so that the third span's head is within the entity (ent_iob=I)
- # bug #5918 would wrongly transfer that I to the full entity, resulting in 2 instead of 3 final ents.
- # TODO: test for logging here
- # with pytest.warns(UserWarning):
- # doc[29].head = doc[33]
- doc = merge_entities(doc)
- assert len(doc.ents) == 3
diff --git a/spacy/tests/regression/test_issue6001-6500.py b/spacy/tests/regression/test_issue6001-6500.py
deleted file mode 100644
index 470b2f388..000000000
--- a/spacy/tests/regression/test_issue6001-6500.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from spacy.util import filter_spans
-from pydantic import ValidationError
-from spacy.schemas import TokenPattern, TokenPatternSchema
-import pytest
-
-
-def test_issue6207(en_tokenizer):
- doc = en_tokenizer("zero one two three four five six")
-
- # Make spans
- s1 = doc[:4]
- s2 = doc[3:6] # overlaps with s1
- s3 = doc[5:7] # overlaps with s2, not s1
-
- result = filter_spans((s1, s2, s3))
- assert s1 in result
- assert s2 not in result
- assert s3 in result
-
-
-def test_issue6258():
- """Test that the non-empty constraint pattern field is respected"""
- # These one is valid
- TokenPatternSchema(pattern=[TokenPattern()])
- # But an empty pattern list should fail to validate
- # based on the schema's constraint
- with pytest.raises(ValidationError):
- TokenPatternSchema(pattern=[])
diff --git a/spacy/tests/regression/test_issue6501-7000.py b/spacy/tests/regression/test_issue6501-7000.py
deleted file mode 100644
index f57e4085c..000000000
--- a/spacy/tests/regression/test_issue6501-7000.py
+++ /dev/null
@@ -1,230 +0,0 @@
-import pytest
-from spacy.lang.en import English
-import numpy as np
-import spacy
-from spacy.tokens import Doc
-from spacy.matcher import PhraseMatcher
-from spacy.tokens import DocBin
-from spacy.util import load_config_from_str
-from spacy.training import Example
-from spacy.training.initialize import init_nlp
-import pickle
-
-from ..util import make_tempdir
-
-
-def test_issue6730(en_vocab):
- """Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
- from spacy.kb import KnowledgeBase
-
- kb = KnowledgeBase(en_vocab, entity_vector_length=3)
- kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])
-
- with pytest.raises(ValueError):
- kb.add_alias(alias="", entities=["1"], probabilities=[0.4])
- assert kb.contains_alias("") is False
-
- kb.add_alias(alias="x", entities=["1"], probabilities=[0.2])
- kb.add_alias(alias="y", entities=["1"], probabilities=[0.1])
-
- with make_tempdir() as tmp_dir:
- kb.to_disk(tmp_dir)
- kb.from_disk(tmp_dir)
- assert kb.get_size_aliases() == 2
- assert set(kb.get_alias_strings()) == {"x", "y"}
-
-
-def test_issue6755(en_tokenizer):
- doc = en_tokenizer("This is a magnificent sentence.")
- span = doc[:0]
- assert span.text_with_ws == ""
- assert span.text == ""
-
-
-@pytest.mark.parametrize(
- "sentence, start_idx,end_idx,label",
- [("Welcome to Mumbai, my friend", 11, 17, "GPE")],
-)
-def test_issue6815_1(sentence, start_idx, end_idx, label):
- nlp = English()
- doc = nlp(sentence)
- span = doc[:].char_span(start_idx, end_idx, label=label)
- assert span.label_ == label
-
-
-@pytest.mark.parametrize(
- "sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)]
-)
-def test_issue6815_2(sentence, start_idx, end_idx, kb_id):
- nlp = English()
- doc = nlp(sentence)
- span = doc[:].char_span(start_idx, end_idx, kb_id=kb_id)
- assert span.kb_id == kb_id
-
-
-@pytest.mark.parametrize(
- "sentence, start_idx,end_idx,vector",
- [("Welcome to Mumbai, my friend", 11, 17, np.array([0.1, 0.2, 0.3]))],
-)
-def test_issue6815_3(sentence, start_idx, end_idx, vector):
- nlp = English()
- doc = nlp(sentence)
- span = doc[:].char_span(start_idx, end_idx, vector=vector)
- assert (span.vector == vector).all()
-
-
-def test_issue6839(en_vocab):
- """Ensure that PhraseMatcher accepts Span as input"""
- # fmt: off
- words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."]
- # fmt: on
- doc = Doc(en_vocab, words=words)
- span = doc[:8]
- pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
- matcher = PhraseMatcher(en_vocab)
- matcher.add("SPACY", [pattern])
- matches = matcher(span)
- assert matches
-
-
-CONFIG_ISSUE_6908 = """
-[paths]
-train = "TRAIN_PLACEHOLDER"
-raw = null
-init_tok2vec = null
-vectors = null
-
-[system]
-seed = 0
-gpu_allocator = null
-
-[nlp]
-lang = "en"
-pipeline = ["textcat"]
-tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
-disabled = []
-before_creation = null
-after_creation = null
-after_pipeline_creation = null
-batch_size = 1000
-
-[components]
-
-[components.textcat]
-factory = "TEXTCAT_PLACEHOLDER"
-
-[corpora]
-
-[corpora.train]
-@readers = "spacy.Corpus.v1"
-path = ${paths:train}
-
-[corpora.dev]
-@readers = "spacy.Corpus.v1"
-path = ${paths:train}
-
-
-[training]
-train_corpus = "corpora.train"
-dev_corpus = "corpora.dev"
-seed = ${system.seed}
-gpu_allocator = ${system.gpu_allocator}
-frozen_components = []
-before_to_disk = null
-
-[pretraining]
-
-[initialize]
-vectors = ${paths.vectors}
-init_tok2vec = ${paths.init_tok2vec}
-vocab_data = null
-lookups = null
-before_init = null
-after_init = null
-
-[initialize.components]
-
-[initialize.components.textcat]
-labels = ['label1', 'label2']
-
-[initialize.tokenizer]
-"""
-
-
-@pytest.mark.parametrize(
- "component_name",
- ["textcat", "textcat_multilabel"],
-)
-def test_issue6908(component_name):
- """Test intializing textcat with labels in a list"""
-
- def create_data(out_file):
- nlp = spacy.blank("en")
- doc = nlp.make_doc("Some text")
- doc.cats = {"label1": 0, "label2": 1}
- out_data = DocBin(docs=[doc]).to_bytes()
- with out_file.open("wb") as file_:
- file_.write(out_data)
-
- with make_tempdir() as tmp_path:
- train_path = tmp_path / "train.spacy"
- create_data(train_path)
- config_str = CONFIG_ISSUE_6908.replace("TEXTCAT_PLACEHOLDER", component_name)
- config_str = config_str.replace("TRAIN_PLACEHOLDER", train_path.as_posix())
- config = load_config_from_str(config_str)
- init_nlp(config)
-
-
-CONFIG_ISSUE_6950 = """
-[nlp]
-lang = "en"
-pipeline = ["tok2vec", "tagger"]
-
-[components]
-
-[components.tok2vec]
-factory = "tok2vec"
-
-[components.tok2vec.model]
-@architectures = "spacy.Tok2Vec.v1"
-
-[components.tok2vec.model.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
-width = ${components.tok2vec.model.encode:width}
-attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
-rows = [5000,2500,2500,2500]
-include_static_vectors = false
-
-[components.tok2vec.model.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
-width = 96
-depth = 4
-window_size = 1
-maxout_pieces = 3
-
-[components.ner]
-factory = "ner"
-
-[components.tagger]
-factory = "tagger"
-
-[components.tagger.model]
-@architectures = "spacy.Tagger.v1"
-nO = null
-
-[components.tagger.model.tok2vec]
-@architectures = "spacy.Tok2VecListener.v1"
-width = ${components.tok2vec.model.encode:width}
-upstream = "*"
-"""
-
-
-def test_issue6950():
- """Test that the nlp object with initialized tok2vec with listeners pickles
- correctly (and doesn't have lambdas).
- """
- nlp = English.from_config(load_config_from_str(CONFIG_ISSUE_6950))
- nlp.initialize(lambda: [Example.from_dict(nlp.make_doc("hello"), {"tags": ["V"]})])
- pickle.dumps(nlp)
- nlp("hello")
- pickle.dumps(nlp)
diff --git a/spacy/tests/regression/test_issue7019.py b/spacy/tests/regression/test_issue7019.py
deleted file mode 100644
index 53958b594..000000000
--- a/spacy/tests/regression/test_issue7019.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from spacy.cli.evaluate import print_textcats_auc_per_cat, print_prf_per_type
-from wasabi import msg
-
-
-def test_issue7019():
- scores = {"LABEL_A": 0.39829102, "LABEL_B": 0.938298329382, "LABEL_C": None}
- print_textcats_auc_per_cat(msg, scores)
- scores = {
- "LABEL_A": {"p": 0.3420302, "r": 0.3929020, "f": 0.49823928932},
- "LABEL_B": {"p": None, "r": None, "f": None},
- }
- print_prf_per_type(msg, scores, name="foo", type="bar")
diff --git a/spacy/tests/regression/test_issue7029.py b/spacy/tests/regression/test_issue7029.py
deleted file mode 100644
index 8435b32e1..000000000
--- a/spacy/tests/regression/test_issue7029.py
+++ /dev/null
@@ -1,66 +0,0 @@
-from spacy.lang.en import English
-from spacy.training import Example
-from spacy.util import load_config_from_str
-
-
-CONFIG = """
-[nlp]
-lang = "en"
-pipeline = ["tok2vec", "tagger"]
-
-[components]
-
-[components.tok2vec]
-factory = "tok2vec"
-
-[components.tok2vec.model]
-@architectures = "spacy.Tok2Vec.v1"
-
-[components.tok2vec.model.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
-width = ${components.tok2vec.model.encode:width}
-attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
-rows = [5000,2500,2500,2500]
-include_static_vectors = false
-
-[components.tok2vec.model.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
-width = 96
-depth = 4
-window_size = 1
-maxout_pieces = 3
-
-[components.tagger]
-factory = "tagger"
-
-[components.tagger.model]
-@architectures = "spacy.Tagger.v1"
-nO = null
-
-[components.tagger.model.tok2vec]
-@architectures = "spacy.Tok2VecListener.v1"
-width = ${components.tok2vec.model.encode:width}
-upstream = "*"
-"""
-
-
-TRAIN_DATA = [
- ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
- ("Eat blue ham", {"tags": ["V", "J", "N"]}),
-]
-
-
-def test_issue7029():
- """Test that an empty document doesn't mess up an entire batch."""
- nlp = English.from_config(load_config_from_str(CONFIG))
- train_examples = []
- for t in TRAIN_DATA:
- train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
- optimizer = nlp.initialize(get_examples=lambda: train_examples)
- for i in range(50):
- losses = {}
- nlp.update(train_examples, sgd=optimizer, losses=losses)
- texts = ["first", "second", "third", "fourth", "and", "then", "some", ""]
- docs1 = list(nlp.pipe(texts, batch_size=1))
- docs2 = list(nlp.pipe(texts, batch_size=4))
- assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]]
diff --git a/spacy/tests/regression/test_issue7055.py b/spacy/tests/regression/test_issue7055.py
deleted file mode 100644
index c7ddb0a75..000000000
--- a/spacy/tests/regression/test_issue7055.py
+++ /dev/null
@@ -1,40 +0,0 @@
-from spacy.cli.init_config import fill_config
-from spacy.util import load_config
-from spacy.lang.en import English
-from thinc.api import Config
-
-from ..util import make_tempdir
-
-
-def test_issue7055():
- """Test that fill-config doesn't turn sourced components into factories."""
- source_cfg = {
- "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger"]},
- "components": {
- "tok2vec": {"factory": "tok2vec"},
- "tagger": {"factory": "tagger"},
- },
- }
- source_nlp = English.from_config(source_cfg)
- with make_tempdir() as dir_path:
- # We need to create a loadable source pipeline
- source_path = dir_path / "test_model"
- source_nlp.to_disk(source_path)
- base_cfg = {
- "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]},
- "components": {
- "tok2vec": {"source": str(source_path)},
- "tagger": {"source": str(source_path)},
- "ner": {"factory": "ner"},
- },
- }
- base_cfg = Config(base_cfg)
- base_path = dir_path / "base.cfg"
- base_cfg.to_disk(base_path)
- output_path = dir_path / "config.cfg"
- fill_config(output_path, base_path, silent=True)
- filled_cfg = load_config(output_path)
- assert filled_cfg["components"]["tok2vec"]["source"] == str(source_path)
- assert filled_cfg["components"]["tagger"]["source"] == str(source_path)
- assert filled_cfg["components"]["ner"]["factory"] == "ner"
- assert "model" in filled_cfg["components"]["ner"]
diff --git a/spacy/tests/regression/test_issue7056.py b/spacy/tests/regression/test_issue7056.py
deleted file mode 100644
index e94a975d4..000000000
--- a/spacy/tests/regression/test_issue7056.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from spacy.tokens.doc import Doc
-from spacy.vocab import Vocab
-from spacy.pipeline._parser_internals.arc_eager import ArcEager
-
-
-def test_issue7056():
- """Test that the Unshift transition works properly, and doesn't cause
- sentence segmentation errors."""
- vocab = Vocab()
- ae = ArcEager(
- vocab.strings, ArcEager.get_actions(left_labels=["amod"], right_labels=["pobj"])
- )
- doc = Doc(vocab, words="Severe pain , after trauma".split())
- state = ae.init_batch([doc])[0]
- ae.apply_transition(state, "S")
- ae.apply_transition(state, "L-amod")
- ae.apply_transition(state, "S")
- ae.apply_transition(state, "S")
- ae.apply_transition(state, "S")
- ae.apply_transition(state, "R-pobj")
- ae.apply_transition(state, "D")
- ae.apply_transition(state, "D")
- ae.apply_transition(state, "D")
- assert not state.eol()
diff --git a/spacy/tests/regression/test_issue7062.py b/spacy/tests/regression/test_issue7062.py
deleted file mode 100644
index 66bf09523..000000000
--- a/spacy/tests/regression/test_issue7062.py
+++ /dev/null
@@ -1,54 +0,0 @@
-from spacy.kb import KnowledgeBase
-from spacy.training import Example
-from spacy.lang.en import English
-
-
-# fmt: off
-TRAIN_DATA = [
- ("Russ Cochran his reprints include EC Comics.",
- {"links": {(0, 12): {"Q2146908": 1.0}},
- "entities": [(0, 12, "PERSON")],
- "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]})
-]
-# fmt: on
-
-
-def test_partial_links():
- # Test that having some entities on the doc without gold links, doesn't crash
- nlp = English()
- vector_length = 3
- train_examples = []
- for text, annotation in TRAIN_DATA:
- doc = nlp(text)
- train_examples.append(Example.from_dict(doc, annotation))
-
- def create_kb(vocab):
- # create artificial KB
- mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
- mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
- mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9])
- return mykb
-
- # Create and train the Entity Linker
- entity_linker = nlp.add_pipe("entity_linker", last=True)
- entity_linker.set_kb(create_kb)
- optimizer = nlp.initialize(get_examples=lambda: train_examples)
- for i in range(2):
- losses = {}
- nlp.update(train_examples, sgd=optimizer, losses=losses)
-
- # adding additional components that are required for the entity_linker
- nlp.add_pipe("sentencizer", first=True)
- patterns = [
- {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]},
- {"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]},
- ]
- ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
- ruler.add_patterns(patterns)
-
- # this will run the pipeline on the examples and shouldn't crash
- results = nlp.evaluate(train_examples)
- assert "PERSON" in results["ents_per_type"]
- assert "PERSON" in results["nel_f_per_type"]
- assert "ORG" in results["ents_per_type"]
- assert "ORG" not in results["nel_f_per_type"]
diff --git a/spacy/tests/regression/test_issue7065.py b/spacy/tests/regression/test_issue7065.py
deleted file mode 100644
index d40763c63..000000000
--- a/spacy/tests/regression/test_issue7065.py
+++ /dev/null
@@ -1,97 +0,0 @@
-from spacy.kb import KnowledgeBase
-from spacy.lang.en import English
-from spacy.training import Example
-
-
-def test_issue7065():
- text = "Kathleen Battle sang in Mahler 's Symphony No. 8 at the Cincinnati Symphony Orchestra 's May Festival."
- nlp = English()
- nlp.add_pipe("sentencizer")
- ruler = nlp.add_pipe("entity_ruler")
- patterns = [
- {
- "label": "THING",
- "pattern": [
- {"LOWER": "symphony"},
- {"LOWER": "no"},
- {"LOWER": "."},
- {"LOWER": "8"},
- ],
- }
- ]
- ruler.add_patterns(patterns)
-
- doc = nlp(text)
- sentences = [s for s in doc.sents]
- assert len(sentences) == 2
- sent0 = sentences[0]
- ent = doc.ents[0]
- assert ent.start < sent0.end < ent.end
- assert sentences.index(ent.sent) == 0
-
-
-def test_issue7065_b():
- # Test that the NEL doesn't crash when an entity crosses a sentence boundary
- nlp = English()
- vector_length = 3
- nlp.add_pipe("sentencizer")
-
- text = "Mahler 's Symphony No. 8 was beautiful."
- entities = [(0, 6, "PERSON"), (10, 24, "WORK")]
- links = {
- (0, 6): {"Q7304": 1.0, "Q270853": 0.0},
- (10, 24): {"Q7304": 0.0, "Q270853": 1.0},
- }
- sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0]
- doc = nlp(text)
- example = Example.from_dict(
- doc, {"entities": entities, "links": links, "sent_starts": sent_starts}
- )
- train_examples = [example]
-
- def create_kb(vocab):
- # create artificial KB
- mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
- mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7])
- mykb.add_alias(
- alias="No. 8",
- entities=["Q270853"],
- probabilities=[1.0],
- )
- mykb.add_entity(entity="Q7304", freq=12, entity_vector=[6, -4, 3])
- mykb.add_alias(
- alias="Mahler",
- entities=["Q7304"],
- probabilities=[1.0],
- )
- return mykb
-
- # Create the Entity Linker component and add it to the pipeline
- entity_linker = nlp.add_pipe("entity_linker", last=True)
- entity_linker.set_kb(create_kb)
-
- # train the NEL pipe
- optimizer = nlp.initialize(get_examples=lambda: train_examples)
- for i in range(2):
- losses = {}
- nlp.update(train_examples, sgd=optimizer, losses=losses)
-
- # Add a custom rule-based component to mimick NER
- patterns = [
- {"label": "PERSON", "pattern": [{"LOWER": "mahler"}]},
- {
- "label": "WORK",
- "pattern": [
- {"LOWER": "symphony"},
- {"LOWER": "no"},
- {"LOWER": "."},
- {"LOWER": "8"},
- ],
- },
- ]
- ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
- ruler.add_patterns(patterns)
-
- # test the trained model - this should not throw E148
- doc = nlp(text)
- assert doc
diff --git a/spacy/tests/regression/test_issue8168.py b/spacy/tests/regression/test_issue8168.py
deleted file mode 100644
index fbddf643c..000000000
--- a/spacy/tests/regression/test_issue8168.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from spacy.lang.en import English
-
-
-def test_issue8168():
- nlp = English()
- ruler = nlp.add_pipe("entity_ruler")
- patterns = [
- {"label": "ORG", "pattern": "Apple"},
- {
- "label": "GPE",
- "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}],
- "id": "san-francisco",
- },
- {
- "label": "GPE",
- "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}],
- "id": "san-francisco",
- },
- ]
- ruler.add_patterns(patterns)
-
- assert ruler._ent_ids == {8043148519967183733: ("GPE", "san-francisco")}
diff --git a/spacy/tests/regression/test_issue8190.py b/spacy/tests/regression/test_issue8190.py
deleted file mode 100644
index 6ddbe53e0..000000000
--- a/spacy/tests/regression/test_issue8190.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import spacy
-from spacy.lang.en import English
-from ..util import make_tempdir
-
-
-def test_issue8190():
- """Test that config overrides are not lost after load is complete."""
- source_cfg = {
- "nlp": {
- "lang": "en",
- },
- "custom": {"key": "value"},
- }
- source_nlp = English.from_config(source_cfg)
- with make_tempdir() as dir_path:
- # We need to create a loadable source pipeline
- source_path = dir_path / "test_model"
- source_nlp.to_disk(source_path)
- nlp = spacy.load(source_path, config={"custom": {"key": "updated_value"}})
-
- assert nlp.config["custom"]["key"] == "updated_value"
diff --git a/spacy/tests/regression/test_issue8216.py b/spacy/tests/regression/test_issue8216.py
deleted file mode 100644
index 00cd6da3b..000000000
--- a/spacy/tests/regression/test_issue8216.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import pytest
-
-from spacy import registry
-from spacy.language import Language
-
-
-@pytest.fixture
-def nlp():
- return Language()
-
-
-@pytest.fixture
-@registry.misc("entity_ruler_patterns")
-def patterns():
- return [
- {"label": "HELLO", "pattern": "hello world"},
- {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
- {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
- {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
- {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
- {"label": "TECH_ORG", "pattern": "Microsoft", "id": "a2"},
- ]
-
-
-def test_entity_ruler_fix8216(nlp, patterns):
- """Test that patterns don't get added excessively."""
- ruler = nlp.add_pipe("entity_ruler", config={"validate": True})
- ruler.add_patterns(patterns)
- pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
- assert pattern_count > 0
- ruler.add_patterns([])
- after_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
- assert after_count == pattern_count
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 102989705..1d50fd1d1 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -1,20 +1,17 @@
import pytest
-from thinc.api import Config, ConfigValidationError
-import spacy
-from spacy.lang.en import English
-from spacy.lang.de import German
-from spacy.language import Language, DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
-from spacy.util import (
- registry,
- load_model_from_config,
- load_config,
- load_config_from_str,
-)
-from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
-from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
-from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
from catalogue import RegistryError
+from thinc.api import Config, ConfigValidationError
+import spacy
+from spacy.lang.de import German
+from spacy.lang.en import English
+from spacy.language import DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
+from spacy.language import Language
+from spacy.ml.models import MaxoutWindowEncoder, MultiHashEmbed
+from spacy.ml.models import build_tb_parser_model, build_Tok2Vec_model
+from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
+from spacy.util import load_config, load_config_from_str
+from spacy.util import load_model_from_config, registry
from ..util import make_tempdir
@@ -187,6 +184,25 @@ def my_parser():
return parser
+@pytest.mark.issue(8190)
+def test_issue8190():
+ """Test that config overrides are not lost after load is complete."""
+ source_cfg = {
+ "nlp": {
+ "lang": "en",
+ },
+ "custom": {"key": "value"},
+ }
+ source_nlp = English.from_config(source_cfg)
+ with make_tempdir() as dir_path:
+ # We need to create a loadable source pipeline
+ source_path = dir_path / "test_model"
+ source_nlp.to_disk(source_path)
+ nlp = spacy.load(source_path, config={"custom": {"key": "updated_value"}})
+
+ assert nlp.config["custom"]["key"] == "updated_value"
+
+
def test_create_nlp_from_config():
config = Config().from_str(nlp_config_string)
with pytest.raises(ConfigValidationError):
diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py
index e51c7f45b..15bf67bfd 100644
--- a/spacy/tests/serialize/test_serialize_doc.py
+++ b/spacy/tests/serialize/test_serialize_doc.py
@@ -1,13 +1,168 @@
-import pytest
-from spacy.tokens.doc import Underscore
+import copy
+import pickle
-import spacy
+import numpy
+import pytest
+
+from spacy.attrs import DEP, HEAD
from spacy.lang.en import English
-from spacy.tokens import Doc, DocBin
+from spacy.language import Language
+from spacy.matcher import Matcher, PhraseMatcher
+from spacy.tokens import Doc
+from spacy.vectors import Vectors
+from spacy.vocab import Vocab
from ..util import make_tempdir
+@pytest.mark.issue(1727)
+def test_issue1727():
+ """Test that models with no pretrained vectors can be deserialized
+ correctly after vectors are added."""
+ nlp = Language(Vocab())
+ data = numpy.ones((3, 300), dtype="f")
+ vectors = Vectors(data=data, keys=["I", "am", "Matt"])
+ tagger = nlp.create_pipe("tagger")
+ tagger.add_label("PRP")
+ assert tagger.cfg.get("pretrained_dims", 0) == 0
+ tagger.vocab.vectors = vectors
+ with make_tempdir() as path:
+ tagger.to_disk(path)
+ tagger = nlp.create_pipe("tagger").from_disk(path)
+ assert tagger.cfg.get("pretrained_dims", 0) == 0
+
+
+@pytest.mark.issue(1799)
+def test_issue1799():
+ """Test sentence boundaries are deserialized correctly, even for
+ non-projective sentences."""
+ heads_deps = numpy.asarray(
+ [
+ [1, 397],
+ [4, 436],
+ [2, 426],
+ [1, 402],
+ [0, 8206900633647566924],
+ [18446744073709551615, 440],
+ [18446744073709551614, 442],
+ ],
+ dtype="uint64",
+ )
+ doc = Doc(Vocab(), words="Just what I was looking for .".split())
+ doc.vocab.strings.add("ROOT")
+ doc = doc.from_array([HEAD, DEP], heads_deps)
+ assert len(list(doc.sents)) == 1
+
+
+@pytest.mark.issue(1834)
+def test_issue1834():
+ """Test that sentence boundaries & parse/tag flags are not lost
+ during serialization."""
+ words = ["This", "is", "a", "first", "sentence", ".", "And", "another", "one"]
+ doc = Doc(Vocab(), words=words)
+ doc[6].is_sent_start = True
+ new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
+ assert new_doc[6].sent_start
+ assert not new_doc.has_annotation("DEP")
+ assert not new_doc.has_annotation("TAG")
+ doc = Doc(
+ Vocab(),
+ words=words,
+ tags=["TAG"] * len(words),
+ heads=[0, 0, 0, 0, 0, 0, 6, 6, 6],
+ deps=["dep"] * len(words),
+ )
+ new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
+ assert new_doc[6].sent_start
+ assert new_doc.has_annotation("DEP")
+ assert new_doc.has_annotation("TAG")
+
+
+@pytest.mark.issue(1883)
+def test_issue1883():
+ matcher = Matcher(Vocab())
+ matcher.add("pat1", [[{"orth": "hello"}]])
+ doc = Doc(matcher.vocab, words=["hello"])
+ assert len(matcher(doc)) == 1
+ new_matcher = copy.deepcopy(matcher)
+ new_doc = Doc(new_matcher.vocab, words=["hello"])
+ assert len(new_matcher(new_doc)) == 1
+
+
+@pytest.mark.issue(2564)
+def test_issue2564():
+ """Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe."""
+ nlp = Language()
+ tagger = nlp.add_pipe("tagger")
+ tagger.add_label("A")
+ nlp.initialize()
+ doc = nlp("hello world")
+ assert doc.has_annotation("TAG")
+ docs = nlp.pipe(["hello", "world"])
+ piped_doc = next(docs)
+ assert piped_doc.has_annotation("TAG")
+
+
+@pytest.mark.issue(3248)
+def test_issue3248_2():
+ """Test that the PhraseMatcher can be pickled correctly."""
+ nlp = English()
+ matcher = PhraseMatcher(nlp.vocab)
+ matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
+ matcher.add("TEST2", [nlp("d")])
+ data = pickle.dumps(matcher)
+ new_matcher = pickle.loads(data)
+ assert len(new_matcher) == len(matcher)
+
+
+@pytest.mark.issue(3289)
+def test_issue3289():
+ """Test that Language.to_bytes handles serializing a pipeline component
+ with an uninitialized model."""
+ nlp = English()
+ nlp.add_pipe("textcat")
+ bytes_data = nlp.to_bytes()
+ new_nlp = English()
+ new_nlp.add_pipe("textcat")
+ new_nlp.from_bytes(bytes_data)
+
+
+@pytest.mark.issue(3468)
+def test_issue3468():
+ """Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can
+ be restored after serialization."""
+ nlp = English()
+ nlp.add_pipe("sentencizer")
+ doc = nlp("Hello world")
+ assert doc[0].is_sent_start
+ assert doc.has_annotation("SENT_START")
+ assert len(list(doc.sents)) == 1
+ doc_bytes = doc.to_bytes()
+ new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
+ assert new_doc[0].is_sent_start
+ assert new_doc.has_annotation("SENT_START")
+ assert len(list(new_doc.sents)) == 1
+
+
+@pytest.mark.issue(3959)
+def test_issue3959():
+ """Ensure that a modified pos attribute is serialized correctly."""
+ nlp = English()
+ doc = nlp(
+ "displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
+ )
+ assert doc[0].pos_ == ""
+ doc[0].pos_ = "NOUN"
+ assert doc[0].pos_ == "NOUN"
+ # usually this is already True when starting from proper models instead of blank English
+ with make_tempdir() as tmp_dir:
+ file_path = tmp_dir / "my_doc"
+ doc.to_disk(file_path)
+ doc2 = nlp("")
+ doc2.from_disk(file_path)
+ assert doc2[0].pos_ == "NOUN"
+
+
def test_serialize_empty_doc(en_vocab):
doc = Doc(en_vocab)
data = doc.to_bytes()
@@ -61,69 +216,3 @@ def test_serialize_doc_span_groups(en_vocab):
doc.spans["content"] = [doc[0:2]]
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
assert len(new_doc.spans["content"]) == 1
-
-
-def test_serialize_doc_bin():
- doc_bin = DocBin(
- attrs=["LEMMA", "ENT_IOB", "ENT_TYPE", "NORM", "ENT_ID"], store_user_data=True
- )
- texts = ["Some text", "Lots of texts...", "..."]
- cats = {"A": 0.5}
- nlp = English()
- for doc in nlp.pipe(texts):
- doc.cats = cats
- doc.spans["start"] = [doc[0:2]]
- doc[0].norm_ = "UNUSUAL_TOKEN_NORM"
- doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID"
- doc_bin.add(doc)
- bytes_data = doc_bin.to_bytes()
-
- # Deserialize later, e.g. in a new process
- nlp = spacy.blank("en")
- doc_bin = DocBin().from_bytes(bytes_data)
- reloaded_docs = list(doc_bin.get_docs(nlp.vocab))
- for i, doc in enumerate(reloaded_docs):
- assert doc.text == texts[i]
- assert doc.cats == cats
- assert len(doc.spans) == 1
- assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM"
- assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID"
-
-
-def test_serialize_doc_bin_unknown_spaces(en_vocab):
- doc1 = Doc(en_vocab, words=["that", "'s"])
- assert doc1.has_unknown_spaces
- assert doc1.text == "that 's "
- doc2 = Doc(en_vocab, words=["that", "'s"], spaces=[False, False])
- assert not doc2.has_unknown_spaces
- assert doc2.text == "that's"
-
- doc_bin = DocBin().from_bytes(DocBin(docs=[doc1, doc2]).to_bytes())
- re_doc1, re_doc2 = doc_bin.get_docs(en_vocab)
- assert re_doc1.has_unknown_spaces
- assert re_doc1.text == "that 's "
- assert not re_doc2.has_unknown_spaces
- assert re_doc2.text == "that's"
-
-
-@pytest.mark.parametrize(
- "writer_flag,reader_flag,reader_value",
- [
- (True, True, "bar"),
- (True, False, "bar"),
- (False, True, "nothing"),
- (False, False, "nothing"),
- ],
-)
-def test_serialize_custom_extension(en_vocab, writer_flag, reader_flag, reader_value):
- """Test that custom extensions are correctly serialized in DocBin."""
- Doc.set_extension("foo", default="nothing")
- doc = Doc(en_vocab, words=["hello", "world"])
- doc._.foo = "bar"
- doc_bin_1 = DocBin(store_user_data=writer_flag)
- doc_bin_1.add(doc)
- doc_bin_bytes = doc_bin_1.to_bytes()
- doc_bin_2 = DocBin(store_user_data=reader_flag).from_bytes(doc_bin_bytes)
- doc_2 = list(doc_bin_2.get_docs(en_vocab))[0]
- assert doc_2._.foo == reader_value
- Underscore.doc_extensions = {}
diff --git a/spacy/tests/serialize/test_serialize_docbin.py b/spacy/tests/serialize/test_serialize_docbin.py
new file mode 100644
index 000000000..9f8e5e06b
--- /dev/null
+++ b/spacy/tests/serialize/test_serialize_docbin.py
@@ -0,0 +1,106 @@
+import pytest
+
+import spacy
+from spacy.lang.en import English
+from spacy.tokens import Doc, DocBin
+from spacy.tokens.underscore import Underscore
+
+
+@pytest.mark.issue(4367)
+def test_issue4367():
+ """Test that docbin init goes well"""
+ DocBin()
+ DocBin(attrs=["LEMMA"])
+ DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
+
+
+@pytest.mark.issue(4528)
+def test_issue4528(en_vocab):
+ """Test that user_data is correctly serialized in DocBin."""
+ doc = Doc(en_vocab, words=["hello", "world"])
+ doc.user_data["foo"] = "bar"
+ # This is how extension attribute values are stored in the user data
+ doc.user_data[("._.", "foo", None, None)] = "bar"
+ doc_bin = DocBin(store_user_data=True)
+ doc_bin.add(doc)
+ doc_bin_bytes = doc_bin.to_bytes()
+ new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
+ new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
+ assert new_doc.user_data["foo"] == "bar"
+ assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
+
+
+@pytest.mark.issue(5141)
+def test_issue5141(en_vocab):
+ """Ensure an empty DocBin does not crash on serialization"""
+ doc_bin = DocBin(attrs=["DEP", "HEAD"])
+ assert list(doc_bin.get_docs(en_vocab)) == []
+ doc_bin_bytes = doc_bin.to_bytes()
+ doc_bin_2 = DocBin().from_bytes(doc_bin_bytes)
+ assert list(doc_bin_2.get_docs(en_vocab)) == []
+
+
+def test_serialize_doc_bin():
+ doc_bin = DocBin(
+ attrs=["LEMMA", "ENT_IOB", "ENT_TYPE", "NORM", "ENT_ID"], store_user_data=True
+ )
+ texts = ["Some text", "Lots of texts...", "..."]
+ cats = {"A": 0.5}
+ nlp = English()
+ for doc in nlp.pipe(texts):
+ doc.cats = cats
+ doc.spans["start"] = [doc[0:2]]
+ doc[0].norm_ = "UNUSUAL_TOKEN_NORM"
+ doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID"
+ doc_bin.add(doc)
+ bytes_data = doc_bin.to_bytes()
+
+ # Deserialize later, e.g. in a new process
+ nlp = spacy.blank("en")
+ doc_bin = DocBin().from_bytes(bytes_data)
+ reloaded_docs = list(doc_bin.get_docs(nlp.vocab))
+ for i, doc in enumerate(reloaded_docs):
+ assert doc.text == texts[i]
+ assert doc.cats == cats
+ assert len(doc.spans) == 1
+ assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM"
+ assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID"
+
+
+def test_serialize_doc_bin_unknown_spaces(en_vocab):
+ doc1 = Doc(en_vocab, words=["that", "'s"])
+ assert doc1.has_unknown_spaces
+ assert doc1.text == "that 's "
+ doc2 = Doc(en_vocab, words=["that", "'s"], spaces=[False, False])
+ assert not doc2.has_unknown_spaces
+ assert doc2.text == "that's"
+
+ doc_bin = DocBin().from_bytes(DocBin(docs=[doc1, doc2]).to_bytes())
+ re_doc1, re_doc2 = doc_bin.get_docs(en_vocab)
+ assert re_doc1.has_unknown_spaces
+ assert re_doc1.text == "that 's "
+ assert not re_doc2.has_unknown_spaces
+ assert re_doc2.text == "that's"
+
+
+@pytest.mark.parametrize(
+ "writer_flag,reader_flag,reader_value",
+ [
+ (True, True, "bar"),
+ (True, False, "bar"),
+ (False, True, "nothing"),
+ (False, False, "nothing"),
+ ],
+)
+def test_serialize_custom_extension(en_vocab, writer_flag, reader_flag, reader_value):
+ """Test that custom extensions are correctly serialized in DocBin."""
+ Doc.set_extension("foo", default="nothing")
+ doc = Doc(en_vocab, words=["hello", "world"])
+ doc._.foo = "bar"
+ doc_bin_1 = DocBin(store_user_data=writer_flag)
+ doc_bin_1.add(doc)
+ doc_bin_bytes = doc_bin_1.to_bytes()
+ doc_bin_2 = DocBin(store_user_data=reader_flag).from_bytes(doc_bin_bytes)
+ doc_2 = list(doc_bin_2.get_docs(en_vocab))[0]
+ assert doc_2._.foo == reader_value
+ Underscore.doc_extensions = {}
diff --git a/spacy/tests/serialize/test_serialize_language.py b/spacy/tests/serialize/test_serialize_language.py
index 05529f9d1..6e7fa0e4e 100644
--- a/spacy/tests/serialize/test_serialize_language.py
+++ b/spacy/tests/serialize/test_serialize_language.py
@@ -1,8 +1,14 @@
-import pytest
import re
+import pickle
+
+import pytest
from spacy.language import Language
+from spacy.lang.it import Italian
+from spacy.lang.en import English
from spacy.tokenizer import Tokenizer
+from spacy.training import Example
+from spacy.util import load_config_from_str
from ..util import make_tempdir
@@ -21,6 +27,71 @@ def meta_data():
}
+@pytest.mark.issue(2482)
+def test_issue2482():
+ """Test we can serialize and deserialize a blank NER or parser model."""
+ nlp = Italian()
+ nlp.add_pipe("ner")
+ b = nlp.to_bytes()
+ Italian().from_bytes(b)
+
+
+CONFIG_ISSUE_6950 = """
+[nlp]
+lang = "en"
+pipeline = ["tok2vec", "tagger"]
+
+[components]
+
+[components.tok2vec]
+factory = "tok2vec"
+
+[components.tok2vec.model]
+@architectures = "spacy.Tok2Vec.v1"
+
+[components.tok2vec.model.embed]
+@architectures = "spacy.MultiHashEmbed.v1"
+width = ${components.tok2vec.model.encode:width}
+attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
+rows = [5000,2500,2500,2500]
+include_static_vectors = false
+
+[components.tok2vec.model.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v1"
+width = 96
+depth = 4
+window_size = 1
+maxout_pieces = 3
+
+[components.ner]
+factory = "ner"
+
+[components.tagger]
+factory = "tagger"
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v1"
+nO = null
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode:width}
+upstream = "*"
+"""
+
+
+@pytest.mark.issue(6950)
+def test_issue6950():
+ """Test that the nlp object with initialized tok2vec with listeners pickles
+ correctly (and doesn't have lambdas).
+ """
+ nlp = English.from_config(load_config_from_str(CONFIG_ISSUE_6950))
+ nlp.initialize(lambda: [Example.from_dict(nlp.make_doc("hello"), {"tags": ["V"]})])
+ pickle.dumps(nlp)
+ nlp("hello")
+ pickle.dumps(nlp)
+
+
def test_serialize_language_meta_disk(meta_data):
language = Language(meta=meta_data)
with make_tempdir() as d:
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index 35cc22d24..9fcf18e2d 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -1,18 +1,25 @@
+import pickle
+
import pytest
-from spacy import registry, Vocab
-from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
-from spacy.pipeline import TextCategorizer, SentenceRecognizer, TrainablePipe
+import srsly
+from thinc.api import Linear
+
+import spacy
+from spacy import Vocab, load, registry
+from spacy.lang.en import English
+from spacy.language import Language
+from spacy.pipeline import DependencyParser, EntityRecognizer, EntityRuler
+from spacy.pipeline import SentenceRecognizer, Tagger, TextCategorizer
+from spacy.pipeline import TrainablePipe
from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
+from spacy.pipeline.senter import DEFAULT_SENTER_MODEL
from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL
-from spacy.pipeline.senter import DEFAULT_SENTER_MODEL
-from spacy.lang.en import English
-from thinc.api import Linear
-import spacy
+from spacy.util import ensure_path, load_model
+from spacy.tokens import Span
from ..util import make_tempdir
-
test_parsers = [DependencyParser, EntityRecognizer]
@@ -58,14 +65,183 @@ def taggers(en_vocab):
return tagger1, tagger2
+@pytest.mark.issue(3456)
+def test_issue3456():
+ # this crashed because of a padding error in layer.ops.unflatten in thinc
+ nlp = English()
+ tagger = nlp.add_pipe("tagger")
+ tagger.add_label("A")
+ nlp.initialize()
+ list(nlp.pipe(["hi", ""]))
+
+
+@pytest.mark.issue(3526)
+def test_issue_3526_1(en_vocab):
+ patterns = [
+ {"label": "HELLO", "pattern": "hello world"},
+ {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
+ {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
+ {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
+ {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
+ ]
+ nlp = Language(vocab=en_vocab)
+ ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
+ ruler_bytes = ruler.to_bytes()
+ assert len(ruler) == len(patterns)
+ assert len(ruler.labels) == 4
+ assert ruler.overwrite
+ new_ruler = EntityRuler(nlp)
+ new_ruler = new_ruler.from_bytes(ruler_bytes)
+ assert len(new_ruler) == len(ruler)
+ assert len(new_ruler.labels) == 4
+ assert new_ruler.overwrite == ruler.overwrite
+ assert new_ruler.ent_id_sep == ruler.ent_id_sep
+
+
+@pytest.mark.issue(3526)
+def test_issue_3526_2(en_vocab):
+ patterns = [
+ {"label": "HELLO", "pattern": "hello world"},
+ {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
+ {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
+ {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
+ {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
+ ]
+ nlp = Language(vocab=en_vocab)
+ ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
+ bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
+ new_ruler = EntityRuler(nlp)
+ new_ruler = new_ruler.from_bytes(bytes_old_style)
+ assert len(new_ruler) == len(ruler)
+ for pattern in ruler.patterns:
+ assert pattern in new_ruler.patterns
+ assert new_ruler.overwrite is not ruler.overwrite
+
+
+@pytest.mark.issue(3526)
+def test_issue_3526_3(en_vocab):
+ patterns = [
+ {"label": "HELLO", "pattern": "hello world"},
+ {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
+ {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
+ {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
+ {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
+ ]
+ nlp = Language(vocab=en_vocab)
+ ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
+ with make_tempdir() as tmpdir:
+ out_file = tmpdir / "entity_ruler"
+ srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
+ new_ruler = EntityRuler(nlp).from_disk(out_file)
+ for pattern in ruler.patterns:
+ assert pattern in new_ruler.patterns
+ assert len(new_ruler) == len(ruler)
+ assert new_ruler.overwrite is not ruler.overwrite
+
+
+@pytest.mark.issue(3526)
+def test_issue_3526_4(en_vocab):
+ nlp = Language(vocab=en_vocab)
+ patterns = [{"label": "ORG", "pattern": "Apple"}]
+ config = {"overwrite_ents": True}
+ ruler = nlp.add_pipe("entity_ruler", config=config)
+ ruler.add_patterns(patterns)
+ with make_tempdir() as tmpdir:
+ nlp.to_disk(tmpdir)
+ ruler = nlp.get_pipe("entity_ruler")
+ assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
+ assert ruler.overwrite is True
+ nlp2 = load(tmpdir)
+ new_ruler = nlp2.get_pipe("entity_ruler")
+ assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
+ assert new_ruler.overwrite is True
+
+
+@pytest.mark.issue(4042)
+def test_issue4042():
+ """Test that serialization of an EntityRuler before NER works fine."""
+ nlp = English()
+ # add ner pipe
+ ner = nlp.add_pipe("ner")
+ ner.add_label("SOME_LABEL")
+ nlp.initialize()
+ # Add entity ruler
+ patterns = [
+ {"label": "MY_ORG", "pattern": "Apple"},
+ {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
+ ]
+ # works fine with "after"
+ ruler = nlp.add_pipe("entity_ruler", before="ner")
+ ruler.add_patterns(patterns)
+ doc1 = nlp("What do you think about Apple ?")
+ assert doc1.ents[0].label_ == "MY_ORG"
+
+ with make_tempdir() as d:
+ output_dir = ensure_path(d)
+ if not output_dir.exists():
+ output_dir.mkdir()
+ nlp.to_disk(output_dir)
+ nlp2 = load_model(output_dir)
+ doc2 = nlp2("What do you think about Apple ?")
+ assert doc2.ents[0].label_ == "MY_ORG"
+
+
+@pytest.mark.issue(4042)
+def test_issue4042_bug2():
+ """
+ Test that serialization of an NER works fine when new labels were added.
+ This is the second bug of two bugs underlying the issue 4042.
+ """
+ nlp1 = English()
+ # add ner pipe
+ ner1 = nlp1.add_pipe("ner")
+ ner1.add_label("SOME_LABEL")
+ nlp1.initialize()
+ # add a new label to the doc
+ doc1 = nlp1("What do you think about Apple ?")
+ assert len(ner1.labels) == 1
+ assert "SOME_LABEL" in ner1.labels
+ apple_ent = Span(doc1, 5, 6, label="MY_ORG")
+ doc1.ents = list(doc1.ents) + [apple_ent]
+ # Add the label explicitly. Previously we didn't require this.
+ ner1.add_label("MY_ORG")
+ ner1(doc1)
+ assert len(ner1.labels) == 2
+ assert "SOME_LABEL" in ner1.labels
+ assert "MY_ORG" in ner1.labels
+ with make_tempdir() as d:
+ # assert IO goes fine
+ output_dir = ensure_path(d)
+ if not output_dir.exists():
+ output_dir.mkdir()
+ ner1.to_disk(output_dir)
+ config = {}
+ ner2 = nlp1.create_pipe("ner", config=config)
+ ner2.from_disk(output_dir)
+ assert len(ner2.labels) == 2
+
+
+@pytest.mark.issue(4725)
+def test_issue4725_1():
+ """Ensure the pickling of the NER goes well"""
+ vocab = Vocab(vectors_name="test_vocab_add_vector")
+ nlp = English(vocab=vocab)
+ config = {
+ "update_with_oracle_cut_size": 111,
+ }
+ ner = nlp.create_pipe("ner", config=config)
+ with make_tempdir() as tmp_path:
+ with (tmp_path / "ner.pkl").open("wb") as file_:
+ pickle.dump(ner, file_)
+ assert ner.cfg["update_with_oracle_cut_size"] == 111
+
+ with (tmp_path / "ner.pkl").open("rb") as file_:
+ ner2 = pickle.load(file_)
+ assert ner2.cfg["update_with_oracle_cut_size"] == 111
+
+
@pytest.mark.parametrize("Parser", test_parsers)
def test_serialize_parser_roundtrip_bytes(en_vocab, Parser):
- config = {
- "update_with_oracle_cut_size": 100,
- "beam_width": 1,
- "beam_update_prob": 1.0,
- "beam_density": 0.0,
- }
cfg = {"model": DEFAULT_PARSER_MODEL}
model = registry.resolve(cfg, validate=True)["model"]
parser = Parser(en_vocab, model)
@@ -168,6 +344,7 @@ def test_serialize_tagger_strings(en_vocab, de_vocab, taggers):
assert label in tagger2.vocab.strings
+@pytest.mark.issue(1105)
def test_serialize_textcat_empty(en_vocab):
# See issue #1105
cfg = {"model": DEFAULT_SINGLE_TEXTCAT_MODEL}
@@ -274,3 +451,21 @@ def test_serialize_custom_trainable_pipe():
pipe.to_disk(d)
new_pipe = CustomPipe(Vocab(), Linear()).from_disk(d)
assert new_pipe.to_bytes() == pipe_bytes
+
+
+def test_load_without_strings():
+ nlp = spacy.blank("en")
+ orig_strings_length = len(nlp.vocab.strings)
+ word = "unlikely_word_" * 20
+ nlp.vocab.strings.add(word)
+ assert len(nlp.vocab.strings) == orig_strings_length + 1
+ with make_tempdir() as d:
+ nlp.to_disk(d)
+ # reload with strings
+ reloaded_nlp = load(d)
+ assert len(nlp.vocab.strings) == len(reloaded_nlp.vocab.strings)
+ assert word in reloaded_nlp.vocab.strings
+ # reload without strings
+ reloaded_nlp = load(d, exclude=["strings"])
+ assert orig_strings_length == len(reloaded_nlp.vocab.strings)
+ assert word not in reloaded_nlp.vocab.strings
diff --git a/spacy/tests/serialize/test_serialize_tokenizer.py b/spacy/tests/serialize/test_serialize_tokenizer.py
index a9450cd04..e271f7707 100644
--- a/spacy/tests/serialize/test_serialize_tokenizer.py
+++ b/spacy/tests/serialize/test_serialize_tokenizer.py
@@ -1,9 +1,16 @@
-import pytest
+import pickle
import re
-from spacy.util import get_lang_class
-from spacy.tokenizer import Tokenizer
-from ..util import make_tempdir, assert_packed_msg_equal
+import pytest
+
+from spacy.attrs import ENT_IOB, ENT_TYPE
+from spacy.lang.en import English
+from spacy.tokenizer import Tokenizer
+from spacy.tokens import Doc
+from spacy.util import compile_infix_regex, compile_prefix_regex
+from spacy.util import compile_suffix_regex, get_lang_class, load_model
+
+from ..util import assert_packed_msg_equal, make_tempdir
def load_tokenizer(b):
@@ -12,6 +19,79 @@ def load_tokenizer(b):
return tok
+@pytest.mark.issue(2833)
+def test_issue2833(en_vocab):
+ """Test that a custom error is raised if a token or span is pickled."""
+ doc = Doc(en_vocab, words=["Hello", "world"])
+ with pytest.raises(NotImplementedError):
+ pickle.dumps(doc[0])
+ with pytest.raises(NotImplementedError):
+ pickle.dumps(doc[0:2])
+
+
+@pytest.mark.issue(3012)
+def test_issue3012(en_vocab):
+ """Test that the is_tagged attribute doesn't get overwritten when we from_array
+ without tag information."""
+ words = ["This", "is", "10", "%", "."]
+ tags = ["DT", "VBZ", "CD", "NN", "."]
+ pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
+ ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"]
+ doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
+ assert doc.has_annotation("TAG")
+ expected = ("10", "NUM", "CD", "PERCENT")
+ assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
+ header = [ENT_IOB, ENT_TYPE]
+ ent_array = doc.to_array(header)
+ doc.from_array(header, ent_array)
+ assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
+ # Serializing then deserializing
+ doc_bytes = doc.to_bytes()
+ doc2 = Doc(en_vocab).from_bytes(doc_bytes)
+ assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected
+
+
+@pytest.mark.issue(4190)
+def test_issue4190():
+ def customize_tokenizer(nlp):
+ prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
+ suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)
+ infix_re = compile_infix_regex(nlp.Defaults.infixes)
+ # Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
+ exceptions = {
+ k: v
+ for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
+ if not (len(k) == 2 and k[1] == ".")
+ }
+ new_tokenizer = Tokenizer(
+ nlp.vocab,
+ exceptions,
+ prefix_search=prefix_re.search,
+ suffix_search=suffix_re.search,
+ infix_finditer=infix_re.finditer,
+ token_match=nlp.tokenizer.token_match,
+ )
+ nlp.tokenizer = new_tokenizer
+
+ test_string = "Test c."
+ # Load default language
+ nlp_1 = English()
+ doc_1a = nlp_1(test_string)
+ result_1a = [token.text for token in doc_1a] # noqa: F841
+ # Modify tokenizer
+ customize_tokenizer(nlp_1)
+ doc_1b = nlp_1(test_string)
+ result_1b = [token.text for token in doc_1b]
+ # Save and Reload
+ with make_tempdir() as model_dir:
+ nlp_1.to_disk(model_dir)
+ nlp_2 = load_model(model_dir)
+ # This should be the modified tokenizer
+ doc_2 = nlp_2(test_string)
+ result_2 = [token.text for token in doc_2]
+ assert result_1b == result_2
+
+
def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
"""Test that custom tokenizer with not all functions defined or empty
properties can be serialized and deserialized correctly (see #2494,
diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py
index 3fe9363bf..fd80c3d8e 100644
--- a/spacy/tests/serialize/test_serialize_vocab_strings.py
+++ b/spacy/tests/serialize/test_serialize_vocab_strings.py
@@ -1,15 +1,71 @@
-import pytest
import pickle
-from spacy.vocab import Vocab
+
+import pytest
+from thinc.api import get_current_ops
+
+import spacy
+from spacy.lang.en import English
from spacy.strings import StringStore
+from spacy.tokens import Doc
+from spacy.util import ensure_path, load_model
+from spacy.vectors import Vectors
+from spacy.vocab import Vocab
from ..util import make_tempdir
-
test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])]
test_strings_attrs = [(["rats", "are", "cute"], "Hello")]
+@pytest.mark.issue(599)
+def test_issue599(en_vocab):
+ doc = Doc(en_vocab)
+ doc2 = Doc(doc.vocab)
+ doc2.from_bytes(doc.to_bytes())
+ assert doc2.has_annotation("DEP")
+
+
+@pytest.mark.issue(4054)
+def test_issue4054(en_vocab):
+ """Test that a new blank model can be made with a vocab from file,
+ and that serialization does not drop the language at any point."""
+ nlp1 = English()
+ vocab1 = nlp1.vocab
+ with make_tempdir() as d:
+ vocab_dir = ensure_path(d / "vocab")
+ if not vocab_dir.exists():
+ vocab_dir.mkdir()
+ vocab1.to_disk(vocab_dir)
+ vocab2 = Vocab().from_disk(vocab_dir)
+ nlp2 = spacy.blank("en", vocab=vocab2)
+ nlp_dir = ensure_path(d / "nlp")
+ if not nlp_dir.exists():
+ nlp_dir.mkdir()
+ nlp2.to_disk(nlp_dir)
+ nlp3 = load_model(nlp_dir)
+ assert nlp3.lang == "en"
+
+
+@pytest.mark.issue(4133)
+def test_issue4133(en_vocab):
+ nlp = English()
+ vocab_bytes = nlp.vocab.to_bytes()
+ words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
+ pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
+ doc = Doc(en_vocab, words=words)
+ for i, token in enumerate(doc):
+ token.pos_ = pos[i]
+ # usually this is already True when starting from proper models instead of blank English
+ doc_bytes = doc.to_bytes()
+ vocab = Vocab()
+ vocab = vocab.from_bytes(vocab_bytes)
+ doc = Doc(vocab).from_bytes(doc_bytes)
+ actual = []
+ for token in doc:
+ actual.append(token.pos_)
+ assert actual == pos
+
+
@pytest.mark.parametrize("text", ["rat"])
def test_serialize_vocab(en_vocab, text):
text_hash = en_vocab.strings.add(text)
@@ -129,7 +185,11 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
def test_pickle_vocab(strings, lex_attr):
vocab = Vocab(strings=strings)
+ ops = get_current_ops()
+ vectors = Vectors(data=ops.xp.zeros((10, 10)), mode="floret", hash_count=1)
+ vocab.vectors = vectors
vocab[strings[0]].norm_ = lex_attr
vocab_pickled = pickle.dumps(vocab)
vocab_unpickled = pickle.loads(vocab_pickled)
assert vocab.to_bytes() == vocab_unpickled.to_bytes()
+ assert vocab_unpickled.vectors.mode == "floret"
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 03bef3528..253469909 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -1,25 +1,105 @@
-import pytest
-from click import NoSuchOption
-from spacy.training import docs_to_json, offsets_to_biluo_tags
-from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs
-from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
-from spacy.lang.nl import Dutch
-from spacy.util import ENV_VARS, load_model_from_config
-from spacy.cli import info
-from spacy.cli.init_config import init_config, RECOMMENDATIONS
-from spacy.cli._util import validate_project_commands, parse_config_overrides
-from spacy.cli._util import load_project_config, substitute_project_variables
-from spacy.cli._util import string_to_list
-from spacy import about
-from spacy.util import get_minor_version
-from spacy.cli.validate import get_model_pkgs
-from spacy.cli.download import get_compatibility, get_version
-from thinc.api import ConfigValidationError, Config
-import srsly
import os
-from .util import make_tempdir
+import pytest
+import srsly
+from click import NoSuchOption
+from packaging.specifiers import SpecifierSet
+from thinc.api import Config, ConfigValidationError
+
+from spacy import about
+from spacy.cli import info
+from spacy.cli._util import is_subpath_of, load_project_config
+from spacy.cli._util import parse_config_overrides, string_to_list
+from spacy.cli._util import substitute_project_variables
+from spacy.cli._util import validate_project_commands
+from spacy.cli.debug_data import _get_labels_from_model
+from spacy.cli.debug_data import _get_labels_from_spancat
+from spacy.cli.download import get_compatibility, get_version
+from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
+from spacy.cli.package import get_third_party_dependencies
+from spacy.cli.validate import get_model_pkgs
+from spacy.lang.en import English
+from spacy.lang.nl import Dutch
+from spacy.language import Language
+from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
+from spacy.training import Example, docs_to_json, offsets_to_biluo_tags
+from spacy.training.converters import conll_ner_to_docs, conllu_to_docs
+from spacy.training.converters import iob_to_docs
+from spacy.util import ENV_VARS, get_minor_version, load_model_from_config, load_config
+
from ..cli.init_pipeline import _init_labels
+from .util import make_tempdir
+
+
+@pytest.mark.issue(4665)
+def test_issue4665():
+ """
+ conllu_to_docs should not raise an exception if the HEAD column contains an
+ underscore
+ """
+ input_data = """
+1 [ _ PUNCT -LRB- _ _ punct _ _
+2 This _ DET DT _ _ det _ _
+3 killing _ NOUN NN _ _ nsubj _ _
+4 of _ ADP IN _ _ case _ _
+5 a _ DET DT _ _ det _ _
+6 respected _ ADJ JJ _ _ amod _ _
+7 cleric _ NOUN NN _ _ nmod _ _
+8 will _ AUX MD _ _ aux _ _
+9 be _ AUX VB _ _ aux _ _
+10 causing _ VERB VBG _ _ root _ _
+11 us _ PRON PRP _ _ iobj _ _
+12 trouble _ NOUN NN _ _ dobj _ _
+13 for _ ADP IN _ _ case _ _
+14 years _ NOUN NNS _ _ nmod _ _
+15 to _ PART TO _ _ mark _ _
+16 come _ VERB VB _ _ acl _ _
+17 . _ PUNCT . _ _ punct _ _
+18 ] _ PUNCT -RRB- _ _ punct _ _
+"""
+ conllu_to_docs(input_data)
+
+
+@pytest.mark.issue(4924)
+def test_issue4924():
+ nlp = Language()
+ example = Example.from_dict(nlp.make_doc(""), {})
+ nlp.evaluate([example])
+
+
+@pytest.mark.issue(7055)
+def test_issue7055():
+ """Test that fill-config doesn't turn sourced components into factories."""
+ source_cfg = {
+ "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger"]},
+ "components": {
+ "tok2vec": {"factory": "tok2vec"},
+ "tagger": {"factory": "tagger"},
+ },
+ }
+ source_nlp = English.from_config(source_cfg)
+ with make_tempdir() as dir_path:
+ # We need to create a loadable source pipeline
+ source_path = dir_path / "test_model"
+ source_nlp.to_disk(source_path)
+ base_cfg = {
+ "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]},
+ "components": {
+ "tok2vec": {"source": str(source_path)},
+ "tagger": {"source": str(source_path)},
+ "ner": {"factory": "ner"},
+ },
+ }
+ base_cfg = Config(base_cfg)
+ base_path = dir_path / "base.cfg"
+ base_cfg.to_disk(base_path)
+ output_path = dir_path / "config.cfg"
+ fill_config(output_path, base_path, silent=True)
+ filled_cfg = load_config(output_path)
+ assert filled_cfg["components"]["tok2vec"]["source"] == str(source_path)
+ assert filled_cfg["components"]["tagger"]["source"] == str(source_path)
+ assert filled_cfg["components"]["ner"]["factory"] == "ner"
+ assert "model" in filled_cfg["components"]["ner"]
def test_cli_info():
@@ -440,7 +520,7 @@ def test_init_config(lang, pipeline, optimize, pretraining):
assert isinstance(config, Config)
if pretraining:
config["paths"]["raw_text"] = "my_data.jsonl"
- nlp = load_model_from_config(config, auto_fill=True)
+ load_model_from_config(config, auto_fill=True)
def test_model_recommendations():
@@ -490,18 +570,24 @@ def test_string_to_list_intify(value):
def test_download_compatibility():
- model_name = "en_core_web_sm"
- compatibility = get_compatibility()
- version = get_version(model_name, compatibility)
- assert get_minor_version(about.__version__) == get_minor_version(version)
+ spec = SpecifierSet("==" + about.__version__)
+ spec.prereleases = False
+ if about.__version__ in spec:
+ model_name = "en_core_web_sm"
+ compatibility = get_compatibility()
+ version = get_version(model_name, compatibility)
+ assert get_minor_version(about.__version__) == get_minor_version(version)
def test_validate_compatibility_table():
- model_pkgs, compat = get_model_pkgs()
- spacy_version = get_minor_version(about.__version__)
- current_compat = compat.get(spacy_version, {})
- assert len(current_compat) > 0
- assert "en_core_web_sm" in current_compat
+ spec = SpecifierSet("==" + about.__version__)
+ spec.prereleases = False
+ if about.__version__ in spec:
+ model_pkgs, compat = get_model_pkgs()
+ spacy_version = get_minor_version(about.__version__)
+ current_compat = compat.get(spacy_version, {})
+ assert len(current_compat) > 0
+ assert "en_core_web_sm" in current_compat
@pytest.mark.parametrize("component_name", ["ner", "textcat", "spancat", "tagger"])
@@ -532,3 +618,77 @@ def test_init_labels(component_name):
assert len(nlp2.get_pipe(component_name).labels) == 0
nlp2.initialize()
assert len(nlp2.get_pipe(component_name).labels) == 4
+
+
+def test_get_third_party_dependencies():
+ # We can't easily test the detection of third-party packages here, but we
+ # can at least make sure that the function and its importlib magic runs.
+ nlp = Dutch()
+ # Test with component factory based on Cython module
+ nlp.add_pipe("tagger")
+ assert get_third_party_dependencies(nlp.config) == []
+
+ # Test with legacy function
+ nlp = Dutch()
+ nlp.add_pipe(
+ "textcat",
+ config={
+ "model": {
+ # Do not update from legacy architecture spacy.TextCatBOW.v1
+ "@architectures": "spacy.TextCatBOW.v1",
+ "exclusive_classes": True,
+ "ngram_size": 1,
+ "no_output_layer": False,
+ }
+ },
+ )
+ assert get_third_party_dependencies(nlp.config) == []
+
+ # Test with lang-specific factory
+ @Dutch.factory("third_party_test")
+ def test_factory(nlp, name):
+ return lambda x: x
+
+ nlp.add_pipe("third_party_test")
+ # Before #9674 this would throw an exception
+ get_third_party_dependencies(nlp.config)
+
+
+@pytest.mark.parametrize(
+ "parent,child,expected",
+ [
+ ("/tmp", "/tmp", True),
+ ("/tmp", "/", False),
+ ("/tmp", "/tmp/subdir", True),
+ ("/tmp", "/tmpdir", False),
+ ("/tmp", "/tmp/subdir/..", True),
+ ("/tmp", "/tmp/..", False),
+ ],
+)
+def test_is_subpath_of(parent, child, expected):
+ assert is_subpath_of(parent, child) == expected
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize(
+ "factory_name,pipe_name",
+ [
+ ("ner", "ner"),
+ ("ner", "my_ner"),
+ ("spancat", "spancat"),
+ ("spancat", "my_spancat"),
+ ],
+)
+def test_get_labels_from_model(factory_name, pipe_name):
+ labels = ("A", "B")
+
+ nlp = English()
+ pipe = nlp.add_pipe(factory_name, name=pipe_name)
+ for label in labels:
+ pipe.add_label(label)
+ nlp.initialize()
+ assert nlp.get_pipe(pipe_name).labels == labels
+ if factory_name == "spancat":
+ assert _get_labels_from_spancat(nlp)[pipe.key] == set(labels)
+ else:
+ assert _get_labels_from_model(nlp, factory_name) == set(labels)
diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py
index 040dd657f..392c95e42 100644
--- a/spacy/tests/test_displacy.py
+++ b/spacy/tests/test_displacy.py
@@ -1,8 +1,99 @@
+import numpy
import pytest
+
from spacy import displacy
from spacy.displacy.render import DependencyRenderer, EntityRenderer
-from spacy.tokens import Span, Doc
+from spacy.lang.en import English
from spacy.lang.fa import Persian
+from spacy.tokens import Span, Doc
+
+
+@pytest.mark.issue(2361)
+def test_issue2361(de_vocab):
+ """Test if < is escaped when rendering"""
+ chars = ("<", ">", "&", """)
+ words = ["<", ">", "&", '"']
+ doc = Doc(de_vocab, words=words, deps=["dep"] * len(words))
+ html = displacy.render(doc)
+ for char in chars:
+ assert char in html
+
+
+@pytest.mark.issue(2728)
+def test_issue2728(en_vocab):
+ """Test that displaCy ENT visualizer escapes HTML correctly."""
+ doc = Doc(en_vocab, words=["test", "", "test"])
+ doc.ents = [Span(doc, 0, 1, label="TEST")]
+ html = displacy.render(doc, style="ent")
+ assert "<RELEASE>" in html
+ doc.ents = [Span(doc, 1, 2, label="TEST")]
+ html = displacy.render(doc, style="ent")
+ assert "<RELEASE>" in html
+
+
+@pytest.mark.issue(3288)
+def test_issue3288(en_vocab):
+ """Test that retokenization works correctly via displaCy when punctuation
+ is merged onto the preceeding token and tensor is resized."""
+ words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"]
+ heads = [1, 1, 1, 4, 4, 6, 4, 4]
+ deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"]
+ doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
+ doc.tensor = numpy.zeros((len(words), 96), dtype="float32")
+ displacy.render(doc)
+
+
+@pytest.mark.issue(3531)
+def test_issue3531():
+ """Test that displaCy renderer doesn't require "settings" key."""
+ example_dep = {
+ "words": [
+ {"text": "But", "tag": "CCONJ"},
+ {"text": "Google", "tag": "PROPN"},
+ {"text": "is", "tag": "VERB"},
+ {"text": "starting", "tag": "VERB"},
+ {"text": "from", "tag": "ADP"},
+ {"text": "behind.", "tag": "ADV"},
+ ],
+ "arcs": [
+ {"start": 0, "end": 3, "label": "cc", "dir": "left"},
+ {"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
+ {"start": 2, "end": 3, "label": "aux", "dir": "left"},
+ {"start": 3, "end": 4, "label": "prep", "dir": "right"},
+ {"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
+ ],
+ }
+ example_ent = {
+ "text": "But Google is starting from behind.",
+ "ents": [{"start": 4, "end": 10, "label": "ORG"}],
+ }
+ dep_html = displacy.render(example_dep, style="dep", manual=True)
+ assert dep_html
+ ent_html = displacy.render(example_ent, style="ent", manual=True)
+ assert ent_html
+
+
+@pytest.mark.issue(3882)
+def test_issue3882(en_vocab):
+ """Test that displaCy doesn't serialize the doc.user_data when making a
+ copy of the Doc.
+ """
+ doc = Doc(en_vocab, words=["Hello", "world"], deps=["dep", "dep"])
+ doc.user_data["test"] = set()
+ displacy.parse_deps(doc)
+
+
+@pytest.mark.issue(5838)
+def test_issue5838():
+ # Displacy's EntityRenderer break line
+ # not working after last entity
+ sample_text = "First line\nSecond line, with ent\nThird line\nFourth line\n"
+ nlp = English()
+ doc = nlp(sample_text)
+ doc.ents = [Span(doc, 7, 8, label="test")]
+ html = displacy.render(doc, style="ent")
+ found = html.count("")
+ assert found == 4
def test_displacy_parse_ents(en_vocab):
@@ -12,7 +103,38 @@ def test_displacy_parse_ents(en_vocab):
ents = displacy.parse_ents(doc)
assert isinstance(ents, dict)
assert ents["text"] == "But Google is starting from behind "
- assert ents["ents"] == [{"start": 4, "end": 10, "label": "ORG"}]
+ assert ents["ents"] == [
+ {"start": 4, "end": 10, "label": "ORG", "kb_id": "", "kb_url": "#"}
+ ]
+
+ doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"], kb_id="Q95")]
+ ents = displacy.parse_ents(doc)
+ assert isinstance(ents, dict)
+ assert ents["text"] == "But Google is starting from behind "
+ assert ents["ents"] == [
+ {"start": 4, "end": 10, "label": "ORG", "kb_id": "Q95", "kb_url": "#"}
+ ]
+
+
+def test_displacy_parse_ents_with_kb_id_options(en_vocab):
+ """Test that named entities with kb_id on a Doc are converted into displaCy's format."""
+ doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
+ doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"], kb_id="Q95")]
+
+ ents = displacy.parse_ents(
+ doc, {"kb_url_template": "https://www.wikidata.org/wiki/{}"}
+ )
+ assert isinstance(ents, dict)
+ assert ents["text"] == "But Google is starting from behind "
+ assert ents["ents"] == [
+ {
+ "start": 4,
+ "end": 10,
+ "label": "ORG",
+ "kb_id": "Q95",
+ "kb_url": "https://www.wikidata.org/wiki/Q95",
+ }
+ ]
def test_displacy_parse_deps(en_vocab):
diff --git a/spacy/tests/test_errors.py b/spacy/tests/test_errors.py
index e79abc6ab..a845a52c9 100644
--- a/spacy/tests/test_errors.py
+++ b/spacy/tests/test_errors.py
@@ -2,11 +2,10 @@ from inspect import isclass
import pytest
-from spacy.errors import add_codes
+from spacy.errors import ErrorsWithCodes
-@add_codes
-class Errors:
+class Errors(metaclass=ErrorsWithCodes):
E001 = "error description"
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index c911b8d81..c5fdc8eb0 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -8,13 +8,23 @@ from spacy.vocab import Vocab
from spacy.training import Example
from spacy.lang.en import English
from spacy.lang.de import German
-from spacy.util import registry, ignore_error, raise_error
+from spacy.util import registry, ignore_error, raise_error, find_matching_language
import spacy
-from thinc.api import NumpyOps, get_current_ops
+from thinc.api import CupyOps, NumpyOps, get_current_ops
from .util import add_vecs_to_vocab, assert_docs_equal
+try:
+ import torch
+
+ # Ensure that we don't deadlock in multiprocessing tests.
+ torch.set_num_threads(1)
+ torch.set_num_interop_threads(1)
+except ImportError:
+ pass
+
+
def evil_component(doc):
if "2" in doc.text:
raise ValueError("no dice")
@@ -245,6 +255,38 @@ def test_language_pipe_error_handler_custom(en_vocab, n_process):
assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"]
+@pytest.mark.parametrize("n_process", [1, 2])
+def test_language_pipe_error_handler_input_as_tuples(en_vocab, n_process):
+ """Test the error handling of nlp.pipe with input as tuples"""
+ Language.component("my_evil_component", func=evil_component)
+ ops = get_current_ops()
+ if isinstance(ops, NumpyOps) or n_process < 2:
+ nlp = English()
+ nlp.add_pipe("my_evil_component")
+ texts = [
+ ("TEXT 111", 111),
+ ("TEXT 222", 222),
+ ("TEXT 333", 333),
+ ("TEXT 342", 342),
+ ("TEXT 666", 666),
+ ]
+ with pytest.raises(ValueError):
+ list(nlp.pipe(texts, as_tuples=True))
+ nlp.set_error_handler(warn_error)
+ logger = logging.getLogger("spacy")
+ with mock.patch.object(logger, "warning") as mock_warning:
+ tuples = list(nlp.pipe(texts, as_tuples=True, n_process=n_process))
+ # HACK/TODO? the warnings in child processes don't seem to be
+ # detected by the mock logger
+ if n_process == 1:
+ mock_warning.assert_called()
+ assert mock_warning.call_count == 2
+ assert len(tuples) + mock_warning.call_count == len(texts)
+ assert (tuples[0][0].text, tuples[0][1]) == ("TEXT 111", 111)
+ assert (tuples[1][0].text, tuples[1][1]) == ("TEXT 333", 333)
+ assert (tuples[2][0].text, tuples[2][1]) == ("TEXT 666", 666)
+
+
@pytest.mark.parametrize("n_process", [1, 2])
def test_language_pipe_error_handler_pipe(en_vocab, n_process):
"""Test the error handling of a component's pipe method"""
@@ -502,6 +544,55 @@ def test_spacy_blank():
assert nlp.meta["name"] == "my_custom_model"
+@pytest.mark.parametrize(
+ "lang,target",
+ [
+ ("en", "en"),
+ ("fra", "fr"),
+ ("fre", "fr"),
+ ("iw", "he"),
+ ("mo", "ro"),
+ ("mul", "xx"),
+ ("no", "nb"),
+ ("pt-BR", "pt"),
+ ("xx", "xx"),
+ ("zh-Hans", "zh"),
+ ("zh-Hant", None),
+ ("zxx", None),
+ ],
+)
+def test_language_matching(lang, target):
+ """
+ Test that we can look up languages by equivalent or nearly-equivalent
+ language codes.
+ """
+ assert find_matching_language(lang) == target
+
+
+@pytest.mark.parametrize(
+ "lang,target",
+ [
+ ("en", "en"),
+ ("fra", "fr"),
+ ("fre", "fr"),
+ ("iw", "he"),
+ ("mo", "ro"),
+ ("mul", "xx"),
+ ("no", "nb"),
+ ("pt-BR", "pt"),
+ ("xx", "xx"),
+ ("zh-Hans", "zh"),
+ ],
+)
+def test_blank_languages(lang, target):
+ """
+ Test that we can get spacy.blank in various languages, including codes
+ that are defined to be equivalent or that match by CLDR language matching.
+ """
+ nlp = spacy.blank(lang)
+ assert nlp.lang == target
+
+
@pytest.mark.parametrize("value", [False, None, ["x", "y"], Language, Vocab])
def test_language_init_invalid_vocab(value):
err_fragment = "invalid value"
@@ -528,3 +619,43 @@ def test_language_source_and_vectors(nlp2):
assert long_string in nlp2.vocab.strings
# vectors should remain unmodified
assert nlp.vocab.vectors.to_bytes() == vectors_bytes
+
+
+@pytest.mark.parametrize("n_process", [1, 2])
+def test_pass_doc_to_pipeline(nlp, n_process):
+ texts = ["cats", "dogs", "guinea pigs"]
+ docs = [nlp.make_doc(text) for text in texts]
+ assert not any(len(doc.cats) for doc in docs)
+ doc = nlp(docs[0])
+ assert doc.text == texts[0]
+ assert len(doc.cats) > 0
+ if isinstance(get_current_ops(), NumpyOps) or n_process < 2:
+ docs = nlp.pipe(docs, n_process=n_process)
+ assert [doc.text for doc in docs] == texts
+ assert all(len(doc.cats) for doc in docs)
+
+
+def test_invalid_arg_to_pipeline(nlp):
+ str_list = ["This is a text.", "This is another."]
+ with pytest.raises(ValueError):
+ nlp(str_list) # type: ignore
+ assert len(list(nlp.pipe(str_list))) == 2
+ int_list = [1, 2, 3]
+ with pytest.raises(ValueError):
+ list(nlp.pipe(int_list)) # type: ignore
+ with pytest.raises(ValueError):
+ nlp(int_list) # type: ignore
+
+
+@pytest.mark.skipif(
+ not isinstance(get_current_ops(), CupyOps), reason="test requires GPU"
+)
+def test_multiprocessing_gpu_warning(nlp2, texts):
+ texts = texts * 10
+ docs = nlp2.pipe(texts, n_process=2, batch_size=2)
+
+ with pytest.warns(UserWarning, match="multiprocessing with GPU models"):
+ with pytest.raises(ValueError):
+ # Trigger multi-processing.
+ for _ in docs:
+ pass
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 45cbdf45b..d8743d322 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -15,7 +15,8 @@ from spacy.training.batchers import minibatch_by_words
from spacy.lang.en import English
from spacy.lang.nl import Dutch
from spacy.language import DEFAULT_CONFIG_PATH
-from spacy.schemas import ConfigSchemaTraining
+from spacy.schemas import ConfigSchemaTraining, TokenPattern, TokenPatternSchema
+from pydantic import ValidationError
from thinc.api import get_current_ops, NumpyOps, CupyOps
@@ -33,6 +34,32 @@ def is_admin():
return admin
+@pytest.mark.issue(6207)
+def test_issue6207(en_tokenizer):
+ doc = en_tokenizer("zero one two three four five six")
+
+ # Make spans
+ s1 = doc[:4]
+ s2 = doc[3:6] # overlaps with s1
+ s3 = doc[5:7] # overlaps with s2, not s1
+
+ result = util.filter_spans((s1, s2, s3))
+ assert s1 in result
+ assert s2 not in result
+ assert s3 in result
+
+
+@pytest.mark.issue(6258)
+def test_issue6258():
+ """Test that the non-empty constraint pattern field is respected"""
+ # These one is valid
+ TokenPatternSchema(pattern=[TokenPattern()])
+ # But an empty pattern list should fail to validate
+ # based on the schema's constraint
+ with pytest.raises(ValidationError):
+ TokenPatternSchema(pattern=[])
+
+
@pytest.mark.parametrize("text", ["hello/world", "hello world"])
def test_util_ensure_path_succeeds(text):
path = util.ensure_path(text)
@@ -139,6 +166,12 @@ def test_load_model_blank_shortcut():
nlp = util.load_model("blank:en")
assert nlp.lang == "en"
assert nlp.pipeline == []
+
+ # ImportError for loading an unsupported language
+ with pytest.raises(ImportError):
+ util.load_model("blank:zxx")
+
+ # ImportError for requesting an invalid language code that isn't registered
with pytest.raises(ImportError):
util.load_model("blank:fjsfijsdof")
diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py
index 33d394933..2306cabb7 100644
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@@ -193,6 +193,7 @@ def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X):
assert_array_almost_equal(
model1.ops.to_numpy(get_all_params(model1)),
model2.ops.to_numpy(get_all_params(model2)),
+ decimal=5,
)
@@ -211,7 +212,7 @@ def test_empty_docs(model_func, kwargs):
def test_init_extract_spans():
- model = extract_spans().initialize()
+ extract_spans().initialize()
def test_extract_spans_span_indices():
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index 16cc97f6d..6e15fa2de 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -249,6 +249,7 @@ def test_tag_score(tagged_doc):
assert results["tag_acc"] == 1.0
assert results["pos_acc"] == 1.0
assert results["morph_acc"] == 1.0
+ assert results["morph_micro_f"] == 1.0
assert results["morph_per_feat"]["NounType"]["f"] == 1.0
# Gold annotation is modified
@@ -272,6 +273,7 @@ def test_tag_score(tagged_doc):
assert results["tag_acc"] == 0.9
assert results["pos_acc"] == 0.9
assert results["morph_acc"] == approx(0.8)
+ assert results["morph_micro_f"] == approx(0.8461538)
assert results["morph_per_feat"]["NounType"]["f"] == 1.0
assert results["morph_per_feat"]["Poss"]["f"] == 0.0
assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272)
diff --git a/spacy/tests/test_ty.py b/spacy/tests/test_ty.py
new file mode 100644
index 000000000..2037520df
--- /dev/null
+++ b/spacy/tests/test_ty.py
@@ -0,0 +1,18 @@
+import spacy
+from spacy import ty
+
+
+def test_component_types():
+ nlp = spacy.blank("en")
+ tok2vec = nlp.create_pipe("tok2vec")
+ tagger = nlp.create_pipe("tagger")
+ entity_ruler = nlp.create_pipe("entity_ruler")
+ assert isinstance(tok2vec, ty.TrainableComponent)
+ assert isinstance(tagger, ty.TrainableComponent)
+ assert not isinstance(entity_ruler, ty.TrainableComponent)
+ assert isinstance(tok2vec, ty.InitializableComponent)
+ assert isinstance(tagger, ty.InitializableComponent)
+ assert isinstance(entity_ruler, ty.InitializableComponent)
+ assert isinstance(tok2vec, ty.ListenedToComponent)
+ assert not isinstance(tagger, ty.ListenedToComponent)
+ assert not isinstance(entity_ruler, ty.ListenedToComponent)
diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py
index 9a98e049e..85716377a 100644
--- a/spacy/tests/tokenizer/test_exceptions.py
+++ b/spacy/tests/tokenizer/test_exceptions.py
@@ -45,3 +45,9 @@ def test_tokenizer_handles_emoji(tokenizer, text, length):
if sys.maxunicode >= 1114111:
tokens = tokenizer(text)
assert len(tokens) == length
+
+
+def test_tokenizer_degree(tokenizer):
+ for u in "cfkCFK":
+ assert [t.text for t in tokenizer(f"°{u}.")] == ["°", f"{u}", "."]
+ assert [t[1] for t in tokenizer.explain(f"°{u}.")] == ["°", f"{u}", "."]
diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py
index 7d0c16745..a7270cb1e 100644
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@@ -1,9 +1,284 @@
-import pytest
import re
-from spacy.vocab import Vocab
-from spacy.tokenizer import Tokenizer
-from spacy.util import ensure_path
+
+import numpy
+import pytest
+
from spacy.lang.en import English
+from spacy.lang.de import German
+from spacy.tokenizer import Tokenizer
+from spacy.tokens import Doc
+from spacy.training import Example
+from spacy.util import compile_prefix_regex, compile_suffix_regex, ensure_path
+from spacy.util import compile_infix_regex
+from spacy.vocab import Vocab
+from spacy.symbols import ORTH
+
+
+@pytest.mark.issue(743)
+def test_issue743():
+ doc = Doc(Vocab(), ["hello", "world"])
+ token = doc[0]
+ s = set([token])
+ items = list(s)
+ assert items[0] is token
+
+
+@pytest.mark.issue(801)
+@pytest.mark.skip(
+ reason="Can not be fixed unless with variable-width lookbehinds, cf. PR #3218"
+)
+@pytest.mark.parametrize(
+ "text,tokens",
+ [
+ ('"deserve,"--and', ['"', "deserve", ',"--', "and"]),
+ ("exception;--exclusive", ["exception", ";--", "exclusive"]),
+ ("day.--Is", ["day", ".--", "Is"]),
+ ("refinement:--just", ["refinement", ":--", "just"]),
+ ("memories?--To", ["memories", "?--", "To"]),
+ ("Useful.=--Therefore", ["Useful", ".=--", "Therefore"]),
+ ("=Hope.=--Pandora", ["=", "Hope", ".=--", "Pandora"]),
+ ],
+)
+def test_issue801(en_tokenizer, text, tokens):
+ """Test that special characters + hyphens are split correctly."""
+ doc = en_tokenizer(text)
+ assert len(doc) == len(tokens)
+ assert [t.text for t in doc] == tokens
+
+
+@pytest.mark.issue(1061)
+def test_issue1061():
+ """Test special-case works after tokenizing. Was caching problem."""
+ text = "I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_."
+ tokenizer = English().tokenizer
+ doc = tokenizer(text)
+ assert "MATH" in [w.text for w in doc]
+ assert "_MATH_" not in [w.text for w in doc]
+
+ tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}])
+ doc = tokenizer(text)
+ assert "_MATH_" in [w.text for w in doc]
+ assert "MATH" not in [w.text for w in doc]
+
+ # For sanity, check it works when pipeline is clean.
+ tokenizer = English().tokenizer
+ tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}])
+ doc = tokenizer(text)
+ assert "_MATH_" in [w.text for w in doc]
+ assert "MATH" not in [w.text for w in doc]
+
+
+@pytest.mark.issue(1963)
+def test_issue1963(en_tokenizer):
+ """Test that doc.merge() resizes doc.tensor"""
+ doc = en_tokenizer("a b c d")
+ doc.tensor = numpy.ones((len(doc), 128), dtype="f")
+ with doc.retokenize() as retokenizer:
+ retokenizer.merge(doc[0:2])
+ assert len(doc) == 3
+ assert doc.tensor.shape == (3, 128)
+
+
+@pytest.mark.skip(
+ reason="Can not be fixed without variable-width look-behind (which we don't want)"
+)
+@pytest.mark.issue(1235)
+def test_issue1235():
+ """Test that g is not split of if preceded by a number and a letter"""
+ nlp = English()
+ testwords = "e2g 2g 52g"
+ doc = nlp(testwords)
+ assert len(doc) == 5
+ assert doc[0].text == "e2g"
+ assert doc[1].text == "2"
+ assert doc[2].text == "g"
+ assert doc[3].text == "52"
+ assert doc[4].text == "g"
+
+
+@pytest.mark.issue(1242)
+def test_issue1242():
+ nlp = English()
+ doc = nlp("")
+ assert len(doc) == 0
+ docs = list(nlp.pipe(["", "hello"]))
+ assert len(docs[0]) == 0
+ assert len(docs[1]) == 1
+
+
+@pytest.mark.issue(1257)
+def test_issue1257():
+ """Test that tokens compare correctly."""
+ doc1 = Doc(Vocab(), words=["a", "b", "c"])
+ doc2 = Doc(Vocab(), words=["a", "c", "e"])
+ assert doc1[0] != doc2[0]
+ assert not doc1[0] == doc2[0]
+
+
+@pytest.mark.issue(1375)
+def test_issue1375():
+ """Test that token.nbor() raises IndexError for out-of-bounds access."""
+ doc = Doc(Vocab(), words=["0", "1", "2"])
+ with pytest.raises(IndexError):
+ assert doc[0].nbor(-1)
+ assert doc[1].nbor(-1).text == "0"
+ with pytest.raises(IndexError):
+ assert doc[2].nbor(1)
+ assert doc[1].nbor(1).text == "2"
+
+
+@pytest.mark.issue(1488)
+def test_issue1488():
+ """Test that tokenizer can parse DOT inside non-whitespace separators"""
+ prefix_re = re.compile(r"""[\[\("']""")
+ suffix_re = re.compile(r"""[\]\)"']""")
+ infix_re = re.compile(r"""[-~\.]""")
+ simple_url_re = re.compile(r"""^https?://""")
+
+ def my_tokenizer(nlp):
+ return Tokenizer(
+ nlp.vocab,
+ {},
+ prefix_search=prefix_re.search,
+ suffix_search=suffix_re.search,
+ infix_finditer=infix_re.finditer,
+ token_match=simple_url_re.match,
+ )
+
+ nlp = English()
+ nlp.tokenizer = my_tokenizer(nlp)
+ doc = nlp("This is a test.")
+ for token in doc:
+ assert token.text
+
+
+@pytest.mark.issue(1494)
+def test_issue1494():
+ """Test if infix_finditer works correctly"""
+ infix_re = re.compile(r"""[^a-z]""")
+ test_cases = [
+ ("token 123test", ["token", "1", "2", "3", "test"]),
+ ("token 1test", ["token", "1test"]),
+ ("hello...test", ["hello", ".", ".", ".", "test"]),
+ ]
+
+ def new_tokenizer(nlp):
+ return Tokenizer(nlp.vocab, {}, infix_finditer=infix_re.finditer)
+
+ nlp = English()
+ nlp.tokenizer = new_tokenizer(nlp)
+ for text, expected in test_cases:
+ assert [token.text for token in nlp(text)] == expected
+
+
+@pytest.mark.skip(
+ reason="Can not be fixed without iterative looping between prefix/suffix and infix"
+)
+@pytest.mark.issue(2070)
+def test_issue2070():
+ """Test that checks that a dot followed by a quote is handled
+ appropriately.
+ """
+ # Problem: The dot is now properly split off, but the prefix/suffix rules
+ # are not applied again afterwards. This means that the quote will still be
+ # attached to the remaining token.
+ nlp = English()
+ doc = nlp('First sentence."A quoted sentence" he said ...')
+ assert len(doc) == 11
+
+
+@pytest.mark.issue(2926)
+def test_issue2926(fr_tokenizer):
+ """Test that the tokenizer correctly splits tokens separated by a slash (/)
+ ending in a digit.
+ """
+ doc = fr_tokenizer("Learn html5/css3/javascript/jquery")
+ assert len(doc) == 8
+ assert doc[0].text == "Learn"
+ assert doc[1].text == "html5"
+ assert doc[2].text == "/"
+ assert doc[3].text == "css3"
+ assert doc[4].text == "/"
+ assert doc[5].text == "javascript"
+ assert doc[6].text == "/"
+ assert doc[7].text == "jquery"
+
+
+@pytest.mark.parametrize(
+ "text",
+ [
+ "ABLEItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume TABLE ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume",
+ "oow.jspsearch.eventoracleopenworldsearch.technologyoraclesolarissearch.technologystoragesearch.technologylinuxsearch.technologyserverssearch.technologyvirtualizationsearch.technologyengineeredsystemspcodewwmkmppscem:",
+ ],
+)
+@pytest.mark.issue(2626)
+def test_issue2626_2835(en_tokenizer, text):
+ """Check that sentence doesn't cause an infinite loop in the tokenizer."""
+ doc = en_tokenizer(text)
+ assert doc
+
+
+@pytest.mark.issue(2656)
+def test_issue2656(en_tokenizer):
+ """Test that tokenizer correctly splits off punctuation after numbers with
+ decimal points.
+ """
+ doc = en_tokenizer("I went for 40.3, and got home by 10.0.")
+ assert len(doc) == 11
+ assert doc[0].text == "I"
+ assert doc[1].text == "went"
+ assert doc[2].text == "for"
+ assert doc[3].text == "40.3"
+ assert doc[4].text == ","
+ assert doc[5].text == "and"
+ assert doc[6].text == "got"
+ assert doc[7].text == "home"
+ assert doc[8].text == "by"
+ assert doc[9].text == "10.0"
+ assert doc[10].text == "."
+
+
+@pytest.mark.issue(2754)
+def test_issue2754(en_tokenizer):
+ """Test that words like 'a' and 'a.m.' don't get exceptional norm values."""
+ a = en_tokenizer("a")
+ assert a[0].norm_ == "a"
+ am = en_tokenizer("am")
+ assert am[0].norm_ == "am"
+
+
+@pytest.mark.issue(3002)
+def test_issue3002():
+ """Test that the tokenizer doesn't hang on a long list of dots"""
+ nlp = German()
+ doc = nlp(
+ "880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl"
+ )
+ assert len(doc) == 5
+
+
+@pytest.mark.skip(reason="default suffix rules avoid one upper-case letter before dot")
+@pytest.mark.issue(3449)
+def test_issue3449():
+ nlp = English()
+ nlp.add_pipe("sentencizer")
+ text1 = "He gave the ball to I. Do you want to go to the movies with I?"
+ text2 = "He gave the ball to I. Do you want to go to the movies with I?"
+ text3 = "He gave the ball to I.\nDo you want to go to the movies with I?"
+ t1 = nlp(text1)
+ t2 = nlp(text2)
+ t3 = nlp(text3)
+ assert t1[5].text == "I"
+ assert t2[5].text == "I"
+ assert t3[5].text == "I"
+
+
+@pytest.mark.parametrize(
+ "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
+)
+def test_gold_misaligned(en_tokenizer, text, words):
+ doc = en_tokenizer(text)
+ Example.from_dict(doc, {"words": words})
def test_tokenizer_handles_no_word(tokenizer):
@@ -212,3 +487,37 @@ def test_tokenizer_flush_specials(en_vocab):
assert [t.text for t in tokenizer1("a a.")] == ["a a", "."]
tokenizer1.rules = {}
assert [t.text for t in tokenizer1("a a.")] == ["a", "a", "."]
+
+
+def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab):
+ # the prefix and suffix matches overlap in the suffix lookbehind
+ prefixes = ["a(?=.)"]
+ suffixes = [r"(?<=\w)\.", r"(?<=a)\d+\."]
+ prefix_re = compile_prefix_regex(prefixes)
+ suffix_re = compile_suffix_regex(suffixes)
+ tokenizer = Tokenizer(
+ en_vocab,
+ prefix_search=prefix_re.search,
+ suffix_search=suffix_re.search,
+ )
+ tokens = [t.text for t in tokenizer("a10.")]
+ assert tokens == ["a", "10", "."]
+ explain_tokens = [t[1] for t in tokenizer.explain("a10.")]
+ assert tokens == explain_tokens
+
+
+def test_tokenizer_infix_prefix(en_vocab):
+ # the prefix and suffix matches overlap in the suffix lookbehind
+ infixes = ["±"]
+ suffixes = ["%"]
+ infix_re = compile_infix_regex(infixes)
+ suffix_re = compile_suffix_regex(suffixes)
+ tokenizer = Tokenizer(
+ en_vocab,
+ infix_finditer=infix_re.finditer,
+ suffix_search=suffix_re.search,
+ )
+ tokens = [t.text for t in tokenizer("±10%")]
+ assert tokens == ["±10", "%"]
+ explain_tokens = [t[1] for t in tokenizer.explain("±10%")]
+ assert tokens == explain_tokens
diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py
index f53660818..8c5c81625 100644
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@@ -1,6 +1,6 @@
from typing import Dict, Iterable, Callable
import pytest
-from thinc.api import Config
+from thinc.api import Config, fix_random_seed
from spacy import Language
from spacy.util import load_model_from_config, registry, resolve_dot_names
from spacy.schemas import ConfigSchemaTraining
@@ -28,7 +28,7 @@ def test_readers():
"""
@registry.readers("myreader.v1")
- def myreader() -> Dict[str, Callable[[Language, str], Iterable[Example]]]:
+ def myreader() -> Dict[str, Callable[[Language], Iterable[Example]]]:
annots = {"cats": {"POS": 1.0, "NEG": 0.0}}
def reader(nlp: Language):
@@ -64,8 +64,8 @@ def test_readers():
@pytest.mark.parametrize(
"reader,additional_config",
[
- ("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 2}),
- ("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 2}),
+ ("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 10}),
+ ("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 10}),
("ml_datasets.cmu_movies.v1", {"limit": 10, "freq_cutoff": 200, "split": 0.8}),
],
)
@@ -82,17 +82,18 @@ def test_cat_readers(reader, additional_config):
[nlp]
lang = "en"
- pipeline = ["tok2vec", "textcat"]
+ pipeline = ["tok2vec", "textcat_multilabel"]
[components]
[components.tok2vec]
factory = "tok2vec"
- [components.textcat]
- factory = "textcat"
+ [components.textcat_multilabel]
+ factory = "textcat_multilabel"
"""
config = Config().from_str(nlp_config_string)
+ fix_random_seed(config["training"]["seed"])
config["corpora"]["@readers"] = reader
config["corpora"].update(additional_config)
nlp = load_model_from_config(config, auto_fill=True)
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index cd428be15..0d73300d8 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -1,15 +1,18 @@
+import random
+
import numpy
-from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment
-from spacy.training import biluo_tags_to_spans, iob_to_biluo
-from spacy.training import Corpus, docs_to_json, Example
-from spacy.training.align import get_alignments
-from spacy.training.converters import json_to_docs
-from spacy.lang.en import English
-from spacy.tokens import Doc, DocBin
-from spacy.util import get_words_and_spaces, minibatch
-from thinc.api import compounding
import pytest
import srsly
+from spacy.lang.en import English
+from spacy.tokens import Doc, DocBin
+from spacy.training import Alignment, Corpus, Example, biluo_tags_to_offsets
+from spacy.training import biluo_tags_to_spans, docs_to_json, iob_to_biluo
+from spacy.training import offsets_to_biluo_tags
+from spacy.training.align import get_alignments
+from spacy.training.converters import json_to_docs
+from spacy.util import get_words_and_spaces, load_model_from_path, minibatch
+from spacy.util import load_config_from_str
+from thinc.api import compounding
from ..util import make_tempdir
@@ -68,6 +71,207 @@ def vocab():
return nlp.vocab
+@pytest.mark.issue(999)
+def test_issue999():
+ """Test that adding entities and resuming training works passably OK.
+ There are two issues here:
+ 1) We have to re-add labels. This isn't very nice.
+ 2) There's no way to set the learning rate for the weight update, so we
+ end up out-of-scale, causing it to learn too fast.
+ """
+ TRAIN_DATA = [
+ ["hey", []],
+ ["howdy", []],
+ ["hey there", []],
+ ["hello", []],
+ ["hi", []],
+ ["i'm looking for a place to eat", []],
+ ["i'm looking for a place in the north of town", [(31, 36, "LOCATION")]],
+ ["show me chinese restaurants", [(8, 15, "CUISINE")]],
+ ["show me chines restaurants", [(8, 14, "CUISINE")]],
+ ]
+ nlp = English()
+ ner = nlp.add_pipe("ner")
+ for _, offsets in TRAIN_DATA:
+ for start, end, label in offsets:
+ ner.add_label(label)
+ nlp.initialize()
+ for itn in range(20):
+ random.shuffle(TRAIN_DATA)
+ for raw_text, entity_offsets in TRAIN_DATA:
+ example = Example.from_dict(
+ nlp.make_doc(raw_text), {"entities": entity_offsets}
+ )
+ nlp.update([example])
+
+ with make_tempdir() as model_dir:
+ nlp.to_disk(model_dir)
+ nlp2 = load_model_from_path(model_dir)
+
+ for raw_text, entity_offsets in TRAIN_DATA:
+ doc = nlp2(raw_text)
+ ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents}
+ for start, end, label in entity_offsets:
+ if (start, end) in ents:
+ assert ents[(start, end)] == label
+ break
+ else:
+ if entity_offsets:
+ raise Exception(ents)
+
+
+@pytest.mark.issue(4402)
+def test_issue4402():
+ json_data = {
+ "id": 0,
+ "paragraphs": [
+ {
+ "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
+ "sentences": [
+ {
+ "tokens": [
+ {"id": 0, "orth": "How", "ner": "O"},
+ {"id": 1, "orth": "should", "ner": "O"},
+ {"id": 2, "orth": "I", "ner": "O"},
+ {"id": 3, "orth": "cook", "ner": "O"},
+ {"id": 4, "orth": "bacon", "ner": "O"},
+ {"id": 5, "orth": "in", "ner": "O"},
+ {"id": 6, "orth": "an", "ner": "O"},
+ {"id": 7, "orth": "oven", "ner": "O"},
+ {"id": 8, "orth": "?", "ner": "O"},
+ ],
+ "brackets": [],
+ },
+ {
+ "tokens": [
+ {"id": 9, "orth": "\n", "ner": "O"},
+ {"id": 10, "orth": "I", "ner": "O"},
+ {"id": 11, "orth": "'ve", "ner": "O"},
+ {"id": 12, "orth": "heard", "ner": "O"},
+ {"id": 13, "orth": "of", "ner": "O"},
+ {"id": 14, "orth": "people", "ner": "O"},
+ {"id": 15, "orth": "cooking", "ner": "O"},
+ {"id": 16, "orth": "bacon", "ner": "O"},
+ {"id": 17, "orth": "in", "ner": "O"},
+ {"id": 18, "orth": "an", "ner": "O"},
+ {"id": 19, "orth": "oven", "ner": "O"},
+ {"id": 20, "orth": ".", "ner": "O"},
+ ],
+ "brackets": [],
+ },
+ ],
+ "cats": [
+ {"label": "baking", "value": 1.0},
+ {"label": "not_baking", "value": 0.0},
+ ],
+ },
+ {
+ "raw": "What is the difference between white and brown eggs?\n",
+ "sentences": [
+ {
+ "tokens": [
+ {"id": 0, "orth": "What", "ner": "O"},
+ {"id": 1, "orth": "is", "ner": "O"},
+ {"id": 2, "orth": "the", "ner": "O"},
+ {"id": 3, "orth": "difference", "ner": "O"},
+ {"id": 4, "orth": "between", "ner": "O"},
+ {"id": 5, "orth": "white", "ner": "O"},
+ {"id": 6, "orth": "and", "ner": "O"},
+ {"id": 7, "orth": "brown", "ner": "O"},
+ {"id": 8, "orth": "eggs", "ner": "O"},
+ {"id": 9, "orth": "?", "ner": "O"},
+ ],
+ "brackets": [],
+ },
+ {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
+ ],
+ "cats": [
+ {"label": "baking", "value": 0.0},
+ {"label": "not_baking", "value": 1.0},
+ ],
+ },
+ ],
+ }
+ nlp = English()
+ attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
+ with make_tempdir() as tmpdir:
+ output_file = tmpdir / "test4402.spacy"
+ docs = json_to_docs([json_data])
+ data = DocBin(docs=docs, attrs=attrs).to_bytes()
+ with output_file.open("wb") as file_:
+ file_.write(data)
+ reader = Corpus(output_file)
+ train_data = list(reader(nlp))
+ assert len(train_data) == 2
+
+ split_train_data = []
+ for eg in train_data:
+ split_train_data.extend(eg.split_sents())
+ assert len(split_train_data) == 4
+
+
+CONFIG_7029 = """
+[nlp]
+lang = "en"
+pipeline = ["tok2vec", "tagger"]
+
+[components]
+
+[components.tok2vec]
+factory = "tok2vec"
+
+[components.tok2vec.model]
+@architectures = "spacy.Tok2Vec.v1"
+
+[components.tok2vec.model.embed]
+@architectures = "spacy.MultiHashEmbed.v1"
+width = ${components.tok2vec.model.encode:width}
+attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
+rows = [5000,2500,2500,2500]
+include_static_vectors = false
+
+[components.tok2vec.model.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v1"
+width = 96
+depth = 4
+window_size = 1
+maxout_pieces = 3
+
+[components.tagger]
+factory = "tagger"
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v1"
+nO = null
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode:width}
+upstream = "*"
+"""
+
+
+@pytest.mark.issue(7029)
+def test_issue7029():
+ """Test that an empty document doesn't mess up an entire batch."""
+ TRAIN_DATA = [
+ ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
+ ("Eat blue ham", {"tags": ["V", "J", "N"]}),
+ ]
+ nlp = English.from_config(load_config_from_str(CONFIG_7029))
+ train_examples = []
+ for t in TRAIN_DATA:
+ train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+ optimizer = nlp.initialize(get_examples=lambda: train_examples)
+ for i in range(50):
+ losses = {}
+ nlp.update(train_examples, sgd=optimizer, losses=losses)
+ texts = ["first", "second", "third", "fourth", "and", "then", "some", ""]
+ docs1 = list(nlp.pipe(texts, batch_size=1))
+ docs2 = list(nlp.pipe(texts, batch_size=4))
+ assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]]
+
+
def test_gold_biluo_U(en_vocab):
words = ["I", "flew", "to", "London", "."]
spaces = [True, True, True, False, True]
@@ -525,6 +729,33 @@ def test_roundtrip_docs_to_docbin(doc):
assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
+def test_docbin_user_data_serialized(doc):
+ doc.user_data["check"] = True
+ nlp = English()
+
+ with make_tempdir() as tmpdir:
+ output_file = tmpdir / "userdata.spacy"
+ DocBin(docs=[doc], store_user_data=True).to_disk(output_file)
+ reloaded_docs = DocBin().from_disk(output_file).get_docs(nlp.vocab)
+ reloaded_doc = list(reloaded_docs)[0]
+
+ assert reloaded_doc.user_data["check"] == True
+
+
+def test_docbin_user_data_not_serialized(doc):
+ # this isn't serializable, but that shouldn't cause an error
+ doc.user_data["check"] = set()
+ nlp = English()
+
+ with make_tempdir() as tmpdir:
+ output_file = tmpdir / "userdata.spacy"
+ DocBin(docs=[doc], store_user_data=False).to_disk(output_file)
+ reloaded_docs = DocBin().from_disk(output_file).get_docs(nlp.vocab)
+ reloaded_doc = list(reloaded_docs)[0]
+
+ assert "check" not in reloaded_doc.user_data
+
+
@pytest.mark.parametrize(
"tokens_a,tokens_b,expected",
[
diff --git a/spacy/tests/universe/test_universe_json.py b/spacy/tests/universe/test_universe_json.py
new file mode 100644
index 000000000..295889186
--- /dev/null
+++ b/spacy/tests/universe/test_universe_json.py
@@ -0,0 +1,17 @@
+import json
+import re
+from pathlib import Path
+
+
+def test_universe_json():
+
+ root_dir = Path(__file__).parent
+ universe_file = root_dir / "universe.json"
+
+ with universe_file.open() as f:
+ universe_data = json.load(f)
+ for entry in universe_data["resources"]:
+ if "github" in entry:
+ assert not re.match(
+ r"^(http:)|^(https:)", entry["github"]
+ ), "Github field should be user/repo, not a url"
diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py
index b6fee6628..d91f41db3 100644
--- a/spacy/tests/vocab_vectors/test_lexeme.py
+++ b/spacy/tests/vocab_vectors/test_lexeme.py
@@ -1,7 +1,25 @@
-import pytest
import numpy
+import pytest
from spacy.attrs import IS_ALPHA, IS_DIGIT
+from spacy.lookups import Lookups
+from spacy.tokens import Doc
from spacy.util import OOV_RANK
+from spacy.vocab import Vocab
+
+
+@pytest.mark.issue(361)
+@pytest.mark.parametrize("text1,text2", [("cat", "dog")])
+def test_issue361(en_vocab, text1, text2):
+ """Test Issue #361: Equality of lexemes"""
+ assert en_vocab[text1] == en_vocab[text1]
+ assert en_vocab[text1] != en_vocab[text2]
+
+
+@pytest.mark.issue(600)
+def test_issue600():
+ vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
+ doc = Doc(vocab, words=["hello"])
+ doc[0].tag_ = "NN"
@pytest.mark.parametrize("text1,prob1,text2,prob2", [("NOUN", -1, "opera", -2)])
diff --git a/spacy/tests/vocab_vectors/test_similarity.py b/spacy/tests/vocab_vectors/test_similarity.py
index b5f7303b5..47cd1f060 100644
--- a/spacy/tests/vocab_vectors/test_similarity.py
+++ b/spacy/tests/vocab_vectors/test_similarity.py
@@ -16,6 +16,16 @@ def vocab(en_vocab, vectors):
return en_vocab
+@pytest.mark.issue(2219)
+def test_issue2219(en_vocab):
+ """Test if indexing issue still occurs during Token-Token similarity"""
+ vectors = [("a", [1, 2, 3]), ("letter", [4, 5, 6])]
+ add_vecs_to_vocab(en_vocab, vectors)
+ [(word1, vec1), (word2, vec2)] = vectors
+ doc = Doc(en_vocab, words=[word1, word2])
+ assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
+
+
def test_vectors_similarity_LL(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
lex1 = vocab[word1]
@@ -25,6 +35,7 @@ def test_vectors_similarity_LL(vocab, vectors):
assert lex1.vector_norm != 0
assert lex2.vector_norm != 0
assert lex1.vector[0] != lex2.vector[0] and lex1.vector[1] != lex2.vector[1]
+ assert isinstance(lex1.similarity(lex2), float)
assert numpy.isclose(lex1.similarity(lex2), get_cosine(vec1, vec2))
assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1))
@@ -37,25 +48,46 @@ def test_vectors_similarity_TT(vocab, vectors):
assert doc[0].vector_norm != 0
assert doc[1].vector_norm != 0
assert doc[0].vector[0] != doc[1].vector[0] and doc[0].vector[1] != doc[1].vector[1]
+ assert isinstance(doc[0].similarity(doc[1]), float)
assert numpy.isclose(doc[0].similarity(doc[1]), get_cosine(vec1, vec2))
assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1]))
+def test_vectors_similarity_SS(vocab, vectors):
+ [(word1, vec1), (word2, vec2)] = vectors
+ doc = Doc(vocab, words=[word1, word2])
+ assert isinstance(doc[0:1].similarity(doc[0:2]), float)
+ assert doc[0:1].similarity(doc[0:2]) == doc[0:2].similarity(doc[0:1])
+
+
+def test_vectors_similarity_DD(vocab, vectors):
+ [(word1, vec1), (word2, vec2)] = vectors
+ doc1 = Doc(vocab, words=[word1, word2])
+ doc2 = Doc(vocab, words=[word2, word1])
+ assert isinstance(doc1.similarity(doc2), float)
+ assert doc1.similarity(doc2) == doc2.similarity(doc1)
+
+
def test_vectors_similarity_TD(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2])
with pytest.warns(UserWarning):
+ assert isinstance(doc.similarity(doc[0]), float)
+ assert isinstance(doc[0].similarity(doc), float)
assert doc.similarity(doc[0]) == doc[0].similarity(doc)
-def test_vectors_similarity_DS(vocab, vectors):
- [(word1, vec1), (word2, vec2)] = vectors
- doc = Doc(vocab, words=[word1, word2])
- assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
-
-
def test_vectors_similarity_TS(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2])
with pytest.warns(UserWarning):
+ assert isinstance(doc[:2].similarity(doc[0]), float)
+ assert isinstance(doc[0].similarity(doc[-2]), float)
assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
+
+
+def test_vectors_similarity_DS(vocab, vectors):
+ [(word1, vec1), (word2, vec2)] = vectors
+ doc = Doc(vocab, words=[word1, word2])
+ assert isinstance(doc.similarity(doc[:2]), float)
+ assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py
index 8a7dd22c3..0650a7487 100644
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@@ -1,12 +1,15 @@
-import pytest
import numpy
-from numpy.testing import assert_allclose, assert_equal
-from thinc.api import get_current_ops
-from spacy.vocab import Vocab
-from spacy.vectors import Vectors
+import pytest
+from numpy.testing import assert_allclose, assert_almost_equal, assert_equal
+from thinc.api import NumpyOps, get_current_ops
+
+from spacy.lang.en import English
+from spacy.strings import hash_string # type: ignore
from spacy.tokenizer import Tokenizer
-from spacy.strings import hash_string
from spacy.tokens import Doc
+from spacy.training.initialize import convert_vectors
+from spacy.vectors import Vectors
+from spacy.vocab import Vocab
from ..util import add_vecs_to_vocab, get_cosine, make_tempdir
@@ -29,22 +32,6 @@ def vectors():
]
-@pytest.fixture
-def ngrams_vectors():
- return [
- ("apple", OPS.asarray([1, 2, 3])),
- ("app", OPS.asarray([-0.1, -0.2, -0.3])),
- ("ppl", OPS.asarray([-0.2, -0.3, -0.4])),
- ("pl", OPS.asarray([0.7, 0.8, 0.9])),
- ]
-
-
-@pytest.fixture()
-def ngrams_vocab(en_vocab, ngrams_vectors):
- add_vecs_to_vocab(en_vocab, ngrams_vectors)
- return en_vocab
-
-
@pytest.fixture
def data():
return numpy.asarray([[0.0, 1.0, 2.0], [3.0, -2.0, 4.0]], dtype="f")
@@ -79,6 +66,79 @@ def tokenizer_v(vocab):
return Tokenizer(vocab, {}, None, None, None)
+@pytest.mark.issue(1518)
+def test_issue1518():
+ """Test vectors.resize() works."""
+ vectors = Vectors(shape=(10, 10))
+ vectors.add("hello", row=2)
+ vectors.resize((5, 9))
+
+
+@pytest.mark.issue(1539)
+def test_issue1539():
+ """Ensure vectors.resize() doesn't try to modify dictionary during iteration."""
+ v = Vectors(shape=(10, 10), keys=[5, 3, 98, 100])
+ v.resize((100, 100))
+
+
+@pytest.mark.issue(1807)
+def test_issue1807():
+ """Test vocab.set_vector also adds the word to the vocab."""
+ vocab = Vocab(vectors_name="test_issue1807")
+ assert "hello" not in vocab
+ vocab.set_vector("hello", numpy.ones((50,), dtype="f"))
+ assert "hello" in vocab
+
+
+@pytest.mark.issue(2871)
+def test_issue2871():
+ """Test that vectors recover the correct key for spaCy reserved words."""
+ words = ["dog", "cat", "SUFFIX"]
+ vocab = Vocab(vectors_name="test_issue2871")
+ vocab.vectors.resize(shape=(3, 10))
+ vector_data = numpy.zeros((3, 10), dtype="f")
+ for word in words:
+ _ = vocab[word] # noqa: F841
+ vocab.set_vector(word, vector_data[0])
+ vocab.vectors.name = "dummy_vectors"
+ assert vocab["dog"].rank == 0
+ assert vocab["cat"].rank == 1
+ assert vocab["SUFFIX"].rank == 2
+ assert vocab.vectors.find(key="dog") == 0
+ assert vocab.vectors.find(key="cat") == 1
+ assert vocab.vectors.find(key="SUFFIX") == 2
+
+
+@pytest.mark.issue(3412)
+def test_issue3412():
+ data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
+ vectors = Vectors(data=data, keys=["A", "B", "C"])
+ keys, best_rows, scores = vectors.most_similar(
+ numpy.asarray([[9, 8, 7], [0, 0, 0]], dtype="f")
+ )
+ assert best_rows[0] == 2
+
+
+@pytest.mark.issue(4725)
+def test_issue4725_2():
+ if isinstance(get_current_ops, NumpyOps):
+ # ensures that this runs correctly and doesn't hang or crash because of the global vectors
+ # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows),
+ # or because of issues with pickling the NER (cf test_issue4725_1)
+ vocab = Vocab(vectors_name="test_vocab_add_vector")
+ data = numpy.ndarray((5, 3), dtype="f")
+ data[0] = 1.0
+ data[1] = 2.0
+ vocab.set_vector("cat", data[0])
+ vocab.set_vector("dog", data[1])
+ nlp = English(vocab=vocab)
+ nlp.add_pipe("ner")
+ nlp.initialize()
+ docs = ["Kurt is in London."] * 10
+ for _ in nlp.pipe(docs, batch_size=2, n_process=2):
+ pass
+
+
def test_init_vectors_with_resize_shape(strings, resize_data):
v = Vectors(shape=(len(strings), 3))
v.resize(shape=resize_data.shape)
@@ -125,6 +185,7 @@ def test_init_vectors_with_data(strings, data):
def test_init_vectors_with_shape(strings):
v = Vectors(shape=(len(strings), 3))
assert v.shape == (len(strings), 3)
+ assert v.is_full is False
def test_get_vector(strings, data):
@@ -180,30 +241,6 @@ def test_vectors_token_vector(tokenizer_v, vectors, text):
assert all([a == b for a, b in zip(vectors[1][1], doc[2].vector)])
-@pytest.mark.parametrize("text", ["apple"])
-def test_vectors__ngrams_word(ngrams_vocab, ngrams_vectors, text):
- assert list(ngrams_vocab.get_vector(text)) == list(ngrams_vectors[0][1])
-
-
-@pytest.mark.parametrize("text", ["applpie"])
-def test_vectors__ngrams_subword(ngrams_vocab, ngrams_vectors, text):
- truth = list(ngrams_vocab.get_vector(text, 1, 6))
- test = list(
- [
- (
- ngrams_vectors[1][1][i]
- + ngrams_vectors[2][1][i]
- + ngrams_vectors[3][1][i]
- )
- / 3
- for i in range(len(ngrams_vectors[1][1]))
- ]
- )
- eps = [abs(truth[i] - test[i]) for i in range(len(truth))]
- for i in eps:
- assert i < 1e-6
-
-
@pytest.mark.parametrize("text", ["apple", "orange"])
def test_vectors_lexeme_vector(vocab, text):
lex = vocab[text]
@@ -379,3 +416,178 @@ def test_vector_is_oov():
assert vocab["cat"].is_oov is False
assert vocab["dog"].is_oov is False
assert vocab["hamster"].is_oov is True
+
+
+def test_init_vectors_unset():
+ v = Vectors(shape=(10, 10))
+ assert v.is_full is False
+ assert v.shape == (10, 10)
+
+ with pytest.raises(ValueError):
+ v = Vectors(shape=(10, 10), mode="floret")
+
+ v = Vectors(data=OPS.xp.zeros((10, 10)), mode="floret", hash_count=1)
+ assert v.is_full is True
+
+
+def test_vectors_clear():
+ data = OPS.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f")
+ v = Vectors(data=data, keys=["A", "B", "C"])
+ assert v.is_full is True
+ assert hash_string("A") in v
+ v.clear()
+ # no keys
+ assert v.key2row == {}
+ assert list(v) == []
+ assert v.is_full is False
+ assert "A" not in v
+ with pytest.raises(KeyError):
+ v["A"]
+
+
+def test_vectors_get_batch():
+ data = OPS.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f")
+ v = Vectors(data=data, keys=["A", "B", "C"])
+ # check with mixed int/str keys
+ words = ["C", "B", "A", v.strings["B"]]
+ rows = v.find(keys=words)
+ vecs = OPS.as_contig(v.data[rows])
+ assert_equal(OPS.to_numpy(vecs), OPS.to_numpy(v.get_batch(words)))
+
+
+@pytest.fixture()
+def floret_vectors_hashvec_str():
+ """The full hashvec table from floret with the settings:
+ bucket 10, dim 10, minn 2, maxn 3, hash count 2, hash seed 2166136261,
+ bow <, eow >"""
+ return """10 10 2 3 2 2166136261 < >
+0 -2.2611 3.9302 2.6676 -11.233 0.093715 -10.52 -9.6463 -0.11853 2.101 -0.10145
+1 -3.12 -1.7981 10.7 -6.171 4.4527 10.967 9.073 6.2056 -6.1199 -2.0402
+2 9.5689 5.6721 -8.4832 -1.2249 2.1871 -3.0264 -2.391 -5.3308 -3.2847 -4.0382
+3 3.6268 4.2759 -1.7007 1.5002 5.5266 1.8716 -12.063 0.26314 2.7645 2.4929
+4 -11.683 -7.7068 2.1102 2.214 7.2202 0.69799 3.2173 -5.382 -2.0838 5.0314
+5 -4.3024 8.0241 2.0714 -1.0174 -0.28369 1.7622 7.8797 -1.7795 6.7541 5.6703
+6 8.3574 -5.225 8.6529 8.5605 -8.9465 3.767 -5.4636 -1.4635 -0.98947 -0.58025
+7 -10.01 3.3894 -4.4487 1.1669 -11.904 6.5158 4.3681 0.79913 -6.9131 -8.687
+8 -5.4576 7.1019 -8.8259 1.7189 4.955 -8.9157 -3.8905 -0.60086 -2.1233 5.892
+9 8.0678 -4.4142 3.6236 4.5889 -2.7611 2.4455 0.67096 -4.2822 2.0875 4.6274
+"""
+
+
+@pytest.fixture()
+def floret_vectors_vec_str():
+ """The top 10 rows from floret with the settings above, to verify
+ that the spacy floret vectors are equivalent to the fasttext static
+ vectors."""
+ return """10 10
+, -5.7814 2.6918 0.57029 -3.6985 -2.7079 1.4406 1.0084 1.7463 -3.8625 -3.0565
+. 3.8016 -1.759 0.59118 3.3044 -0.72975 0.45221 -2.1412 -3.8933 -2.1238 -0.47409
+der 0.08224 2.6601 -1.173 1.1549 -0.42821 -0.097268 -2.5589 -1.609 -0.16968 0.84687
+die -2.8781 0.082576 1.9286 -0.33279 0.79488 3.36 3.5609 -0.64328 -2.4152 0.17266
+und 2.1558 1.8606 -1.382 0.45424 -0.65889 1.2706 0.5929 -2.0592 -2.6949 -1.6015
+" -1.1242 1.4588 -1.6263 1.0382 -2.7609 -0.99794 -0.83478 -1.5711 -1.2137 1.0239
+in -0.87635 2.0958 4.0018 -2.2473 -1.2429 2.3474 1.8846 0.46521 -0.506 -0.26653
+von -0.10589 1.196 1.1143 -0.40907 -1.0848 -0.054756 -2.5016 -1.0381 -0.41598 0.36982
+( 0.59263 2.1856 0.67346 1.0769 1.0701 1.2151 1.718 -3.0441 2.7291 3.719
+) 0.13812 3.3267 1.657 0.34729 -3.5459 0.72372 0.63034 -1.6145 1.2733 0.37798
+"""
+
+
+def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
+ nlp = English()
+ nlp_plain = English()
+ # load both vec and hashvec tables
+ with make_tempdir() as tmpdir:
+ p = tmpdir / "test.hashvec"
+ with open(p, "w") as fileh:
+ fileh.write(floret_vectors_hashvec_str)
+ convert_vectors(nlp, p, truncate=0, prune=-1, mode="floret")
+ p = tmpdir / "test.vec"
+ with open(p, "w") as fileh:
+ fileh.write(floret_vectors_vec_str)
+ convert_vectors(nlp_plain, p, truncate=0, prune=-1)
+
+ word = "der"
+ # ngrams: full padded word + padded 2-grams + padded 3-grams
+ ngrams = nlp.vocab.vectors._get_ngrams(word)
+ assert ngrams == ["", "", ""]
+ # rows: 2 rows per ngram
+ rows = OPS.xp.asarray(
+ [
+ h % nlp.vocab.vectors.shape[0]
+ for ngram in ngrams
+ for h in nlp.vocab.vectors._get_ngram_hashes(ngram)
+ ],
+ dtype="uint32",
+ )
+ assert_equal(
+ OPS.to_numpy(rows),
+ numpy.asarray([5, 6, 7, 5, 8, 2, 8, 9, 3, 3, 4, 6, 7, 3, 0, 2]),
+ )
+ assert len(rows) == len(ngrams) * nlp.vocab.vectors.hash_count
+ # all vectors are equivalent for plain static table vs. hash ngrams
+ for word in nlp_plain.vocab.vectors:
+ word = nlp_plain.vocab.strings.as_string(word)
+ assert_almost_equal(
+ nlp.vocab[word].vector, nlp_plain.vocab[word].vector, decimal=3
+ )
+
+ # every word has a vector
+ assert nlp.vocab[word * 5].has_vector
+
+ # check that single and batched vector lookups are identical
+ words = [s for s in nlp_plain.vocab.vectors]
+ single_vecs = OPS.to_numpy(OPS.asarray([nlp.vocab[word].vector for word in words]))
+ batch_vecs = OPS.to_numpy(nlp.vocab.vectors.get_batch(words))
+ assert_equal(single_vecs, batch_vecs)
+
+ # an empty key returns 0s
+ assert_equal(
+ OPS.to_numpy(nlp.vocab[""].vector),
+ numpy.zeros((nlp.vocab.vectors.shape[0],)),
+ )
+ # an empty batch returns 0s
+ assert_equal(
+ OPS.to_numpy(nlp.vocab.vectors.get_batch([""])),
+ numpy.zeros((1, nlp.vocab.vectors.shape[0])),
+ )
+ # an empty key within a batch returns 0s
+ assert_equal(
+ OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]),
+ numpy.zeros((nlp.vocab.vectors.shape[0],)),
+ )
+
+ # the loaded ngram vector table cannot be modified
+ # except for clear: warning, then return without modifications
+ vector = list(range(nlp.vocab.vectors.shape[1]))
+ orig_bytes = nlp.vocab.vectors.to_bytes(exclude=["strings"])
+ with pytest.warns(UserWarning):
+ nlp.vocab.set_vector("the", vector)
+ assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"])
+ with pytest.warns(UserWarning):
+ nlp.vocab[word].vector = vector
+ assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"])
+ with pytest.warns(UserWarning):
+ nlp.vocab.vectors.add("the", row=6)
+ assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"])
+ with pytest.warns(UserWarning):
+ nlp.vocab.vectors.resize(shape=(100, 10))
+ assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"])
+ with pytest.raises(ValueError):
+ nlp.vocab.vectors.clear()
+
+ # data and settings are serialized correctly
+ with make_tempdir() as d:
+ nlp.vocab.to_disk(d)
+ vocab_r = Vocab()
+ vocab_r.from_disk(d)
+ assert nlp.vocab.vectors.to_bytes() == vocab_r.vectors.to_bytes()
+ assert_equal(
+ OPS.to_numpy(nlp.vocab.vectors.data), OPS.to_numpy(vocab_r.vectors.data)
+ )
+ assert_equal(nlp.vocab.vectors._get_cfg(), vocab_r.vectors._get_cfg())
+ assert_almost_equal(
+ OPS.to_numpy(nlp.vocab[word].vector),
+ OPS.to_numpy(vocab_r[word].vector),
+ decimal=6,
+ )
diff --git a/spacy/tests/vocab_vectors/test_vocab_api.py b/spacy/tests/vocab_vectors/test_vocab_api.py
index 56ef1d108..16cf80a08 100644
--- a/spacy/tests/vocab_vectors/test_vocab_api.py
+++ b/spacy/tests/vocab_vectors/test_vocab_api.py
@@ -1,6 +1,19 @@
import pytest
-from spacy.attrs import LEMMA, ORTH, IS_ALPHA
+from spacy.attrs import IS_ALPHA, LEMMA, ORTH
from spacy.parts_of_speech import NOUN, VERB
+from spacy.vocab import Vocab
+
+
+@pytest.mark.issue(1868)
+def test_issue1868():
+ """Test Vocab.__contains__ works with int keys."""
+ vocab = Vocab()
+ lex = vocab["hello"]
+ assert lex.orth in vocab
+ assert lex.orth_ in vocab
+ assert "some string" not in vocab
+ int_id = vocab.strings.add("some string")
+ assert int_id not in vocab
@pytest.mark.parametrize(
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index 719e8e6f5..fa38a1015 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -23,10 +23,12 @@ cdef class Tokenizer:
cdef object _infix_finditer
cdef object _rules
cdef PhraseMatcher _special_matcher
- cdef int _property_init_count # TODO: unused, remove in v3.1
- cdef int _property_init_max # TODO: unused, remove in v3.1
+ # TODO next two are unused and should be removed in v4
+ # https://github.com/explosion/spaCy/pull/9150
+ cdef int _unused_int1
+ cdef int _unused_int2
- cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases)
+ cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
cdef int _apply_special_cases(self, Doc doc) except -1
cdef void _filter_special_spans(self, vector[SpanC] &original,
vector[SpanC] &filtered, int doc_len) nogil
@@ -37,13 +39,13 @@ cdef class Tokenizer:
cdef int _try_specials_and_cache(self, hash_t key, Doc tokens,
int* has_special,
bint with_special_cases) except -1
- cdef int _tokenize(self, Doc tokens, unicode span, hash_t key,
+ cdef int _tokenize(self, Doc tokens, str span, hash_t key,
int* has_special, bint with_special_cases) except -1
- cdef unicode _split_affixes(self, Pool mem, unicode string,
+ cdef str _split_affixes(self, Pool mem, str string,
vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes, int* has_special,
bint with_special_cases)
- cdef int _attach_tokens(self, Doc tokens, unicode string,
+ cdef int _attach_tokens(self, Doc tokens, str string,
vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes, int* has_special,
bint with_special_cases) except -1
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 61a7582b1..91f228032 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -1,6 +1,4 @@
# cython: embedsignature=True, profile=True, binding=True
-from __future__ import unicode_literals
-
from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as preinc
from libc.string cimport memcpy, memset
@@ -47,10 +45,12 @@ cdef class Tokenizer:
`re.compile(string).search` to match suffixes.
`infix_finditer` (callable): A function matching the signature of
`re.compile(string).finditer` to find infixes.
- token_match (callable): A boolean function matching strings to be
+ token_match (callable): A function matching the signature of
+ `re.compile(string).match`, for matching strings to be
recognized as tokens.
- url_match (callable): A boolean function matching strings to be
- recognized as tokens after considering prefixes and suffixes.
+ url_match (callable): A function matching the signature of
+ `re.compile(string).match`, for matching strings to be
+ recognized as urls.
EXAMPLE:
>>> tokenizer = Tokenizer(nlp.vocab)
@@ -132,7 +132,7 @@ cdef class Tokenizer:
self.url_match)
return (self.__class__, args, None, None)
- def __call__(self, unicode string):
+ def __call__(self, str string):
"""Tokenize a string.
string (str): The string to tokenize.
@@ -145,7 +145,7 @@ cdef class Tokenizer:
return doc
@cython.boundscheck(False)
- cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases):
+ cdef Doc _tokenize_affixes(self, str string, bint with_special_cases):
"""Tokenize according to affix and token_match settings.
string (str): The string to tokenize.
@@ -161,7 +161,7 @@ cdef class Tokenizer:
cdef int start = 0
cdef int has_special = 0
cdef bint in_ws = string[0].isspace()
- cdef unicode span
+ cdef str span
# The task here is much like string.split, but not quite
# We find spans of whitespace and non-space characters, and ignore
# spans that are exactly ' '. So, our sequences will all be separated
@@ -373,7 +373,7 @@ cdef class Tokenizer:
return False
return True
- cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key, int* has_special, bint with_special_cases) except -1:
+ cdef int _tokenize(self, Doc tokens, str span, hash_t orig_key, int* has_special, bint with_special_cases) except -1:
cdef vector[LexemeC*] prefixes
cdef vector[LexemeC*] suffixes
cdef int orig_size
@@ -385,16 +385,16 @@ cdef class Tokenizer:
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
tokens.length - orig_size)
- cdef unicode _split_affixes(self, Pool mem, unicode string,
+ cdef str _split_affixes(self, Pool mem, str string,
vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes,
int* has_special,
bint with_special_cases):
cdef size_t i
- cdef unicode prefix
- cdef unicode suffix
- cdef unicode minus_pre
- cdef unicode minus_suf
+ cdef str prefix
+ cdef str suffix
+ cdef str minus_pre
+ cdef str minus_suf
cdef size_t last_size = 0
while string and len(string) != last_size:
if self.token_match and self.token_match(string):
@@ -410,7 +410,7 @@ cdef class Tokenizer:
string = minus_pre
prefixes.push_back(self.vocab.get(mem, prefix))
break
- suf_len = self.find_suffix(string)
+ suf_len = self.find_suffix(string[pre_len:])
if suf_len != 0:
suffix = string[-suf_len:]
minus_suf = string[:-suf_len]
@@ -430,7 +430,7 @@ cdef class Tokenizer:
suffixes.push_back(self.vocab.get(mem, suffix))
return string
- cdef int _attach_tokens(self, Doc tokens, unicode string,
+ cdef int _attach_tokens(self, Doc tokens, str string,
vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes,
int* has_special,
@@ -440,7 +440,7 @@ cdef class Tokenizer:
cdef int split, end
cdef const LexemeC* const* lexemes
cdef const LexemeC* lexeme
- cdef unicode span
+ cdef str span
cdef int i
if prefixes.size():
for i in range(prefixes.size()):
@@ -513,7 +513,7 @@ cdef class Tokenizer:
cached.data.lexemes = lexemes
self._cache.set(key, cached)
- def find_infix(self, unicode string):
+ def find_infix(self, str string):
"""Find internal split points of the string, such as hyphens.
string (str): The string to segment.
@@ -527,7 +527,7 @@ cdef class Tokenizer:
return 0
return list(self.infix_finditer(string))
- def find_prefix(self, unicode string):
+ def find_prefix(self, str string):
"""Find the length of a prefix that should be segmented from the
string, or None if no prefix rules match.
@@ -541,7 +541,7 @@ cdef class Tokenizer:
match = self.prefix_search(string)
return (match.end() - match.start()) if match is not None else 0
- def find_suffix(self, unicode string):
+ def find_suffix(self, str string):
"""Find the length of a suffix that should be segmented from the
string, or None if no suffix rules match.
@@ -579,7 +579,7 @@ cdef class Tokenizer:
if attr not in (ORTH, NORM):
raise ValueError(Errors.E1005.format(attr=self.vocab.strings[attr], chunk=chunk))
- def add_special_case(self, unicode string, substrings):
+ def add_special_case(self, str string, substrings):
"""Add a special-case tokenization rule.
string (str): The string to specially tokenize.
@@ -683,6 +683,8 @@ cdef class Tokenizer:
infixes = infix_finditer(substring)
offset = 0
for match in infixes:
+ if offset == 0 and match.start() == 0:
+ continue
if substring[offset : match.start()]:
tokens.append(("TOKEN", substring[offset : match.start()]))
if substring[match.start() : match.end()]:
@@ -765,7 +767,7 @@ cdef class Tokenizer:
DOCS: https://spacy.io/api/tokenizer#to_bytes
"""
serializers = {
- "vocab": lambda: self.vocab.to_bytes(),
+ "vocab": lambda: self.vocab.to_bytes(exclude=exclude),
"prefix_search": lambda: _get_regex_pattern(self.prefix_search),
"suffix_search": lambda: _get_regex_pattern(self.suffix_search),
"infix_finditer": lambda: _get_regex_pattern(self.infix_finditer),
@@ -786,7 +788,7 @@ cdef class Tokenizer:
"""
data = {}
deserializers = {
- "vocab": lambda b: self.vocab.from_bytes(b),
+ "vocab": lambda b: self.vocab.from_bytes(b, exclude=exclude),
"prefix_search": lambda b: data.setdefault("prefix_search", b),
"suffix_search": lambda b: data.setdefault("suffix_search", b),
"infix_finditer": lambda b: data.setdefault("infix_finditer", b),
diff --git a/spacy/tokens/_dict_proxies.py b/spacy/tokens/_dict_proxies.py
index 6194cdeff..470d3430f 100644
--- a/spacy/tokens/_dict_proxies.py
+++ b/spacy/tokens/_dict_proxies.py
@@ -1,9 +1,10 @@
-from typing import Iterable, Tuple, Union, TYPE_CHECKING
+from typing import Iterable, Tuple, Union, Optional, TYPE_CHECKING
import weakref
from collections import UserDict
import srsly
from .span_group import SpanGroup
+from ..errors import Errors
if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports
@@ -22,7 +23,7 @@ class SpanGroups(UserDict):
self, doc: "Doc", items: Iterable[Tuple[str, SpanGroup]] = tuple()
) -> None:
self.doc_ref = weakref.ref(doc)
- UserDict.__init__(self, items)
+ UserDict.__init__(self, items) # type: ignore[arg-type]
def __setitem__(self, key: str, value: Union[SpanGroup, Iterable["Span"]]) -> None:
if not isinstance(value, SpanGroup):
@@ -31,11 +32,12 @@ class SpanGroups(UserDict):
UserDict.__setitem__(self, key, value)
def _make_span_group(self, name: str, spans: Iterable["Span"]) -> SpanGroup:
- return SpanGroup(self.doc_ref(), name=name, spans=spans)
+ doc = self._ensure_doc()
+ return SpanGroup(doc, name=name, spans=spans)
- def copy(self, doc: "Doc" = None) -> "SpanGroups":
+ def copy(self, doc: Optional["Doc"] = None) -> "SpanGroups":
if doc is None:
- doc = self.doc_ref()
+ doc = self._ensure_doc()
return SpanGroups(doc).from_bytes(self.to_bytes())
def to_bytes(self) -> bytes:
@@ -47,8 +49,14 @@ class SpanGroups(UserDict):
def from_bytes(self, bytes_data: bytes) -> "SpanGroups":
msg = srsly.msgpack_loads(bytes_data)
self.clear()
- doc = self.doc_ref()
+ doc = self._ensure_doc()
for value_bytes in msg:
group = SpanGroup(doc).from_bytes(value_bytes)
self[group.name] = group
return self
+
+ def _ensure_doc(self) -> "Doc":
+ doc = self.doc_ref()
+ if doc is None:
+ raise ValueError(Errors.E866)
+ return doc
diff --git a/spacy/tokens/_retokenize.pyi b/spacy/tokens/_retokenize.pyi
new file mode 100644
index 000000000..8834d38c0
--- /dev/null
+++ b/spacy/tokens/_retokenize.pyi
@@ -0,0 +1,21 @@
+from typing import Dict, Any, Union, List, Tuple
+from .doc import Doc
+from .span import Span
+from .token import Token
+from .. import Vocab
+
+class Retokenizer:
+ def __init__(self, doc: Doc) -> None: ...
+ def merge(self, span: Span, attrs: Dict[Union[str, int], Any] = ...) -> None: ...
+ def split(
+ self,
+ token: Token,
+ orths: List[str],
+ heads: List[Union[Token, Tuple[Token, int]]],
+ attrs: Dict[Union[str, int], List[Any]] = ...,
+ ) -> None: ...
+ def __enter__(self) -> Retokenizer: ...
+ def __exit__(self, *args: Any) -> None: ...
+
+def normalize_token_attrs(vocab: Vocab, attrs: Dict): ...
+def set_token_attrs(py_token: Token, attrs: Dict): ...
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index 868eb3eab..bd2bdb811 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -1,6 +1,7 @@
-from typing import Iterable, Iterator, Union
+from typing import List, Dict, Set, Iterable, Iterator, Union, Optional
from pathlib import Path
import numpy
+from numpy import ndarray
import zlib
import srsly
from thinc.api import NumpyOps
@@ -8,7 +9,7 @@ from thinc.api import NumpyOps
from .doc import Doc
from ..vocab import Vocab
from ..compat import copy_reg
-from ..attrs import SPACY, ORTH, intify_attr
+from ..attrs import SPACY, ORTH, intify_attr, IDS
from ..errors import Errors
from ..util import ensure_path, SimpleFrozenList
@@ -36,7 +37,7 @@ class DocBin:
"spans": List[Dict[str, bytes]], # SpanGroups data for each doc
"spaces": bytes, # Serialized numpy boolean array with spaces data
"lengths": bytes, # Serialized numpy int32 array with the doc lengths
- "strings": List[unicode] # List of unique strings in the token data
+ "strings": List[str] # List of unique strings in the token data
"version": str, # DocBin version number
}
@@ -64,17 +65,23 @@ class DocBin:
DOCS: https://spacy.io/api/docbin#init
"""
- attrs = sorted([intify_attr(attr) for attr in attrs])
+ int_attrs = [intify_attr(attr) for attr in attrs]
+ if None in int_attrs:
+ non_valid = [attr for attr in attrs if intify_attr(attr) is None]
+ raise KeyError(
+ Errors.E983.format(dict="attrs", key=non_valid, keys=IDS.keys())
+ ) from None
+ attrs = sorted(int_attrs)
self.version = "0.1"
self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0]
- self.tokens = []
- self.spaces = []
- self.cats = []
- self.span_groups = []
- self.user_data = []
- self.flags = []
- self.strings = set()
+ self.tokens: List[ndarray] = []
+ self.spaces: List[ndarray] = []
+ self.cats: List[Dict] = []
+ self.span_groups: List[bytes] = []
+ self.user_data: List[Optional[bytes]] = []
+ self.flags: List[Dict] = []
+ self.strings: Set[str] = set()
self.store_user_data = store_user_data
for doc in docs:
self.add(doc)
@@ -110,7 +117,8 @@ class DocBin:
self.strings.add(token.ent_kb_id_)
self.strings.add(token.ent_id_)
self.cats.append(doc.cats)
- self.user_data.append(srsly.msgpack_dumps(doc.user_data))
+ if self.store_user_data:
+ self.user_data.append(srsly.msgpack_dumps(doc.user_data))
self.span_groups.append(doc.spans.to_bytes())
for key, group in doc.spans.items():
for span in group:
@@ -132,11 +140,11 @@ class DocBin:
for i in range(len(self.tokens)):
flags = self.flags[i]
tokens = self.tokens[i]
- spaces = self.spaces[i]
+ spaces: Optional[ndarray] = self.spaces[i]
if flags.get("has_unknown_spaces"):
spaces = None
- doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces)
- doc = doc.from_array(self.attrs, tokens)
+ doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces) # type: ignore
+ doc = doc.from_array(self.attrs, tokens) # type: ignore
doc.cats = self.cats[i]
if self.span_groups[i]:
doc.spans.from_bytes(self.span_groups[i])
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index c74ee0b63..57d087958 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -56,7 +56,7 @@ cdef class Doc:
cdef public bint has_unknown_spaces
- cdef public list _py_tokens
+ cdef public object _context
cdef int length
cdef int max_length
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
new file mode 100644
index 000000000..7e9340d58
--- /dev/null
+++ b/spacy/tokens/doc.pyi
@@ -0,0 +1,175 @@
+from typing import Callable, Protocol, Iterable, Iterator, Optional
+from typing import Union, Tuple, List, Dict, Any, overload
+from cymem.cymem import Pool
+from thinc.types import Floats1d, Floats2d, Ints2d
+from .span import Span
+from .token import Token
+from ._dict_proxies import SpanGroups
+from ._retokenize import Retokenizer
+from ..lexeme import Lexeme
+from ..vocab import Vocab
+from .underscore import Underscore
+from pathlib import Path
+import numpy as np
+
+class DocMethod(Protocol):
+ def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc]
+
+class Doc:
+ vocab: Vocab
+ mem: Pool
+ spans: SpanGroups
+ max_length: int
+ length: int
+ sentiment: float
+ cats: Dict[str, float]
+ user_hooks: Dict[str, Callable[..., Any]]
+ user_token_hooks: Dict[str, Callable[..., Any]]
+ user_span_hooks: Dict[str, Callable[..., Any]]
+ tensor: np.ndarray[Any, np.dtype[np.float_]]
+ user_data: Dict[str, Any]
+ has_unknown_spaces: bool
+ _context: Any
+ @classmethod
+ def set_extension(
+ cls,
+ name: str,
+ default: Optional[Any] = ...,
+ getter: Optional[Callable[[Doc], Any]] = ...,
+ setter: Optional[Callable[[Doc, Any], None]] = ...,
+ method: Optional[DocMethod] = ...,
+ force: bool = ...,
+ ) -> None: ...
+ @classmethod
+ def get_extension(
+ cls, name: str
+ ) -> Tuple[
+ Optional[Any],
+ Optional[DocMethod],
+ Optional[Callable[[Doc], Any]],
+ Optional[Callable[[Doc, Any], None]],
+ ]: ...
+ @classmethod
+ def has_extension(cls, name: str) -> bool: ...
+ @classmethod
+ def remove_extension(
+ cls, name: str
+ ) -> Tuple[
+ Optional[Any],
+ Optional[DocMethod],
+ Optional[Callable[[Doc], Any]],
+ Optional[Callable[[Doc, Any], None]],
+ ]: ...
+ def __init__(
+ self,
+ vocab: Vocab,
+ words: Optional[List[str]] = ...,
+ spaces: Optional[List[bool]] = ...,
+ user_data: Optional[Dict[Any, Any]] = ...,
+ tags: Optional[List[str]] = ...,
+ pos: Optional[List[str]] = ...,
+ morphs: Optional[List[str]] = ...,
+ lemmas: Optional[List[str]] = ...,
+ heads: Optional[List[int]] = ...,
+ deps: Optional[List[str]] = ...,
+ sent_starts: Optional[List[Union[bool, None]]] = ...,
+ ents: Optional[List[str]] = ...,
+ ) -> None: ...
+ @property
+ def _(self) -> Underscore: ...
+ @property
+ def is_tagged(self) -> bool: ...
+ @property
+ def is_parsed(self) -> bool: ...
+ @property
+ def is_nered(self) -> bool: ...
+ @property
+ def is_sentenced(self) -> bool: ...
+ def has_annotation(
+ self, attr: Union[int, str], *, require_complete: bool = ...
+ ) -> bool: ...
+ @overload
+ def __getitem__(self, i: int) -> Token: ...
+ @overload
+ def __getitem__(self, i: slice) -> Span: ...
+ def __iter__(self) -> Iterator[Token]: ...
+ def __len__(self) -> int: ...
+ def __unicode__(self) -> str: ...
+ def __bytes__(self) -> bytes: ...
+ def __str__(self) -> str: ...
+ def __repr__(self) -> str: ...
+ @property
+ def doc(self) -> Doc: ...
+ def char_span(
+ self,
+ start_idx: int,
+ end_idx: int,
+ label: Union[int, str] = ...,
+ kb_id: Union[int, str] = ...,
+ vector: Optional[Floats1d] = ...,
+ alignment_mode: str = ...,
+ ) -> Span: ...
+ def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ...
+ @property
+ def has_vector(self) -> bool: ...
+ vector: Floats1d
+ vector_norm: float
+ @property
+ def text(self) -> str: ...
+ @property
+ def text_with_ws(self) -> str: ...
+ ents: Tuple[Span]
+ def set_ents(
+ self,
+ entities: List[Span],
+ *,
+ blocked: Optional[List[Span]] = ...,
+ missing: Optional[List[Span]] = ...,
+ outside: Optional[List[Span]] = ...,
+ default: str = ...
+ ) -> None: ...
+ @property
+ def noun_chunks(self) -> Iterator[Span]: ...
+ @property
+ def sents(self) -> Iterator[Span]: ...
+ @property
+ def lang(self) -> int: ...
+ @property
+ def lang_(self) -> str: ...
+ def count_by(
+ self, attr_id: int, exclude: Optional[Any] = ..., counts: Optional[Any] = ...
+ ) -> Dict[Any, int]: ...
+ def from_array(
+ self, attrs: Union[int, str, List[Union[int, str]]], array: Ints2d
+ ) -> Doc: ...
+ def to_array(
+ self, py_attr_ids: Union[int, str, List[Union[int, str]]]
+ ) -> np.ndarray[Any, np.dtype[np.float_]]: ...
+ @staticmethod
+ def from_docs(
+ docs: List[Doc],
+ ensure_whitespace: bool = ...,
+ attrs: Optional[Union[Tuple[Union[str, int]], List[Union[int, str]]]] = ...,
+ ) -> Doc: ...
+ def get_lca_matrix(self) -> Ints2d: ...
+ def copy(self) -> Doc: ...
+ def to_disk(
+ self, path: Union[str, Path], *, exclude: Iterable[str] = ...
+ ) -> None: ...
+ def from_disk(
+ self, path: Union[str, Path], *, exclude: Union[List[str], Tuple[str]] = ...
+ ) -> Doc: ...
+ def to_bytes(self, *, exclude: Union[List[str], Tuple[str]] = ...) -> bytes: ...
+ def from_bytes(
+ self, bytes_data: bytes, *, exclude: Union[List[str], Tuple[str]] = ...
+ ) -> Doc: ...
+ def to_dict(self, *, exclude: Union[List[str], Tuple[str]] = ...) -> bytes: ...
+ def from_dict(
+ self, msg: bytes, *, exclude: Union[List[str], Tuple[str]] = ...
+ ) -> Doc: ...
+ def extend_tensor(self, tensor: Floats2d) -> None: ...
+ def retokenize(self) -> Retokenizer: ...
+ def to_json(self, underscore: Optional[List[str]] = ...) -> Dict[str, Any]: ...
+ def to_utf8_array(self, nr_char: int = ...) -> Ints2d: ...
+ @staticmethod
+ def _get_array_attrs() -> Tuple[Any]: ...
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index cd2bd6f6c..5a0db115d 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -30,6 +30,7 @@ from ..compat import copy_reg, pickle
from ..errors import Errors, Warnings
from ..morphology import Morphology
from .. import util
+from .. import parts_of_speech
from .underscore import Underscore, get_ext_args
from ._retokenize import Retokenizer
from ._serialize import ALL_ATTRS as DOCBIN_ALL_ATTRS
@@ -193,11 +194,12 @@ cdef class Doc:
vocab (Vocab): A vocabulary object, which must match any models you
want to use (e.g. tokenizer, parser, entity recognizer).
- words (Optional[List[str]]): A list of unicode strings to add to the document
- as words. If `None`, defaults to empty list.
- spaces (Optional[List[bool]]): A list of boolean values, of the same length as
- words. True means that the word is followed by a space, False means
- it is not. If `None`, defaults to `[True]*len(words)`
+ words (Optional[List[Union[str, int]]]): A list of unicode strings or
+ hash values to add to the document as words. If `None`, defaults to
+ empty list.
+ spaces (Optional[List[bool]]): A list of boolean values, of the same
+ length as `words`. `True` means that the word is followed by a space,
+ `False` means it is not. If `None`, defaults to `[True]*len(words)`
user_data (dict or None): Optional extra data to attach to the Doc.
tags (Optional[List[str]]): A list of unicode strings, of the same
length as words, to assign as token.tag. Defaults to None.
@@ -260,12 +262,15 @@ cdef class Doc:
raise ValueError(Errors.E027)
cdef const LexemeC* lexeme
for word, has_space in zip(words, spaces):
- if isinstance(word, unicode):
+ if isinstance(word, str):
lexeme = self.vocab.get(self.mem, word)
elif isinstance(word, bytes):
raise ValueError(Errors.E028.format(value=word))
else:
- lexeme = self.vocab.get_by_orth(self.mem, word)
+ try:
+ lexeme = self.vocab.get_by_orth(self.mem, word)
+ except TypeError:
+ raise TypeError(Errors.E1022.format(wtype=type(word)))
self.push_back(lexeme, has_space)
if heads is not None:
@@ -285,6 +290,10 @@ cdef class Doc:
sent_starts[i] = -1
elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]:
sent_starts[i] = 0
+ if pos is not None:
+ for pp in set(pos):
+ if pp not in parts_of_speech.IDS:
+ raise ValueError(Errors.E1021.format(pp=pp))
ent_iobs = None
ent_types = None
if ents is not None:
@@ -529,7 +538,13 @@ cdef class Doc:
kb_id = self.vocab.strings.add(kb_id)
alignment_modes = ("strict", "contract", "expand")
if alignment_mode not in alignment_modes:
- raise ValueError(Errors.E202.format(mode=alignment_mode, modes=", ".join(alignment_modes)))
+ raise ValueError(
+ Errors.E202.format(
+ name="alignment",
+ mode=alignment_mode,
+ modes=", ".join(alignment_modes),
+ )
+ )
cdef int start = token_by_char(self.c, self.length, start_idx)
if start < 0 or (alignment_mode == "strict" and start_idx != self[start].idx):
return None
@@ -601,7 +616,7 @@ cdef class Doc:
"""
if "has_vector" in self.user_hooks:
return self.user_hooks["has_vector"](self)
- elif self.vocab.vectors.data.size:
+ elif self.vocab.vectors.size:
return True
elif self.tensor.size:
return True
@@ -626,7 +641,7 @@ cdef class Doc:
if not len(self):
self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f")
return self._vector
- elif self.vocab.vectors.data.size > 0:
+ elif self.vocab.vectors.size > 0:
self._vector = sum(t.vector for t in self) / len(self)
return self._vector
elif self.tensor.size > 0:
@@ -909,7 +924,7 @@ cdef class Doc:
can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) or
string name (e.g. 'LEMMA' or 'lemma').
- attr_ids (list[]): A list of attributes (int IDs or string names).
+ py_attr_ids (list[]): A list of attributes (int IDs or string names).
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
per word, and one column per attribute indicated in the input
`attr_ids`.
@@ -1168,7 +1183,7 @@ cdef class Doc:
token_offset = -1
for doc in docs[:-1]:
token_offset += len(doc)
- if not (len(doc) > 0 and doc[-1].is_space):
+ if len(doc) > 0 and not doc[-1].is_space:
concat_spaces[token_offset] = True
concat_array = numpy.concatenate(arrays)
@@ -1362,7 +1377,7 @@ cdef class Doc:
self.has_unknown_spaces = msg["has_unknown_spaces"]
start = 0
cdef const LexemeC* lex
- cdef unicode orth_
+ cdef str orth_
text = msg["text"]
attrs = msg["array_body"]
for i in range(attrs.shape[0]):
@@ -1423,7 +1438,7 @@ cdef class Doc:
attributes are inherited from the syntactic root of the span.
RETURNS (Token): The first newly merged token.
"""
- cdef unicode tag, lemma, ent_type
+ cdef str tag, lemma, ent_type
attr_len = len(attributes)
span_len = len(spans)
if not attr_len == span_len:
@@ -1695,17 +1710,18 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
def pickle_doc(doc):
bytes_data = doc.to_bytes(exclude=["vocab", "user_data", "user_hooks"])
hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,
- doc.user_token_hooks)
+ doc.user_token_hooks, doc._context)
return (unpickle_doc, (doc.vocab, srsly.pickle_dumps(hooks_and_data), bytes_data))
def unpickle_doc(vocab, hooks_and_data, bytes_data):
- user_data, doc_hooks, span_hooks, token_hooks = srsly.pickle_loads(hooks_and_data)
+ user_data, doc_hooks, span_hooks, token_hooks, _context = srsly.pickle_loads(hooks_and_data)
doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data, exclude=["user_data"])
doc.user_hooks.update(doc_hooks)
doc.user_span_hooks.update(span_hooks)
doc.user_token_hooks.update(token_hooks)
+ doc._context = _context
return doc
diff --git a/spacy/tokens/morphanalysis.pyi b/spacy/tokens/morphanalysis.pyi
new file mode 100644
index 000000000..b86203cc4
--- /dev/null
+++ b/spacy/tokens/morphanalysis.pyi
@@ -0,0 +1,20 @@
+from typing import Any, Dict, Iterator, List, Union
+from ..vocab import Vocab
+
+class MorphAnalysis:
+ def __init__(
+ self, vocab: Vocab, features: Union[Dict[str, str], str] = ...
+ ) -> None: ...
+ @classmethod
+ def from_id(cls, vocab: Vocab, key: Any) -> MorphAnalysis: ...
+ def __contains__(self, feature: str) -> bool: ...
+ def __iter__(self) -> Iterator[str]: ...
+ def __len__(self) -> int: ...
+ def __hash__(self) -> int: ...
+ def __eq__(self, other: MorphAnalysis) -> bool: ... # type: ignore[override]
+ def __ne__(self, other: MorphAnalysis) -> bool: ... # type: ignore[override]
+ def get(self, field: Any) -> List[str]: ...
+ def to_json(self) -> str: ...
+ def to_dict(self) -> Dict[str, str]: ...
+ def __str__(self) -> str: ...
+ def __repr__(self) -> str: ...
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
new file mode 100644
index 000000000..697051e81
--- /dev/null
+++ b/spacy/tokens/span.pyi
@@ -0,0 +1,126 @@
+from typing import Callable, Protocol, Iterator, Optional, Union, Tuple, Any, overload
+from thinc.types import Floats1d, Ints2d, FloatsXd
+from .doc import Doc
+from .token import Token
+from .underscore import Underscore
+from ..lexeme import Lexeme
+from ..vocab import Vocab
+
+class SpanMethod(Protocol):
+ def __call__(self: Span, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc]
+
+class Span:
+ @classmethod
+ def set_extension(
+ cls,
+ name: str,
+ default: Optional[Any] = ...,
+ getter: Optional[Callable[[Span], Any]] = ...,
+ setter: Optional[Callable[[Span, Any], None]] = ...,
+ method: Optional[SpanMethod] = ...,
+ force: bool = ...,
+ ) -> None: ...
+ @classmethod
+ def get_extension(
+ cls, name: str
+ ) -> Tuple[
+ Optional[Any],
+ Optional[SpanMethod],
+ Optional[Callable[[Span], Any]],
+ Optional[Callable[[Span, Any], None]],
+ ]: ...
+ @classmethod
+ def has_extension(cls, name: str) -> bool: ...
+ @classmethod
+ def remove_extension(
+ cls, name: str
+ ) -> Tuple[
+ Optional[Any],
+ Optional[SpanMethod],
+ Optional[Callable[[Span], Any]],
+ Optional[Callable[[Span, Any], None]],
+ ]: ...
+ def __init__(
+ self,
+ doc: Doc,
+ start: int,
+ end: int,
+ label: Union[str, int] = ...,
+ vector: Optional[Floats1d] = ...,
+ vector_norm: Optional[float] = ...,
+ kb_id: Optional[int] = ...,
+ ) -> None: ...
+ def __richcmp__(self, other: Span, op: int) -> bool: ...
+ def __hash__(self) -> int: ...
+ def __len__(self) -> int: ...
+ def __repr__(self) -> str: ...
+ @overload
+ def __getitem__(self, i: int) -> Token: ...
+ @overload
+ def __getitem__(self, i: slice) -> Span: ...
+ def __iter__(self) -> Iterator[Token]: ...
+ @property
+ def _(self) -> Underscore: ...
+ def as_doc(self, *, copy_user_data: bool = ...) -> Doc: ...
+ def get_lca_matrix(self) -> Ints2d: ...
+ def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ...
+ @property
+ def doc(self) -> Doc: ...
+ @property
+ def vocab(self) -> Vocab: ...
+ @property
+ def sent(self) -> Span: ...
+ @property
+ def ents(self) -> Tuple[Span]: ...
+ @property
+ def has_vector(self) -> bool: ...
+ @property
+ def vector(self) -> Floats1d: ...
+ @property
+ def vector_norm(self) -> float: ...
+ @property
+ def tensor(self) -> FloatsXd: ...
+ @property
+ def sentiment(self) -> float: ...
+ @property
+ def text(self) -> str: ...
+ @property
+ def text_with_ws(self) -> str: ...
+ @property
+ def noun_chunks(self) -> Iterator[Span]: ...
+ @property
+ def root(self) -> Token: ...
+ def char_span(
+ self,
+ start_idx: int,
+ end_idx: int,
+ label: int = ...,
+ kb_id: int = ...,
+ vector: Optional[Floats1d] = ...,
+ ) -> Span: ...
+ @property
+ def conjuncts(self) -> Tuple[Token]: ...
+ @property
+ def lefts(self) -> Iterator[Token]: ...
+ @property
+ def rights(self) -> Iterator[Token]: ...
+ @property
+ def n_lefts(self) -> int: ...
+ @property
+ def n_rights(self) -> int: ...
+ @property
+ def subtree(self) -> Iterator[Token]: ...
+ start: int
+ end: int
+ start_char: int
+ end_char: int
+ label: int
+ kb_id: int
+ ent_id: int
+ ent_id_: str
+ @property
+ def orth_(self) -> str: ...
+ @property
+ def lemma_(self) -> str: ...
+ label_: str
+ kb_id_: str
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 093b2a4da..970c09d60 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -1,5 +1,3 @@
-from __future__ import unicode_literals
-
cimport numpy as np
from libc.math cimport sqrt
@@ -88,10 +86,11 @@ cdef class Span:
doc (Doc): The parent document.
start (int): The index of the first token of the span.
end (int): The index of the first token after the span.
- label (uint64): A label to attach to the Span, e.g. for named entities.
- kb_id (uint64): An identifier from a Knowledge Base to capture the meaning of a named entity.
+ label (int or str): A label to attach to the Span, e.g. for named entities.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation
of the span.
+ vector_norm (float): The L2 norm of the span's vector representation.
+ kb_id (uint64): An identifier from a Knowledge Base to capture the meaning of a named entity.
DOCS: https://spacy.io/api/span#init
"""
@@ -105,13 +104,18 @@ cdef class Span:
if label not in doc.vocab.strings:
raise ValueError(Errors.E084.format(label=label))
+ start_char = doc[start].idx if start < doc.length else len(doc.text)
+ if start == end:
+ end_char = start_char
+ else:
+ end_char = doc[end - 1].idx + len(doc[end - 1])
self.c = SpanC(
label=label,
kb_id=kb_id,
start=start,
end=end,
- start_char=doc[start].idx if start < doc.length else 0,
- end_char=doc[end - 1].idx + len(doc[end - 1]) if end >= 1 else 0,
+ start_char=start_char,
+ end_char=end_char,
)
self._vector = vector
self._vector_norm = vector_norm
@@ -213,10 +217,12 @@ cdef class Span:
return Underscore(Underscore.span_extensions, self,
start=self.c.start_char, end=self.c.end_char)
- def as_doc(self, *, bint copy_user_data=False):
+ def as_doc(self, *, bint copy_user_data=False, array_head=None, array=None):
"""Create a `Doc` object with a copy of the `Span`'s data.
copy_user_data (bool): Whether or not to copy the original doc's user data.
+ array_head (tuple): `Doc` array attrs, can be passed in to speed up computation.
+ array (ndarray): `Doc` as array, can be passed in to speed up computation.
RETURNS (Doc): The `Doc` copy of the span.
DOCS: https://spacy.io/api/span#as_doc
@@ -224,8 +230,10 @@ cdef class Span:
words = [t.text for t in self]
spaces = [bool(t.whitespace_) for t in self]
cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces)
- array_head = self.doc._get_array_attrs()
- array = self.doc.to_array(array_head)
+ if array_head is None:
+ array_head = self.doc._get_array_attrs()
+ if array is None:
+ array = self.doc.to_array(array_head)
array = array[self.start : self.end]
self._fix_dep_copy(array_head, array)
# Fix initial IOB so the entities are valid for doc.ents below.
@@ -356,8 +364,10 @@ cdef class Span:
return 0.0
vector = self.vector
xp = get_array_module(vector)
- return xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
-
+ result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
+ # ensure we get a scalar back (numpy does this automatically but cupy doesn't)
+ return result.item()
+
cpdef np.ndarray to_array(self, object py_attr_ids):
"""Given a list of M attribute IDs, export the tokens to a numpy
`ndarray` of shape `(N, M)`, where `N` is the length of the document.
@@ -396,6 +406,10 @@ cdef class Span:
"""
if "sent" in self.doc.user_span_hooks:
return self.doc.user_span_hooks["sent"](self)
+ elif "sents" in self.doc.user_hooks:
+ for sentence in self.doc.user_hooks["sents"](self.doc):
+ if sentence.start <= self.start < sentence.end:
+ return sentence
# Use `sent_start` token attribute to find sentence boundaries
cdef int n = 0
if self.doc.has_annotation("SENT_START"):
@@ -414,10 +428,51 @@ cdef class Span:
else:
raise ValueError(Errors.E030)
+ @property
+ def sents(self):
+ """Obtain the sentences that contain this span. If the given span
+ crosses sentence boundaries, return all sentences it is a part of.
+
+ RETURNS (Iterable[Span]): All sentences that the span is a part of.
+
+ DOCS: https://spacy.io/api/span#sents
+ """
+ cdef int start
+ cdef int i
+
+ if "sents" in self.doc.user_span_hooks:
+ yield from self.doc.user_span_hooks["sents"](self)
+ elif "sents" in self.doc.user_hooks:
+ for sentence in self.doc.user_hooks["sents"](self.doc):
+ if sentence.end > self.start:
+ if sentence.start < self.end or sentence.start == self.start == self.end:
+ yield sentence
+ else:
+ break
+ else:
+ if not self.doc.has_annotation("SENT_START"):
+ raise ValueError(Errors.E030)
+ # Use `sent_start` token attribute to find sentence boundaries
+ # Find start of the 1st sentence of the Span
+ start = self.start
+ while self.doc.c[start].sent_start != 1 and start > 0:
+ start -= 1
+
+ # Now, find all the sentences in the span
+ for i in range(start + 1, self.doc.length):
+ if self.doc.c[i].sent_start == 1:
+ yield Span(self.doc, start, i)
+ start = i
+ if start >= self.end:
+ break
+ if start < self.end:
+ yield Span(self.doc, start, self.end)
+
+
@property
def ents(self):
- """The named entities in the span. Returns a tuple of named entity
- `Span` objects, if the entity recognizer has been applied.
+ """The named entities that fall completely within the span. Returns
+ a tuple of `Span` objects.
RETURNS (tuple): Entities in the span, one `Span` per entity.
@@ -444,7 +499,7 @@ cdef class Span:
"""
if "has_vector" in self.doc.user_span_hooks:
return self.doc.user_span_hooks["has_vector"](self)
- elif self.vocab.vectors.data.size > 0:
+ elif self.vocab.vectors.size > 0:
return any(token.has_vector for token in self)
elif self.doc.tensor.size > 0:
return True
@@ -464,7 +519,11 @@ cdef class Span:
if "vector" in self.doc.user_span_hooks:
return self.doc.user_span_hooks["vector"](self)
if self._vector is None:
- self._vector = sum(t.vector for t in self) / len(self)
+ if not len(self):
+ xp = get_array_module(self.vocab.vectors.data)
+ self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f")
+ else:
+ self._vector = sum(t.vector for t in self) / len(self)
return self._vector
@property
@@ -477,10 +536,10 @@ cdef class Span:
"""
if "vector_norm" in self.doc.user_span_hooks:
return self.doc.user_span_hooks["vector"](self)
- vector = self.vector
- xp = get_array_module(vector)
if self._vector_norm is None:
+ vector = self.vector
total = (vector*vector).sum()
+ xp = get_array_module(vector)
self._vector_norm = xp.sqrt(total) if total != 0. else 0.
return self._vector_norm
@@ -740,7 +799,7 @@ cdef class Span:
def __get__(self):
return self.root.ent_id_
- def __set__(self, unicode key):
+ def __set__(self, str key):
raise NotImplementedError(Errors.E200.format(attr="ent_id_"))
@property
@@ -761,7 +820,7 @@ cdef class Span:
def __get__(self):
return self.doc.vocab.strings[self.label]
- def __set__(self, unicode label_):
+ def __set__(self, str label_):
self.label = self.doc.vocab.strings.add(label_)
property kb_id_:
@@ -769,7 +828,7 @@ cdef class Span:
def __get__(self):
return self.doc.vocab.strings[self.kb_id]
- def __set__(self, unicode kb_id_):
+ def __set__(self, str kb_id_):
self.kb_id = self.doc.vocab.strings.add(kb_id_)
diff --git a/spacy/tokens/span_group.pyi b/spacy/tokens/span_group.pyi
new file mode 100644
index 000000000..26efc3ba0
--- /dev/null
+++ b/spacy/tokens/span_group.pyi
@@ -0,0 +1,26 @@
+from typing import Any, Dict, Iterable
+from .doc import Doc
+from .span import Span
+
+class SpanGroup:
+ name: str
+ attrs: Dict[str, Any]
+ def __init__(
+ self,
+ doc: Doc,
+ *,
+ name: str = ...,
+ attrs: Dict[str, Any] = ...,
+ spans: Iterable[Span] = ...
+ ) -> None: ...
+ def __repr__(self) -> str: ...
+ @property
+ def doc(self) -> Doc: ...
+ @property
+ def has_overlap(self) -> bool: ...
+ def __len__(self) -> int: ...
+ def append(self, span: Span) -> None: ...
+ def extend(self, spans: Iterable[Span]) -> None: ...
+ def __getitem__(self, i: int) -> Span: ...
+ def to_bytes(self) -> bytes: ...
+ def from_bytes(self, bytes_data: bytes) -> SpanGroup: ...
diff --git a/spacy/tokens/span_group.pyx b/spacy/tokens/span_group.pyx
index 081685c25..6cfa75237 100644
--- a/spacy/tokens/span_group.pyx
+++ b/spacy/tokens/span_group.pyx
@@ -1,6 +1,8 @@
import weakref
import struct
import srsly
+
+from spacy.errors import Errors
from .span cimport Span
from libc.stdint cimport uint64_t, uint32_t, int32_t
@@ -58,7 +60,11 @@ cdef class SpanGroup:
DOCS: https://spacy.io/api/spangroup#doc
"""
- return self._doc_ref()
+ doc = self._doc_ref()
+ if doc is None:
+ # referent has been garbage collected
+ raise RuntimeError(Errors.E865)
+ return doc
@property
def has_overlap(self):
diff --git a/spacy/tokens/token.pyi b/spacy/tokens/token.pyi
new file mode 100644
index 000000000..bd585d034
--- /dev/null
+++ b/spacy/tokens/token.pyi
@@ -0,0 +1,208 @@
+from typing import (
+ Callable,
+ Protocol,
+ Iterator,
+ Optional,
+ Union,
+ Tuple,
+ Any,
+)
+from thinc.types import Floats1d, FloatsXd
+from .doc import Doc
+from .span import Span
+from .morphanalysis import MorphAnalysis
+from ..lexeme import Lexeme
+from ..vocab import Vocab
+from .underscore import Underscore
+
+class TokenMethod(Protocol):
+ def __call__(self: Token, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc]
+
+class Token:
+ i: int
+ doc: Doc
+ vocab: Vocab
+ @classmethod
+ def set_extension(
+ cls,
+ name: str,
+ default: Optional[Any] = ...,
+ getter: Optional[Callable[[Token], Any]] = ...,
+ setter: Optional[Callable[[Token, Any], None]] = ...,
+ method: Optional[TokenMethod] = ...,
+ force: bool = ...,
+ ) -> None: ...
+ @classmethod
+ def get_extension(
+ cls, name: str
+ ) -> Tuple[
+ Optional[Any],
+ Optional[TokenMethod],
+ Optional[Callable[[Token], Any]],
+ Optional[Callable[[Token, Any], None]],
+ ]: ...
+ @classmethod
+ def has_extension(cls, name: str) -> bool: ...
+ @classmethod
+ def remove_extension(
+ cls, name: str
+ ) -> Tuple[
+ Optional[Any],
+ Optional[TokenMethod],
+ Optional[Callable[[Token], Any]],
+ Optional[Callable[[Token, Any], None]],
+ ]: ...
+ def __init__(self, vocab: Vocab, doc: Doc, offset: int) -> None: ...
+ def __hash__(self) -> int: ...
+ def __len__(self) -> int: ...
+ def __unicode__(self) -> str: ...
+ def __bytes__(self) -> bytes: ...
+ def __str__(self) -> str: ...
+ def __repr__(self) -> str: ...
+ def __richcmp__(self, other: Token, op: int) -> bool: ...
+ @property
+ def _(self) -> Underscore: ...
+ def nbor(self, i: int = ...) -> Token: ...
+ def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ...
+ def has_morph(self) -> bool: ...
+ morph: MorphAnalysis
+ @property
+ def lex(self) -> Lexeme: ...
+ @property
+ def lex_id(self) -> int: ...
+ @property
+ def rank(self) -> int: ...
+ @property
+ def text(self) -> str: ...
+ @property
+ def text_with_ws(self) -> str: ...
+ @property
+ def prob(self) -> float: ...
+ @property
+ def sentiment(self) -> float: ...
+ @property
+ def lang(self) -> int: ...
+ @property
+ def idx(self) -> int: ...
+ @property
+ def cluster(self) -> int: ...
+ @property
+ def orth(self) -> int: ...
+ @property
+ def lower(self) -> int: ...
+ @property
+ def norm(self) -> int: ...
+ @property
+ def shape(self) -> int: ...
+ @property
+ def prefix(self) -> int: ...
+ @property
+ def suffix(self) -> int: ...
+ lemma: int
+ pos: int
+ tag: int
+ dep: int
+ @property
+ def has_vector(self) -> bool: ...
+ @property
+ def vector(self) -> Floats1d: ...
+ @property
+ def vector_norm(self) -> float: ...
+ @property
+ def tensor(self) -> Optional[FloatsXd]: ...
+ @property
+ def n_lefts(self) -> int: ...
+ @property
+ def n_rights(self) -> int: ...
+ @property
+ def sent(self) -> Span: ...
+ sent_start: bool
+ is_sent_start: Optional[bool]
+ is_sent_end: Optional[bool]
+ @property
+ def lefts(self) -> Iterator[Token]: ...
+ @property
+ def rights(self) -> Iterator[Token]: ...
+ @property
+ def children(self) -> Iterator[Token]: ...
+ @property
+ def subtree(self) -> Iterator[Token]: ...
+ @property
+ def left_edge(self) -> Token: ...
+ @property
+ def right_edge(self) -> Token: ...
+ @property
+ def ancestors(self) -> Iterator[Token]: ...
+ def is_ancestor(self, descendant: Token) -> bool: ...
+ def has_head(self) -> bool: ...
+ head: Token
+ @property
+ def conjuncts(self) -> Tuple[Token]: ...
+ ent_type: int
+ ent_type_: str
+ @property
+ def ent_iob(self) -> int: ...
+ @classmethod
+ def iob_strings(cls) -> Tuple[str]: ...
+ @property
+ def ent_iob_(self) -> str: ...
+ ent_id: int
+ ent_id_: str
+ ent_kb_id: int
+ ent_kb_id_: str
+ @property
+ def whitespace_(self) -> str: ...
+ @property
+ def orth_(self) -> str: ...
+ @property
+ def lower_(self) -> str: ...
+ norm_: str
+ @property
+ def shape_(self) -> str: ...
+ @property
+ def prefix_(self) -> str: ...
+ @property
+ def suffix_(self) -> str: ...
+ @property
+ def lang_(self) -> str: ...
+ lemma_: str
+ pos_: str
+ tag_: str
+ def has_dep(self) -> bool: ...
+ dep_: str
+ @property
+ def is_oov(self) -> bool: ...
+ @property
+ def is_stop(self) -> bool: ...
+ @property
+ def is_alpha(self) -> bool: ...
+ @property
+ def is_ascii(self) -> bool: ...
+ @property
+ def is_digit(self) -> bool: ...
+ @property
+ def is_lower(self) -> bool: ...
+ @property
+ def is_upper(self) -> bool: ...
+ @property
+ def is_title(self) -> bool: ...
+ @property
+ def is_punct(self) -> bool: ...
+ @property
+ def is_space(self) -> bool: ...
+ @property
+ def is_bracket(self) -> bool: ...
+ @property
+ def is_quote(self) -> bool: ...
+ @property
+ def is_left_punct(self) -> bool: ...
+ @property
+ def is_right_punct(self) -> bool: ...
+ @property
+ def is_currency(self) -> bool: ...
+ @property
+ def like_url(self) -> bool: ...
+ @property
+ def like_num(self) -> bool: ...
+ @property
+ def like_email(self) -> bool: ...
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 3fcfda691..b515ab67b 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -20,6 +20,7 @@ from .doc cimport set_children_from_heads
from .. import parts_of_speech
from ..errors import Errors, Warnings
+from ..attrs import IOB_STRINGS
from .underscore import Underscore, get_ext_args
@@ -209,8 +210,10 @@ cdef class Token:
return 0.0
vector = self.vector
xp = get_array_module(vector)
- return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
-
+ result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
+ # ensure we get a scalar back (numpy does this automatically but cupy doesn't)
+ return result.item()
+
def has_morph(self):
"""Check whether the token has annotated morph information.
Return False when the morph annotation is unset/missing.
@@ -267,7 +270,7 @@ cdef class Token:
"""RETURNS (str): The text content of the span (with trailing
whitespace).
"""
- cdef unicode orth = self.vocab.strings[self.c.lex.orth]
+ cdef str orth = self.vocab.strings[self.c.lex.orth]
if self.c.spacy:
return orth + " "
else:
@@ -600,7 +603,7 @@ cdef class Token:
yield from word.subtree
@property
- def left_edge(self):
+ def left_edge(self) -> int:
"""The leftmost token of this token's syntactic descendents.
RETURNS (Token): The first token such that `self.is_ancestor(token)`.
@@ -608,7 +611,7 @@ cdef class Token:
return self.doc[self.c.l_edge]
@property
- def right_edge(self):
+ def right_edge(self) -> int:
"""The rightmost token of this token's syntactic descendents.
RETURNS (Token): The last token such that `self.is_ancestor(token)`.
@@ -743,7 +746,7 @@ cdef class Token:
@classmethod
def iob_strings(cls):
- return ("", "I", "O", "B")
+ return IOB_STRINGS
@property
def ent_iob_(self):
@@ -820,7 +823,7 @@ cdef class Token:
def __get__(self):
return self.vocab.strings[self.norm]
- def __set__(self, unicode norm_):
+ def __set__(self, str norm_):
self.c.norm = self.vocab.strings.add(norm_)
@property
@@ -858,7 +861,7 @@ cdef class Token:
def __get__(self):
return self.vocab.strings[self.c.lemma]
- def __set__(self, unicode lemma_):
+ def __set__(self, str lemma_):
self.c.lemma = self.vocab.strings.add(lemma_)
property pos_:
@@ -867,6 +870,8 @@ cdef class Token:
return parts_of_speech.NAMES[self.c.pos]
def __set__(self, pos_name):
+ if pos_name not in parts_of_speech.IDS:
+ raise ValueError(Errors.E1021.format(pp=pos_name))
self.c.pos = parts_of_speech.IDS[pos_name]
property tag_:
@@ -890,7 +895,7 @@ cdef class Token:
def __get__(self):
return self.vocab.strings[self.c.dep]
- def __set__(self, unicode label):
+ def __set__(self, str label):
self.c.dep = self.vocab.strings.add(label)
@property
diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py
index b7966fd6e..e9a4e1862 100644
--- a/spacy/tokens/underscore.py
+++ b/spacy/tokens/underscore.py
@@ -1,16 +1,31 @@
+from typing import Dict, Any, List, Optional, Tuple, Union, TYPE_CHECKING
import functools
import copy
-
from ..errors import Errors
+if TYPE_CHECKING:
+ from .doc import Doc
+ from .span import Span
+ from .token import Token
+
class Underscore:
mutable_types = (dict, list, set)
- doc_extensions = {}
- span_extensions = {}
- token_extensions = {}
+ doc_extensions: Dict[Any, Any] = {}
+ span_extensions: Dict[Any, Any] = {}
+ token_extensions: Dict[Any, Any] = {}
+ _extensions: Dict[str, Any]
+ _obj: Union["Doc", "Span", "Token"]
+ _start: Optional[int]
+ _end: Optional[int]
- def __init__(self, extensions, obj, start=None, end=None):
+ def __init__(
+ self,
+ extensions: Dict[str, Any],
+ obj: Union["Doc", "Span", "Token"],
+ start: Optional[int] = None,
+ end: Optional[int] = None,
+ ):
object.__setattr__(self, "_extensions", extensions)
object.__setattr__(self, "_obj", obj)
# Assumption is that for doc values, _start and _end will both be None
@@ -22,12 +37,12 @@ class Underscore:
object.__setattr__(self, "_start", start)
object.__setattr__(self, "_end", end)
- def __dir__(self):
+ def __dir__(self) -> List[str]:
# Hack to enable autocomplete on custom extensions
extensions = list(self._extensions.keys())
return ["set", "get", "has"] + extensions
- def __getattr__(self, name):
+ def __getattr__(self, name: str) -> Any:
if name not in self._extensions:
raise AttributeError(Errors.E046.format(name=name))
default, method, getter, setter = self._extensions[name]
@@ -55,7 +70,7 @@ class Underscore:
return new_default
return default
- def __setattr__(self, name, value):
+ def __setattr__(self, name: str, value: Any):
if name not in self._extensions:
raise AttributeError(Errors.E047.format(name=name))
default, method, getter, setter = self._extensions[name]
@@ -64,28 +79,30 @@ class Underscore:
else:
self._doc.user_data[self._get_key(name)] = value
- def set(self, name, value):
+ def set(self, name: str, value: Any):
return self.__setattr__(name, value)
- def get(self, name):
+ def get(self, name: str) -> Any:
return self.__getattr__(name)
- def has(self, name):
+ def has(self, name: str) -> bool:
return name in self._extensions
- def _get_key(self, name):
+ def _get_key(self, name: str) -> Tuple[str, str, Optional[int], Optional[int]]:
return ("._.", name, self._start, self._end)
@classmethod
- def get_state(cls):
+ def get_state(cls) -> Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]]:
return cls.token_extensions, cls.span_extensions, cls.doc_extensions
@classmethod
- def load_state(cls, state):
+ def load_state(
+ cls, state: Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]]
+ ) -> None:
cls.token_extensions, cls.span_extensions, cls.doc_extensions = state
-def get_ext_args(**kwargs):
+def get_ext_args(**kwargs: Any):
"""Validate and convert arguments. Reused in Doc, Token and Span."""
default = kwargs.get("default")
getter = kwargs.get("getter")
diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py
index 055f30f42..a4feb01f4 100644
--- a/spacy/training/__init__.py
+++ b/spacy/training/__init__.py
@@ -1,4 +1,4 @@
-from .corpus import Corpus # noqa: F401
+from .corpus import Corpus, JsonlCorpus # noqa: F401
from .example import Example, validate_examples, validate_get_examples # noqa: F401
from .alignment import Alignment # noqa: F401
from .augment import dont_augment, orth_variants_augmenter # noqa: F401
@@ -7,5 +7,5 @@ from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F40
from .iob_utils import biluo_tags_to_spans, tags_to_entities # noqa: F401
from .gold_io import docs_to_json, read_json_file # noqa: F401
from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401
-from .loggers import console_logger, wandb_logger # noqa: F401
+from .loggers import console_logger # noqa: F401
from .callbacks import create_copy_from_base_model # noqa: F401
diff --git a/spacy/training/augment.py b/spacy/training/augment.py
index 0dae92143..63b54034c 100644
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@@ -22,8 +22,8 @@ class OrthVariantsPaired(BaseModel):
class OrthVariants(BaseModel):
- paired: List[OrthVariantsPaired] = {}
- single: List[OrthVariantsSingle] = {}
+ paired: List[OrthVariantsPaired] = []
+ single: List[OrthVariantsSingle] = []
@registry.augmenters("spacy.orth_variants.v1")
@@ -76,7 +76,7 @@ def lower_casing_augmenter(
def orth_variants_augmenter(
nlp: "Language",
example: Example,
- orth_variants: dict,
+ orth_variants: Dict,
*,
level: float = 0.0,
lower: float = 0.0,
diff --git a/spacy/training/batchers.py b/spacy/training/batchers.py
index e79ba79b0..f0b6c3123 100644
--- a/spacy/training/batchers.py
+++ b/spacy/training/batchers.py
@@ -1,4 +1,4 @@
-from typing import Union, Iterable, Sequence, TypeVar, List, Callable
+from typing import Union, Iterable, Sequence, TypeVar, List, Callable, Iterator
from typing import Optional, Any
from functools import partial
import itertools
@@ -6,7 +6,7 @@ import itertools
from ..util import registry, minibatch
-Sizing = Union[Iterable[int], int]
+Sizing = Union[Sequence[int], int]
ItemT = TypeVar("ItemT")
BatcherT = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]
@@ -24,7 +24,7 @@ def configure_minibatch_by_padded_size(
The padded size is defined as the maximum length of sequences within the
batch multiplied by the number of sequences in the batch.
- size (int or Iterable[int]): The largest padded size to batch sequences into.
+ size (int or Sequence[int]): The largest padded size to batch sequences into.
Can be a single integer, or a sequence, allowing for variable batch sizes.
buffer (int): The number of sequences to accumulate before sorting by length.
A larger buffer will result in more even sizing, but if the buffer is
@@ -56,7 +56,7 @@ def configure_minibatch_by_words(
) -> BatcherT:
"""Create a batcher that uses the "minibatch by words" strategy.
- size (int or Iterable[int]): The target number of words per batch.
+ size (int or Sequence[int]): The target number of words per batch.
Can be a single integer, or a sequence, allowing for variable batch sizes.
tolerance (float): What percentage of the size to allow batches to exceed.
discard_oversize (bool): Whether to discard sequences that by themselves
@@ -80,7 +80,7 @@ def configure_minibatch(
) -> BatcherT:
"""Create a batcher that creates batches of the specified size.
- size (int or Iterable[int]): The target number of items per batch.
+ size (int or Sequence[int]): The target number of items per batch.
Can be a single integer, or a sequence, allowing for variable batch sizes.
"""
optionals = {"get_length": get_length} if get_length is not None else {}
@@ -100,7 +100,7 @@ def minibatch_by_padded_size(
The padded size is defined as the maximum length of sequences within the
batch multiplied by the number of sequences in the batch.
- size (int): The largest padded size to batch sequences into.
+ size (int or Sequence[int]): The largest padded size to batch sequences into.
buffer (int): The number of sequences to accumulate before sorting by length.
A larger buffer will result in more even sizing, but if the buffer is
very large, the iteration order will be less random, which can result
@@ -111,9 +111,9 @@ def minibatch_by_padded_size(
The `len` function is used by default.
"""
if isinstance(size, int):
- size_ = itertools.repeat(size)
+ size_ = itertools.repeat(size) # type: Iterator[int]
else:
- size_ = size
+ size_ = iter(size)
for outer_batch in minibatch(seqs, size=buffer):
outer_batch = list(outer_batch)
target_size = next(size_)
@@ -138,7 +138,7 @@ def minibatch_by_words(
themselves, or be discarded if discard_oversize=True.
seqs (Iterable[Sequence]): The sequences to minibatch.
- size (int or Iterable[int]): The target number of words per batch.
+ size (int or Sequence[int]): The target number of words per batch.
Can be a single integer, or a sequence, allowing for variable batch sizes.
tolerance (float): What percentage of the size to allow batches to exceed.
discard_oversize (bool): Whether to discard sequences that by themselves
@@ -147,11 +147,9 @@ def minibatch_by_words(
item. The `len` function is used by default.
"""
if isinstance(size, int):
- size_ = itertools.repeat(size)
- elif isinstance(size, List):
- size_ = iter(size)
+ size_ = itertools.repeat(size) # type: Iterator[int]
else:
- size_ = size
+ size_ = iter(size)
target_size = next(size_)
tol_size = target_size * tolerance
batch = []
@@ -216,7 +214,7 @@ def _batch_by_length(
lengths_indices = [(get_length(seq), i) for i, seq in enumerate(seqs)]
lengths_indices.sort()
batches = []
- batch = []
+ batch: List[int] = []
for length, i in lengths_indices:
if not batch:
batch.append(i)
diff --git a/spacy/training/callbacks.py b/spacy/training/callbacks.py
index 2a21be98c..426fddf90 100644
--- a/spacy/training/callbacks.py
+++ b/spacy/training/callbacks.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Callable, Optional
from ..errors import Errors
from ..language import Language
from ..util import load_model, registry, logger
@@ -8,7 +8,7 @@ from ..util import load_model, registry, logger
def create_copy_from_base_model(
tokenizer: Optional[str] = None,
vocab: Optional[str] = None,
-) -> Language:
+) -> Callable[[Language], Language]:
def copy_from_base_model(nlp):
if tokenizer:
logger.info(f"Copying tokenizer from: {tokenizer}")
diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index 606dbfb4a..b9f929fcd 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -41,8 +41,11 @@ def create_docbin_reader(
@util.registry.readers("spacy.JsonlCorpus.v1")
def create_jsonl_reader(
- path: Optional[Path], min_length: int = 0, max_length: int = 0, limit: int = 0
-) -> Callable[["Language"], Iterable[Doc]]:
+ path: Optional[Union[str, Path]],
+ min_length: int = 0,
+ max_length: int = 0,
+ limit: int = 0,
+) -> Callable[["Language"], Iterable[Example]]:
return JsonlCorpus(path, min_length=min_length, max_length=max_length, limit=limit)
@@ -129,15 +132,15 @@ class Corpus:
"""
ref_docs = self.read_docbin(nlp.vocab, walk_corpus(self.path, FILE_TYPE))
if self.shuffle:
- ref_docs = list(ref_docs)
- random.shuffle(ref_docs)
+ ref_docs = list(ref_docs) # type: ignore
+ random.shuffle(ref_docs) # type: ignore
if self.gold_preproc:
examples = self.make_examples_gold_preproc(nlp, ref_docs)
else:
examples = self.make_examples(nlp, ref_docs)
for real_eg in examples:
- for augmented_eg in self.augmenter(nlp, real_eg):
+ for augmented_eg in self.augmenter(nlp, real_eg): # type: ignore[operator]
yield augmented_eg
def _make_example(
@@ -190,7 +193,7 @@ class Corpus:
i = 0
for loc in locs:
loc = util.ensure_path(loc)
- if loc.parts[-1].endswith(FILE_TYPE):
+ if loc.parts[-1].endswith(FILE_TYPE): # type: ignore[union-attr]
doc_bin = DocBin().from_disk(loc)
docs = doc_bin.get_docs(vocab)
for doc in docs:
@@ -202,7 +205,7 @@ class Corpus:
class JsonlCorpus:
- """Iterate Doc objects from a file or directory of jsonl
+ """Iterate Example objects from a file or directory of jsonl
formatted raw text files.
path (Path): The directory or filename to read from.
@@ -221,7 +224,7 @@ class JsonlCorpus:
def __init__(
self,
- path: Union[str, Path],
+ path: Optional[Union[str, Path]],
*,
limit: int = 0,
min_length: int = 0,
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 3cfd33f95..b59288e38 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -13,7 +13,7 @@ import warnings
from .pretrain import get_tok2vec_ref
from ..lookups import Lookups
-from ..vectors import Vectors
+from ..vectors import Vectors, Mode as VectorsMode
from ..errors import Errors, Warnings
from ..schemas import ConfigSchemaTraining
from ..util import registry, load_model_from_config, resolve_dot_names, logger
@@ -71,10 +71,15 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
nlp._link_components()
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
if T["max_epochs"] == -1:
+ sample_size = 100
logger.debug(
- "Due to streamed train corpus, using only first 100 examples for initialization. If necessary, provide all labels in [initialize]. More info: https://spacy.io/api/cli#init_labels"
+ f"Due to streamed train corpus, using only first {sample_size} "
+ f"examples for initialization. If necessary, provide all labels "
+ f"in [initialize]. More info: https://spacy.io/api/cli#init_labels"
+ )
+ nlp.initialize(
+ lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer
)
- nlp.initialize(lambda: islice(train_corpus(nlp), 100), sgd=optimizer)
else:
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
@@ -86,12 +91,12 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
# Don't warn about components not in the pipeline
if listener not in nlp.pipe_names:
continue
-
if listener in frozen_components and name not in frozen_components:
logger.warning(Warnings.W087.format(name=name, listener=listener))
# We always check this regardless, in case user freezes tok2vec
if listener not in frozen_components and name in frozen_components:
- logger.warning(Warnings.W086.format(name=name, listener=listener))
+ if name not in T["annotating_components"]:
+ logger.warning(Warnings.W086.format(name=name, listener=listener))
return nlp
@@ -101,7 +106,7 @@ def init_vocab(
data: Optional[Path] = None,
lookups: Optional[Lookups] = None,
vectors: Optional[str] = None,
-) -> "Language":
+) -> None:
if lookups:
nlp.vocab.lookups = lookups
logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}")
@@ -127,7 +132,7 @@ def init_vocab(
logger.info(f"Added vectors: {vectors}")
# warn if source model vectors are not identical
sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
- vectors_hash = hash(nlp.vocab.vectors.to_bytes())
+ vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items():
if vectors_hash != sourced_vectors_hash:
warnings.warn(Warnings.W113.format(name=sourced_component))
@@ -139,7 +144,12 @@ def load_vectors_into_model(
) -> None:
"""Load word vectors from an installed model or path into a model instance."""
try:
- vectors_nlp = load_model(name)
+ # Load with the same vocab, which automatically adds the vectors to
+ # the current nlp object. Exclude lookups so they are not modified.
+ exclude = ["lookups"]
+ if not add_strings:
+ exclude.append("strings")
+ vectors_nlp = load_model(name, vocab=nlp.vocab, exclude=exclude)
except ConfigValidationError as e:
title = f"Config validation error for vectors {name}"
desc = (
@@ -150,16 +160,17 @@ def load_vectors_into_model(
err = ConfigValidationError.from_error(e, title=title, desc=desc)
raise err from None
- if len(vectors_nlp.vocab.vectors.keys()) == 0:
+ if (
+ len(vectors_nlp.vocab.vectors.keys()) == 0
+ and vectors_nlp.vocab.vectors.mode != VectorsMode.floret
+ ) or (
+ vectors_nlp.vocab.vectors.shape[0] == 0
+ and vectors_nlp.vocab.vectors.mode == VectorsMode.floret
+ ):
logger.warning(Warnings.W112.format(name=name))
- nlp.vocab.vectors = vectors_nlp.vocab.vectors
- if add_strings:
- # I guess we should add the strings from the vectors_nlp model?
- # E.g. if someone does a similarity query, they might expect the strings.
- for key in nlp.vocab.vectors.key2row:
- if key in vectors_nlp.vocab.strings:
- nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
+ for lex in nlp.vocab:
+ lex.rank = nlp.vocab.vectors.key2row.get(lex.orth, OOV_RANK) # type: ignore[attr-defined]
def init_tok2vec(
@@ -192,41 +203,80 @@ def convert_vectors(
truncate: int,
prune: int,
name: Optional[str] = None,
+ mode: str = VectorsMode.default,
) -> None:
vectors_loc = ensure_path(vectors_loc)
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
- nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
+ nlp.vocab.vectors = Vectors(
+ strings=nlp.vocab.strings, data=numpy.load(vectors_loc.open("rb"))
+ )
for lex in nlp.vocab:
if lex.rank and lex.rank != OOV_RANK:
- nlp.vocab.vectors.add(lex.orth, row=lex.rank)
+ nlp.vocab.vectors.add(lex.orth, row=lex.rank) # type: ignore[attr-defined]
else:
if vectors_loc:
logger.info(f"Reading vectors from {vectors_loc}")
- vectors_data, vector_keys = read_vectors(vectors_loc, truncate)
+ vectors_data, vector_keys, floret_settings = read_vectors(
+ vectors_loc,
+ truncate,
+ mode=mode,
+ )
logger.info(f"Loaded vectors from {vectors_loc}")
else:
vectors_data, vector_keys = (None, None)
- if vector_keys is not None:
+ if vector_keys is not None and mode != VectorsMode.floret:
for word in vector_keys:
if word not in nlp.vocab:
nlp.vocab[word]
if vectors_data is not None:
- nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
+ if mode == VectorsMode.floret:
+ nlp.vocab.vectors = Vectors(
+ strings=nlp.vocab.strings,
+ data=vectors_data,
+ **floret_settings,
+ )
+ else:
+ nlp.vocab.vectors = Vectors(
+ strings=nlp.vocab.strings, data=vectors_data, keys=vector_keys
+ )
if name is None:
# TODO: Is this correct? Does this matter?
nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
else:
nlp.vocab.vectors.name = name
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
- if prune >= 1:
+ if prune >= 1 and mode != VectorsMode.floret:
nlp.vocab.prune_vectors(prune)
-def read_vectors(vectors_loc: Path, truncate_vectors: int):
+def read_vectors(
+ vectors_loc: Path, truncate_vectors: int, *, mode: str = VectorsMode.default
+):
f = ensure_shape(vectors_loc)
- shape = tuple(int(size) for size in next(f).split())
- if truncate_vectors >= 1:
- shape = (truncate_vectors, shape[1])
+ header_parts = next(f).split()
+ shape = tuple(int(size) for size in header_parts[:2])
+ floret_settings = {}
+ if mode == VectorsMode.floret:
+ if len(header_parts) != 8:
+ raise ValueError(
+ "Invalid header for floret vectors. "
+ "Expected: bucket dim minn maxn hash_count hash_seed BOW EOW"
+ )
+ floret_settings = {
+ "mode": "floret",
+ "minn": int(header_parts[2]),
+ "maxn": int(header_parts[3]),
+ "hash_count": int(header_parts[4]),
+ "hash_seed": int(header_parts[5]),
+ "bow": header_parts[6],
+ "eow": header_parts[7],
+ }
+ if truncate_vectors >= 1:
+ raise ValueError(Errors.E860)
+ else:
+ assert len(header_parts) == 2
+ if truncate_vectors >= 1:
+ shape = (truncate_vectors, shape[1])
vectors_data = numpy.zeros(shape=shape, dtype="f")
vectors_keys = []
for i, line in enumerate(tqdm.tqdm(f)):
@@ -239,21 +289,21 @@ def read_vectors(vectors_loc: Path, truncate_vectors: int):
vectors_keys.append(word)
if i == truncate_vectors - 1:
break
- return vectors_data, vectors_keys
+ return vectors_data, vectors_keys, floret_settings
def open_file(loc: Union[str, Path]) -> IO:
"""Handle .gz, .tar.gz or unzipped files"""
loc = ensure_path(loc)
if tarfile.is_tarfile(str(loc)):
- return tarfile.open(str(loc), "r:gz")
+ return tarfile.open(str(loc), "r:gz") # type: ignore[return-value]
elif loc.parts[-1].endswith("gz"):
- return (line.decode("utf8") for line in gzip.open(str(loc), "r"))
+ return (line.decode("utf8") for line in gzip.open(str(loc), "r")) # type: ignore[return-value]
elif loc.parts[-1].endswith("zip"):
zip_file = zipfile.ZipFile(str(loc))
names = zip_file.namelist()
file_ = zip_file.open(names[0])
- return (line.decode("utf8") for line in file_)
+ return (line.decode("utf8") for line in file_) # type: ignore[return-value]
else:
return loc.open("r", encoding="utf8")
@@ -266,7 +316,7 @@ def ensure_shape(vectors_loc):
lines = open_file(vectors_loc)
first_line = next(lines)
try:
- shape = tuple(int(size) for size in first_line.split())
+ shape = tuple(int(size) for size in first_line.split()[:2])
except ValueError:
shape = None
if shape is not None:
diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py
index 42dae8fc4..64492c2bc 100644
--- a/spacy/training/iob_utils.py
+++ b/spacy/training/iob_utils.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple, Iterable, Union, Iterator
+from typing import List, Dict, Tuple, Iterable, Union, Iterator
import warnings
from ..errors import Errors, Warnings
@@ -6,7 +6,7 @@ from ..tokens import Span, Doc
def iob_to_biluo(tags: Iterable[str]) -> List[str]:
- out = []
+ out: List[str] = []
tags = list(tags)
while tags:
out.extend(_consume_os(tags))
@@ -90,7 +90,7 @@ def offsets_to_biluo_tags(
>>> assert tags == ["O", "O", 'U-LOC', "O"]
"""
# Ensure no overlapping entity labels exist
- tokens_in_ents = {}
+ tokens_in_ents: Dict[int, Tuple[int, int, Union[str, int]]] = {}
starts = {token.idx: token.i for token in doc}
ends = {token.idx + len(token): token.i for token in doc}
biluo = ["-" for _ in doc]
@@ -199,14 +199,18 @@ def tags_to_entities(tags: Iterable[str]) -> List[Tuple[str, int, int]]:
pass
elif tag.startswith("I"):
if start is None:
- raise ValueError(Errors.E067.format(start="I", tags=tags[: i + 1]))
+ raise ValueError(
+ Errors.E067.format(start="I", tags=list(tags)[: i + 1])
+ )
elif tag.startswith("U"):
entities.append((tag[2:], i, i))
elif tag.startswith("B"):
start = i
elif tag.startswith("L"):
if start is None:
- raise ValueError(Errors.E067.format(start="L", tags=tags[: i + 1]))
+ raise ValueError(
+ Errors.E067.format(start="L", tags=list(tags)[: i + 1])
+ )
entities.append((tag[2:], start, i))
start = None
else:
diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py
index f7f70226d..edd0f1959 100644
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@@ -4,7 +4,6 @@ import tqdm
import sys
from ..util import registry
-from .. import util
from ..errors import Errors
if TYPE_CHECKING:
@@ -29,7 +28,7 @@ def console_logger(progress_bar: bool = False):
def setup_printer(
nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable[[], None]]:
- write = lambda text: stdout.write(f"{text}\n")
+ write = lambda text: print(text, file=stdout, flush=True)
msg = Printer(no_print=True)
# ensure that only trainable components are logged
logged_pipes = [
@@ -99,81 +98,3 @@ def console_logger(progress_bar: bool = False):
return log_step, finalize
return setup_printer
-
-
-@registry.loggers("spacy.WandbLogger.v2")
-def wandb_logger(
- project_name: str,
- remove_config_values: List[str] = [],
- model_log_interval: Optional[int] = None,
- log_dataset_dir: Optional[str] = None,
-):
- try:
- import wandb
-
- # test that these are available
- from wandb import init, log, join # noqa: F401
- except ImportError:
- raise ImportError(Errors.E880)
-
- console = console_logger(progress_bar=False)
-
- def setup_logger(
- nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
- ) -> Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]:
- config = nlp.config.interpolate()
- config_dot = util.dict_to_dot(config)
- for field in remove_config_values:
- del config_dot[field]
- config = util.dot_to_dict(config_dot)
- run = wandb.init(project=project_name, config=config, reinit=True)
- console_log_step, console_finalize = console(nlp, stdout, stderr)
-
- def log_dir_artifact(
- path: str,
- name: str,
- type: str,
- metadata: Optional[Dict[str, Any]] = {},
- aliases: Optional[List[str]] = [],
- ):
- dataset_artifact = wandb.Artifact(name, type=type, metadata=metadata)
- dataset_artifact.add_dir(path, name=name)
- wandb.log_artifact(dataset_artifact, aliases=aliases)
-
- if log_dataset_dir:
- log_dir_artifact(path=log_dataset_dir, name="dataset", type="dataset")
-
- def log_step(info: Optional[Dict[str, Any]]):
- console_log_step(info)
- if info is not None:
- score = info["score"]
- other_scores = info["other_scores"]
- losses = info["losses"]
- wandb.log({"score": score})
- if losses:
- wandb.log({f"loss_{k}": v for k, v in losses.items()})
- if isinstance(other_scores, dict):
- wandb.log(other_scores)
- if model_log_interval and info.get("output_path"):
- if info["step"] % model_log_interval == 0 and info["step"] != 0:
- log_dir_artifact(
- path=info["output_path"],
- name="pipeline_" + run.id,
- type="checkpoint",
- metadata=info,
- aliases=[
- f"epoch {info['epoch']} step {info['step']}",
- "latest",
- "best"
- if info["score"] == max(info["checkpoints"])[0]
- else "",
- ],
- )
-
- def finalize() -> None:
- console_finalize()
- wandb.join()
-
- return log_step, finalize
-
- return setup_logger
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 09c54fc9f..06372cbb0 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -32,7 +32,7 @@ def train(
"""Train a pipeline.
nlp (Language): The initialized nlp object with the full config.
- output_path (Path): Optional output path to save trained model to.
+ output_path (Optional[Path]): Optional output path to save trained model to.
use_gpu (int): Whether to train on GPU. Make sure to call require_gpu
before calling this function.
stdout (file): A file-like object to write output messages. To disable
@@ -194,17 +194,17 @@ def train_while_improving(
else:
dropouts = dropout
results = []
- losses = {}
+ losses: Dict[str, float] = {}
words_seen = 0
start_time = timer()
for step, (epoch, batch) in enumerate(train_data):
- dropout = next(dropouts)
+ dropout = next(dropouts) # type: ignore
for subbatch in subdivide_batch(batch, accumulate_gradient):
nlp.update(
subbatch,
drop=dropout,
losses=losses,
- sgd=False,
+ sgd=False, # type: ignore[arg-type]
exclude=exclude,
annotates=annotating_components,
)
@@ -214,9 +214,9 @@ def train_while_improving(
name not in exclude
and hasattr(proc, "is_trainable")
and proc.is_trainable
- and proc.model not in (True, False, None)
+ and proc.model not in (True, False, None) # type: ignore[attr-defined]
):
- proc.finish_update(optimizer)
+ proc.finish_update(optimizer) # type: ignore[attr-defined]
optimizer.step_schedules()
if not (step % eval_frequency):
if optimizer.averages:
@@ -310,13 +310,13 @@ def create_train_batches(
):
epoch = 0
if max_epochs >= 0:
- examples = list(corpus(nlp))
+ examples = list(corpus(nlp)) # type: Iterable[Example]
if not examples:
# Raise error if no data
raise ValueError(Errors.E986)
while max_epochs < 1 or epoch != max_epochs:
if max_epochs >= 0:
- random.shuffle(examples)
+ random.shuffle(examples) # type: ignore
else:
examples = corpus(nlp)
for batch in batcher(examples):
@@ -353,7 +353,7 @@ def create_before_to_disk_callback(
return before_to_disk
-def clean_output_dir(path: Union[str, Path]) -> None:
+def clean_output_dir(path: Optional[Path]) -> None:
"""Remove an existing output directory. Typically used to ensure that that
a directory like model-best and its contents aren't just being overwritten
by nlp.to_disk, which could preserve existing subdirectories (e.g.
diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py
index 6d7850212..52af84aaf 100644
--- a/spacy/training/pretrain.py
+++ b/spacy/training/pretrain.py
@@ -31,6 +31,8 @@ def pretrain(
allocator = config["training"]["gpu_allocator"]
if use_gpu >= 0 and allocator:
set_gpu_allocator(allocator)
+ # ignore in pretraining because we're creating it now
+ config["initialize"]["init_tok2vec"] = None
nlp = load_model_from_config(config)
_config = nlp.config.interpolate()
P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain)
@@ -41,14 +43,20 @@ def pretrain(
optimizer = P["optimizer"]
# Load in pretrained weights to resume from
if resume_path is not None:
- _resume_model(model, resume_path, epoch_resume, silent=silent)
+ epoch_resume = _resume_model(model, resume_path, epoch_resume, silent=silent)
else:
# Without '--resume-path' the '--epoch-resume' argument is ignored
epoch_resume = 0
+
objective = model.attrs["loss"]
# TODO: move this to logger function?
tracker = ProgressTracker(frequency=10000)
- msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
+ if P["n_save_epoch"]:
+ msg.divider(
+ f"Pre-training tok2vec layer - starting at epoch {epoch_resume} - saving every {P['n_save_epoch']} epoch"
+ )
+ else:
+ msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
@@ -77,7 +85,12 @@ def pretrain(
msg.row(progress, **row_settings)
if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
_save_model(epoch, is_temp=True)
- _save_model(epoch)
+
+ if P["n_save_epoch"]:
+ if epoch % P["n_save_epoch"] == 0 or epoch == P["max_epochs"] - 1:
+ _save_model(epoch)
+ else:
+ _save_model(epoch)
tracker.epoch_loss = 0.0
@@ -92,21 +105,26 @@ def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
def _resume_model(
- model: Model, resume_path: Path, epoch_resume: int, silent: bool = True
-) -> None:
+ model: Model, resume_path: Path, epoch_resume: Optional[int], silent: bool = True
+) -> int:
msg = Printer(no_print=silent)
msg.info(f"Resume training tok2vec from: {resume_path}")
with resume_path.open("rb") as file_:
weights_data = file_.read()
model.get_ref("tok2vec").from_bytes(weights_data)
- # Parse the epoch number from the given weight file
- model_name = re.search(r"model\d+\.bin", str(resume_path))
- if model_name:
- # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
- epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
- msg.info(f"Resuming from epoch: {epoch_resume}")
- else:
- msg.info(f"Resuming from epoch: {epoch_resume}")
+
+ if epoch_resume is None:
+ # Parse the epoch number from the given weight file
+ model_name = re.search(r"model\d+\.bin", str(resume_path))
+ if model_name:
+ # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
+ epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
+ else:
+ # No epoch given and couldn't infer it
+ raise ValueError(Errors.E1020)
+
+ msg.info(f"Resuming from epoch: {epoch_resume}")
+ return epoch_resume
def make_update(
diff --git a/spacy/ty.py b/spacy/ty.py
new file mode 100644
index 000000000..8f2903d78
--- /dev/null
+++ b/spacy/ty.py
@@ -0,0 +1,55 @@
+from typing import TYPE_CHECKING
+from typing import Optional, Any, Iterable, Dict, Callable, Sequence, List
+from .compat import Protocol, runtime_checkable
+
+from thinc.api import Optimizer, Model
+
+if TYPE_CHECKING:
+ from .training import Example
+
+
+@runtime_checkable
+class TrainableComponent(Protocol):
+ model: Any
+ is_trainable: bool
+
+ def update(
+ self,
+ examples: Iterable["Example"],
+ *,
+ drop: float = 0.0,
+ sgd: Optional[Optimizer] = None,
+ losses: Optional[Dict[str, float]] = None
+ ) -> Dict[str, float]:
+ ...
+
+ def finish_update(self, sgd: Optimizer) -> None:
+ ...
+
+
+@runtime_checkable
+class InitializableComponent(Protocol):
+ def initialize(
+ self,
+ get_examples: Callable[[], Iterable["Example"]],
+ nlp: Iterable["Example"],
+ **kwargs: Any
+ ):
+ ...
+
+
+@runtime_checkable
+class ListenedToComponent(Protocol):
+ model: Any
+ listeners: Sequence[Model]
+ listener_map: Dict[str, Sequence[Model]]
+ listening_components: List[str]
+
+ def add_listener(self, listener: Model, component_name: str) -> None:
+ ...
+
+ def remove_listener(self, listener: Model, component_name: str) -> bool:
+ ...
+
+ def find_listeners(self, component) -> None:
+ ...
diff --git a/spacy/util.py b/spacy/util.py
index 421287ce2..14714143c 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1,4 +1,5 @@
-from typing import List, Union, Dict, Any, Optional, Iterable, Callable, Tuple
+from typing import List, Mapping, NoReturn, Union, Dict, Any, Set
+from typing import Optional, Iterable, Callable, Tuple, Type
from typing import Iterator, Type, Pattern, Generator, TYPE_CHECKING
from types import ModuleType
import os
@@ -16,16 +17,20 @@ import numpy
import srsly
import catalogue
from catalogue import RegistryError, Registry
+import langcodes
import sys
import warnings
from packaging.specifiers import SpecifierSet, InvalidSpecifier
from packaging.version import Version, InvalidVersion
+from packaging.requirements import Requirement
import subprocess
from contextlib import contextmanager
+from collections import defaultdict
import tempfile
import shutil
import shlex
import inspect
+import pkgutil
import logging
try:
@@ -33,11 +38,6 @@ try:
except ImportError:
cupy = None
-try: # Python 3.8
- import importlib.metadata as importlib_metadata
-except ImportError:
- from catalogue import _importlib_metadata as importlib_metadata
-
# These are functions that were previously (v2.x) available from spacy.util
# and have since moved to Thinc. We're importing them here so people's code
# doesn't break, but they should always be imported from Thinc from now on,
@@ -46,13 +46,14 @@ from thinc.api import fix_random_seed, compounding, decaying # noqa: F401
from .symbols import ORTH
-from .compat import cupy, CudaStream, is_windows
+from .compat import cupy, CudaStream, is_windows, importlib_metadata
from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS
from . import about
if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports
from .language import Language # noqa: F401
+ from .pipeline import Pipe # noqa: F401
from .tokens import Doc, Span # noqa: F401
from .vocab import Vocab # noqa: F401
@@ -62,7 +63,7 @@ OOV_RANK = numpy.iinfo(numpy.uint64).max
DEFAULT_OOV_PROB = -20
LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
-# Default order of sections in the config.cfg. Not all sections needs to exist,
+# Default order of sections in the config file. Not all sections needs to exist,
# and additional sections are added at the end, in alphabetical order.
CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"]
# fmt: on
@@ -95,6 +96,7 @@ class registry(thinc.registry):
readers = catalogue.create("spacy", "readers", entry_points=True)
augmenters = catalogue.create("spacy", "augmenters", entry_points=True)
loggers = catalogue.create("spacy", "loggers", entry_points=True)
+ scorers = catalogue.create("spacy", "scorers", entry_points=True)
# These are factories registered via third-party packages and the
# spacy_factories entry point. This registry only exists so we can easily
# load them via the entry points. The "true" factories are added via the
@@ -143,6 +145,32 @@ class registry(thinc.registry):
) from None
return func
+ @classmethod
+ def find(cls, registry_name: str, func_name: str) -> Callable:
+ """Get info about a registered function from the registry."""
+ # We're overwriting this classmethod so we're able to provide more
+ # specific error messages and implement a fallback to spacy-legacy.
+ if not hasattr(cls, registry_name):
+ names = ", ".join(cls.get_registry_names()) or "none"
+ raise RegistryError(Errors.E892.format(name=registry_name, available=names))
+ reg = getattr(cls, registry_name)
+ try:
+ func_info = reg.find(func_name)
+ except RegistryError:
+ if func_name.startswith("spacy."):
+ legacy_name = func_name.replace("spacy.", "spacy-legacy.")
+ try:
+ return reg.find(legacy_name)
+ except catalogue.RegistryError:
+ pass
+ available = ", ".join(sorted(reg.get_all().keys())) or "none"
+ raise RegistryError(
+ Errors.E893.format(
+ name=func_name, reg_name=registry_name, available=available
+ )
+ ) from None
+ return func_info
+
@classmethod
def has(cls, registry_name: str, func_name: str) -> bool:
"""Check whether a function is available in a registry."""
@@ -232,21 +260,89 @@ def lang_class_is_loaded(lang: str) -> bool:
return lang in registry.languages
-def get_lang_class(lang: str) -> "Language":
+def find_matching_language(lang: str) -> Optional[str]:
+ """
+ Given an IETF language code, find a supported spaCy language that is a
+ close match for it (according to Unicode CLDR language-matching rules).
+ This allows for language aliases, ISO 639-2 codes, more detailed language
+ tags, and close matches.
+
+ Returns the language code if a matching language is available, or None
+ if there is no matching language.
+
+ >>> find_matching_language('en')
+ 'en'
+ >>> find_matching_language('pt-BR') # Brazilian Portuguese
+ 'pt'
+ >>> find_matching_language('fra') # an ISO 639-2 code for French
+ 'fr'
+ >>> find_matching_language('iw') # obsolete alias for Hebrew
+ 'he'
+ >>> find_matching_language('no') # Norwegian
+ 'nb'
+ >>> find_matching_language('mo') # old code for ro-MD
+ 'ro'
+ >>> find_matching_language('zh-Hans') # Simplified Chinese
+ 'zh'
+ >>> find_matching_language('zxx')
+ None
+ """
+ import spacy.lang # noqa: F401
+
+ if lang == "xx":
+ return "xx"
+
+ # Find out which language modules we have
+ possible_languages = []
+ for modinfo in pkgutil.iter_modules(spacy.lang.__path__): # type: ignore
+ code = modinfo.name
+ if code == "xx":
+ # Temporarily make 'xx' into a valid language code
+ possible_languages.append("mul")
+ elif langcodes.tag_is_valid(code):
+ possible_languages.append(code)
+
+ # Distances from 1-9 allow near misses like Bosnian -> Croatian and
+ # Norwegian -> Norwegian Bokmål. A distance of 10 would include several
+ # more possibilities, like variants of Chinese like 'wuu', but text that
+ # is labeled that way is probably trying to be distinct from 'zh' and
+ # shouldn't automatically match.
+ match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9)
+ if match == "mul":
+ # Convert 'mul' back to spaCy's 'xx'
+ return "xx"
+ else:
+ return match
+
+
+def get_lang_class(lang: str) -> Type["Language"]:
"""Import and load a Language class.
- lang (str): Two-letter language code, e.g. 'en'.
+ lang (str): IETF language code, such as 'en'.
RETURNS (Language): Language class.
"""
# Check if language is registered / entry point is available
if lang in registry.languages:
return registry.languages.get(lang)
else:
+ # Find the language in the spacy.lang subpackage
try:
module = importlib.import_module(f".lang.{lang}", "spacy")
except ImportError as err:
- raise ImportError(Errors.E048.format(lang=lang, err=err)) from err
- set_lang_class(lang, getattr(module, module.__all__[0]))
+ # Find a matching language. For example, if the language 'no' is
+ # requested, we can use language-matching to load `spacy.lang.nb`.
+ try:
+ match = find_matching_language(lang)
+ except langcodes.tag_parser.LanguageTagError:
+ # proceed to raising an import error
+ match = None
+
+ if match:
+ lang = match
+ module = importlib.import_module(f".lang.{lang}", "spacy")
+ else:
+ raise ImportError(Errors.E048.format(lang=lang, err=err)) from err
+ set_lang_class(lang, getattr(module, module.__all__[0])) # type: ignore[attr-defined]
return registry.languages.get(lang)
@@ -321,13 +417,13 @@ def load_model(
if name.startswith("blank:"): # shortcut for blank model
return get_lang_class(name.replace("blank:", ""))()
if is_package(name): # installed as package
- return load_model_from_package(name, **kwargs)
+ return load_model_from_package(name, **kwargs) # type: ignore[arg-type]
if Path(name).exists(): # path to model data directory
- return load_model_from_path(Path(name), **kwargs)
+ return load_model_from_path(Path(name), **kwargs) # type: ignore[arg-type]
elif hasattr(name, "exists"): # Path or Path-like to model data
- return load_model_from_path(name, **kwargs)
+ return load_model_from_path(name, **kwargs) # type: ignore[arg-type]
if name in OLD_MODEL_SHORTCUTS:
- raise IOError(Errors.E941.format(name=name, full=OLD_MODEL_SHORTCUTS[name]))
+ raise IOError(Errors.E941.format(name=name, full=OLD_MODEL_SHORTCUTS[name])) # type: ignore[index]
raise IOError(Errors.E050.format(name=name))
@@ -354,11 +450,11 @@ def load_model_from_package(
RETURNS (Language): The loaded nlp object.
"""
cls = importlib.import_module(name)
- return cls.load(vocab=vocab, disable=disable, exclude=exclude, config=config)
+ return cls.load(vocab=vocab, disable=disable, exclude=exclude, config=config) # type: ignore[attr-defined]
def load_model_from_path(
- model_path: Union[str, Path],
+ model_path: Path,
*,
meta: Optional[Dict[str, Any]] = None,
vocab: Union["Vocab", bool] = True,
@@ -369,7 +465,7 @@ def load_model_from_path(
"""Load a model from a data directory path. Creates Language class with
pipeline from config.cfg and then calls from_disk() with path.
- name (str): Package name or model path.
+ model_path (Path): Model path.
meta (Dict[str, Any]): Optional model meta.
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created.
@@ -451,7 +547,9 @@ def get_sourced_components(
}
-def resolve_dot_names(config: Config, dot_names: List[Optional[str]]) -> Tuple[Any]:
+def resolve_dot_names(
+ config: Config, dot_names: List[Optional[str]]
+) -> Tuple[Any, ...]:
"""Resolve one or more "dot notation" names, e.g. corpora.train.
The paths could point anywhere into the config, so we don't know which
top-level section we'll be looking within.
@@ -461,7 +559,7 @@ def resolve_dot_names(config: Config, dot_names: List[Optional[str]]) -> Tuple[A
"""
# TODO: include schema?
resolved = {}
- output = []
+ output: List[Any] = []
errors = []
for name in dot_names:
if name is None:
@@ -477,7 +575,7 @@ def resolve_dot_names(config: Config, dot_names: List[Optional[str]]) -> Tuple[A
result = registry.resolve(config[section])
resolved[section] = result
try:
- output.append(dot_to_object(resolved, name))
+ output.append(dot_to_object(resolved, name)) # type: ignore[arg-type]
except KeyError:
msg = f"not a valid section reference: {name}"
errors.append({"loc": name.split("."), "msg": msg})
@@ -544,8 +642,8 @@ def load_config(
sys.stdin.read(), overrides=overrides, interpolate=interpolate
)
else:
- if not config_path or not config_path.exists() or not config_path.is_file():
- raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
+ if not config_path or not config_path.is_file():
+ raise IOError(Errors.E053.format(path=config_path, name="config file"))
return config.from_disk(
config_path, overrides=overrides, interpolate=interpolate
)
@@ -581,8 +679,8 @@ def get_package_version(name: str) -> Optional[str]:
RETURNS (str / None): The version or None if package not installed.
"""
try:
- return importlib_metadata.version(name)
- except importlib_metadata.PackageNotFoundError:
+ return importlib_metadata.version(name) # type: ignore[attr-defined]
+ except importlib_metadata.PackageNotFoundError: # type: ignore[attr-defined]
return None
@@ -605,7 +703,7 @@ def is_compatible_version(
constraint = f"=={constraint}"
try:
spec = SpecifierSet(constraint)
- version = Version(version)
+ version = Version(version) # type: ignore[assignment]
except (InvalidSpecifier, InvalidVersion):
return None
spec.prereleases = prereleases
@@ -639,13 +737,18 @@ def is_unconstrained_version(
return True
-def get_model_version_range(spacy_version: str) -> str:
- """Generate a version range like >=1.2.3,<1.3.0 based on a given spaCy
- version. Models are always compatible across patch versions but not
- across minor or major versions.
+def split_requirement(requirement: str) -> Tuple[str, str]:
+ """Split a requirement like spacy>=1.2.3 into ("spacy", ">=1.2.3")."""
+ req = Requirement(requirement)
+ return (req.name, str(req.specifier))
+
+
+def get_minor_version_range(version: str) -> str:
+ """Generate a version range like >=1.2.3,<1.3.0 based on a given version
+ (e.g. of spaCy).
"""
- release = Version(spacy_version).release
- return f">={spacy_version},<{release[0]}.{release[1] + 1}.0"
+ release = Version(version).release
+ return f">={version},<{release[0]}.{release[1] + 1}.0"
def get_model_lower_version(constraint: str) -> Optional[str]:
@@ -714,7 +817,7 @@ def load_meta(path: Union[str, Path]) -> Dict[str, Any]:
if "spacy_version" in meta:
if not is_compatible_version(about.__version__, meta["spacy_version"]):
lower_version = get_model_lower_version(meta["spacy_version"])
- lower_version = get_minor_version(lower_version)
+ lower_version = get_minor_version(lower_version) # type: ignore[arg-type]
if lower_version is not None:
lower_version = "v" + lower_version
elif "spacy_git_version" in meta:
@@ -733,7 +836,7 @@ def load_meta(path: Union[str, Path]) -> Dict[str, Any]:
model=f"{meta['lang']}_{meta['name']}",
model_version=meta["version"],
version=meta["spacy_version"],
- example=get_model_version_range(about.__version__),
+ example=get_minor_version_range(about.__version__),
)
warnings.warn(warn_msg)
return meta
@@ -756,7 +859,7 @@ def is_package(name: str) -> bool:
RETURNS (bool): True if installed package, False if not.
"""
try:
- importlib_metadata.distribution(name)
+ importlib_metadata.distribution(name) # type: ignore[attr-defined]
return True
except: # noqa: E722
return False
@@ -817,7 +920,7 @@ def run_command(
*,
stdin: Optional[Any] = None,
capture: bool = False,
-) -> Optional[subprocess.CompletedProcess]:
+) -> subprocess.CompletedProcess:
"""Run a command on the command line as a subprocess. If the subprocess
returns a non-zero exit code, a system exit is performed.
@@ -860,8 +963,8 @@ def run_command(
message += f"\n\nProcess log (stdout and stderr):\n\n"
message += ret.stdout
error = subprocess.SubprocessError(message)
- error.ret = ret
- error.command = cmd_str
+ error.ret = ret # type: ignore[attr-defined]
+ error.command = cmd_str # type: ignore[attr-defined]
raise error
elif ret.returncode != 0:
sys.exit(ret.returncode)
@@ -869,7 +972,7 @@ def run_command(
@contextmanager
-def working_dir(path: Union[str, Path]) -> None:
+def working_dir(path: Union[str, Path]) -> Iterator[Path]:
"""Change current working directory and returns to previous on exit.
path (str / Path): The directory to navigate to.
@@ -917,7 +1020,7 @@ def is_in_jupyter() -> bool:
"""
# https://stackoverflow.com/a/39662359/6400719
try:
- shell = get_ipython().__class__.__name__
+ shell = get_ipython().__class__.__name__ # type: ignore[name-defined]
if shell == "ZMQInteractiveShell":
return True # Jupyter notebook or qtconsole
except NameError:
@@ -999,7 +1102,7 @@ def compile_prefix_regex(entries: Iterable[Union[str, Pattern]]) -> Pattern:
spacy.lang.punctuation.TOKENIZER_PREFIXES.
RETURNS (Pattern): The regex object. to be used for Tokenizer.prefix_search.
"""
- expression = "|".join(["^" + piece for piece in entries if piece.strip()])
+ expression = "|".join(["^" + piece for piece in entries if piece.strip()]) # type: ignore[operator, union-attr]
return re.compile(expression)
@@ -1010,7 +1113,7 @@ def compile_suffix_regex(entries: Iterable[Union[str, Pattern]]) -> Pattern:
spacy.lang.punctuation.TOKENIZER_SUFFIXES.
RETURNS (Pattern): The regex object. to be used for Tokenizer.suffix_search.
"""
- expression = "|".join([piece + "$" for piece in entries if piece.strip()])
+ expression = "|".join([piece + "$" for piece in entries if piece.strip()]) # type: ignore[operator, union-attr]
return re.compile(expression)
@@ -1021,7 +1124,7 @@ def compile_infix_regex(entries: Iterable[Union[str, Pattern]]) -> Pattern:
spacy.lang.punctuation.TOKENIZER_INFIXES.
RETURNS (regex object): The regex object. to be used for Tokenizer.infix_finditer.
"""
- expression = "|".join([piece for piece in entries if piece.strip()])
+ expression = "|".join([piece for piece in entries if piece.strip()]) # type: ignore[misc, union-attr]
return re.compile(expression)
@@ -1043,7 +1146,7 @@ def _get_attr_unless_lookup(
) -> Any:
for lookup in lookups:
if string in lookup:
- return lookup[string]
+ return lookup[string] # type: ignore[index]
return default_func(string)
@@ -1125,7 +1228,7 @@ def filter_spans(spans: Iterable["Span"]) -> List["Span"]:
get_sort_key = lambda span: (span.end - span.start, -span.start)
sorted_spans = sorted(spans, key=get_sort_key, reverse=True)
result = []
- seen_tokens = set()
+ seen_tokens: Set[int] = set()
for span in sorted_spans:
# Check for end - 1 here because boundaries are inclusive
if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
@@ -1144,7 +1247,7 @@ def from_bytes(
setters: Dict[str, Callable[[bytes], Any]],
exclude: Iterable[str],
) -> None:
- return from_dict(srsly.msgpack_loads(bytes_data), setters, exclude)
+ return from_dict(srsly.msgpack_loads(bytes_data), setters, exclude) # type: ignore[return-value]
def to_dict(
@@ -1206,8 +1309,8 @@ def import_file(name: str, loc: Union[str, Path]) -> ModuleType:
RETURNS: The loaded module.
"""
spec = importlib.util.spec_from_file_location(name, str(loc))
- module = importlib.util.module_from_spec(spec)
- spec.loader.exec_module(module)
+ module = importlib.util.module_from_spec(spec) # type: ignore[arg-type]
+ spec.loader.exec_module(module) # type: ignore[union-attr]
return module
@@ -1297,7 +1400,7 @@ def dot_to_dict(values: Dict[str, Any]) -> Dict[str, dict]:
values (Dict[str, Any]): The key/value pairs to convert.
RETURNS (Dict[str, dict]): The converted values.
"""
- result = {}
+ result: Dict[str, dict] = {}
for key, value in values.items():
path = result
parts = key.lower().split(".")
@@ -1375,13 +1478,13 @@ def get_arg_names(func: Callable) -> List[str]:
RETURNS (List[str]): The argument names.
"""
argspec = inspect.getfullargspec(func)
- return list(set([*argspec.args, *argspec.kwonlyargs]))
+ return list(dict.fromkeys([*argspec.args, *argspec.kwonlyargs]))
def combine_score_weights(
- weights: List[Dict[str, float]],
- overrides: Dict[str, Optional[Union[float, int]]] = SimpleFrozenDict(),
-) -> Dict[str, float]:
+ weights: List[Dict[str, Optional[float]]],
+ overrides: Dict[str, Optional[float]] = SimpleFrozenDict(),
+) -> Dict[str, Optional[float]]:
"""Combine and normalize score weights defined by components, e.g.
{"ents_r": 0.2, "ents_p": 0.3, "ents_f": 0.5} and {"some_other_score": 1.0}.
@@ -1393,7 +1496,9 @@ def combine_score_weights(
# We divide each weight by the total weight sum.
# We first need to extract all None/null values for score weights that
# shouldn't be shown in the table *or* be weighted
- result = {key: value for w_dict in weights for (key, value) in w_dict.items()}
+ result: Dict[str, Optional[float]] = {
+ key: value for w_dict in weights for (key, value) in w_dict.items()
+ }
result.update(overrides)
weight_sum = sum([v if v else 0.0 for v in result.values()])
for key, value in result.items():
@@ -1415,13 +1520,13 @@ class DummyTokenizer:
def to_bytes(self, **kwargs):
return b""
- def from_bytes(self, _bytes_data, **kwargs):
+ def from_bytes(self, data: bytes, **kwargs) -> "DummyTokenizer":
return self
- def to_disk(self, _path, **kwargs):
+ def to_disk(self, path: Union[str, Path], **kwargs) -> None:
return None
- def from_disk(self, _path, **kwargs):
+ def from_disk(self, path: Union[str, Path], **kwargs) -> "DummyTokenizer":
return self
@@ -1483,7 +1588,13 @@ def check_bool_env_var(env_var: str) -> bool:
return bool(value)
-def _pipe(docs, proc, name, default_error_handler, kwargs):
+def _pipe(
+ docs: Iterable["Doc"],
+ proc: "Pipe",
+ name: str,
+ default_error_handler: Callable[[str, "Pipe", List["Doc"], Exception], NoReturn],
+ kwargs: Mapping[str, Any],
+) -> Iterator["Doc"]:
if hasattr(proc, "pipe"):
yield from proc.pipe(docs, **kwargs)
else:
@@ -1497,7 +1608,7 @@ def _pipe(docs, proc, name, default_error_handler, kwargs):
kwargs.pop(arg)
for doc in docs:
try:
- doc = proc(doc, **kwargs)
+ doc = proc(doc, **kwargs) # type: ignore[call-arg]
yield doc
except Exception as e:
error_handler(name, proc, [doc], e)
@@ -1549,3 +1660,19 @@ def to_ternary_int(val) -> int:
return 0
else:
return -1
+
+
+# The following implementation of packages_distributions() is adapted from
+# importlib_metadata, which is distributed under the Apache 2.0 License.
+# Copyright (c) 2017-2019 Jason R. Coombs, Barry Warsaw
+# See licenses/3rd_party_licenses.txt
+def packages_distributions() -> Dict[str, List[str]]:
+ """Return a mapping of top-level packages to their distributions. We're
+ inlining this helper from the importlib_metadata "backport" here, since
+ it's not available in the builtin importlib.metadata.
+ """
+ pkg_to_dist = defaultdict(list)
+ for dist in importlib_metadata.distributions(): # type: ignore[attr-defined]
+ for pkg in (dist.read_text("top_level.txt") or "").split():
+ pkg_to_dist[pkg].append(dist.metadata["Name"])
+ return dict(pkg_to_dist)
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 7cb3322c2..bc4863703 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -1,16 +1,23 @@
cimport numpy as np
+from libc.stdint cimport uint32_t, uint64_t
from cython.operator cimport dereference as deref
from libcpp.set cimport set as cppset
+from murmurhash.mrmr cimport hash128_x64
import functools
import numpy
+from typing import cast
+import warnings
+from enum import Enum
import srsly
-from thinc.api import get_array_module, get_current_ops
+from thinc.api import Ops, get_array_module, get_current_ops
+from thinc.backends import get_array_ops
+from thinc.types import Floats2d
from .strings cimport StringStore
from .strings import get_string_id
-from .errors import Errors
+from .errors import Errors, Warnings
from . import util
@@ -18,18 +25,13 @@ def unpickle_vectors(bytes_data):
return Vectors().from_bytes(bytes_data)
-class GlobalRegistry:
- """Global store of vectors, to avoid repeatedly loading the data."""
- data = {}
+class Mode(str, Enum):
+ default = "default"
+ floret = "floret"
@classmethod
- def register(cls, name, data):
- cls.data[name] = data
- return functools.partial(cls.get, name)
-
- @classmethod
- def get(cls, name):
- return cls.data[name]
+ def values(cls):
+ return list(cls.__members__.keys())
cdef class Vectors:
@@ -37,45 +39,93 @@ cdef class Vectors:
Vectors data is kept in the vectors.data attribute, which should be an
instance of numpy.ndarray (for CPU vectors) or cupy.ndarray
- (for GPU vectors). `vectors.key2row` is a dictionary mapping word hashes to
- rows in the vectors.data table.
+ (for GPU vectors).
- Multiple keys can be mapped to the same vector, and not all of the rows in
- the table need to be assigned - so len(list(vectors.keys())) may be
- greater or smaller than vectors.shape[0].
+ In the default mode, `vectors.key2row` is a dictionary mapping word hashes
+ to rows in the vectors.data table. Multiple keys can be mapped to the same
+ vector, and not all of the rows in the table need to be assigned - so
+ len(list(vectors.keys())) may be greater or smaller than vectors.shape[0].
+
+ In floret mode, the floret settings (minn, maxn, etc.) are used to
+ calculate the vector from the rows corresponding to the key's ngrams.
DOCS: https://spacy.io/api/vectors
"""
+ cdef public object strings
cdef public object name
+ cdef readonly object mode
cdef public object data
cdef public object key2row
cdef cppset[int] _unset
+ cdef readonly uint32_t minn
+ cdef readonly uint32_t maxn
+ cdef readonly uint32_t hash_count
+ cdef readonly uint32_t hash_seed
+ cdef readonly unicode bow
+ cdef readonly unicode eow
- def __init__(self, *, shape=None, data=None, keys=None, name=None):
+ def __init__(self, *, strings=None, shape=None, data=None, keys=None, name=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
"""Create a new vector store.
+ strings (StringStore): The string store.
shape (tuple): Size of the table, as (# entries, # columns)
data (numpy.ndarray or cupy.ndarray): The vector data.
keys (iterable): A sequence of keys, aligned with the data.
name (str): A name to identify the vectors table.
+ mode (str): Vectors mode: "default" or "floret" (default: "default").
+ minn (int): The floret char ngram minn (default: 0).
+ maxn (int): The floret char ngram maxn (default: 0).
+ hash_count (int): The floret hash count (1-4, default: 1).
+ hash_seed (int): The floret hash seed (default: 0).
+ bow (str): The floret BOW string (default: "<").
+ eow (str): The floret EOW string (default: ">").
DOCS: https://spacy.io/api/vectors#init
"""
+ self.strings = strings
+ if self.strings is None:
+ self.strings = StringStore()
self.name = name
- if data is None:
- if shape is None:
- shape = (0,0)
- ops = get_current_ops()
- data = ops.xp.zeros(shape, dtype="f")
- self.data = data
+ if mode not in Mode.values():
+ raise ValueError(
+ Errors.E202.format(
+ name="vectors",
+ mode=mode,
+ modes=str(Mode.values())
+ )
+ )
+ self.mode = Mode(mode).value
self.key2row = {}
- if self.data is not None:
- self._unset = cppset[int]({i for i in range(self.data.shape[0])})
- else:
+ self.minn = minn
+ self.maxn = maxn
+ self.hash_count = hash_count
+ self.hash_seed = hash_seed
+ self.bow = bow
+ self.eow = eow
+ if self.mode == Mode.default:
+ if data is None:
+ if shape is None:
+ shape = (0,0)
+ ops = get_current_ops()
+ data = ops.xp.zeros(shape, dtype="f")
+ self._unset = cppset[int]({i for i in range(data.shape[0])})
+ else:
+ self._unset = cppset[int]()
+ self.data = data
+ if keys is not None:
+ for i, key in enumerate(keys):
+ self.add(key, row=i)
+ elif self.mode == Mode.floret:
+ if maxn < minn:
+ raise ValueError(Errors.E863)
+ if hash_count < 1 or hash_count >= 5:
+ raise ValueError(Errors.E862)
+ if data is None:
+ raise ValueError(Errors.E864)
+ if keys is not None:
+ raise ValueError(Errors.E861)
+ self.data = data
self._unset = cppset[int]()
- if keys is not None:
- for i, key in enumerate(keys):
- self.add(key, row=i)
@property
def shape(self):
@@ -96,7 +146,7 @@ cdef class Vectors:
DOCS: https://spacy.io/api/vectors#size
"""
- return self.data.shape[0] * self.data.shape[1]
+ return self.data.size
@property
def is_full(self):
@@ -106,6 +156,8 @@ cdef class Vectors:
DOCS: https://spacy.io/api/vectors#is_full
"""
+ if self.mode == Mode.floret:
+ return True
return self._unset.size() == 0
@property
@@ -113,7 +165,8 @@ cdef class Vectors:
"""Get the number of keys in the table. Note that this is the number
of all keys, not just unique vectors.
- RETURNS (int): The number of keys in the table.
+ RETURNS (int): The number of keys in the table for default vectors.
+ For floret vectors, return -1.
DOCS: https://spacy.io/api/vectors#n_keys
"""
@@ -125,25 +178,33 @@ cdef class Vectors:
def __getitem__(self, key):
"""Get a vector by key. If the key is not found, a KeyError is raised.
- key (int): The key to get the vector for.
+ key (str/int): The key to get the vector for.
RETURNS (ndarray): The vector for the key.
DOCS: https://spacy.io/api/vectors#getitem
"""
- i = self.key2row[key]
- if i is None:
- raise KeyError(Errors.E058.format(key=key))
- else:
- return self.data[i]
+ if self.mode == Mode.default:
+ i = self.key2row.get(get_string_id(key), None)
+ if i is None:
+ raise KeyError(Errors.E058.format(key=key))
+ else:
+ return self.data[i]
+ elif self.mode == Mode.floret:
+ return self.get_batch([key])[0]
+ raise KeyError(Errors.E058.format(key=key))
def __setitem__(self, key, vector):
"""Set a vector for the given key.
- key (int): The key to set the vector for.
+ key (str/int): The key to set the vector for.
vector (ndarray): The vector to set.
DOCS: https://spacy.io/api/vectors#setitem
"""
+ if self.mode == Mode.floret:
+ warnings.warn(Warnings.W115.format(method="Vectors.__setitem__"))
+ return
+ key = get_string_id(key)
i = self.key2row[key]
self.data[i] = vector
if self._unset.count(i):
@@ -175,7 +236,10 @@ cdef class Vectors:
DOCS: https://spacy.io/api/vectors#contains
"""
- return key in self.key2row
+ if self.mode == Mode.floret:
+ return True
+ else:
+ return key in self.key2row
def resize(self, shape, inplace=False):
"""Resize the underlying vectors array. If inplace=True, the memory
@@ -192,6 +256,9 @@ cdef class Vectors:
DOCS: https://spacy.io/api/vectors#resize
"""
+ if self.mode == Mode.floret:
+ warnings.warn(Warnings.W115.format(method="Vectors.resize"))
+ return -1
xp = get_array_module(self.data)
if inplace:
if shape[1] != self.data.shape[1]:
@@ -207,7 +274,7 @@ cdef class Vectors:
self.data = resized_array
self._sync_unset()
removed_items = []
- for key, row in list(self.key2row.items()):
+ for key, row in self.key2row.copy().items():
if row >= shape[0]:
self.key2row.pop(key)
removed_items.append((key, row))
@@ -244,16 +311,23 @@ cdef class Vectors:
def find(self, *, key=None, keys=None, row=None, rows=None):
"""Look up one or more keys by row, or vice versa.
- key (str / int): Find the row that the given key points to.
+ key (Union[int, str]): Find the row that the given key points to.
Returns int, -1 if missing.
- keys (iterable): Find rows that the keys point to.
+ keys (Iterable[Union[int, str]]): Find rows that the keys point to.
Returns ndarray.
row (int): Find the first key that points to the row.
Returns int.
- rows (iterable): Find the keys that point to the rows.
+ rows (Iterable[int]): Find the keys that point to the rows.
Returns ndarray.
RETURNS: The requested key, keys, row or rows.
"""
+ if self.mode == Mode.floret:
+ raise ValueError(
+ Errors.E858.format(
+ mode=self.mode,
+ alternative="Use Vectors[key] instead.",
+ )
+ )
if sum(arg is None for arg in (key, keys, row, rows)) != 3:
bad_kwargs = {"key": key, "keys": keys, "row": row, "rows": rows}
raise ValueError(Errors.E059.format(kwargs=bad_kwargs))
@@ -273,6 +347,73 @@ cdef class Vectors:
results = [row2key[row] for row in rows]
return xp.asarray(results, dtype="uint64")
+ def _get_ngram_hashes(self, unicode s):
+ """Calculate up to 4 32-bit hash values with MurmurHash3_x64_128 using
+ the floret hash settings.
+ key (str): The string key.
+ RETURNS: A list of the integer hashes.
+ """
+ # MurmurHash3_x64_128 returns an array of 2 uint64_t values.
+ cdef uint64_t[2] out
+ chars = s.encode("utf8")
+ cdef char* utf8_string = chars
+ hash128_x64(utf8_string, len(chars), self.hash_seed, &out)
+ rows = [
+ out[0] & 0xffffffffu,
+ out[0] >> 32,
+ out[1] & 0xffffffffu,
+ out[1] >> 32,
+ ]
+ return rows[:min(self.hash_count, 4)]
+
+ def _get_ngrams(self, unicode key):
+ """Get all padded ngram strings using the ngram settings.
+ key (str): The string key.
+ RETURNS: A list of the ngram strings for the padded key.
+ """
+ key = self.bow + key + self.eow
+ ngrams = [key] + [
+ key[start:start+ngram_size]
+ for ngram_size in range(self.minn, self.maxn + 1)
+ for start in range(0, len(key) - ngram_size + 1)
+ ]
+ return ngrams
+
+ def get_batch(self, keys):
+ """Get the vectors for the provided keys efficiently as a batch.
+ keys (Iterable[Union[int, str]]): The keys.
+ RETURNS: The requested vectors from the vector table.
+ """
+ ops = get_array_ops(self.data)
+ if self.mode == Mode.default:
+ rows = self.find(keys=keys)
+ vecs = self.data[rows]
+ elif self.mode == Mode.floret:
+ keys = [self.strings.as_string(key) for key in keys]
+ if sum(len(key) for key in keys) == 0:
+ return ops.xp.zeros((len(keys), self.data.shape[1]))
+ unique_keys = tuple(set(keys))
+ row_index = {key: i for i, key in enumerate(unique_keys)}
+ rows = [row_index[key] for key in keys]
+ indices = []
+ lengths = []
+ for key in unique_keys:
+ if key == "":
+ ngram_rows = []
+ else:
+ ngram_rows = [
+ h % self.data.shape[0]
+ for ngram in self._get_ngrams(key)
+ for h in self._get_ngram_hashes(ngram)
+ ]
+ indices.extend(ngram_rows)
+ lengths.append(len(ngram_rows))
+ indices = ops.asarray(indices, dtype="int32")
+ lengths = ops.asarray(lengths, dtype="int32")
+ vecs = ops.reduce_mean(cast(Floats2d, self.data[indices]), lengths)
+ vecs = vecs[rows]
+ return ops.as_contig(vecs)
+
def add(self, key, *, vector=None, row=None):
"""Add a key to the table. Keys can be mapped to an existing vector
by setting `row`, or a new vector can be added.
@@ -284,6 +425,9 @@ cdef class Vectors:
DOCS: https://spacy.io/api/vectors#add
"""
+ if self.mode == Mode.floret:
+ warnings.warn(Warnings.W115.format(method="Vectors.add"))
+ return -1
# use int for all keys and rows in key2row for more efficient access
# and serialization
key = int(get_string_id(key))
@@ -324,6 +468,11 @@ cdef class Vectors:
RETURNS (tuple): The most similar entries as a `(keys, best_rows, scores)`
tuple.
"""
+ if self.mode == Mode.floret:
+ raise ValueError(Errors.E858.format(
+ mode=self.mode,
+ alternative="",
+ ))
xp = get_array_module(self.data)
filled = sorted(list({row for row in self.key2row.values()}))
if len(filled) < n:
@@ -368,7 +517,35 @@ cdef class Vectors:
for i in range(len(queries)) ], dtype="uint64")
return (keys, best_rows, scores)
- def to_disk(self, path, **kwargs):
+ def to_ops(self, ops: Ops):
+ self.data = ops.asarray(self.data)
+
+ def _get_cfg(self):
+ if self.mode == Mode.default:
+ return {
+ "mode": Mode(self.mode).value,
+ }
+ elif self.mode == Mode.floret:
+ return {
+ "mode": Mode(self.mode).value,
+ "minn": self.minn,
+ "maxn": self.maxn,
+ "hash_count": self.hash_count,
+ "hash_seed": self.hash_seed,
+ "bow": self.bow,
+ "eow": self.eow,
+ }
+
+ def _set_cfg(self, cfg):
+ self.mode = Mode(cfg.get("mode", Mode.default)).value
+ self.minn = cfg.get("minn", 0)
+ self.maxn = cfg.get("maxn", 0)
+ self.hash_count = cfg.get("hash_count", 0)
+ self.hash_seed = cfg.get("hash_seed", 0)
+ self.bow = cfg.get("bow", "<")
+ self.eow = cfg.get("eow", ">")
+
+ def to_disk(self, path, *, exclude=tuple()):
"""Save the current state to a directory.
path (str / Path): A path to a directory, which will be created if
@@ -390,12 +567,14 @@ cdef class Vectors:
save_array(self.data, _file)
serializers = {
+ "strings": lambda p: self.strings.to_disk(p.with_suffix(".json")),
"vectors": lambda p: save_vectors(p),
- "key2row": lambda p: srsly.write_msgpack(p, self.key2row)
+ "key2row": lambda p: srsly.write_msgpack(p, self.key2row),
+ "vectors.cfg": lambda p: srsly.write_json(p, self._get_cfg()),
}
- return util.to_disk(path, serializers, [])
+ return util.to_disk(path, serializers, exclude)
- def from_disk(self, path, **kwargs):
+ def from_disk(self, path, *, exclude=tuple()):
"""Loads state from a directory. Modifies the object in place and
returns it.
@@ -422,17 +601,23 @@ cdef class Vectors:
if path.exists():
self.data = ops.xp.load(str(path))
+ def load_settings(path):
+ if path.exists():
+ self._set_cfg(srsly.read_json(path))
+
serializers = {
+ "strings": lambda p: self.strings.from_disk(p.with_suffix(".json")),
"vectors": load_vectors,
"keys": load_keys,
"key2row": load_key2row,
+ "vectors.cfg": load_settings,
}
- util.from_disk(path, serializers, [])
+ util.from_disk(path, serializers, exclude)
self._sync_unset()
return self
- def to_bytes(self, **kwargs):
+ def to_bytes(self, *, exclude=tuple()):
"""Serialize the current state to a binary string.
exclude (list): String names of serialization fields to exclude.
@@ -447,12 +632,14 @@ cdef class Vectors:
return srsly.msgpack_dumps(self.data)
serializers = {
+ "strings": lambda: self.strings.to_bytes(),
"key2row": lambda: srsly.msgpack_dumps(self.key2row),
- "vectors": serialize_weights
+ "vectors": serialize_weights,
+ "vectors.cfg": lambda: srsly.json_dumps(self._get_cfg()),
}
- return util.to_bytes(serializers, [])
+ return util.to_bytes(serializers, exclude)
- def from_bytes(self, data, **kwargs):
+ def from_bytes(self, data, *, exclude=tuple()):
"""Load state from a binary string.
data (bytes): The data to load from.
@@ -469,13 +656,25 @@ cdef class Vectors:
self.data = xp.asarray(srsly.msgpack_loads(b))
deserializers = {
+ "strings": lambda b: self.strings.from_bytes(b),
"key2row": lambda b: self.key2row.update(srsly.msgpack_loads(b)),
- "vectors": deserialize_weights
+ "vectors": deserialize_weights,
+ "vectors.cfg": lambda b: self._set_cfg(srsly.json_loads(b))
}
- util.from_bytes(data, deserializers, [])
+ util.from_bytes(data, deserializers, exclude)
self._sync_unset()
return self
+ def clear(self):
+ """Clear all entries in the vector table.
+
+ DOCS: https://spacy.io/api/vectors#clear
+ """
+ if self.mode == Mode.floret:
+ raise ValueError(Errors.E859)
+ self.key2row = {}
+ self._sync_unset()
+
def _sync_unset(self):
filled = {row for row in self.key2row.values()}
self._unset = cppset[int]({row for row in range(self.data.shape[0]) if row not in filled})
diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
index 9067476f7..9c951b2b7 100644
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@@ -27,21 +27,21 @@ cdef class Vocab:
cdef Pool mem
cdef readonly StringStore strings
cdef public Morphology morphology
- cdef public object vectors
+ cdef public object _vectors
cdef public object _lookups
cdef public object writing_system
cdef public object get_noun_chunks
cdef readonly int length
- cdef public object data_dir
+ cdef public object _unused_object # TODO remove in v4, see #9150
cdef public object lex_attr_getters
cdef public object cfg
- cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
+ cdef const LexemeC* get(self, Pool mem, str string) except NULL
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
cdef const TokenC* make_fused_token(self, substrings) except NULL
- cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
+ cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
- cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
+ cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
cdef PreshMap _by_orth
diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi
new file mode 100644
index 000000000..713e85c01
--- /dev/null
+++ b/spacy/vocab.pyi
@@ -0,0 +1,78 @@
+from typing import Callable, Iterator, Optional, Union, List, Dict
+from typing import Any, Iterable
+from thinc.types import Floats1d, FloatsXd
+from . import Language
+from .strings import StringStore
+from .lexeme import Lexeme
+from .lookups import Lookups
+from .morphology import Morphology
+from .tokens import Doc, Span
+from .vectors import Vectors
+from pathlib import Path
+
+def create_vocab(
+ lang: Optional[str], defaults: Any, vectors_name: Optional[str] = ...
+) -> Vocab: ...
+
+class Vocab:
+ cfg: Dict[str, Any]
+ get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]]
+ lookups: Lookups
+ morphology: Morphology
+ strings: StringStore
+ vectors: Vectors
+ writing_system: Dict[str, Any]
+ def __init__(
+ self,
+ lex_attr_getters: Optional[Dict[str, Callable[[str], Any]]] = ...,
+ strings: Optional[Union[List[str], StringStore]] = ...,
+ lookups: Optional[Lookups] = ...,
+ oov_prob: float = ...,
+ vectors_name: Optional[str] = ...,
+ writing_system: Dict[str, Any] = ...,
+ get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]] = ...,
+ ) -> None: ...
+ @property
+ def lang(self) -> str: ...
+ def __len__(self) -> int: ...
+ def add_flag(
+ self, flag_getter: Callable[[str], bool], flag_id: int = ...
+ ) -> int: ...
+ def __contains__(self, key: str) -> bool: ...
+ def __iter__(self) -> Iterator[Lexeme]: ...
+ def __getitem__(self, id_or_string: Union[str, int]) -> Lexeme: ...
+ @property
+ def vectors_length(self) -> int: ...
+ def reset_vectors(
+ self, *, width: Optional[int] = ..., shape: Optional[int] = ...
+ ) -> None: ...
+ def prune_vectors(self, nr_row: int, batch_size: int = ...) -> Dict[str, float]: ...
+ def get_vector(
+ self,
+ orth: Union[int, str],
+ minn: Optional[int] = ...,
+ maxn: Optional[int] = ...,
+ ) -> FloatsXd: ...
+ def set_vector(self, orth: Union[int, str], vector: Floats1d) -> None: ...
+ def has_vector(self, orth: Union[int, str]) -> bool: ...
+ def to_disk(
+ self, path: Union[str, Path], *, exclude: Iterable[str] = ...
+ ) -> None: ...
+ def from_disk(
+ self, path: Union[str, Path], *, exclude: Iterable[str] = ...
+ ) -> Vocab: ...
+ def to_bytes(self, *, exclude: Iterable[str] = ...) -> bytes: ...
+ def from_bytes(
+ self, bytes_data: bytes, *, exclude: Iterable[str] = ...
+ ) -> Vocab: ...
+
+def pickle_vocab(vocab: Vocab) -> Any: ...
+def unpickle_vocab(
+ sstore: StringStore,
+ vectors: Any,
+ morphology: Any,
+ _unused_object: Any,
+ lex_attr_getters: Any,
+ lookups: Any,
+ get_noun_chunks: Any,
+) -> Vocab: ...
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 13dd675af..badd291ed 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -14,7 +14,7 @@ from .attrs cimport LANG, ORTH
from .compat import copy_reg
from .errors import Errors
from .attrs import intify_attrs, NORM, IS_STOP
-from .vectors import Vectors
+from .vectors import Vectors, Mode as VectorsMode
from .util import registry
from .lookups import Lookups
from . import util
@@ -60,8 +60,8 @@ cdef class Vocab:
vice versa.
lookups (Lookups): Container for large lookup tables and dictionaries.
oov_prob (float): Default OOV probability.
- vectors_name (unicode): Optional name to identify the vectors table.
- get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]):
+ vectors_name (str): Optional name to identify the vectors table.
+ get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]):
A function that yields base noun phrases used for Doc.noun_chunks.
"""
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
@@ -77,11 +77,21 @@ cdef class Vocab:
_ = self[string]
self.lex_attr_getters = lex_attr_getters
self.morphology = Morphology(self.strings)
- self.vectors = Vectors(name=vectors_name)
+ self.vectors = Vectors(strings=self.strings, name=vectors_name)
self.lookups = lookups
self.writing_system = writing_system
self.get_noun_chunks = get_noun_chunks
+ property vectors:
+ def __get__(self):
+ return self._vectors
+
+ def __set__(self, vectors):
+ for s in vectors.strings:
+ self.strings.add(s)
+ self._vectors = vectors
+ self._vectors.strings = self.strings
+
@property
def lang(self):
langfunc = None
@@ -105,7 +115,7 @@ cdef class Vocab:
See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`,
`Token.check_flag`.
- flag_getter (callable): A function `f(unicode) -> bool`, to get the
+ flag_getter (callable): A function `f(str) -> bool`, to get the
flag value.
flag_id (int): An integer between 1 and 63 (inclusive), specifying
the bit at which the flag will be stored. If -1, the lowest
@@ -128,7 +138,7 @@ cdef class Vocab:
self.lex_attr_getters[flag_id] = flag_getter
return flag_id
- cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
+ cdef const LexemeC* get(self, Pool mem, str string) except NULL:
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
`Lexeme` if necessary using memory acquired from the given pool. If the
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
@@ -162,7 +172,7 @@ cdef class Vocab:
else:
return self._new_lexeme(mem, self.strings[orth])
- cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
+ cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
# I think this heuristic is bad, and the Vocab should always
# own the lexemes. It avoids weird bugs this way, as it's how the thing
# was originally supposed to work. The best solution to the growing
@@ -184,7 +194,7 @@ cdef class Vocab:
if self.lex_attr_getters is not None:
for attr, func in self.lex_attr_getters.items():
value = func(string)
- if isinstance(value, unicode):
+ if isinstance(value, str):
value = self.strings.add(value)
if value is not None:
Lexeme.set_struct_attr(lex, attr, value)
@@ -201,7 +211,7 @@ cdef class Vocab:
def __contains__(self, key):
"""Check whether the string or int key has an entry in the vocabulary.
- string (unicode): The ID string.
+ string (str): The ID string.
RETURNS (bool) Whether the string has an entry in the vocabulary.
DOCS: https://spacy.io/api/vocab#contains
@@ -209,7 +219,7 @@ cdef class Vocab:
cdef hash_t int_key
if isinstance(key, bytes):
int_key = self.strings[key.decode("utf8")]
- elif isinstance(key, unicode):
+ elif isinstance(key, str):
int_key = self.strings[key]
else:
int_key = key
@@ -234,7 +244,7 @@ cdef class Vocab:
previously unseen unicode string is given, a new lexeme is created and
stored.
- id_or_string (int or unicode): The integer ID of a word, or its unicode
+ id_or_string (int or str): The integer ID of a word, or its unicode
string. If `int >= Lexicon.size`, `IndexError` is raised. If
`id_or_string` is neither an int nor a unicode string, `ValueError`
is raised.
@@ -247,7 +257,7 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab#getitem
"""
cdef attr_t orth
- if isinstance(id_or_string, unicode):
+ if isinstance(id_or_string, str):
orth = self.strings.add(id_or_string)
else:
orth = id_or_string
@@ -273,7 +283,7 @@ cdef class Vocab:
@property
def vectors_length(self):
- return self.vectors.data.shape[1]
+ return self.vectors.shape[1]
def reset_vectors(self, *, width=None, shape=None):
"""Drop the current vector table. Because all vectors must be the same
@@ -282,10 +292,10 @@ cdef class Vocab:
if width is not None and shape is not None:
raise ValueError(Errors.E065.format(width=width, shape=shape))
elif shape is not None:
- self.vectors = Vectors(shape=shape)
+ self.vectors = Vectors(strings=self.strings, shape=shape)
else:
- width = width if width is not None else self.vectors.data.shape[1]
- self.vectors = Vectors(shape=(self.vectors.shape[0], width))
+ width = width if width is not None else self.vectors.shape[1]
+ self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width))
def prune_vectors(self, nr_row, batch_size=1024):
"""Reduce the current vector table to `nr_row` unique entries. Words
@@ -314,6 +324,8 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab#prune_vectors
"""
+ if self.vectors.mode != VectorsMode.default:
+ raise ValueError(Errors.E866)
ops = get_current_ops()
xp = get_array_module(self.vectors.data)
# Make sure all vectors are in the vocab
@@ -328,7 +340,7 @@ cdef class Vocab:
keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64")
keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]])
toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]])
- self.vectors = Vectors(data=keep, keys=keys[:nr_row], name=self.vectors.name)
+ self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row], name=self.vectors.name)
syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size)
syn_keys = ops.to_numpy(syn_keys)
remap = {}
@@ -340,19 +352,12 @@ cdef class Vocab:
remap[word] = (synonym, score)
return remap
- def get_vector(self, orth, minn=None, maxn=None):
+ def get_vector(self, orth):
"""Retrieve a vector for a word in the vocabulary. Words can be looked
up by string or int ID. If no vectors data is loaded, ValueError is
raised.
- If `minn` is defined, then the resulting vector uses Fasttext's
- subword features by average over ngrams of `orth`.
-
orth (int / unicode): The hash value of a word, or its unicode string.
- minn (int): Minimum n-gram length used for Fasttext's ngram computation.
- Defaults to the length of `orth`.
- maxn (int): Maximum n-gram length used for Fasttext's ngram computation.
- Defaults to the length of `orth`.
RETURNS (numpy.ndarray or cupy.ndarray): A word vector. Size
and shape determined by the `vocab.vectors` instance. Usually, a
numpy ndarray of shape (300,) and dtype float32.
@@ -361,47 +366,17 @@ cdef class Vocab:
"""
if isinstance(orth, str):
orth = self.strings.add(orth)
- word = self[orth].orth_
- if orth in self.vectors.key2row:
+ if self.has_vector(orth):
return self.vectors[orth]
xp = get_array_module(self.vectors.data)
vectors = xp.zeros((self.vectors_length,), dtype="f")
- if minn is None:
- return vectors
- # Fasttext's ngram computation taken from
- # https://github.com/facebookresearch/fastText
- # Assign default ngram limit to maxn which is the length of the word.
- if maxn is None:
- maxn = len(word)
- ngrams_size = 0;
- for i in range(len(word)):
- ngram = ""
- if (word[i] and 0xC0) == 0x80:
- continue
- n = 1
- j = i
- while (j < len(word) and n <= maxn):
- if n > maxn:
- break
- ngram += word[j]
- j = j + 1
- while (j < len(word) and (word[j] and 0xC0) == 0x80):
- ngram += word[j]
- j = j + 1
- if (n >= minn and not (n == 1 and (i == 0 or j == len(word)))):
- if self.strings[ngram] in self.vectors.key2row:
- vectors = xp.add(self.vectors[self.strings[ngram]], vectors)
- ngrams_size += 1
- n = n + 1
- if ngrams_size > 0:
- vectors = vectors * (1.0/ngrams_size)
return vectors
def set_vector(self, orth, vector):
"""Set a vector for a word in the vocabulary. Words can be referenced
by string or int ID.
- orth (int / unicode): The word.
+ orth (int / str): The word.
vector (numpy.ndarray or cupy.nadarry[ndim=1, dtype='float32']): The vector to set.
DOCS: https://spacy.io/api/vocab#set_vector
@@ -417,13 +392,14 @@ cdef class Vocab:
self.vectors.resize((new_rows, width))
lex = self[orth] # Add word to vocab if necessary
row = self.vectors.add(orth, vector=vector)
- lex.rank = row
+ if row >= 0:
+ lex.rank = row
def has_vector(self, orth):
"""Check whether a word has a vector. Returns False if no vectors have
been loaded. Words can be looked up by string or int ID.
- orth (int / unicode): The word.
+ orth (int / str): The word.
RETURNS (bool): Whether the word has a vector.
DOCS: https://spacy.io/api/vocab#has_vector
@@ -448,9 +424,9 @@ cdef class Vocab:
def to_disk(self, path, *, exclude=tuple()):
"""Save the current state to a directory.
- path (unicode or Path): A path to a directory, which will be created if
+ path (str or Path): A path to a directory, which will be created if
it doesn't exist.
- exclude (list): String names of serialization fields to exclude.
+ exclude (Iterable[str]): String names of serialization fields to exclude.
DOCS: https://spacy.io/api/vocab#to_disk
"""
@@ -461,7 +437,7 @@ cdef class Vocab:
if "strings" not in exclude:
self.strings.to_disk(path / "strings.json")
if "vectors" not in "exclude":
- self.vectors.to_disk(path)
+ self.vectors.to_disk(path, exclude=["strings"])
if "lookups" not in "exclude":
self.lookups.to_disk(path)
@@ -469,8 +445,8 @@ cdef class Vocab:
"""Loads state from a directory. Modifies the object in place and
returns it.
- path (unicode or Path): A path to a directory.
- exclude (list): String names of serialization fields to exclude.
+ path (str or Path): A path to a directory.
+ exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (Vocab): The modified `Vocab` object.
DOCS: https://spacy.io/api/vocab#to_disk
@@ -495,7 +471,7 @@ cdef class Vocab:
def to_bytes(self, *, exclude=tuple()):
"""Serialize the current state to a binary string.
- exclude (list): String names of serialization fields to exclude.
+ exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (bytes): The serialized form of the `Vocab` object.
DOCS: https://spacy.io/api/vocab#to_bytes
@@ -504,7 +480,7 @@ cdef class Vocab:
if self.vectors is None:
return None
else:
- return self.vectors.to_bytes()
+ return self.vectors.to_bytes(exclude=["strings"])
getters = {
"strings": lambda: self.strings.to_bytes(),
@@ -517,7 +493,7 @@ cdef class Vocab:
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
- exclude (list): String names of serialization fields to exclude.
+ exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (Vocab): The `Vocab` object.
DOCS: https://spacy.io/api/vocab#from_bytes
@@ -526,11 +502,10 @@ cdef class Vocab:
if self.vectors is None:
return None
else:
- return self.vectors.from_bytes(b)
+ return self.vectors.from_bytes(b, exclude=["strings"])
setters = {
"strings": lambda b: self.strings.from_bytes(b),
- "lexemes": lambda b: self.lexemes_from_bytes(b),
"vectors": lambda b: serialize_vectors(b),
"lookups": lambda b: self.lookups.from_bytes(b),
}
@@ -552,21 +527,21 @@ def pickle_vocab(vocab):
sstore = vocab.strings
vectors = vocab.vectors
morph = vocab.morphology
- data_dir = vocab.data_dir
+ _unused_object = vocab._unused_object
lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters)
lookups = vocab.lookups
get_noun_chunks = vocab.get_noun_chunks
return (unpickle_vocab,
- (sstore, vectors, morph, data_dir, lex_attr_getters, lookups, get_noun_chunks))
+ (sstore, vectors, morph, _unused_object, lex_attr_getters, lookups, get_noun_chunks))
-def unpickle_vocab(sstore, vectors, morphology, data_dir,
+def unpickle_vocab(sstore, vectors, morphology, _unused_object,
lex_attr_getters, lookups, get_noun_chunks):
cdef Vocab vocab = Vocab()
vocab.vectors = vectors
vocab.strings = sstore
vocab.morphology = morphology
- vocab.data_dir = data_dir
+ vocab._unused_object = _unused_object
vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters)
vocab.lookups = lookups
vocab.get_noun_chunks = get_noun_chunks
diff --git a/website/Dockerfile b/website/Dockerfile
new file mode 100644
index 000000000..f71733e55
--- /dev/null
+++ b/website/Dockerfile
@@ -0,0 +1,16 @@
+FROM node:11.15.0
+
+WORKDIR /spacy-io
+
+RUN npm install -g gatsby-cli@2.7.4
+
+COPY package.json .
+COPY package-lock.json .
+
+RUN npm install
+
+# This is so the installed node_modules will be up one directory
+# from where a user mounts files, so that they don't accidentally mount
+# their own node_modules from a different build
+# https://nodejs.org/api/modules.html#modules_loading_from_node_modules_folders
+WORKDIR /spacy-io/website/
diff --git a/website/README.md b/website/README.md
index 076032d92..db050cf03 100644
--- a/website/README.md
+++ b/website/README.md
@@ -554,6 +554,42 @@ extensions for your code editor. The
[`.prettierrc`](https://github.com/explosion/spaCy/tree/master/website/.prettierrc)
file in the root defines the settings used in this codebase.
+## Building & developing the site with Docker {#docker}
+Sometimes it's hard to get a local environment working due to rapid updates to node dependencies,
+so it may be easier to use docker for building the docs.
+
+If you'd like to do this,
+**be sure you do *not* include your local `node_modules` folder**,
+since there are some dependencies that need to be built for the image system.
+Rename it before using.
+
+```bash
+docker run -it \
+ -v $(pwd):/spacy-io/website \
+ -p 8000:8000 \
+ ghcr.io/explosion/spacy-io \
+ gatsby develop -H 0.0.0.0
+```
+
+This will allow you to access the built website at http://0.0.0.0:8000/
+in your browser, and still edit code in your editor while having the site
+reflect those changes.
+
+**Note**: If you're working on a Mac with an M1 processor,
+you might see segfault errors from `qemu` if you use the default image.
+To fix this use the `arm64` tagged image in the `docker run` command
+(ghcr.io/explosion/spacy-io:arm64).
+
+### Building the Docker image {#docker-build}
+
+If you'd like to build the image locally, you can do so like this:
+
+```bash
+docker build -t spacy-io .
+```
+
+This will take some time, so if you want to use the prebuilt image you'll save a bit of time.
+
## Markdown reference {#markdown}
All page content and page meta lives in the `.md` files in the `/docs`
diff --git a/website/UNIVERSE.md b/website/UNIVERSE.md
index d37c4561a..770bbde13 100644
--- a/website/UNIVERSE.md
+++ b/website/UNIVERSE.md
@@ -44,7 +44,7 @@ markup is correct.
"id": "unique-project-id",
"title": "Project title",
"slogan": "A short summary",
- "description": "A longer description – *Mardown allowed!*",
+ "description": "A longer description – *Markdown allowed!*",
"github": "user/repo",
"pip": "package-name",
"code_example": [
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index e90dc1183..07b76393f 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -82,7 +82,7 @@ consisting of a CNN and a layer-normalized maxout activation function.
| `width` | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~ |
| `depth` | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~ |
| `embed_size` | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~ |
-| `window_size` | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * (window_size * 2 + 1)`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ |
+| `window_size` | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * (window_size * 2 + 1)`, so a 4-layer network with a window size of `2` will be sensitive to 20 words at a time. Recommended value is `1`. ~~int~~ |
| `maxout_pieces` | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~ |
| `subword_features` | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~ |
| `pretrained_vectors` | Whether to also use static vectors. ~~bool~~ |
@@ -124,6 +124,14 @@ Instead of defining its own `Tok2Vec` instance, a model architecture like
[Tagger](/api/architectures#tagger) can define a listener as its `tok2vec`
argument that connects to the shared `tok2vec` component in the pipeline.
+Listeners work by caching the `Tok2Vec` output for a given batch of `Doc`s. This
+means that in order for a component to work with the listener, the batch of
+`Doc`s passed to the listener must be the same as the batch of `Doc`s passed to
+the `Tok2Vec`. As a result, any manipulation of the `Doc`s which would affect
+`Tok2Vec` output, such as to create special contexts or remove `Doc`s for which
+no prediction can be made, must happen inside the model, **after** the call to
+the `Tok2Vec` component.
+
| Name | Description |
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `width` | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. ~~int~~ |
@@ -150,7 +158,7 @@ be configured with the `attrs` argument. The suggested attributes are `NORM`,
`PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account some
subword information, without construction a fully character-based
representation. If pretrained vectors are available, they can be included in the
-representation as well, with the vectors table will be kept static (i.e. it's
+representation as well, with the vectors table kept static (i.e. it's
not updated).
| Name | Description |
@@ -288,7 +296,7 @@ learned linear projection to control the dimensionality. Unknown tokens are
mapped to a zero vector. See the documentation on
[static vectors](/usage/embeddings-transformers#static-vectors) for details.
-| Name | Description |
+| Name | Description |
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `nO` | The output width of the layer, after the linear projection. ~~Optional[int]~~ |
| `nM` | The width of the static vectors. ~~Optional[int]~~ |
@@ -310,7 +318,7 @@ mapped to a zero vector. See the documentation on
Extract arrays of input features from [`Doc`](/api/doc) objects. Expects a list
of feature names to extract, which should refer to token attributes.
-| Name | Description |
+| Name | Description |
| ----------- | ------------------------------------------------------------------------ |
| `columns` | The token attributes to extract. ~~List[Union[int, str]]~~ |
| **CREATES** | The created feature extraction layer. ~~Model[List[Doc], List[Ints2d]]~~ |
@@ -332,15 +340,18 @@ for details and system requirements.
-### spacy-transformers.TransformerModel.v1 {#TransformerModel}
+### spacy-transformers.TransformerModel.v3 {#TransformerModel}
> #### Example Config
>
> ```ini
> [model]
-> @architectures = "spacy-transformers.TransformerModel.v1"
+> @architectures = "spacy-transformers.TransformerModel.v3"
> name = "roberta-base"
> tokenizer_config = {"use_fast": true}
+> transformer_config = {}
+> mixed_precision = true
+> grad_scaler_config = {"init_scale": 32768}
>
> [model.get_spans]
> @span_getters = "spacy-transformers.strided_spans.v1"
@@ -366,12 +377,31 @@ transformer weights across your pipeline. For a layer that's configured for use
in other components, see
[Tok2VecTransformer](/api/architectures#Tok2VecTransformer).
-| Name | Description |
-| ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `name` | Any model name that can be loaded by [`transformers.AutoModel`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoModel). ~~str~~ |
-| `get_spans` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. ~~Callable[[List[Doc]], List[Span]]~~ |
-| `tokenizer_config` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). ~~Dict[str, Any]~~ |
-| **CREATES** | The model using the architecture. ~~Model[List[Doc], FullTransformerBatch]~~ |
+| Name | Description |
+| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `name` | Any model name that can be loaded by [`transformers.AutoModel`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoModel). ~~str~~ |
+| `get_spans` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. ~~Callable[[List[Doc]], List[Span]]~~ |
+| `tokenizer_config` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). ~~Dict[str, Any]~~ |
+| `transformer_config` | Transformer settings passed to [`transformers.AutoConfig`](https://huggingface.co/transformers/model_doc/auto.html?highlight=autoconfig#transformers.AutoConfig) ~~Dict[str, Any]~~ |
+| `mixed_precision` | Replace whitelisted ops by half-precision counterparts. Speeds up training and prediction on GPUs with [Tensor Cores](https://developer.nvidia.com/tensor-cores) and reduces GPU memory use. ~~bool~~ |
+| `grad_scaler_config` | Configuration to pass to `thinc.api.PyTorchGradScaler` during training when `mixed_precision` is enabled. ~~Dict[str, Any]~~ |
+| **CREATES** | The model using the architecture. ~~Model[List[Doc], FullTransformerBatch]~~ |
+| | |
+
+
+Mixed-precision support is currently an experimental feature.
+
+
+
+
+- The `transformer_config` argument was added in
+ `spacy-transformers.TransformerModel.v2`.
+- The `mixed_precision` and `grad_scaler_config` arguments were added in
+ `spacy-transformers.TransformerModel.v3`.
+
+The other arguments are shared between all versions.
+
+
### spacy-transformers.TransformerListener.v1 {#TransformerListener}
@@ -403,16 +433,19 @@ a single token vector given zero or more wordpiece vectors.
| `upstream` | A string to identify the "upstream" `Transformer` component to communicate with. By default, the upstream name is the wildcard string `"*"`, but you could also specify the name of the `Transformer` component. You'll almost never have multiple upstream `Transformer` components, so the wildcard string will almost always be fine. ~~str~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
-### spacy-transformers.Tok2VecTransformer.v1 {#Tok2VecTransformer}
+### spacy-transformers.Tok2VecTransformer.v3 {#Tok2VecTransformer}
> #### Example Config
>
> ```ini
> [model]
-> @architectures = "spacy.Tok2VecTransformer.v1"
+> @architectures = "spacy-transformers.Tok2VecTransformer.v3"
> name = "albert-base-v2"
> tokenizer_config = {"use_fast": false}
+> transformer_config = {}
> grad_factor = 1.0
+> mixed_precision = true
+> grad_scaler_config = {"init_scale": 32768}
> ```
Use a transformer as a [`Tok2Vec`](/api/tok2vec) layer directly. This does
@@ -421,13 +454,31 @@ Use a transformer as a [`Tok2Vec`](/api/tok2vec) layer directly. This does
object, but it's a **simpler solution** if you only need the transformer within
one component.
-| Name | Description |
-| ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `get_spans` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. ~~Callable[[List[Doc]], List[Span]]~~ |
-| `tokenizer_config` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). ~~Dict[str, Any]~~ |
-| `pooling` | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. ~~Model[Ragged, Floats2d]~~ |
-| `grad_factor` | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. ~~float~~ |
-| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
+| Name | Description |
+| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_spans` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. ~~Callable[[List[Doc]], List[Span]]~~ |
+| `tokenizer_config` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). ~~Dict[str, Any]~~ |
+| `transformer_config` | Settings to pass to the transformers forward pass. ~~Dict[str, Any]~~ |
+| `pooling` | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. ~~Model[Ragged, Floats2d]~~ |
+| `grad_factor` | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. ~~float~~ |
+| `mixed_precision` | Replace whitelisted ops by half-precision counterparts. Speeds up training and prediction on GPUs with [Tensor Cores](https://developer.nvidia.com/tensor-cores) and reduces GPU memory use. ~~bool~~ |
+| `grad_scaler_config` | Configuration to pass to `thinc.api.PyTorchGradScaler` during training when `mixed_precision` is enabled. ~~Dict[str, Any]~~ |
+| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
+
+
+Mixed-precision support is currently an experimental feature.
+
+
+
+
+- The `transformer_config` argument was added in
+ `spacy-transformers.Tok2VecTransformer.v2`.
+- The `mixed_precision` and `grad_scaler_config` arguments were added in
+ `spacy-transformers.Tok2VecTransformer.v3`.
+
+The other arguments are shared between all versions.
+
+
## Pretraining architectures {#pretrain source="spacy/ml/models/multi_task.py"}
@@ -555,8 +606,8 @@ consists of either two or three subnetworks:
-[TransitionBasedParser.v1](/api/legacy#TransitionBasedParser_v1) had the exact same signature,
-but the `use_upper` argument was `True` by default.
+[TransitionBasedParser.v1](/api/legacy#TransitionBasedParser_v1) had the exact
+same signature, but the `use_upper` argument was `True` by default.
diff --git a/website/docs/api/attributeruler.md b/website/docs/api/attributeruler.md
index a253ca9f8..965bffbcc 100644
--- a/website/docs/api/attributeruler.md
+++ b/website/docs/api/attributeruler.md
@@ -48,12 +48,13 @@ Initialize the attribute ruler.
> ruler = nlp.add_pipe("attribute_ruler")
> ```
-| Name | Description |
-| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab` | The shared vocabulary to pass to the matcher. ~~Vocab~~ |
-| `name` | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. ~~str~~ |
-| _keyword-only_ | |
-| `validate` | Whether patterns should be validated (passed to the [`Matcher`](/api/matcher#init)). Defaults to `False`. ~~bool~~ |
+| Name | Description |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `vocab` | The shared vocabulary to pass to the matcher. ~~Vocab~~ |
+| `name` | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. ~~str~~ |
+| _keyword-only_ | |
+| `validate` | Whether patterns should be validated (passed to the [`Matcher`](/api/matcher#init)). Defaults to `False`. ~~bool~~ |
+| `scorer` | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"tag`", `"pos"`, `"morph"` and `"lemma"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
## AttributeRuler.\_\_call\_\_ {#call tag="method"}
@@ -175,21 +176,6 @@ Load attribute ruler patterns from morph rules.
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `morph_rules` | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. ~~Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]~~ |
-## AttributeRuler.score {#score tag="method" new="3"}
-
-Score a batch of examples.
-
-> #### Example
->
-> ```python
-> scores = ruler.score(examples)
-> ```
-
-| Name | Description |
-| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples` | The examples to score. ~~Iterable[Example]~~ |
-| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"tag"`, `"pos"`, `"morph"` and `"lemma"` if present in any of the target token attributes. ~~Dict[str, float]~~ |
-
## AttributeRuler.to_disk {#to_disk tag="method"}
Serialize the pipe to disk.
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 10ab2083e..89e2e87d9 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -148,8 +148,8 @@ $ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [
### init fill-config {#init-fill-config new="3"}
-Auto-fill a partial [`config.cfg` file](/usage/training#config) file with **all
-default values**, e.g. a config generated with the
+Auto-fill a partial [.cfg file](/usage/training#config) with **all default
+values**, e.g. a config generated with the
[quickstart widget](/usage/training#quickstart). Config files used for training
should always be complete and not contain any hidden defaults or missing values,
so this command helps you create your final training config. In order to find
@@ -175,7 +175,7 @@ $ python -m spacy init fill-config [base_path] [output_file] [--diff]
| Name | Description |
| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `base_path` | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~ |
-| `output_file` | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ |
+| `output_file` | Path to output `.cfg` file or "-" to write to stdout so you can pipe it to a file. Defaults to "-" (stdout). ~~Path (positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~ |
| `--diff`, `-D` | Print a visual diff highlighting the changes. ~~bool (flag)~~ |
@@ -203,11 +203,12 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
| Name | Description |
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ |
+| `lang` | Pipeline language [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as `en`. ~~str (positional)~~ |
| `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
| `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
+| `--mode`, `-m` | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~ |
| `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ |
| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
@@ -260,16 +261,18 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type]
| Name | Description |
| ------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- |
-| `input_file` | Input file. ~~Path (positional)~~ |
+| `input_path` | Input file or directory. ~~Path (positional)~~ |
| `output_dir` | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. ~~Optional[Path] \(option)~~ |
| `--converter`, `-c` 2 | Name of converter to use (see below). ~~str (option)~~ |
| `--file-type`, `-t` 2.1 | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. ~~str (option)~~ |
| `--n-sents`, `-n` | Number of sentences per document. Supported for: `conll`, `conllu`, `iob`, `ner` ~~int (option)~~ |
| `--seg-sents`, `-s` 2.2 | Segment sentences. Supported for: `conll`, `ner` ~~bool (flag)~~ |
-| `--base`, `-b` | Trained spaCy pipeline for sentence segmentation to use as base (for `--seg-sents`). ~~Optional[str](option)~~ |
+| `--base`, `-b`, `--model` | Trained spaCy pipeline for sentence segmentation to use as base (for `--seg-sents`). ~~Optional[str](option)~~ |
| `--morphology`, `-m` | Enable appending morphology to tags. Supported for: `conllu` ~~bool (flag)~~ |
+| `--merge-subtokens`, `-T` | Merge CoNLL-U subtokens ~~bool (flag)~~ |
| `--ner-map`, `-nm` | NER tag mapping (as JSON-encoded dict of entity types). Supported for: `conllu` ~~Optional[Path](option)~~ |
| `--lang`, `-l` 2.1 | Language code (if tokenizer required). ~~Optional[str] \(option)~~ |
+| `--concatenate`, `-C` | Concatenate output to a single file ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | Binary [`DocBin`](/api/docbin) training data that can be used with [`spacy train`](/api/cli#train). |
@@ -817,6 +820,29 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id]
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
| **CREATES** | The final trained pipeline and the best trained pipeline. |
+### Calling the training function from Python {#train-function new="3.2"}
+
+The training CLI exposes a `train` helper function that lets you run the
+training just like `spacy train`. Usually it's easier to use the command line
+directly, but if you need to kick off training from code this is how to do it.
+
+> #### Example
+>
+> ```python
+> from spacy.cli.train import train
+>
+> train("./config.cfg", overrides={"paths.train": "./train.spacy", "paths.dev": "./dev.spacy"})
+>
+> ```
+
+| Name | Description |
+| -------------- | ----------------------------------------------------------------------------------------------------------------------------- |
+| `config_path` | Path to the config to use for training. ~~Union[str, Path]~~ |
+| `output_path` | Optional name of directory to save output model in. If not provided a model will not be saved. ~~Optional[Union[str, Path]]~~ |
+| _keyword-only_ | |
+| `use_gpu` | Which GPU to use. Defaults to -1 for no GPU. ~~int~~ |
+| `overrides` | Values to override config settings. ~~Dict[str, Any]~~ |
+
## pretrain {#pretrain new="2.1" tag="command,experimental"}
Pretrain the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 4ca5fb24d..b7aedc511 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -90,7 +90,6 @@ Defines the `nlp` object, its tokenizer and
> ```ini
> [components.textcat]
> factory = "textcat"
-> labels = ["POSITIVE", "NEGATIVE"]
>
> [components.textcat.model]
> @architectures = "spacy.TextCatBOW.v2"
@@ -182,25 +181,25 @@ single corpus once and then divide it up into `train` and `dev` partitions.
This section defines settings and controls for the training and evaluation
process that are used when you run [`spacy train`](/api/cli#train).
-| Name | Description |
-| ----------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
-| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
-| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
-| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ |
-| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
-| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
-| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ |
-| `annotating_components` | Pipeline component names that should set annotations on the predicted docs during training. See [here](/usage/training#annotating-components) for details. Defaults to `[]`. ~~List[str]~~ |
-| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ |
-| `logger` | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ |
-| `max_epochs` | Maximum number of epochs to train for. `0` means an unlimited number of epochs. `-1` means that the train corpus should be streamed rather than loaded into memory with no shuffling within the training loop. Defaults to `0`. ~~int~~ |
-| `max_steps` | Maximum number of update steps to train for. `0` means an unlimited number of steps. Defaults to `20000`. ~~int~~ |
-| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
-| `patience` | How many steps to continue without improvement in evaluation score. `0` disables early stopping. Defaults to `1600`. ~~int~~ |
-| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ |
-| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
-| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ |
+| Name | Description |
+| ---------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
+| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
+| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
+| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ |
+| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
+| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
+| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ |
+| `annotating_components` 3.1 | Pipeline component names that should set annotations on the predicted docs during training. See [here](/usage/training#annotating-components) for details. Defaults to `[]`. ~~List[str]~~ |
+| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ |
+| `logger` | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ |
+| `max_epochs` | Maximum number of epochs to train for. `0` means an unlimited number of epochs. `-1` means that the train corpus should be streamed rather than loaded into memory with no shuffling within the training loop. Defaults to `0`. ~~int~~ |
+| `max_steps` | Maximum number of update steps to train for. `0` means an unlimited number of steps. Defaults to `20000`. ~~int~~ |
+| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
+| `patience` | How many steps to continue without improvement in evaluation score. `0` disables early stopping. Defaults to `1600`. ~~int~~ |
+| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ |
+| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
+| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ |
### pretraining {#config-pretraining tag="section,optional"}
@@ -249,7 +248,7 @@ Also see the usage guides on the
| `after_init` | Optional callback to modify the `nlp` object after initialization. ~~Optional[Callable[[Language], Language]]~~ |
| `before_init` | Optional callback to modify the `nlp` object before initialization. ~~Optional[Callable[[Language], Language]]~~ |
| `components` | Additional arguments passed to the `initialize` method of a pipeline component, keyed by component name. If type annotations are available on the method, the config will be validated against them. The `initialize` methods will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Dict[str, Any]]~~ |
-| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ |
+| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. Ignored when actually running pretraining, as you're creating the file to be used later. ~~Optional[str]~~ |
| `lookups` | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~ |
| `tokenizer` | Additional arguments passed to the `initialize` method of the specified tokenizer. Can be used for languages like Chinese that depend on dictionaries or trained models for tokenization. If type annotations are available on the method, the config will be validated against them. The `initialize` method will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Any]~~ |
| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vectors`](/api/cli#init-vectors). Defaults to `null`. ~~Optional[str]~~ |
@@ -284,6 +283,10 @@ CLI [`train`](/api/cli#train) command. The built-in
of the `.conllu` format used by the
[Universal Dependencies corpora](https://github.com/UniversalDependencies).
+Note that while this is the format used to save training data, you do not have
+to understand the internal details to use it or create training data. See the
+section on [preparing training data](/usage/training#training-data).
+
### JSON training format {#json-input tag="deprecated"}
@@ -297,7 +300,7 @@ objects to JSON, you can now serialize them directly using the
format:
```cli
-$ python -m spacy convert ./data.json ./output.spacy
+$ python -m spacy convert ./data.json .
```
@@ -532,7 +535,7 @@ As of spaCy v3.0, the `meta.json` **isn't** used to construct the language class
and pipeline anymore and only contains meta information for reference and for
creating a Python package with [`spacy package`](/api/cli#package). How to set
up the `nlp` object is now defined in the
-[`config.cfg`](/api/data-formats#config), which includes detailed information
+[config file](/api/data-formats#config), which includes detailed information
about the pipeline components and their model architectures, and all other
settings and hyperparameters used to train the pipeline. It's the **single
source of truth** used for loading a pipeline.
diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md
index fa02a6f99..118cdc611 100644
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@@ -25,6 +25,20 @@ current state. The weights are updated such that the scores assigned to the set
of optimal actions is increased, while scores assigned to other actions are
decreased. Note that more than one action may be optimal for a given state.
+## Assigned Attributes {#assigned-attributes}
+
+Dependency predictions are assigned to the `Token.dep` and `Token.head` fields.
+Beside the dependencies themselves, the parser decides sentence boundaries,
+which are saved in `Token.is_sent_start` and accessible via `Doc.sents`.
+
+| Location | Value |
+| --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
+| `Token.dep` | The type of dependency relation (hash). ~~int~~ |
+| `Token.dep_` | The type of dependency relation. ~~str~~ |
+| `Token.head` | The syntactic parent, or "governor", of this token. ~~Token~~ |
+| `Token.is_sent_start` | A boolean value indicating whether the token starts a sentence. After the parser runs this will be `True` or `False` for all tokens. ~~bool~~ |
+| `Doc.sents` | An iterator over sentences in the `Doc`, determined by `Token.is_sent_start` values. ~~Iterator[Span]~~ |
+
## Config and implementation {#config}
The default config is defined by the pipeline component factory and describes
@@ -91,6 +105,7 @@ shortcut for this and instantiate the component using its string name and
| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
| `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. Defaults to `False`. ~~bool~~ |
| `min_action_freq` | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. ~~int~~ |
+| `scorer` | The scoring method. Defaults to [`Scorer.score_deps`](/api/scorer#score_deps) for the attribute `"dep"` ignoring the labels `p` and `punct` and [`Scorer.score_spans`](/api/scorer/#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~ |
## DependencyParser.\_\_call\_\_ {#call tag="method"}
@@ -259,21 +274,6 @@ predicted scores.
| `scores` | Scores representing the model's predictions. ~~StateClass~~ |
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
-## DependencyParser.score {#score tag="method" new="3"}
-
-Score a batch of examples.
-
-> #### Example
->
-> ```python
-> scores = parser.score(examples)
-> ```
-
-| Name | Description |
-| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `examples` | The examples to score. ~~Iterable[Example]~~ |
-| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans) and [`Scorer.score_deps`](/api/scorer#score_deps). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
-
## DependencyParser.create_optimizer {#create_optimizer tag="method"}
Create an [`Optimizer`](https://thinc.ai/docs/api-optimizers) for the pipeline
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 9358507dc..9836b8c21 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -34,7 +34,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
| Name | Description |
| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
-| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ |
+| `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ |
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
| _keyword-only_ | |
| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
@@ -212,7 +212,7 @@ alignment mode `"strict".
| Name | Description |
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `start` | The index of the first character of the span. ~~int~~ |
-| `end` | The index of the last character after the span. ~int~~ |
+| `end` | The index of the last character after the span. ~~int~~ |
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
| `kb_id` 2.2 | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
@@ -571,9 +571,9 @@ objects, if the entity recognizer has been applied.
> assert ents[0].text == "Mr. Best"
> ```
-| Name | Description |
-| ----------- | --------------------------------------------------------------------- |
-| **RETURNS** | Entities in the document, one `Span` per entity. ~~Tuple[Span, ...]~~ |
+| Name | Description |
+| ----------- | ---------------------------------------------------------------- |
+| **RETURNS** | Entities in the document, one `Span` per entity. ~~Tuple[Span]~~ |
## Doc.spans {#spans tag="property"}
diff --git a/website/docs/api/docbin.md b/website/docs/api/docbin.md
index 3625ed790..b1d1798ba 100644
--- a/website/docs/api/docbin.md
+++ b/website/docs/api/docbin.md
@@ -16,7 +16,7 @@ document from the `DocBin`. The serialization format is gzipped msgpack, where
the msgpack object has the following structure:
```python
-### msgpack object structrue
+### msgpack object structure
{
"version": str, # DocBin version number
"attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE]
diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md
index 2994d934b..3d3372679 100644
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@@ -16,6 +16,16 @@ plausible candidates from that `KnowledgeBase` given a certain textual mention,
and a machine learning model to pick the right candidate, given the local
context of the mention.
+## Assigned Attributes {#assigned-attributes}
+
+Predictions, in the form of knowledge base IDs, will be assigned to
+`Token.ent_kb_id_`.
+
+| Location | Value |
+| ------------------ | --------------------------------- |
+| `Token.ent_kb_id` | Knowledge base ID (hash). ~~int~~ |
+| `Token.ent_kb_id_` | Knowledge base ID. ~~str~~ |
+
## Config and implementation {#config}
The default config is defined by the pipeline component factory and describes
@@ -41,15 +51,17 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("entity_linker", config=config)
> ```
-| Setting | Description |
-| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
-| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ |
-| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
-| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
-| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
-| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
-| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
+| Setting | Description |
+| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
+| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ |
+| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
+| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
+| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
+| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
+| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
+| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
+| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/entity_linker.py
@@ -82,18 +94,20 @@ custom knowledge base, you should either call
[`set_kb`](/api/entitylinker#set_kb) or provide a `kb_loader` in the
[`initialize`](/api/entitylinker#initialize) call.
-| Name | Description |
-| ---------------------- | -------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab` | The shared vocabulary. ~~Vocab~~ |
-| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ |
-| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
-| _keyword-only_ | |
-| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ |
-| `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
-| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ |
-| `n_sents` | The number of neighbouring sentences to take into account. ~~int~~ |
-| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ |
-| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ |
+| Name | Description |
+| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab` | The shared vocabulary. ~~Vocab~~ |
+| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ |
+| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
+| _keyword-only_ | |
+| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ |
+| `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
+| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ |
+| `n_sents` | The number of neighbouring sentences to take into account. ~~int~~ |
+| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ |
+| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ |
+| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
+| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
## EntityLinker.\_\_call\_\_ {#call tag="method"}
@@ -259,21 +273,6 @@ pipe's entity linking model and context encoder. Delegates to
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
-## EntityLinker.score {#score tag="method" new="3"}
-
-Score a batch of examples.
-
-> #### Example
->
-> ```python
-> scores = entity_linker.score(examples)
-> ```
-
-| Name | Description |
-| ----------- | ---------------------------------------------------------------------------------------------- |
-| `examples` | The examples to score. ~~Iterable[Example]~~ |
-| **RETURNS** | The scores, produced by [`Scorer.score_links`](/api/scorer#score_links) . ~~Dict[str, float]~~ |
-
## EntityLinker.create_optimizer {#create_optimizer tag="method"}
Create an optimizer for the pipeline component.
diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md
index 601b644c1..14b6fece4 100644
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@@ -20,6 +20,24 @@ your entities will be close to their initial tokens. If your entities are long
and characterized by tokens in their middle, the component will likely not be a
good fit for your task.
+## Assigned Attributes {#assigned-attributes}
+
+Predictions will be saved to `Doc.ents` as a tuple. Each label will also be
+reflected to each underlying token, where it is saved in the `Token.ent_type`
+and `Token.ent_iob` fields. Note that by definition each token can only have one
+label.
+
+When setting `Doc.ents` to create training data, all the spans must be valid and
+non-overlapping, or an error will be thrown.
+
+| Location | Value |
+| ----------------- | ----------------------------------------------------------------- |
+| `Doc.ents` | The annotated spans. ~~Tuple[Span]~~ |
+| `Token.ent_iob` | An enum encoding of the IOB part of the named entity tag. ~~int~~ |
+| `Token.ent_iob_` | The IOB part of the named entity tag. ~~str~~ |
+| `Token.ent_type` | The label part of the named entity tag (hash). ~~int~~ |
+| `Token.ent_type_` | The label part of the named entity tag. ~~str~~ |
+
## Config and implementation {#config}
The default config is defined by the pipeline component factory and describes
@@ -47,7 +65,8 @@ architectures and their arguments and hyperparameters.
| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ |
| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ |
-| `incorrect_spans_key` | This key refers to a `SpanGroup` in `doc.spans` that specifies incorrect spans. The NER wiill learn not to predict (exactly) those spans. Defaults to `None`. ~~Optional[str]~~ |
+| `incorrect_spans_key` | This key refers to a `SpanGroup` in `doc.spans` that specifies incorrect spans. The NER will learn not to predict (exactly) those spans. Defaults to `None`. ~~Optional[str]~~ |
+| `scorer` | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/ner.pyx
@@ -251,21 +270,6 @@ predicted scores.
| `scores` | Scores representing the model's predictions. ~~StateClass~~ |
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
-## EntityRecognizer.score {#score tag="method" new="3"}
-
-Score a batch of examples.
-
-> #### Example
->
-> ```python
-> scores = ner.score(examples)
-> ```
-
-| Name | Description |
-| ----------- | --------------------------------------------------------- |
-| `examples` | The examples to score. ~~Iterable[Example]~~ |
-| **RETURNS** | The scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
-
## EntityRecognizer.create_optimizer {#create_optimizer tag="method"}
Create an optimizer for the pipeline component.
diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md
index 66cb6d4e4..1ef283870 100644
--- a/website/docs/api/entityruler.md
+++ b/website/docs/api/entityruler.md
@@ -15,6 +15,27 @@ used on its own to implement a purely rule-based entity recognition system. For
usage examples, see the docs on
[rule-based entity recognition](/usage/rule-based-matching#entityruler).
+## Assigned Attributes {#assigned-attributes}
+
+This component assigns predictions basically the same way as the
+[`EntityRecognizer`](/api/entityrecognizer).
+
+Predictions can be accessed under `Doc.ents` as a tuple. Each label will also be
+reflected in each underlying token, where it is saved in the `Token.ent_type`
+and `Token.ent_iob` fields. Note that by definition each token can only have one
+label.
+
+When setting `Doc.ents` to create training data, all the spans must be valid and
+non-overlapping, or an error will be thrown.
+
+| Location | Value |
+| ----------------- | ----------------------------------------------------------------- |
+| `Doc.ents` | The annotated spans. ~~Tuple[Span]~~ |
+| `Token.ent_iob` | An enum encoding of the IOB part of the named entity tag. ~~int~~ |
+| `Token.ent_iob_` | The IOB part of the named entity tag. ~~str~~ |
+| `Token.ent_type` | The label part of the named entity tag (hash). ~~int~~ |
+| `Token.ent_type_` | The label part of the named entity tag. ~~str~~ |
+
## Config and implementation {#config}
The default config is defined by the pipeline component factory and describes
@@ -35,11 +56,12 @@ how the component should be configured. You can override its settings via the
> ```
| Setting | Description |
-| --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --- | ----------- |
+| --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
| `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ |
| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ |
-| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `" | | "`. ~~str~~ |
+| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ |
+| `scorer` | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/entityruler.py
@@ -64,22 +86,22 @@ be a token pattern (list) or a phrase pattern (string). For example:
> ```
| Name | Description |
-| --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --- | ----------- |
+| --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `nlp` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~ |
| `name` 3 | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ |
| _keyword-only_ | |
| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
| `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ |
| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ |
-| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `" | | "`. ~~str~~ |
+| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ |
| `patterns` | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~ |
## EntityRuler.initialize {#initialize tag="method" new="3"}
Initialize the component with data and used before training to load in rules
-from a file. This method is typically called by
-[`Language.initialize`](/api/language#initialize) and lets you customize
-arguments it receives via the
+from a [pattern file](/usage/rule-based-matching/#entityruler-files). This method
+is typically called by [`Language.initialize`](/api/language#initialize) and
+lets you customize arguments it receives via the
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
config.
@@ -188,6 +210,24 @@ of dicts) or a phrase pattern (string). For more details, see the usage guide on
| ---------- | ---------------------------------------------------------------- |
| `patterns` | The patterns to add. ~~List[Dict[str, Union[str, List[dict]]]]~~ |
+
+## EntityRuler.remove {#remove tag="method" new="3.2.1"}
+
+Remove a pattern by its ID from the entity ruler. A `ValueError` is raised if the ID does not exist.
+
+> #### Example
+>
+> ```python
+> patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"}]
+> ruler = nlp.add_pipe("entity_ruler")
+> ruler.add_patterns(patterns)
+> ruler.remove("apple")
+> ```
+
+| Name | Description |
+| ---------- | ---------------------------------------------------------------- |
+| `id` | The ID of the pattern rule. ~~str~~ |
+
## EntityRuler.to_disk {#to_disk tag="method"}
Save the entity ruler patterns to a directory. The patterns will be saved as
@@ -267,7 +307,7 @@ All labels present in the match patterns.
| ----------- | -------------------------------------- |
| **RETURNS** | The string labels. ~~Tuple[str, ...]~~ |
-## EntityRuler.ent_ids {#labels tag="property" new="2.2.2"}
+## EntityRuler.ent_ids {#ent_ids tag="property" new="2.2.2"}
All entity IDs present in the `id` properties of the match patterns.
diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index b09ae1aa2..8d7686243 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -446,7 +446,7 @@ component, adds it to the pipeline and returns it.
| `after` | Component name or index to insert component directly after. ~~Optional[Union[str, int]]~~ |
| `first` | Insert component first / not first in the pipeline. ~~Optional[bool]~~ |
| `last` | Insert component last / not last in the pipeline. ~~Optional[bool]~~ |
-| `config` 3 | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ |
+| `config` 3 | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Dict[str, Any]~~ |
| `source` 3 | Optional source pipeline to copy component from. If a source is provided, the `factory_name` is interpreted as the name of the component in the source pipeline. Make sure that the vocab, vectors and settings of the source pipeline match the target pipeline. ~~Optional[Language]~~ |
| `validate` 3 | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
| **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ |
@@ -476,7 +476,7 @@ To create a component and add it to the pipeline, you should always use
| `factory_name` | Name of the registered component factory. ~~str~~ |
| `name` | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. ~~Optional[str]~~ |
| _keyword-only_ | |
-| `config` 3 | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ |
+| `config` 3 | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Dict[str, Any]~~ |
| `validate` 3 | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
| **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ |
@@ -1000,6 +1000,11 @@ subclasses like `English` or `German` to make language-specific functionality
like the [lexical attribute getters](/usage/linguistic-features#language-data)
available to the loaded object.
+Note that if you want to serialize and reload a whole pipeline, using this alone
+won't work, you also need to handle the config. See
+["Serializing the pipeline"](https://spacy.io/usage/saving-loading#pipeline) for
+details.
+
> #### Example
>
> ```python
@@ -1039,7 +1044,7 @@ available to the loaded object.
| Name | Description |
| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `Defaults` | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~ |
-| `lang` | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). ~~str~~ |
+| `lang` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en' for English. ~~str~~ |
| `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](%%GITHUB_SPACY/spacy/default_config.cfg). ~~Config~~ |
## Defaults {#defaults}
@@ -1077,9 +1082,9 @@ customize the default language data:
| --------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `stop_words` | List of stop words, used for `Token.is_stop`. **Example:** [`stop_words.py`](%%GITHUB_SPACY/spacy/lang/en/stop_words.py) ~~Set[str]~~ |
| `tokenizer_exceptions` | Tokenizer exception rules, string mapped to list of token attributes. **Example:** [`de/tokenizer_exceptions.py`](%%GITHUB_SPACY/spacy/lang/de/tokenizer_exceptions.py) ~~Dict[str, List[dict]]~~ |
-| `prefixes`, `suffixes`, `infixes` | Prefix, suffix and infix rules for the default tokenizer. **Example:** [`puncutation.py`](%%GITHUB_SPACY/spacy/lang/punctuation.py) ~~Optional[List[Union[str, Pattern]]]~~ |
-| `token_match` | Optional regex for matching strings that should never be split, overriding the infix rules. **Example:** [`fr/tokenizer_exceptions.py`](%%GITHUB_SPACY/spacy/lang/fr/tokenizer_exceptions.py) ~~Optional[Pattern]~~ |
-| `url_match` | Regular expression for matching URLs. Prefixes and suffixes are removed before applying the match. **Example:** [`tokenizer_exceptions.py`](%%GITHUB_SPACY/spacy/lang/tokenizer_exceptions.py) ~~Optional[Pattern]~~ |
+| `prefixes`, `suffixes`, `infixes` | Prefix, suffix and infix rules for the default tokenizer. **Example:** [`puncutation.py`](%%GITHUB_SPACY/spacy/lang/punctuation.py) ~~Optional[Sequence[Union[str, Pattern]]]~~ |
+| `token_match` | Optional regex for matching strings that should never be split, overriding the infix rules. **Example:** [`fr/tokenizer_exceptions.py`](%%GITHUB_SPACY/spacy/lang/fr/tokenizer_exceptions.py) ~~Optional[Callable]~~ |
+| `url_match` | Regular expression for matching URLs. Prefixes and suffixes are removed before applying the match. **Example:** [`tokenizer_exceptions.py`](%%GITHUB_SPACY/spacy/lang/tokenizer_exceptions.py) ~~Optional[Callable]~~ |
| `lex_attr_getters` | Custom functions for setting lexical attributes on tokens, e.g. `like_num`. **Example:** [`lex_attrs.py`](%%GITHUB_SPACY/spacy/lang/en/lex_attrs.py) ~~Dict[int, Callable[[str], Any]]~~ |
| `syntax_iterators` | Functions that compute views of a `Doc` object based on its syntax. At the moment, only used for [noun chunks](/usage/linguistic-features#noun-chunks). **Example:** [`syntax_iterators.py`](%%GITHUB_SPACY/spacy/lang/en/syntax_iterators.py). ~~Dict[str, Callable[[Union[Doc, Span]], Iterator[Span]]]~~ |
| `writing_system` | Information about the language's writing system, available via `Vocab.writing_system`. Defaults to: `{"direction": "ltr", "has_case": True, "has_letters": True}.`. **Example:** [`zh/__init__.py`](%%GITHUB_SPACY/spacy/lang/zh/__init__.py) ~~Dict[str, Any]~~ |
diff --git a/website/docs/api/legacy.md b/website/docs/api/legacy.md
index 02b376780..916a5bf7f 100644
--- a/website/docs/api/legacy.md
+++ b/website/docs/api/legacy.md
@@ -105,7 +105,8 @@ and residual connections.
### spacy.TransitionBasedParser.v1 {#TransitionBasedParser_v1}
-Identical to [`spacy.TransitionBasedParser.v2`](/api/architectures#TransitionBasedParser)
+Identical to
+[`spacy.TransitionBasedParser.v2`](/api/architectures#TransitionBasedParser)
except the `use_upper` was set to `True` by default.
### spacy.TextCatEnsemble.v1 {#TextCatEnsemble_v1}
diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md
index 995f890cd..2fa040917 100644
--- a/website/docs/api/lemmatizer.md
+++ b/website/docs/api/lemmatizer.md
@@ -4,7 +4,6 @@ tag: class
source: spacy/pipeline/lemmatizer.py
new: 3
teaser: 'Pipeline component for lemmatization'
-api_base_class: /api/pipe
api_string_name: lemmatizer
api_trainable: false
---
@@ -32,6 +31,15 @@ available in the pipeline and runs _before_ the lemmatizer.
+## Assigned Attributes {#assigned-attributes}
+
+Lemmas generated by rules or predicted will be saved to `Token.lemma`.
+
+| Location | Value |
+| -------------- | ------------------------- |
+| `Token.lemma` | The lemma (hash). ~~int~~ |
+| `Token.lemma_` | The lemma. ~~str~~ |
+
## Config and implementation
The default config is defined by the pipeline component factory and describes
@@ -48,11 +56,13 @@ data format used by the lookup and rule-based lemmatizers, see
> nlp.add_pipe("lemmatizer", config=config)
> ```
-| Setting | Description |
-| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `lookup` if no language-specific lemmatizer is available (see the following table). ~~str~~ |
-| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ |
-| `model` | **Not yet implemented:** the model to use. ~~Model~~ |
+| Setting | Description |
+| -------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `lookup` if no language-specific lemmatizer is available (see the following table). ~~str~~ |
+| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ |
+| `model` | **Not yet implemented:** the model to use. ~~Model~~ |
+| _keyword-only_ | |
+| `scorer` | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"lemma"`. ~~Optional[Callable]~~ |
Many languages specify a default lemmatizer mode other than `lookup` if a better
lemmatizer is available. The lemmatizer modes `rule` and `pos_lookup` require
diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md
index 9c15f8797..3e7f9dc04 100644
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@@ -44,6 +44,9 @@ rule-based matching are:
| `SPACY` | Token has a trailing space. ~~bool~~ |
| `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ |
| `ENT_TYPE` | The token's entity label. ~~str~~ |
+| `ENT_IOB` | The IOB part of the token's entity tag. ~~str~~ |
+| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ |
+| `ENT_KB_ID` | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~ |
| `_` 2.1 | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
| `OP` | Operator or quantifier to determine how often to match a token pattern. ~~str~~ |
@@ -77,13 +80,14 @@ it compares to another value.
> ]
> ```
-| Attribute | Description |
-| -------------------------- | ------------------------------------------------------------------------------------------------------- |
-| `IN` | Attribute value is member of a list. ~~Any~~ |
-| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
-| `ISSUBSET` | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~ |
-| `ISSUPERSET` | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~ |
-| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
+| Attribute | Description |
+| -------------------------- | -------------------------------------------------------------------------------------------------------- |
+| `IN` | Attribute value is member of a list. ~~Any~~ |
+| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
+| `IS_SUBSET` | Attribute value (for `MORPH` or custom list attributes) is a subset of a list. ~~Any~~ |
+| `IS_SUPERSET` | Attribute value (for `MORPH` or custom list attributes) is a superset of a list. ~~Any~~ |
+| `INTERSECTS` | Attribute value (for `MORPH` or custom list attribute) has a non-empty intersection with a list. ~~Any~~ |
+| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
## Matcher.\_\_init\_\_ {#init tag="method"}
diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md
index d2dd28ac2..434c56833 100644
--- a/website/docs/api/morphologizer.md
+++ b/website/docs/api/morphologizer.md
@@ -15,6 +15,16 @@ coarse-grained POS tags following the Universal Dependencies
[FEATS](https://universaldependencies.org/format.html#morphological-annotation)
annotation guidelines.
+## Assigned Attributes {#assigned-attributes}
+
+Predictions are saved to `Token.morph` and `Token.pos`.
+
+| Location | Value |
+| ------------- | ----------------------------------------- |
+| `Token.pos` | The UPOS part of speech (hash). ~~int~~ |
+| `Token.pos_` | The UPOS part of speech. ~~str~~ |
+| `Token.morph` | Morphological features. ~~MorphAnalysis~~ |
+
## Config and implementation {#config}
The default config is defined by the pipeline component factory and describes
@@ -32,9 +42,12 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("morphologizer", config=config)
> ```
-| Setting | Description |
-| ------- | ------------------------------------------------------------------------------------------------------- |
-| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
+| Setting | Description |
+| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
+| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~ |
+| `extend` 3.2 | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ |
+| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/morphologizer.pyx
@@ -46,6 +59,19 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe).
+The `overwrite` and `extend` settings determine how existing annotation is
+handled (with the example for existing annotation `A=B|C=D` + predicted
+annotation `C=E|X=Y`):
+
+- `overwrite=True, extend=True`: overwrite values of existing features, add any
+ new features (`A=B|C=D` + `C=E|X=Y` → `A=B|C=E|X=Y`)
+- `overwrite=True, extend=False`: overwrite completely, removing any existing
+ features (`A=B|C=D` + `C=E|X=Y` → `C=E|X=Y`)
+- `overwrite=False, extend=True`: keep values of existing features, add any new
+ features (`A=B|C=D` + `C=E|X=Y` → `A=B|C=D|X=Y`)
+- `overwrite=False, extend=False`: do not modify the existing annotation if set
+ (`A=B|C=D` + `C=E|X=Y` → `A=B|C=D`)
+
> #### Example
>
> ```python
@@ -61,11 +87,15 @@ shortcut for this and instantiate the component using its string name and
> morphologizer = Morphologizer(nlp.vocab, model)
> ```
-| Name | Description |
-| ------- | -------------------------------------------------------------------------------------------------------------------- |
-| `vocab` | The shared vocabulary. ~~Vocab~~ |
-| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
-| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
+| Name | Description |
+| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab` | The shared vocabulary. ~~Vocab~~ |
+| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
+| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
+| _keyword-only_ | |
+| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~ |
+| `extend` 3.2 | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ |
+| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
## Morphologizer.\_\_call\_\_ {#call tag="method"}
diff --git a/website/docs/api/morphology.md b/website/docs/api/morphology.md
index 565e520b5..20fcd1a40 100644
--- a/website/docs/api/morphology.md
+++ b/website/docs/api/morphology.md
@@ -105,11 +105,11 @@ representation.
## Attributes {#attributes}
-| Name | Description |
-| ------------- | ---------------------------------------------------------------------------------------------------------------------------- | ---------- |
-| `FEATURE_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) feature separator. Default is ` | `. ~~str~~ |
-| `FIELD_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) field separator. Default is `=`. ~~str~~ |
-| `VALUE_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) value separator. Default is `,`. ~~str~~ |
+| Name | Description |
+| ------------- | ------------------------------------------------------------------------------------------------------------------------------ |
+| `FEATURE_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) feature separator. Default is `|`. ~~str~~ |
+| `FIELD_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) field separator. Default is `=`. ~~str~~ |
+| `VALUE_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) value separator. Default is `,`. ~~str~~ |
## MorphAnalysis {#morphanalysis tag="class" source="spacy/tokens/morphanalysis.pyx"}
diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md
index 4a5fb6042..2cef9ac2a 100644
--- a/website/docs/api/phrasematcher.md
+++ b/website/docs/api/phrasematcher.md
@@ -149,8 +149,8 @@ patterns = [nlp("health care reform"), nlp("healthcare reform")]
| Name | Description |
-| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | --- |
-| `match_id` | An ID for the thing you're matching. ~~str~~ | |
+| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `key` | An ID for the thing you're matching. ~~str~~ |
| `docs` | `Doc` objects of the phrases to match. ~~List[Doc]~~ |
| _keyword-only_ | |
| `on_match` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. ~~Optional[Callable[[Matcher, Doc, int, List[tuple], Any]]~~ |
diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md
index 2f856c667..263942e3e 100644
--- a/website/docs/api/pipe.md
+++ b/website/docs/api/pipe.md
@@ -297,10 +297,12 @@ Score a batch of examples.
> scores = pipe.score(examples)
> ```
-| Name | Description |
-| ----------- | ------------------------------------------------------------------------------------------------------- |
-| `examples` | The examples to score. ~~Iterable[Example]~~ |
-| **RETURNS** | The scores, e.g. produced by the [`Scorer`](/api/scorer). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
+| Name | Description |
+| -------------- | ------------------------------------------------------------------------------------------------------- |
+| `examples` | The examples to score. ~~Iterable[Example]~~ |
+| _keyword-only_ |
+| `\*\*kwargs` | Any additional settings to pass on to the scorer. ~~Any~~ |
+| **RETURNS** | The scores, e.g. produced by the [`Scorer`](/api/scorer). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
## TrainablePipe.create_optimizer {#create_optimizer tag="method"}
diff --git a/website/docs/api/pipeline-functions.md b/website/docs/api/pipeline-functions.md
index a776eca9b..ff19d3e71 100644
--- a/website/docs/api/pipeline-functions.md
+++ b/website/docs/api/pipeline-functions.md
@@ -130,3 +130,25 @@ exceed the transformer model max length.
| `min_length` | The minimum length for a token to be split. Defaults to `25`. ~~int~~ |
| `split_length` | The length of the split tokens. Defaults to `5`. ~~int~~ |
| **RETURNS** | The modified `Doc` with the split tokens. ~~Doc~~ |
+
+## doc_cleaner {#doc_cleaner tag="function" new="3.2.1"}
+
+Clean up `Doc` attributes. Intended for use at the end of pipelines with
+`tok2vec` or `transformer` pipeline components that store tensors and other
+values that can require a lot of memory and frequently aren't needed after the
+whole pipeline has run.
+
+> #### Example
+>
+> ```python
+> config = {"attrs": {"tensor": None}}
+> nlp.add_pipe("doc_cleaner", config=config)
+> doc = nlp("text")
+> assert doc.tensor is None
+> ```
+
+| Setting | Description |
+| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `attrs` | A dict of the `Doc` attributes and the values to set them to. Defaults to `{"tensor": None, "_.trf_data": None}` to clean up after `tok2vec` and `transformer` components. ~~dict~~ |
+| `silent` | If `False`, show warnings if attributes aren't found or can't be set. Defaults to `True`. ~~bool~~ |
+| **RETURNS** | The modified `Doc` with the modified attributes. ~~Doc~~ |
diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md
index ad908f204..8dbe3b276 100644
--- a/website/docs/api/scorer.md
+++ b/website/docs/api/scorer.md
@@ -27,9 +27,13 @@ Create a new `Scorer`.
> scorer = Scorer(nlp)
> ```
-| Name | Description |
-| ----- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `nlp` | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`. ~~Language~~ |
+| Name | Description |
+| ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `nlp` | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline is constructed using the `default_lang` and `default_pipeline` settings. ~~Optional[Language]~~ |
+| `default_lang` | The language to use for a default pipeline if `nlp` is not provided. Defaults to `xx`. ~~str~~ |
+| `default_pipeline` | The pipeline components to use for a default pipeline if `nlp` is not provided. Defaults to `("senter", "tagger", "morphologizer", "parser", "ner", "textcat")`. ~~Iterable[string]~~ |
+| _keyword-only_ | |
+| `\*\*kwargs` | Any additional settings to pass on to the individual scoring methods. ~~Any~~ |
## Scorer.score {#score tag="method"}
@@ -37,15 +41,20 @@ Calculate the scores for a list of [`Example`](/api/example) objects using the
scoring methods provided by the components in the pipeline.
The returned `Dict` contains the scores provided by the individual pipeline
-components. For the scoring methods provided by the `Scorer` and use by the core
-pipeline components, the individual score names start with the `Token` or `Doc`
-attribute being scored:
+components. For the scoring methods provided by the `Scorer` and used by the
+core pipeline components, the individual score names start with the `Token` or
+`Doc` attribute being scored:
-- `token_acc`, `token_p`, `token_r`, `token_f`,
+- `token_acc`, `token_p`, `token_r`, `token_f`
- `sents_p`, `sents_r`, `sents_f`
-- `tag_acc`, `pos_acc`, `morph_acc`, `morph_per_feat`, `lemma_acc`
+- `tag_acc`
+- `pos_acc`
+- `morph_acc`, `morph_micro_p`, `morph_micro_r`, `morph_micro_f`,
+ `morph_per_feat`
+- `lemma_acc`
- `dep_uas`, `dep_las`, `dep_las_per_type`
- `ents_p`, `ents_r` `ents_f`, `ents_per_type`
+- `spans_sc_p`, `spans_sc_r`, `spans_sc_f`
- `cats_score` (depends on config, description provided in `cats_score_desc`),
`cats_micro_p`, `cats_micro_r`, `cats_micro_f`, `cats_macro_p`,
`cats_macro_r`, `cats_macro_f`, `cats_macro_auc`, `cats_f_per_type`,
@@ -120,14 +129,14 @@ scoring.
> print(scores["morph_per_feat"])
> ```
-| Name | Description |
-| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
-| `attr` | The attribute to score. ~~str~~ |
-| _keyword-only_ | |
-| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
-| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ |
-| **RETURNS** | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~ |
+| Name | Description |
+| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
+| `attr` | The attribute to score. ~~str~~ |
+| _keyword-only_ | |
+| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
+| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ |
+| **RETURNS** | A dictionary containing the micro PRF scores under the key `{attr}_micro_p/r/f` and the per-feature PRF scores under `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~ |
## Scorer.score_spans {#score_spans tag="staticmethod" new="3"}
@@ -253,3 +262,11 @@ entities that overlap between the gold reference and the predictions.
| _keyword-only_ | |
| `negative_labels` | The string values that refer to no annotation (e.g. "NIL"). ~~Iterable[str]~~ |
| **RETURNS** | A dictionary containing the scores. ~~Dict[str, Optional[float]]~~ |
+
+## get_ner_prf {#get_ner_prf new="3"}
+
+Compute micro-PRF and per-entity PRF scores.
+
+| Name | Description |
+| ---------- | ------------------------------------------------------------------------------------------------------------------- |
+| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md
index e82a4bef6..29bf10393 100644
--- a/website/docs/api/sentencerecognizer.md
+++ b/website/docs/api/sentencerecognizer.md
@@ -12,6 +12,16 @@ api_trainable: true
A trainable pipeline component for sentence segmentation. For a simpler,
rule-based strategy, see the [`Sentencizer`](/api/sentencizer).
+## Assigned Attributes {#assigned-attributes}
+
+Predicted values will be assigned to `Token.is_sent_start`. The resulting
+sentences can be accessed using `Doc.sents`.
+
+| Location | Value |
+| --------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
+| `Token.is_sent_start` | A boolean value indicating whether the token starts a sentence. This will be either `True` or `False` for all tokens. ~~bool~~ |
+| `Doc.sents` | An iterator over sentences in the `Doc`, determined by `Token.is_sent_start` values. ~~Iterator[Span]~~ |
+
## Config and implementation {#config}
The default config is defined by the pipeline component factory and describes
@@ -29,9 +39,11 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("senter", config=config)
> ```
-| Setting | Description |
-| ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
+| Setting | Description |
+| ---------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
+| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ |
+| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/senter.pyx
@@ -60,11 +72,14 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe).
-| Name | Description |
-| ------- | -------------------------------------------------------------------------------------------------------------------- |
-| `vocab` | The shared vocabulary. ~~Vocab~~ |
-| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
-| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
+| Name | Description |
+| ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab` | The shared vocabulary. ~~Vocab~~ |
+| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
+| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
+| _keyword-only_ | |
+| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ |
+| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~ |
## SentenceRecognizer.\_\_call\_\_ {#call tag="method"}
@@ -238,21 +253,6 @@ predicted scores.
| `scores` | Scores representing the model's predictions. |
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
-## SentenceRecognizer.score {#score tag="method" new="3"}
-
-Score a batch of examples.
-
-> #### Example
->
-> ```python
-> scores = senter.score(examples)
-> ```
-
-| Name | Description |
-| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples` | The examples to score. ~~Iterable[Example]~~ |
-| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"`, `"tag"` and `"lemma"`. ~~Dict[str, float]~~ |
-
## SentenceRecognizer.create_optimizer {#create_optimizer tag="method"}
Create an optimizer for the pipeline component.
diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md
index 75a253fc0..b75c7a2f1 100644
--- a/website/docs/api/sentencizer.md
+++ b/website/docs/api/sentencizer.md
@@ -13,6 +13,16 @@ performed by the [`DependencyParser`](/api/dependencyparser), so the
`Sentencizer` lets you implement a simpler, rule-based strategy that doesn't
require a statistical model to be loaded.
+## Assigned Attributes {#assigned-attributes}
+
+Calculated values will be assigned to `Token.is_sent_start`. The resulting
+sentences can be accessed using `Doc.sents`.
+
+| Location | Value |
+| --------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
+| `Token.is_sent_start` | A boolean value indicating whether the token starts a sentence. This will be either `True` or `False` for all tokens. ~~bool~~ |
+| `Doc.sents` | An iterator over sentences in the `Doc`, determined by `Token.is_sent_start` values. ~~Iterator[Span]~~ |
+
## Config and implementation {#config}
The default config is defined by the pipeline component factory and describes
@@ -27,9 +37,11 @@ how the component should be configured. You can override its settings via the
> nlp.add_pipe("sentencizer", config=config)
> ```
-| Setting | Description |
-| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ | ------ |
-| `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults if not set. Defaults to `None`. ~~Optional[List[str]]~~ | `None` |
+| Setting | Description |
+| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults if not set. Defaults to `None`. ~~Optional[List[str]]~~ | `None` |
+| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ |
+| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"` ~~Optional[Callable]~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/sentencizer.pyx
@@ -50,10 +62,12 @@ Initialize the sentencizer.
> sentencizer = Sentencizer()
> ```
-| Name | Description |
-| -------------- | ----------------------------------------------------------------------------------------------------------------------- |
-| _keyword-only_ | |
-| `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults. ~~Optional[List[str]]~~ |
+| Name | Description |
+| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
+| _keyword-only_ | |
+| `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults. ~~Optional[List[str]]~~ |
+| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ |
+| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"` ~~Optional[Callable]~~ |
```python
### punct_chars defaults
@@ -112,21 +126,6 @@ applied to the `Doc` in order.
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
-## Sentencizer.score {#score tag="method" new="3"}
-
-Score a batch of examples.
-
-> #### Example
->
-> ```python
-> scores = sentencizer.score(examples)
-> ```
-
-| Name | Description |
-| ----------- | --------------------------------------------------------------------------------------------------------------------- |
-| `examples` | The examples to score. ~~Iterable[Example]~~ |
-| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). ~~Dict[str, Union[float, Dict[str, float]]~~ |
-
## Sentencizer.to_disk {#to_disk tag="method"}
Save the sentencizer settings (punctuation characters) to a directory. Will
diff --git a/website/docs/api/span.md b/website/docs/api/span.md
index 9212f957d..ff7905bc0 100644
--- a/website/docs/api/span.md
+++ b/website/docs/api/span.md
@@ -18,14 +18,15 @@ Create a `Span` object from the slice `doc[start : end]`.
> assert [t.text for t in span] == ["it", "back", "!"]
> ```
-| Name | Description |
-| -------- | --------------------------------------------------------------------------------------- |
-| `doc` | The parent document. ~~Doc~~ |
-| `start` | The index of the first token of the span. ~~int~~ |
-| `end` | The index of the first token after the span. ~~int~~ |
-| `label` | A label to attach to the span, e.g. for named entities. ~~Union[str, int]~~ |
-| `kb_id` | A knowledge base ID to attach to the span, e.g. for named entities. ~~Union[str, int]~~ |
-| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
+| Name | Description |
+| ------------- | --------------------------------------------------------------------------------------- |
+| `doc` | The parent document. ~~Doc~~ |
+| `start` | The index of the first token of the span. ~~int~~ |
+| `end` | The index of the first token after the span. ~~int~~ |
+| `label` | A label to attach to the span, e.g. for named entities. ~~Union[str, int]~~ |
+| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
+| `vector_norm` | The L2 norm of the document's vector representation. ~~float~~ |
+| `kb_id` | A knowledge base ID to attach to the span, e.g. for named entities. ~~Union[str, int]~~ |
## Span.\_\_getitem\_\_ {#getitem tag="method"}
@@ -256,8 +257,8 @@ shape `(N, M)`, where `N` is the length of the document. The values will be
## Span.ents {#ents tag="property" new="2.0.13" model="ner"}
-The named entities in the span. Returns a tuple of named entity `Span` objects,
-if the entity recognizer has been applied.
+The named entities that fall completely within the span. Returns a tuple of
+`Span` objects.
> #### Example
>
@@ -303,6 +304,10 @@ not been implemeted for the given language, a `NotImplementedError` is raised.
Create a new `Doc` object corresponding to the `Span`, with a copy of the data.
+When calling this on many spans from the same doc, passing in a precomputed
+array representation of the doc using the `array_head` and `array` args can save
+time.
+
> #### Example
>
> ```python
@@ -312,10 +317,12 @@ Create a new `Doc` object corresponding to the `Span`, with a copy of the data.
> assert doc2.text == "New York"
> ```
-| Name | Description |
-| ---------------- | ------------------------------------------------------------- |
-| `copy_user_data` | Whether or not to copy the original doc's user data. ~~bool~~ |
-| **RETURNS** | A `Doc` object of the `Span`'s content. ~~Doc~~ |
+| Name | Description |
+| ---------------- | -------------------------------------------------------------------------------------------------------------------- |
+| `copy_user_data` | Whether or not to copy the original doc's user data. ~~bool~~ |
+| `array_head` | Precomputed array attributes (headers) of the original doc, as generated by `Doc._get_array_attrs()`. ~~Tuple~~ |
+| `array` | Precomputed array version of the original doc as generated by [`Doc.to_array`](/api/doc#to_array). ~~numpy.ndarray~~ |
+| **RETURNS** | A `Doc` object of the `Span`'s content. ~~Doc~~ |
## Span.root {#root tag="property" model="parser"}
@@ -511,6 +518,27 @@ sent = doc[sent.start : max(sent.end, span.end)]
| ----------- | ------------------------------------------------------- |
| **RETURNS** | The sentence span that this span is a part of. ~~Span~~ |
+## Span.sents {#sents tag="property" model="sentences" new="3.2.1"}
+
+Returns a generator over the sentences the span belongs to. This property is only available
+when [sentence boundaries](/usage/linguistic-features#sbd) have been set on the
+document by the `parser`, `senter`, `sentencizer` or some custom function. It
+will raise an error otherwise.
+
+If the span happens to cross sentence boundaries, all sentences the span overlaps with will be returned.
+
+> #### Example
+>
+> ```python
+> doc = nlp("Give it back! He pleaded.")
+> span = doc[2:4]
+> assert len(span.sents) == 2
+> ```
+
+| Name | Description |
+| ----------- | -------------------------------------------------------------------------- |
+| **RETURNS** | A generator yielding sentences this `Span` is a part of ~~Iterable[Span]~~ |
+
## Attributes {#attributes}
| Name | Description |
diff --git a/website/docs/api/spancategorizer.md b/website/docs/api/spancategorizer.md
index f26dba149..26fcaefdf 100644
--- a/website/docs/api/spancategorizer.md
+++ b/website/docs/api/spancategorizer.md
@@ -13,6 +13,22 @@ A span categorizer consists of two parts: a [suggester function](#suggesters)
that proposes candidate spans, which may or may not overlap, and a labeler model
that predicts zero or more labels for each candidate.
+Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the doc.
+Individual span scores can be found in `spangroup.attrs["scores"]`.
+
+## Assigned Attributes {#assigned-attributes}
+
+Predictions will be saved to `Doc.spans[spans_key]` as a
+[`SpanGroup`](/api/spangroup). The scores for the spans in the `SpanGroup` will
+be saved in `SpanGroup.attrs["scores"]`.
+
+`spans_key` defaults to `"sc"`, but can be passed as a parameter.
+
+| Location | Value |
+| -------------------------------------- | -------------------------------------------------------- |
+| `Doc.spans[spans_key]` | The annotated spans. ~~SpanGroup~~ |
+| `Doc.spans[spans_key].attrs["scores"]` | The score for each span in the `SpanGroup`. ~~Floats1d~~ |
+
## Config and implementation {#config}
The default config is defined by the pipeline component factory and describes
@@ -38,11 +54,12 @@ architectures and their arguments and hyperparameters.
| Setting | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[List[Doc], Ragged]~~ |
+| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"spans"`. ~~str~~ |
| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ |
| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ |
+| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/spancat.py
@@ -73,7 +90,7 @@ shortcut for this and instantiate the component using its string name and
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
-| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. ~~Callable[List[Doc], Ragged]~~ |
+| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| _keyword-only_ | |
| `spans_key` | Key of the [`Doc.spans`](/api/doc#sans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"spans"`. ~~str~~ |
@@ -235,27 +252,11 @@ predicted scores.
> loss, d_loss = spancat.get_loss(examples, scores)
> ```
-| Name | Description |
-| ----------- | --------------------------------------------------------------------------- |
-| `examples` | The batch of examples. ~~Iterable[Example]~~ |
-| `scores` | Scores representing the model's predictions. |
-| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
-
-## SpanCategorizer.score {#score tag="method"}
-
-Score a batch of examples.
-
-> #### Example
->
-> ```python
-> scores = spancat.score(examples)
-> ```
-
-| Name | Description |
-| -------------- | ---------------------------------------------------------------------------------------------------------------------- |
-| `examples` | The examples to score. ~~Iterable[Example]~~ |
-| _keyword-only_ | |
-| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
+| Name | Description |
+| -------------- | --------------------------------------------------------------------------- |
+| `examples` | The batch of examples. ~~Iterable[Example]~~ |
+| `spans_scores` | Scores representing the model's predictions. ~~Tuple[Ragged, Floats2d]~~ |
+| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
## SpanCategorizer.create_optimizer {#create_optimizer tag="method"}
@@ -450,4 +451,25 @@ integers. The array has two columns, indicating the start and end position.
| Name | Description |
| ----------- | -------------------------------------------------------------------------------------------------------------------- |
| `sizes` | The phrase lengths to suggest. For example, `[1, 2]` will suggest phrases consisting of 1 or 2 tokens. ~~List[int]~~ |
-| **CREATES** | The suggester function. ~~Callable[[List[Doc]], Ragged]~~ |
+| **CREATES** | The suggester function. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
+
+### spacy.ngram_range_suggester.v1 {#ngram_range_suggester}
+
+> #### Example Config
+>
+> ```ini
+> [components.spancat.suggester]
+> @misc = "spacy.ngram_range_suggester.v1"
+> min_size = 2
+> max_size = 4
+> ```
+
+Suggest all spans of at least length `min_size` and at most length `max_size`
+(both inclusive). Spans are returned as a ragged array of integers. The array
+has two columns, indicating the start and end position.
+
+| Name | Description |
+| ----------- | ---------------------------------------------------------------------------- |
+| `min_size` | The minimal phrase lengths to suggest (inclusive). ~~[int]~~ |
+| `max_size` | The maximal phrase lengths to suggest (exclusive). ~~[int]~~ |
+| **CREATES** | The suggester function. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
diff --git a/website/docs/api/spangroup.md b/website/docs/api/spangroup.md
index ba248f376..654067eb1 100644
--- a/website/docs/api/spangroup.md
+++ b/website/docs/api/spangroup.md
@@ -46,6 +46,16 @@ Create a `SpanGroup`.
The [`Doc`](/api/doc) object the span group is referring to.
+
+
+When a `Doc` object is garbage collected, any related `SpanGroup` object won't
+be functional anymore, as these objects use a `weakref` to refer to the
+document. An error will be raised as the internal `doc` object will be `None`.
+To avoid this, make sure that the original `Doc` objects are still available in
+the scope of your function.
+
+
+
> #### Example
>
> ```python
diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md
index 3002aff7b..b51864d3a 100644
--- a/website/docs/api/tagger.md
+++ b/website/docs/api/tagger.md
@@ -8,6 +8,21 @@ api_string_name: tagger
api_trainable: true
---
+A trainable pipeline component to predict part-of-speech tags for any
+part-of-speech tag set.
+
+In the pre-trained pipelines, the tag schemas vary by language; see the
+[individual model pages](/models) for details.
+
+## Assigned Attributes {#assigned-attributes}
+
+Predictions are assigned to `Token.tag`.
+
+| Location | Value |
+| ------------ | ---------------------------------- |
+| `Token.tag` | The part of speech (hash). ~~int~~ |
+| `Token.tag_` | The part of speech. ~~str~~ |
+
## Config and implementation {#config}
The default config is defined by the pipeline component factory and describes
@@ -25,9 +40,12 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("tagger", config=config)
> ```
-| Setting | Description |
-| ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
+| Setting | Description |
+| ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
+| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ |
+| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~ |
+| `neg_prefix` 3.2.1 | The prefix used to specify incorrect tags while training. The tagger will learn not to predict exactly this tag. Defaults to `!`. ~~str~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/tagger.pyx
@@ -54,11 +72,14 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe).
-| Name | Description |
-| ------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab` | The shared vocabulary. ~~Vocab~~ |
-| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ |
-| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
+| Name | Description |
+| ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab` | The shared vocabulary. ~~Vocab~~ |
+| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ |
+| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
+| _keyword-only_ | |
+| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ |
+| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~ |
## Tagger.\_\_call\_\_ {#call tag="method"}
@@ -249,21 +270,6 @@ predicted scores.
| `scores` | Scores representing the model's predictions. |
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
-## Tagger.score {#score tag="method" new="3"}
-
-Score a batch of examples.
-
-> #### Example
->
-> ```python
-> scores = tagger.score(examples)
-> ```
-
-| Name | Description |
-| ----------- | --------------------------------------------------------------------------------------------------------------------------------- |
-| `examples` | The examples to score. ~~Iterable[Example]~~ |
-| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Dict[str, float]~~ |
-
## Tagger.create_optimizer {#create_optimizer tag="method"}
Create an optimizer for the pipeline component.
diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md
index 923da0048..47f868637 100644
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@@ -29,6 +29,22 @@ only.
+## Assigned Attributes {#assigned-attributes}
+
+Predictions will be saved to `doc.cats` as a dictionary, where the key is the
+name of the category and the value is a score between 0 and 1 (inclusive). For
+`textcat` (exclusive categories), the scores will sum to 1, while for
+`textcat_multilabel` there is no particular guarantee about their sum.
+
+Note that when assigning values to create training data, the score of each
+category must be 0 or 1. Using other values, for example to create a document
+that is a little bit in category A and a little bit in category B, is not
+supported.
+
+| Location | Value |
+| ---------- | ------------------------------------- |
+| `Doc.cats` | Category scores. ~~Dict[str, float]~~ |
+
## Config and implementation {#config}
The default config is defined by the pipeline component factory and describes
@@ -96,13 +112,14 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#create_pipe).
-| Name | Description |
-| -------------- | -------------------------------------------------------------------------------------------------------------------------- |
-| `vocab` | The shared vocabulary. ~~Vocab~~ |
-| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
-| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
-| _keyword-only_ | |
-| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ |
+| Name | Description |
+| -------------- | -------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab` | The shared vocabulary. ~~Vocab~~ |
+| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
+| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
+| _keyword-only_ | |
+| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ |
+| `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ |
## TextCategorizer.\_\_call\_\_ {#call tag="method"}
diff --git a/website/docs/api/token.md b/website/docs/api/token.md
index 44c92d1ee..44a2ea9e8 100644
--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@@ -474,8 +474,8 @@ The L2 norm of the token's vector representation.
| `like_email` | Does the token resemble an email address? ~~bool~~ |
| `is_oov` | Is the token out-of-vocabulary (i.e. does it not have a word vector)? ~~bool~~ |
| `is_stop` | Is the token part of a "stop list"? ~~bool~~ |
-| `pos` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~int~~ |
-| `pos_` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~str~~ |
+| `pos` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~int~~ |
+| `pos_` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~str~~ |
| `tag` | Fine-grained part-of-speech. ~~int~~ |
| `tag_` | Fine-grained part-of-speech. ~~str~~ |
| `morph` 3 | Morphological analysis. ~~MorphAnalysis~~ |
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 8190d9f78..be19f9c3a 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -83,7 +83,7 @@ Create a blank pipeline of a given language class. This function is the twin of
| Name | Description |
| ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `name` | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. ~~str~~ |
+| `name` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en', of the language class to load. ~~str~~ |
| _keyword-only_ | |
| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
| `config` 3 | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
@@ -313,11 +313,12 @@ If a setting is not present in the options, the default value will be used.
> displacy.serve(doc, style="ent", options=options)
> ```
-| Name | Description |
-| --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `ents` | Entity types to highlight or `None` for all types (default). ~~Optional[List[str]]~~ |
-| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ |
-| `template` 2.2 | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](%%GITHUB_SPACY/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ |
+| Name | Description |
+| ------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `ents` | Entity types to highlight or `None` for all types (default). ~~Optional[List[str]]~~ |
+| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ |
+| `template` 2.2 | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](%%GITHUB_SPACY/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ |
+| `kb_url_template` 3.2.1 | Optional template to construct the KB url for the entity to link to. Expects a python f-string format with single field to fill in. ~~Optional[str]~~ |
By default, displaCy comes with colors for all entity types used by
[spaCy's trained pipelines](/models). If you're using custom entity types, you
@@ -326,6 +327,14 @@ or pipeline package can also expose a
[`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy)
to add custom labels and their colors automatically.
+By default, displaCy links to `#` for entities without a `kb_id` set on their
+span. If you wish to link an entity to their URL then consider using the
+`kb_url_template` option from above. For example if the `kb_id` on a span is
+`Q95` and this is a Wikidata identifier then this option can be set to
+`https://www.wikidata.org/wiki/{}`. Clicking on your entity in the rendered HTML
+should redirect you to their Wikidata page, in this case
+`https://www.wikidata.org/wiki/Q95`.
+
## registry {#registry source="spacy/util.py" new="3"}
spaCy's function registry extends
@@ -373,6 +382,7 @@ factories.
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
| `readers` | Registry for file and data readers, including training and evaluation data readers like [`Corpus`](/api/corpus). |
| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). |
+| `scorers` | Registry for functions that create scoring methods for user with the [`Scorer`](/api/scorer). Scoring methods are called with `Iterable[Example]` and arbitrary `\*\*kwargs` and return scores as `Dict[str, Any]`. |
| `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. |
### spacy-transformers registry {#registry-transformers}
@@ -410,10 +420,13 @@ finished. To log each training step, a
[`spacy train`](/api/cli#train), including information such as the training loss
and the accuracy scores on the development set.
-There are two built-in logging functions: a logger printing results to the
-console in tabular format (which is the default), and one that also sends the
-results to a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of
-using one of the built-in loggers listed here, you can also
+The built-in, default logger is the ConsoleLogger, which prints results to the
+console in tabular format. The
+[spacy-loggers](https://github.com/explosion/spacy-loggers) package, included as
+a dependency of spaCy, enables other loggers: currently it provides one that
+sends results to a [Weights & Biases](https://www.wandb.com/) dashboard.
+
+Instead of using one of the built-in loggers, you can
[implement your own](/usage/training#custom-logging).
#### spacy.ConsoleLogger.v1 {#ConsoleLogger tag="registered function"}
@@ -462,62 +475,6 @@ start decreasing across epochs.
-#### spacy.WandbLogger.v2 {#WandbLogger tag="registered function"}
-
-> #### Installation
->
-> ```bash
-> $ pip install wandb
-> $ wandb login
-> ```
-
-Built-in logger that sends the results of each training step to the dashboard of
-the [Weights & Biases](https://www.wandb.com/) tool. To use this logger, Weights
-& Biases should be installed, and you should be logged in. The logger will send
-the full config file to W&B, as well as various system information such as
-memory utilization, network traffic, disk IO, GPU statistics, etc. This will
-also include information such as your hostname and operating system, as well as
-the location of your Python executable.
-
-
-
-Note that by default, the full (interpolated)
-[training config](/usage/training#config) is sent over to the W&B dashboard. If
-you prefer to **exclude certain information** such as path names, you can list
-those fields in "dot notation" in the `remove_config_values` parameter. These
-fields will then be removed from the config before uploading, but will otherwise
-remain in the config file stored on your local system.
-
-
-
-> #### Example config
->
-> ```ini
-> [training.logger]
-> @loggers = "spacy.WandbLogger.v2"
-> project_name = "monitor_spacy_training"
-> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
-> log_dataset_dir = "corpus"
-> model_log_interval = 1000
-> ```
-
-| Name | Description |
-| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
-| `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ |
-| `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ |
-| `model_log_interval` | Steps to wait between logging model checkpoints to W&B dasboard (default: None). ~~Optional[int]~~ |
-| `log_dataset_dir` | Directory containing dataset to be logged and versioned as W&B artifact (default: None). ~~Optional[str]~~ |
-
-
-
-Get started with tracking your spaCy training runs in Weights & Biases using our
-project template. It trains on the IMDB Movie Review Dataset and includes a
-simple config with the built-in `WandbLogger`, as well as a custom example of
-creating variants of the config for a simple hyperparameter grid search and
-logging the results.
-
-
-
## Readers {#readers}
### File readers {#file-readers source="github.com/explosion/srsly" new="3"}
@@ -815,6 +772,26 @@ from the specified model. Intended for use in `[initialize.before_init]`.
| `vocab` | The pipeline to copy the vocab from. The vocab includes the lookups and vectors. Defaults to `None`. ~~Optional[str]~~ |
| **CREATES** | A function that takes the current `nlp` object and modifies its `tokenizer` and `vocab`. ~~Callable[[Language], None]~~ |
+### spacy.models_with_nvtx_range.v1 {#models_with_nvtx_range tag="registered function"}
+
+> #### Example config
+>
+> ```ini
+> [nlp]
+> after_pipeline_creation = {"@callbacks":"spacy.models_with_nvtx_range.v1"}
+> ```
+
+Recursively wrap the models in each pipe using
+[NVTX](https://nvidia.github.io/NVTX/) range markers. These markers aid in GPU
+profiling by attributing specific operations to a ~~Model~~'s forward or
+backprop passes.
+
+| Name | Description |
+| ---------------- | ---------------------------------------------------------------------------------------------------------------------------- |
+| `forward_color` | Color identifier for forward passes. Defaults to `-1`. ~~int~~ |
+| `backprop_color` | Color identifier for backpropagation passes. Defaults to `-1`. ~~int~~ |
+| **CREATES** | A function that takes the current `nlp` and wraps forward/backprop passes in NVTX ranges. ~~Callable[[Language], Language]~~ |
+
## Training data and alignment {#gold source="spacy/training"}
### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"}
diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md
index 569fcfbd4..b1673cdbe 100644
--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@@ -38,12 +38,21 @@ attributes. We also calculate an alignment between the word-piece tokens and the
spaCy tokenization, so that we can use the last hidden states to set the
`Doc.tensor` attribute. When multiple word-piece tokens align to the same spaCy
token, the spaCy token receives the sum of their values. To access the values,
-you can use the custom [`Doc._.trf_data`](#custom-attributes) attribute. The
+you can use the custom [`Doc._.trf_data`](#assigned-attributes) attribute. The
package also adds the function registries [`@span_getters`](#span_getters) and
[`@annotation_setters`](#annotation_setters) with several built-in registered
functions. For more details, see the
[usage documentation](/usage/embeddings-transformers).
+## Assigned Attributes {#assigned-attributes}
+
+The component sets the following
+[custom extension attribute](/usage/processing-pipeline#custom-components-attributes):
+
+| Location | Value |
+| ---------------- | ------------------------------------------------------------------------ |
+| `Doc._.trf_data` | Transformer tokens and outputs for the `Doc` object. ~~TransformerData~~ |
+
## Config and implementation {#config}
The default config is defined by the pipeline component factory and describes
@@ -83,9 +92,12 @@ https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/p
> # Construction via add_pipe with custom config
> config = {
> "model": {
-> "@architectures": "spacy-transformers.TransformerModel.v1",
+> "@architectures": "spacy-transformers.TransformerModel.v3",
> "name": "bert-base-uncased",
-> "tokenizer_config": {"use_fast": True}
+> "tokenizer_config": {"use_fast": True},
+> "transformer_config": {"output_attentions": True},
+> "mixed_precision": True,
+> "grad_scaler_config": {"init_scale": 32768}
> }
> }
> trf = nlp.add_pipe("transformer", config=config)
@@ -98,7 +110,7 @@ https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/p
Construct a `Transformer` component. One or more subsequent spaCy components can
use the transformer outputs as features in its model, with gradients
backpropagated to the single shared weights. The activations from the
-transformer are saved in the [`Doc._.trf_data`](#custom-attributes) extension
+transformer are saved in the [`Doc._.trf_data`](#assigned-attributes) extension
attribute. You can also provide a callback to set additional annotations. In
your application, you would normally use a shortcut for this and instantiate the
component using its string name and [`nlp.add_pipe`](/api/language#create_pipe).
@@ -205,7 +217,7 @@ modifying them.
Assign the extracted features to the `Doc` objects. By default, the
[`TransformerData`](/api/transformer#transformerdata) object is written to the
-[`Doc._.trf_data`](#custom-attributes) attribute. Your `set_extra_annotations`
+[`Doc._.trf_data`](#assigned-attributes) attribute. Your `set_extra_annotations`
callback is then called, if provided.
> #### Example
@@ -383,14 +395,15 @@ are wrapped into the
[FullTransformerBatch](/api/transformer#fulltransformerbatch) object. The
`FullTransformerBatch` then splits out the per-document data, which is handled
by this class. Instances of this class are typically assigned to the
-[`Doc._.trf_data`](/api/transformer#custom-attributes) extension attribute.
+[`Doc._.trf_data`](/api/transformer#assigned-attributes) extension attribute.
-| Name | Description |
-| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tokens` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~ |
-| `tensors` | The activations for the `Doc` from the transformer. Usually the last tensor that is 3-dimensional will be the most important, as that will provide the final hidden state. Generally activations that are 2-dimensional will be attention weights. Details of this variable will differ depending on the underlying transformer model. ~~List[FloatsXd]~~ |
-| `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ |
-| `width` | The width of the last hidden layer. ~~int~~ |
+| Name | Description |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `tokens` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~ |
+| `model_output` | The model output from the transformer model, determined by the model and transformer config. New in `spacy-transformers` v1.1.0. ~~transformers.file_utils.ModelOutput~~ |
+| `tensors` | The `model_output` in the earlier `transformers` tuple format converted using [`ModelOutput.to_tuple()`](https://huggingface.co/transformers/main_classes/output.html#transformers.file_utils.ModelOutput.to_tuple). Returns `Tuple` instead of `List` as of `spacy-transformers` v1.1.0. ~~Tuple[Union[FloatsXd, List[FloatsXd]]]~~ |
+| `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ |
+| `width` | The width of the last hidden layer. ~~int~~ |
### TransformerData.empty {#transformerdata-emoty tag="classmethod"}
@@ -400,19 +413,32 @@ Create an empty `TransformerData` container.
| ----------- | ---------------------------------- |
| **RETURNS** | The container. ~~TransformerData~~ |
+
+
+In `spacy-transformers` v1.0, the model output is stored in
+`TransformerData.tensors` as `List[Union[FloatsXd]]` and only includes the
+activations for the `Doc` from the transformer. Usually the last tensor that is
+3-dimensional will be the most important, as that will provide the final hidden
+state. Generally activations that are 2-dimensional will be attention weights.
+Details of this variable will differ depending on the underlying transformer
+model.
+
+
+
## FullTransformerBatch {#fulltransformerbatch tag="dataclass"}
Holds a batch of input and output objects for a transformer model. The data can
then be split to a list of [`TransformerData`](/api/transformer#transformerdata)
objects to associate the outputs to each [`Doc`](/api/doc) in the batch.
-| Name | Description |
-| ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `spans` | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each `Span` can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each `Span` may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. ~~List[List[Span]]~~ |
-| `tokens` | The output of the tokenizer. ~~transformers.BatchEncoding~~ |
-| `tensors` | The output of the transformer model. ~~List[torch.Tensor]~~ |
-| `align` | Alignment from the spaCy tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ |
-| `doc_data` | The outputs, split per `Doc` object. ~~List[TransformerData]~~ |
+| Name | Description |
+| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `spans` | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each `Span` can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each `Span` may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. ~~List[List[Span]]~~ |
+| `tokens` | The output of the tokenizer. ~~transformers.BatchEncoding~~ |
+| `model_output` | The model output from the transformer model, determined by the model and transformer config. New in `spacy-transformers` v1.1.0. ~~transformers.file_utils.ModelOutput~~ |
+| `tensors` | The `model_output` in the earlier `transformers` tuple format converted using [`ModelOutput.to_tuple()`](https://huggingface.co/transformers/main_classes/output.html#transformers.file_utils.ModelOutput.to_tuple). Returns `Tuple` instead of `List` as of `spacy-transformers` v1.1.0. ~~Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]~~ |
+| `align` | Alignment from the spaCy tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ |
+| `doc_data` | The outputs, split per `Doc` object. ~~List[TransformerData]~~ |
### FullTransformerBatch.unsplit_by_doc {#fulltransformerbatch-unsplit_by_doc tag="method"}
@@ -435,6 +461,13 @@ Split a `TransformerData` object that represents a batch into a list with one
| ----------- | ------------------------------------------ |
| **RETURNS** | The split batch. ~~List[TransformerData]~~ |
+
+
+In `spacy-transformers` v1.0, the model output is stored in
+`FullTransformerBatch.tensors` as `List[torch.Tensor]`.
+
+
+
## Span getters {#span_getters source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
Span getters are functions that take a batch of [`Doc`](/api/doc) objects and
@@ -549,12 +582,3 @@ The following built-in functions are available:
| Name | Description |
| ---------------------------------------------- | ------------------------------------- |
| `spacy-transformers.null_annotation_setter.v1` | Don't set any additional annotations. |
-
-## Custom attributes {#custom-attributes}
-
-The component sets the following
-[custom extension attributes](/usage/processing-pipeline#custom-components-attributes):
-
-| Name | Description |
-| ---------------- | ------------------------------------------------------------------------ |
-| `Doc._.trf_data` | Transformer tokens and outputs for the `Doc` object. ~~TransformerData~~ |
diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md
index 598abe681..b3bee822c 100644
--- a/website/docs/api/vectors.md
+++ b/website/docs/api/vectors.md
@@ -8,15 +8,30 @@ new: 2
Vectors data is kept in the `Vectors.data` attribute, which should be an
instance of `numpy.ndarray` (for CPU vectors) or `cupy.ndarray` (for GPU
-vectors). Multiple keys can be mapped to the same vector, and not all of the
-rows in the table need to be assigned – so `vectors.n_keys` may be greater or
-smaller than `vectors.shape[0]`.
+vectors).
+
+As of spaCy v3.2, `Vectors` supports two types of vector tables:
+
+- `default`: A standard vector table (as in spaCy v3.1 and earlier) where each
+ key is mapped to one row in the vector table. Multiple keys can be mapped to
+ the same vector, and not all of the rows in the table need to be assigned – so
+ `vectors.n_keys` may be greater or smaller than `vectors.shape[0]`.
+- `floret`: Only supports vectors trained with
+ [floret](https://github.com/explosion/floret), an extended version of
+ [fastText](https://fasttext.cc) that produces compact vector tables by
+ combining fastText's subword ngrams with Bloom embeddings. The compact tables
+ are similar to the [`HashEmbed`](https://thinc.ai/docs/api-layers#hashembed)
+ embeddings already used in many spaCy components. Each word is represented as
+ the sum of one or more rows as determined by the settings related to character
+ ngrams and the hash table.
## Vectors.\_\_init\_\_ {#init tag="method"}
-Create a new vector store. You can set the vector values and keys directly on
-initialization, or supply a `shape` keyword argument to create an empty table
-you can add vectors to later.
+Create a new vector store. With the default mode, you can set the vector values
+and keys directly on initialization, or supply a `shape` keyword argument to
+create an empty table you can add vectors to later. In floret mode, the complete
+vector data and settings must be provided on initialization and cannot be
+modified later.
> #### Example
>
@@ -30,13 +45,21 @@ you can add vectors to later.
> vectors = Vectors(data=data, keys=keys)
> ```
-| Name | Description |
-| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| _keyword-only_ | |
-| `shape` | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ |
-| `data` | The vector data. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
-| `keys` | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~ |
-| `name` | A name to identify the vectors table. ~~str~~ |
+| Name | Description |
+| ----------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| _keyword-only_ | |
+| `strings` | The string store. A new string store is created if one is not provided. Defaults to `None`. ~~Optional[StringStore]~~ |
+| `shape` | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ |
+| `data` | The vector data. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
+| `keys` | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~ |
+| `name` | A name to identify the vectors table. ~~str~~ |
+| `mode` 3.2 | Vectors mode: `"default"` or [`"floret"`](https://github.com/explosion/floret) (default: `"default"`). ~~str~~ |
+| `minn` 3.2 | The floret char ngram minn (default: `0`). ~~int~~ |
+| `maxn` 3.2 | The floret char ngram maxn (default: `0`). ~~int~~ |
+| `hash_count` 3.2 | The floret hash count. Supported values: 1--4 (default: `1`). ~~int~~ |
+| `hash_seed` 3.2 | The floret hash seed (default: `0`). ~~int~~ |
+| `bow` 3.2 | The floret BOW string (default: `"<"`). ~~str~~ |
+| `eow` 3.2 | The floret EOW string (default: `">"`). ~~str~~ |
## Vectors.\_\_getitem\_\_ {#getitem tag="method"}
@@ -53,12 +76,12 @@ raised.
| Name | Description |
| ----------- | ---------------------------------------------------------------- |
-| `key` | The key to get the vector for. ~~int~~ |
+| `key` | The key to get the vector for. ~~Union[int, str]~~ |
| **RETURNS** | The vector for the key. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
## Vectors.\_\_setitem\_\_ {#setitem tag="method"}
-Set a vector for the given key.
+Set a vector for the given key. Not supported for `floret` mode.
> #### Example
>
@@ -75,7 +98,8 @@ Set a vector for the given key.
## Vectors.\_\_iter\_\_ {#iter tag="method"}
-Iterate over the keys in the table.
+Iterate over the keys in the table. In `floret` mode, the keys table is not
+used.
> #### Example
>
@@ -105,7 +129,8 @@ Return the number of vectors in the table.
## Vectors.\_\_contains\_\_ {#contains tag="method"}
-Check whether a key has been mapped to a vector entry in the table.
+Check whether a key has been mapped to a vector entry in the table. In `floret`
+mode, returns `True` for all keys.
> #### Example
>
@@ -123,11 +148,8 @@ Check whether a key has been mapped to a vector entry in the table.
## Vectors.add {#add tag="method"}
Add a key to the table, optionally setting a vector value as well. Keys can be
-mapped to an existing vector by setting `row`, or a new vector can be added.
-When adding string keys, keep in mind that the `Vectors` class itself has no
-[`StringStore`](/api/stringstore), so you have to store the hash-to-string
-mapping separately. If you need to manage the strings, you should use the
-`Vectors` via the [`Vocab`](/api/vocab) class, e.g. `vocab.vectors`.
+mapped to an existing vector by setting `row`, or a new vector can be added. Not
+supported for `floret` mode.
> #### Example
>
@@ -152,7 +174,8 @@ Resize the underlying vectors array. If `inplace=True`, the memory is
reallocated. This may cause other references to the data to become invalid, so
only use `inplace=True` if you're sure that's what you want. If the number of
vectors is reduced, keys mapped to rows that have been deleted are removed.
-These removed items are returned as a list of `(key, row)` tuples.
+These removed items are returned as a list of `(key, row)` tuples. Not supported
+for `floret` mode.
> #### Example
>
@@ -168,7 +191,8 @@ These removed items are returned as a list of `(key, row)` tuples.
## Vectors.keys {#keys tag="method"}
-A sequence of the keys in the table.
+A sequence of the keys in the table. In `floret` mode, the keys table is not
+used.
> #### Example
>
@@ -185,7 +209,7 @@ A sequence of the keys in the table.
Iterate over vectors that have been assigned to at least one key. Note that some
vectors may be unassigned, so the number of vectors returned may be less than
-the length of the vectors table.
+the length of the vectors table. In `floret` mode, the keys table is not used.
> #### Example
>
@@ -200,7 +224,8 @@ the length of the vectors table.
## Vectors.items {#items tag="method"}
-Iterate over `(key, vector)` pairs, in order.
+Iterate over `(key, vector)` pairs, in order. In `floret` mode, the keys table
+is empty.
> #### Example
>
@@ -215,7 +240,7 @@ Iterate over `(key, vector)` pairs, in order.
## Vectors.find {#find tag="method"}
-Look up one or more keys by row, or vice versa.
+Look up one or more keys by row, or vice versa. Not supported for `floret` mode.
> #### Example
>
@@ -273,7 +298,8 @@ The vector size, i.e. `rows * dims`.
Whether the vectors table is full and has no slots are available for new keys.
If a table is full, it can be resized using
-[`Vectors.resize`](/api/vectors#resize).
+[`Vectors.resize`](/api/vectors#resize). In `floret` mode, the table is always
+full and cannot be resized.
> #### Example
>
@@ -291,7 +317,7 @@ If a table is full, it can be resized using
Get the number of keys in the table. Note that this is the number of _all_ keys,
not just unique vectors. If several keys are mapped to the same vectors, they
-will be counted individually.
+will be counted individually. In `floret` mode, the keys table is not used.
> #### Example
>
@@ -311,7 +337,8 @@ For each of the given vectors, find the `n` most similar entries to it by
cosine. Queries are by vector. Results are returned as a
`(keys, best_rows, scores)` tuple. If `queries` is large, the calculations are
performed in chunks to avoid consuming too much memory. You can set the
-`batch_size` to control the size/space trade-off during the calculations.
+`batch_size` to control the size/space trade-off during the calculations. Not
+supported for `floret` mode.
> #### Example
>
@@ -321,7 +348,7 @@ performed in chunks to avoid consuming too much memory. You can set the
> ```
| Name | Description |
-| -------------- | --------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------- |
+| -------------- | --------------------------------------------------------------------------- |
| `queries` | An array with one or more vectors. ~~numpy.ndarray~~ |
| _keyword-only_ | |
| `batch_size` | The batch size to use. Default to `1024`. ~~int~~ |
@@ -329,6 +356,38 @@ performed in chunks to avoid consuming too much memory. You can set the
| `sort` | Whether to sort the entries returned by score. Defaults to `True`. ~~bool~~ |
| **RETURNS** | tuple | The most similar entries as a `(keys, best_rows, scores)` tuple. ~~Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]~~ |
+## Vectors.get_batch {#get_batch tag="method" new="3.2"}
+
+Get the vectors for the provided keys efficiently as a batch.
+
+> #### Example
+>
+> ```python
+> words = ["cat", "dog"]
+> vectors = nlp.vocab.vectors.get_batch(words)
+> ```
+
+| Name | Description |
+| ------ | --------------------------------------- |
+| `keys` | The keys. ~~Iterable[Union[int, str]]~~ |
+
+## Vectors.to_ops {#to_ops tag="method"}
+
+Change the embedding matrix to use different Thinc ops.
+
+> #### Example
+>
+> ```python
+> from thinc.api import NumpyOps
+>
+> vectors.to_ops(NumpyOps())
+>
+> ```
+
+| Name | Description |
+|-------|----------------------------------------------------------|
+| `ops` | The Thinc ops to switch the embedding matrix to. ~~Ops~~ |
+
## Vectors.to_disk {#to_disk tag="method"}
Save the current state to a directory.
diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md
index 8fe769cdd..c0a269d95 100644
--- a/website/docs/api/vocab.md
+++ b/website/docs/api/vocab.md
@@ -21,15 +21,15 @@ Create the vocabulary.
> vocab = Vocab(strings=["hello", "world"])
> ```
-| Name | Description |
-| ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `lex_attr_getters` | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. ~~Optional[Dict[str, Callable[[str], Any]]]~~ |
-| `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~ |
-| `lookups` | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~ |
-| `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ |
-| `vectors_name` 2.2 | A name to identify the vectors table. ~~str~~ |
-| `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ |
-| `get_noun_chunks` | A function that yields base noun phrases used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]~~ |
+| Name | Description |
+| ------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `lex_attr_getters` | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. ~~Optional[Dict[str, Callable[[str], Any]]]~~ |
+| `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~ |
+| `lookups` | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~ |
+| `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ |
+| `vectors_name` 2.2 | A name to identify the vectors table. ~~str~~ |
+| `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ |
+| `get_noun_chunks` | A function that yields base noun phrases used for [`Doc.noun_chunks`](/api/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ |
## Vocab.\_\_len\_\_ {#len tag="method"}
@@ -300,14 +300,14 @@ Load state from a binary string.
> assert type(PERSON) == int
> ```
-| Name | Description |
-| ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `strings` | A table managing the string-to-int mapping. ~~StringStore~~ |
-| `vectors` 2 | A table associating word IDs to word vectors. ~~Vectors~~ |
-| `vectors_length` | Number of dimensions for each word vector. ~~int~~ |
-| `lookups` | The available lookup tables in this vocab. ~~Lookups~~ |
-| `writing_system` 2.1 | A dict with information about the language's writing system. ~~Dict[str, Any]~~ |
-| `get_noun_chunks` 3.0 | A function that yields base noun phrases used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]~~ |
+| Name | Description |
+| ---------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `strings` | A table managing the string-to-int mapping. ~~StringStore~~ |
+| `vectors` 2 | A table associating word IDs to word vectors. ~~Vectors~~ |
+| `vectors_length` | Number of dimensions for each word vector. ~~int~~ |
+| `lookups` | The available lookup tables in this vocab. ~~Lookups~~ |
+| `writing_system` 2.1 | A dict with information about the language's writing system. ~~Dict[str, Any]~~ |
+| `get_noun_chunks` 3.0 | A function that yields base noun phrases used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ |
## Serialization fields {#serialization-fields}
@@ -325,6 +325,5 @@ serialization by passing in the string names via the `exclude` argument.
| Name | Description |
| --------- | ----------------------------------------------------- |
| `strings` | The strings in the [`StringStore`](/api/stringstore). |
-| `lexemes` | The lexeme data. |
| `vectors` | The word vectors, if available. |
| `lookups` | The lookup tables, if available. |
diff --git a/website/docs/images/prodigy_train_curve.jpg b/website/docs/images/prodigy_train_curve.jpg
new file mode 100644
index 000000000..af22cd065
Binary files /dev/null and b/website/docs/images/prodigy_train_curve.jpg differ
diff --git a/website/docs/usage/101/_pos-deps.md b/website/docs/usage/101/_pos-deps.md
index a531b245e..93ad0961a 100644
--- a/website/docs/usage/101/_pos-deps.md
+++ b/website/docs/usage/101/_pos-deps.md
@@ -25,7 +25,7 @@ for token in doc:
> - **Text:** The original word text.
> - **Lemma:** The base form of the word.
-> - **POS:** The simple [UPOS](https://universaldependencies.org/docs/u/pos/)
+> - **POS:** The simple [UPOS](https://universaldependencies.org/u/pos/)
> part-of-speech tag.
> - **Tag:** The detailed part-of-speech tag.
> - **Dep:** Syntactic dependency, i.e. the relation between tokens.
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index 9e3f140e4..708cdd8bf 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -351,7 +351,7 @@ factory = "transformer"
max_batch_items = 4096
[components.transformer.model]
-@architectures = "spacy-transformers.TransformerModel.v1"
+@architectures = "spacy-transformers.TransformerModel.v3"
name = "bert-base-cased"
tokenizer_config = {"use_fast": true}
@@ -367,7 +367,7 @@ The `[components.transformer.model]` block describes the `model` argument passed
to the transformer component. It's a Thinc
[`Model`](https://thinc.ai/docs/api-model) object that will be passed into the
component. Here, it references the function
-[spacy-transformers.TransformerModel.v1](/api/architectures#TransformerModel)
+[spacy-transformers.TransformerModel.v3](/api/architectures#TransformerModel)
registered in the [`architectures` registry](/api/top-level#registry). If a key
in a block starts with `@`, it's **resolved to a function** and all other
settings are passed to the function as arguments. In this case, `name`,
@@ -379,6 +379,21 @@ of potentially overlapping `Span` objects to process by the transformer. Several
to process the whole document or individual sentences. When the config is
resolved, the function is created and passed into the model as an argument.
+The `name` value is the name of any [HuggingFace model](huggingface-models),
+which will be downloaded automatically the first time it's used. You can also
+use a local file path. For full details, see the
+[`TransformerModel` docs](/api/architectures#TransformerModel).
+
+[huggingface-models]:
+ https://huggingface.co/models?library=pytorch&sort=downloads
+
+A wide variety of PyTorch models are supported, but some might not work. If a
+model doesn't seem to work feel free to open an
+[issue](https://github.com/explosion/spacy/issues). Additionally note that
+Transformers loaded in spaCy can only be used for tensors, and pretrained
+task-specific heads or text generation features cannot be used as part of the
+`transformer` pipeline component.
+
Remember that the `config.cfg` used for training should contain **no missing
@@ -671,7 +686,7 @@ You can then run [`spacy pretrain`](/api/cli#pretrain) with the updated config
and pass in optional config overrides, like the path to the raw text file:
```cli
-$ python -m spacy pretrain config_pretrain.cfg ./output --paths.raw text.jsonl
+$ python -m spacy pretrain config_pretrain.cfg ./output --paths.raw_text text.jsonl
```
The following defaults are used for the `[pretraining]` block and merged into
@@ -697,8 +712,10 @@ given you a 10% error reduction, pretraining with spaCy might give you another
The [`spacy pretrain`](/api/cli#pretrain) command will take a **specific
subnetwork** within one of your components, and add additional layers to build a
network for a temporary task that forces the model to learn something about
-sentence structure and word cooccurrence statistics. Pretraining produces a
-**binary weights file** that can be loaded back in at the start of training. The
+sentence structure and word cooccurrence statistics.
+
+Pretraining produces a **binary weights file** that can be loaded back in at the
+start of training, using the configuration option `initialize.init_tok2vec`. The
weights file specifies an initial set of weights. Training then proceeds as
normal.
@@ -732,6 +749,37 @@ component = "textcat"
layer = "tok2vec"
```
+#### Connecting pretraining to training {#pretraining-training}
+
+To benefit from pretraining, your training step needs to know to initialize its
+`tok2vec` component with the weights learned from the pretraining step. You do
+this by setting `initialize.init_tok2vec` to the filename of the `.bin` file
+that you want to use from pretraining.
+
+A pretraining step that runs for 5 epochs with an output path of `pretrain/`, as
+an example, produces `pretrain/model0.bin` through `pretrain/model4.bin`. To
+make use of the final output, you could fill in this value in your config file:
+
+```ini
+### config.cfg
+
+[paths]
+init_tok2vec = "pretrain/model4.bin"
+
+[initialize]
+init_tok2vec = ${paths.init_tok2vec}
+```
+
+
+
+The outputs of `spacy pretrain` are not the same data format as the pre-packaged
+static word vectors that would go into
+[`initialize.vectors`](/api/data-formats#config-initialize). The pretraining
+output consists of the weights that the `tok2vec` component should start with in
+an existing pipeline, so it goes in `initialize.init_tok2vec`.
+
+
+
#### Pretraining objectives {#pretraining-objectives}
> ```ini
diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md
index 665d334f8..54ab62467 100644
--- a/website/docs/usage/index.md
+++ b/website/docs/usage/index.md
@@ -71,13 +71,14 @@ spaCy's [`setup.cfg`](%%GITHUB_SPACY/setup.cfg) for details on what's included.
> $ pip install %%SPACY_PKG_NAME[lookups,transformers]%%SPACY_PKG_FLAGS
> ```
-| Name | Description |
-| ---------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `lookups` | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. |
-| `transformers` | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline. |
-| `ray` | Install [`spacy-ray`](https://github.com/explosion/spacy-ray) to add CLI commands for [parallel training](/usage/training#parallel-training). |
-| `cuda`, ... | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options. |
-| `ja`, `ko`, `th`, `zh` | Install additional dependencies required for tokenization for the [languages](/usage/models#languages). |
+| Name | Description |
+| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `lookups` | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. |
+| `transformers` | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline. |
+| `ray` | Install [`spacy-ray`](https://github.com/explosion/spacy-ray) to add CLI commands for [parallel training](/usage/training#parallel-training). |
+| `cuda`, ... | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options. |
+| `apple` | Install [`thinc-apple-ops`](https://github.com/explosion/thinc-apple-ops) to improve performance on an Apple M1. |
+| `ja`, `ko`, `th` | Install additional dependencies required for tokenization for the [languages](/usage/models#languages). |
### conda {#conda}
@@ -284,7 +285,9 @@ $ python -m pytest --pyargs %%SPACY_PKG_NAME --slow # basic and slow test
## Troubleshooting guide {#troubleshooting}
This section collects some of the most common errors you may come across when
-installing, loading and using spaCy, as well as their solutions.
+installing, loading and using spaCy, as well as their solutions. Also see the
+[Discussions FAQ Thread](https://github.com/explosion/spaCy/discussions/8226),
+which is updated more frequently and covers more transitory issues.
> #### Help us improve this guide
>
@@ -311,62 +314,6 @@ language's `Language` class instead, for example
-
-
-```
-no such option: --no-cache-dir
-```
-
-The `download` command uses pip to install the pipeline packages and sets the
-`--no-cache-dir` flag to prevent it from requiring too much memory.
-[This setting](https://pip.pypa.io/en/stable/reference/pip_install/#caching)
-requires pip v6.0 or newer. Run `pip install -U pip` to upgrade to the latest
-version of pip. To see which version you have installed, run `pip --version`.
-
-
-
-
-
-```
-sre_constants.error: bad character range
-```
-
-In [v2.1](/usage/v2-1), spaCy changed its implementation of regular expressions
-for tokenization to make it up to 2-3 times faster. But this also means that
-it's very important now that you run spaCy with a wide unicode build of Python.
-This means that the build has 1114111 unicode characters available, instead of
-only 65535 in a narrow unicode build. You can check this by running the
-following command:
-
-```bash
-$ python -c "import sys; print(sys.maxunicode)"
-```
-
-If you're running a narrow unicode build, reinstall Python and use a wide
-unicode build instead. You can also rebuild Python and set the
-`--enable-unicode=ucs4` flag.
-
-
-
-
-
-```
-ValueError: unknown locale: UTF-8
-```
-
-This error can sometimes occur on OSX and is likely related to a still
-unresolved [Python bug](https://bugs.python.org/issue18378). However, it's easy
-to fix: just add the following to your `~/.bash_profile` or `~/.zshrc` and then
-run `source ~/.bash_profile` or `source ~/.zshrc`. Make sure to add **both
-lines** for `LC_ALL` and `LANG`.
-
-```bash
-$ export LC_ALL=en_US.UTF-8
-$ export LANG=en_US.UTF-8
-```
-
-
-
```
diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index 17043d599..2e23b3684 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -833,7 +833,7 @@ retrieve and add to them.
self.cfg = {"labels": []}
@property
- def labels(self) -> Tuple[str]:
+ def labels(self) -> Tuple[str, ...]:
"""Returns the labels currently added to the component."""
return tuple(self.cfg["labels"])
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index f8f47ab53..f8baf5588 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -105,7 +105,7 @@ coarse-grained part-of-speech tags and morphological features.
that the verb is past tense (e.g. `VBD` for a past tense verb in the Penn
Treebank) .
2. For words whose coarse-grained POS is not set by a prior process, a
- [mapping table](#mapping-exceptions) maps the fine-grained tags to a
+ [mapping table](#mappings-exceptions) maps the fine-grained tags to a
coarse-grained POS tags and morphological features.
```python
@@ -831,6 +831,8 @@ def tokenizer_pseudo_code(
infixes = infix_finditer(substring)
offset = 0
for match in infixes:
+ if offset == 0 and match.start() == 0:
+ continue
tokens.append(substring[offset : match.start()])
tokens.append(substring[match.start() : match.end()])
offset = match.end()
diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md
index d1c9a0a81..3b79c4d0d 100644
--- a/website/docs/usage/models.md
+++ b/website/docs/usage/models.md
@@ -247,6 +247,10 @@ config can be used to configure the split mode to `A`, `B` or `C`.
split_mode = "A"
```
+Extra information, such as reading, inflection form, and the SudachiPy
+normalized form, is available in `Token.morph`. For `B` or `C` split modes,
+subtokens are stored in `Doc.user_data["sub_tokens"]`.
+
If you run into errors related to `sudachipy`, which is currently under active
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index 0109f24c9..11fd1459d 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -795,7 +795,7 @@ if there's no state to be passed through – spaCy can just take care of this fo
you. The following two code examples are equivalent:
```python
-# Statless component with @Language.factory
+# Stateless component with @Language.factory
@Language.factory("my_component")
def create_my_component():
def my_component(doc):
@@ -1479,7 +1479,7 @@ especially useful it you want to pass in a string instead of calling
### Example: Pipeline component for GPE entities and country meta data via a REST API {#component-example3}
This example shows the implementation of a pipeline component that fetches
-country meta data via the [REST Countries API](https://restcountries.eu), sets
+country meta data via the [REST Countries API](https://restcountries.com), sets
entity annotations for countries and sets custom attributes on the `Doc` and
`Span` – for example, the capital, latitude/longitude coordinates and even the
country flag.
@@ -1495,7 +1495,7 @@ from spacy.tokens import Doc, Span, Token
@Language.factory("rest_countries")
class RESTCountriesComponent:
def __init__(self, nlp, name, label="GPE"):
- r = requests.get("https://restcountries.eu/rest/v2/all")
+ r = requests.get("https://restcountries.com/v2/all")
r.raise_for_status() # make sure requests raises an error if it fails
countries = r.json()
# Convert API response to dict keyed by country name for easy lookup
diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index cb71f361b..e0e787a1d 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -291,7 +291,7 @@ files you need and not the whole repo.
| Name | Description |
| ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. |
-| `git` | `repo`: The URL of the repo to download from. `path`: Path of the file or directory to download, relative to the repo root. `branch`: The branch to download from. Defaults to `"master"`. |
+| `git` | `repo`: The URL of the repo to download from. `path`: Path of the file or directory to download, relative to the repo root. "" specifies the root directory. `branch`: The branch to download from. Defaults to `"master"`. |
| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. |
| `description` | Optional asset description, used in [auto-generated docs](#custom-docs). |
@@ -758,16 +758,6 @@ workflows, but only one can be tracked by DVC.
### Prodigy {#prodigy}
-
-
-The Prodigy integration will require a nightly version of Prodigy that supports
-spaCy v3+. You can already use annotations created with Prodigy in spaCy v3 by
-exporting your data with
-[`data-to-spacy`](https://prodi.gy/docs/recipes#data-to-spacy) and running
-[`spacy convert`](/api/cli#convert) to convert it to the binary format.
-
-
-
[Prodigy](https://prodi.gy) is a modern annotation tool for creating training
data for machine learning models, developed by us. It integrates with spaCy
out-of-the-box and provides many different
@@ -776,17 +766,23 @@ with and without a model in the loop. If Prodigy is installed in your project,
you can start the annotation server from your `project.yml` for a tight feedback
loop between data development and training.
-The following example command starts the Prodigy app using the
-[`ner.correct`](https://prodi.gy/docs/recipes#ner-correct) recipe and streams in
-suggestions for the given entity labels produced by a pretrained model. You can
-then correct the suggestions manually in the UI. After you save and exit the
-server, the full dataset is exported in spaCy's format and split into a training
-and evaluation set.
+
+
+This integration requires [Prodigy v1.11](https://prodi.gy/docs/changelog#v1.11)
+or higher. If you're using an older version of Prodigy, you can still use your
+annotations in spaCy v3 by exporting your data with
+[`data-to-spacy`](https://prodi.gy/docs/recipes#data-to-spacy) and running
+[`spacy convert`](/api/cli#convert) to convert it to the binary format.
+
+
+
+The following example shows a workflow for merging and exporting NER annotations
+collected with Prodigy and training a spaCy pipeline:
> #### Example usage
>
> ```cli
-> $ python -m spacy project run annotate
+> $ python -m spacy project run all
> ```
@@ -794,36 +790,71 @@ and evaluation set.
### project.yml
vars:
prodigy:
- dataset: 'ner_articles'
- labels: 'PERSON,ORG,PRODUCT'
- model: 'en_core_web_md'
+ train_dataset: "fashion_brands_training"
+ eval_dataset: "fashion_brands_eval"
+
+workflows:
+ all:
+ - data-to-spacy
+ - train_spacy
commands:
- - name: annotate
- - script:
- - 'python -m prodigy ner.correct ${vars.prodigy.dataset} ${vars.prodigy.model} ./assets/raw_data.jsonl --labels ${vars.prodigy.labels}'
- - 'python -m prodigy data-to-spacy ./corpus/train.json ./corpus/eval.json --ner ${vars.prodigy.dataset}'
- - 'python -m spacy convert ./corpus/train.json ./corpus/train.spacy'
- - 'python -m spacy convert ./corpus/eval.json ./corpus/eval.spacy'
- - deps:
- - 'assets/raw_data.jsonl'
- - outputs:
- - 'corpus/train.spacy'
- - 'corpus/eval.spacy'
+ - name: "data-to-spacy"
+ help: "Merge your annotations and create data in spaCy's binary format"
+ script:
+ - "python -m prodigy data-to-spacy corpus/ --ner ${vars.prodigy.train_dataset},eval:${vars.prodigy.eval_dataset}"
+ outputs:
+ - "corpus/train.spacy"
+ - "corpus/dev.spacy"
+ - name: "train_spacy"
+ help: "Train a named entity recognition model with spaCy"
+ script:
+ - "python -m spacy train configs/config.cfg --output training/ --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy"
+ deps:
+ - "corpus/train.spacy"
+ - "corpus/dev.spacy"
+ outputs:
+ - "training/model-best"
```
-You can use the same approach for other types of projects and annotation
+> #### Example train curve output
+>
+> [](https://prodi.gy/docs/recipes#train-curve)
+
+The [`train-curve`](https://prodi.gy/docs/recipes#train-curve) recipe is another
+cool workflow you can include in your project. It will run the training with
+different portions of the data, e.g. 25%, 50%, 75% and 100%. As a rule of thumb,
+if accuracy increases in the last segment, this could indicate that collecting
+more annotations of the same type might improve the model further.
+
+
+```yaml
+### project.yml (excerpt)
+- name: "train_curve"
+ help: "Train the model with Prodigy by using different portions of training examples to evaluate if more annotations can potentially improve the performance"
+ script:
+ - "python -m prodigy train-curve --ner ${vars.prodigy.train_dataset},eval:${vars.prodigy.eval_dataset} --config configs/${vars.config} --show-plot"
+```
+
+You can use the same approach for various types of projects and annotation
workflows, including
-[text classification](https://prodi.gy/docs/recipes#textcat),
-[dependency parsing](https://prodi.gy/docs/recipes#dep),
+[named entity recognition](https://prodi.gy/docs/named-entity-recognition),
+[span categorization](https://prodi.gy/docs/span-categorization),
+[text classification](https://prodi.gy/docs/text-classification),
+[dependency parsing](https://prodi.gy/docs/dependencies-relations),
[part-of-speech tagging](https://prodi.gy/docs/recipes#pos) or fully
-[custom recipes](https://prodi.gy/docs/custom-recipes) – for instance, an A/B
-evaluation workflow that lets you compare two different models and their
-results.
+[custom recipes](https://prodi.gy/docs/custom-recipes). You can also use spaCy
+project templates to quickly start the annotation server to collect more
+annotations and add them to your Prodigy dataset.
-
+Get started with spaCy and Prodigy using our project template. It includes
+commands to create a merged training corpus from your Prodigy annotations,
+training and packaging a spaCy pipeline and analyzing if more annotations may
+improve performance.
+
+
---
@@ -985,20 +1016,22 @@ commands:
[Weights & Biases](https://www.wandb.com/) is a popular platform for experiment
tracking. spaCy integrates with it out-of-the-box via the
-[`WandbLogger`](/api/top-level#WandbLogger), which you can add as the
-`[training.logger]` block of your training [config](/usage/training#config). The
-results of each step are then logged in your project, together with the full
-**training config**. This means that _every_ hyperparameter, registered function
-name and argument will be tracked and you'll be able to see the impact it has on
-your results.
+[`WandbLogger`](https://github.com/explosion/spacy-loggers#wandblogger), which
+you can add as the `[training.logger]` block of your training
+[config](/usage/training#config). The results of each step are then logged in
+your project, together with the full **training config**. This means that
+_every_ hyperparameter, registered function name and argument will be tracked
+and you'll be able to see the impact it has on your results.
> #### Example config
>
> ```ini
> [training.logger]
-> @loggers = "spacy.WandbLogger.v2"
+> @loggers = "spacy.WandbLogger.v3"
> project_name = "monitor_spacy_training"
> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
+> log_dataset_dir = "corpus"
+> model_log_interval = 1000
> ```

diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index 037850154..74bb10304 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -232,15 +232,22 @@ following rich comparison attributes are available:
>
> # Matches tokens of length >= 10
> pattern2 = [{"LENGTH": {">=": 10}}]
+>
+> # Match based on morph attributes
+> pattern3 = [{"MORPH": {"IS_SUBSET": ["Number=Sing", "Gender=Neut"]}}]
+> # "", "Number=Sing" and "Number=Sing|Gender=Neut" will match as subsets
+> # "Number=Plur|Gender=Neut" will not match
+> # "Number=Sing|Gender=Neut|Polite=Infm" will not match because it's a superset
> ```
-| Attribute | Description |
-| -------------------------- | ------------------------------------------------------------------------------------------------------- |
-| `IN` | Attribute value is member of a list. ~~Any~~ |
-| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
-| `ISSUBSET` | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~ |
-| `ISSUPERSET` | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~ |
-| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
+| Attribute | Description |
+| -------------------------- | --------------------------------------------------------------------------------------------------------- |
+| `IN` | Attribute value is member of a list. ~~Any~~ |
+| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
+| `IS_SUBSET` | Attribute value (for `MORPH` or custom list attributes) is a subset of a list. ~~Any~~ |
+| `IS_SUPERSET` | Attribute value (for `MORPH` or custom list attributes) is a superset of a list. ~~Any~~ |
+| `INTERSECTS` | Attribute value (for `MORPH` or custom list attributes) has a non-empty intersection with a list. ~~Any~~ |
+| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
#### Regular expressions {#regex new="2.1"}
@@ -422,7 +429,7 @@ matcher.add("HelloWorld", [pattern])
# 🚨 Raises an error:
# MatchPatternError: Invalid token patterns for matcher rule 'HelloWorld'
# Pattern 0:
-# - Additional properties are not allowed ('CASEINSENSITIVE' was unexpected) [2]
+# - [pattern -> 2 -> CASEINSENSITIVE] extra fields not permitted
```
@@ -431,7 +438,8 @@ matcher.add("HelloWorld", [pattern])
To move on to a more realistic example, let's say you're working with a large
corpus of blog articles, and you want to match all mentions of "Google I/O"
(which spaCy tokenizes as `['Google', 'I', '/', 'O'`]). To be safe, you only
-match on the uppercase versions, in case someone has written it as "Google i/o".
+match on the uppercase versions, avoiding matches with phrases such as "Google
+i/o".
```python
### {executable="true"}
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 17fac05e5..f46f0052b 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -6,6 +6,7 @@ menu:
- ['Introduction', 'basics']
- ['Quickstart', 'quickstart']
- ['Config System', 'config']
+ - ['Training Data', 'training-data']
- ['Custom Training', 'config-custom']
- ['Custom Functions', 'custom-functions']
- ['Initialization', 'initialization']
@@ -300,8 +301,6 @@ fly without having to save to and load from disk.
$ python -m spacy init config - --lang en --pipeline ner,textcat --optimize accuracy | python -m spacy train - --paths.train ./corpus/train.spacy --paths.dev ./corpus/dev.spacy
```
-
-
### Using variable interpolation {#config-interpolation}
Another very useful feature of the config system is that it supports variable
@@ -355,6 +354,59 @@ that reference this variable.
+## Preparing Training Data {#training-data}
+
+Training data for NLP projects comes in many different formats. For some common
+formats such as CoNLL, spaCy provides [converters](/api/cli#convert) you can use
+from the command line. In other cases you'll have to prepare the training data
+yourself.
+
+When converting training data for use in spaCy, the main thing is to create
+[`Doc`](/api/doc) objects just like the results you want as output from the
+pipeline. For example, if you're creating an NER pipeline, loading your
+annotations and setting them as the `.ents` property on a `Doc` is all you need
+to worry about. On disk the annotations will be saved as a
+[`DocBin`](/api/docbin) in the
+[`.spacy` format](/api/data-formats#binary-training), but the details of that
+are handled automatically.
+
+Here's an example of creating a `.spacy` file from some NER annotations.
+
+```python
+### preprocess.py
+import spacy
+from spacy.tokens import DocBin
+
+nlp = spacy.blank("en")
+training_data = [
+ ("Tokyo Tower is 333m tall.", [(0, 11, "BUILDING")]),
+]
+# the DocBin will store the example documents
+db = DocBin()
+for text, annotations in training_data:
+ doc = nlp(text)
+ ents = []
+ for start, end, label in annotations:
+ span = doc.char_span(start, end, label=label)
+ ents.append(span)
+ doc.ents = ents
+ db.add(doc)
+db.to_disk("./train.spacy")
+```
+
+For more examples of how to convert training data from a wide variety of formats
+for use with spaCy, look at the preprocessing steps in the
+[tutorial projects](https://github.com/explosion/projects/tree/v3/tutorials).
+
+
+
+In spaCy v2, the recommended way to store training data was in
+[a particular JSON format](/api/data-formats#json-input), but in v3 this format
+is deprecated. It's fine as a readable storage format, but there's no need to
+convert your data to JSON before creating a `.spacy` file.
+
+
+
## Customizing the pipeline and training {#config-custom}
### Defining pipeline components {#config-components}
@@ -426,7 +478,10 @@ as-is. They are also excluded when calling
> still impact your model's performance – for instance, a sentence boundary
> detector can impact what the parser or entity recognizer considers a valid
> parse. So the evaluation results should always reflect what your pipeline will
-> produce at runtime.
+> produce at runtime. If you want a frozen component to run (without updating)
+> during training as well, so that downstream components can use its
+> **predictions**, you can add it to the list of
+> [`annotating_components`](/usage/training#annotating-components).
```ini
[nlp]
@@ -513,6 +568,10 @@ frozen_components = ["ner"]
annotating_components = ["sentencizer", "ner"]
```
+Similarly, a pretrained `tok2vec` layer can be frozen and specified in the list
+of `annotating_components` to ensure that a downstream component can use the
+embedding layer without updating it.
+
Be aware that non-frozen annotating components with statistical models will
@@ -645,14 +704,14 @@ excluded from the logs and the score won't be weighted.
-| Name | Description |
-| -------------------------- | ----------------------------------------------------------------------------------------------------------------------- |
-| **Loss** | The training loss representing the amount of work left for the optimizer. Should decrease, but usually not to `0`. |
-| **Precision** (P) | Percentage of predicted annotations that were correct. Should increase. |
-| **Recall** (R) | Percentage of reference annotations recovered. Should increase. |
-| **F-Score** (F) | Harmonic mean of precision and recall. Should increase. |
-| **UAS** / **LAS** | Unlabeled and labeled attachment score for the dependency parser, i.e. the percentage of correct arcs. Should increase. |
-| **Words per second** (WPS) | Prediction speed in words per second. Should stay stable. |
+| Name | Description |
+| ----------------- | ----------------------------------------------------------------------------------------------------------------------- |
+| **Loss** | The training loss representing the amount of work left for the optimizer. Should decrease, but usually not to `0`. |
+| **Precision** (P) | Percentage of predicted annotations that were correct. Should increase. |
+| **Recall** (R) | Percentage of reference annotations recovered. Should increase. |
+| **F-Score** (F) | Harmonic mean of precision and recall. Should increase. |
+| **UAS** / **LAS** | Unlabeled and labeled attachment score for the dependency parser, i.e. the percentage of correct arcs. Should increase. |
+| **Speed** | Prediction speed in words per second (WPS). Should stay stable. |
Note that if the development data has raw text, some of the gold-standard
entities might not align to the predicted tokenization. These tokenization
@@ -883,8 +942,8 @@ During training, the results of each step are passed to a logger function. By
default, these results are written to the console with the
[`ConsoleLogger`](/api/top-level#ConsoleLogger). There is also built-in support
for writing the log files to [Weights & Biases](https://www.wandb.com/) with the
-[`WandbLogger`](/api/top-level#WandbLogger). On each step, the logger function
-receives a **dictionary** with the following keys:
+[`WandbLogger`](https://github.com/explosion/spacy-loggers#wandblogger). On each
+step, the logger function receives a **dictionary** with the following keys:
| Key | Value |
| -------------- | ----------------------------------------------------------------------------------------------------- |
@@ -1586,7 +1645,7 @@ workers are stuck waiting for it to complete before they can continue.
## Internal training API {#api}
-
+
spaCy gives you full control over the training loop. However, for most use
cases, it's recommended to train your pipelines via the
@@ -1598,6 +1657,32 @@ typically give you everything you need to train fully custom pipelines with
+### Training from a Python script {#api-train new="3.2"}
+
+If you want to run the training from a Python script instead of using the
+[`spacy train`](/api/cli#train) CLI command, you can call into the
+[`train`](/api/cli#train-function) helper function directly. It takes the path
+to the config file, an optional output directory and an optional dictionary of
+[config overrides](#config-overrides).
+
+```python
+from spacy.cli.train import train
+
+train("./config.cfg", overrides={"paths.train": "./train.spacy", "paths.dev": "./dev.spacy"})
+```
+
+### Internal training loop API {#api-loop}
+
+
+
+This section documents how the training loop and updates to the `nlp` object
+work internally. You typically shouldn't have to implement this in Python unless
+you're writing your own trainable components. To train a pipeline, use
+[`spacy train`](/api/cli#train) or the [`train`](/api/cli#train-function) helper
+function instead.
+
+
+
The [`Example`](/api/example) object contains annotated training data, also
called the **gold standard**. It's initialized with a [`Doc`](/api/doc) object
that will hold the predictions, and another `Doc` object that holds the
diff --git a/website/docs/usage/v3-2.md b/website/docs/usage/v3-2.md
new file mode 100644
index 000000000..d1d45c7ba
--- /dev/null
+++ b/website/docs/usage/v3-2.md
@@ -0,0 +1,244 @@
+---
+title: What's New in v3.2
+teaser: New features and how to upgrade
+menu:
+ - ['New Features', 'features']
+ - ['Upgrading Notes', 'upgrading']
+---
+
+## New Features {#features hidden="true"}
+
+spaCy v3.2 adds support for [`floret`](https://github.com/explosion/floret)
+vectors, makes custom `Doc` creation and scoring easier, and includes many bug
+fixes and improvements. For the trained pipelines, there's a new transformer
+pipeline for Japanese and the Universal Dependencies training data has been
+updated across the board to the most recent release.
+
+
+
+spaCy is now up to **8 × faster on M1 Macs** by calling into Apple's
+native Accelerate library for matrix multiplication. For more details, see
+[`thinc-apple-ops`](https://github.com/explosion/thinc-apple-ops).
+
+```bash
+$ pip install spacy[apple]
+```
+
+
+
+### Registered scoring functions {#registered-scoring-functions}
+
+To customize the scoring, you can specify a scoring function for each component
+in your config from the new [`scorers` registry](/api/top-level#registry):
+
+```ini
+### config.cfg (excerpt) {highlight="3"}
+[components.tagger]
+factory = "tagger"
+scorer = {"@scorers":"spacy.tagger_scorer.v1"}
+```
+
+### Overwrite settings {#overwrite}
+
+Most pipeline components now include an `overwrite` setting in the config that
+determines whether existing annotation in the `Doc` is preserved or overwritten:
+
+```ini
+### config.cfg (excerpt) {highlight="3"}
+[components.tagger]
+factory = "tagger"
+overwrite = false
+```
+
+### Doc input for pipelines {#doc-input}
+
+[`nlp`](/api/language#call) and [`nlp.pipe`](/api/language#pipe) accept
+[`Doc`](/api/doc) input, skipping the tokenizer if a `Doc` is provided instead
+of a string. This makes it easier to create a `Doc` with custom tokenization or
+to set custom extensions before processing:
+
+```python
+doc = nlp.make_doc("This is text 500.")
+doc._.text_id = 500
+doc = nlp(doc)
+```
+
+### Support for floret vectors {#vectors}
+
+We recently published [`floret`](https://github.com/explosion/floret), an
+extended version of [fastText](https://fasttext.cc) that combines fastText's
+subwords with Bloom embeddings for compact, full-coverage vectors. The use of
+subwords means that there are no OOV words and due to Bloom embeddings, the
+vector table can be kept very small at <100K entries. Bloom embeddings are
+already used by [HashEmbed](https://thinc.ai/docs/api-layers#hashembed) in
+[tok2vec](/api/architectures#tok2vec-arch) for compact spaCy models.
+
+For easy integration, floret includes a
+[Python wrapper](https://github.com/explosion/floret/blob/main/python/README.md):
+
+```bash
+$ pip install floret
+```
+
+A demo project shows how to train and import floret vectors:
+
+
+
+Train toy English floret vectors and import them into a spaCy pipeline.
+
+
+
+Two additional demo projects compare standard fastText vectors with floret
+vectors for full spaCy pipelines. For agglutinative languages like Finnish or
+Korean, there are large improvements in performance due to the use of subwords
+(no OOV words!), with a vector table containing merely 50K entries.
+
+
+
+Finnish UD+NER vector and pipeline training, comparing standard fasttext vs.
+floret vectors.
+
+For the default project settings with 1M (2.6G) tokenized training texts and 50K
+300-dim vectors, ~300K keys for the standard vectors:
+
+| Vectors | TAG | POS | DEP UAS | DEP LAS | NER F |
+| -------------------------------------------- | -------: | -------: | -------: | -------: | -------: |
+| none | 93.3 | 92.3 | 79.7 | 72.8 | 61.0 |
+| standard (pruned: 50K vectors for 300K keys) | 95.9 | 94.7 | 83.3 | 77.9 | 68.5 |
+| standard (unpruned: 300K vectors/keys) | 96.0 | 95.0 | **83.8** | 78.4 | 69.1 |
+| floret (minn 4, maxn 5; 50K vectors, no OOV) | **96.6** | **95.5** | 83.5 | **78.5** | **70.9** |
+
+
+
+
+
+Korean UD vector and pipeline training, comparing standard fasttext vs. floret
+vectors.
+
+For the default project settings with 1M (3.3G) tokenized training texts and 50K
+300-dim vectors, ~800K keys for the standard vectors:
+
+| Vectors | TAG | POS | DEP UAS | DEP LAS |
+| -------------------------------------------- | -------: | -------: | -------: | -------: |
+| none | 72.5 | 85.0 | 73.2 | 64.3 |
+| standard (pruned: 50K vectors for 800K keys) | 77.9 | 89.4 | 78.8 | 72.8 |
+| standard (unpruned: 800K vectors/keys) | 79.0 | 90.2 | 79.2 | 73.9 |
+| floret (minn 2, maxn 3; 50K vectors, no OOV) | **82.5** | **93.8** | **83.0** | **80.1** |
+
+
+
+### Updates for spacy-transformers v1.1 {#spacy-transformers}
+
+[`spacy-transformers`](https://github.com/explosion/spacy-transformers) v1.1 has
+been refactored to improve serialization and support of inline transformer
+components and replacing listeners. In addition, the transformer model output is
+provided as
+[`ModelOutput`](https://huggingface.co/transformers/main_classes/output.html?highlight=modeloutput#transformers.file_utils.ModelOutput)
+instead of tuples in
+`TransformerData.model_output and FullTransformerBatch.model_output.` For
+backwards compatibility, the tuple format remains available under
+`TransformerData.tensors` and `FullTransformerBatch.tensors`. See more details
+in the [transformer API docs](/api/architectures#TransformerModel).
+
+`spacy-transfomers` v1.1 also adds support for `transformer_config` settings
+such as `output_attentions`. Additional output is stored under
+`TransformerData.model_output`. More details are in the
+[TransformerModel docs](/api/architectures#TransformerModel). The training speed
+has been improved by streamlining allocations for tokenizer output and there is
+new support for [mixed-precision training](/api/architectures#TransformerModel).
+
+### New transformer package for Japanese {#pipeline-packages}
+
+spaCy v3.2 adds a new transformer pipeline package for Japanese
+[`ja_core_news_trf`](/models/ja#ja_core_news_trf), which uses the `basic`
+pretokenizer instead of `mecab` to limit the number of dependencies required for
+the pipeline. Thanks to Hiroshi Matsuda and the spaCy Japanese community for
+their contributions!
+
+### Pipeline and language updates {#pipeline-updates}
+
+- All Universal Dependencies training data has been updated to v2.8.
+- The Catalan data, tokenizer and lemmatizer have been updated, thanks to Carlos
+ Rodriguez, Carme Armentano and the Barcelona Supercomputing Center!
+- The transformer pipelines are trained using spacy-transformers v1.1, with
+ improved IO and more options for
+ [model config and output](/api/architectures#TransformerModel).
+- Trailing whitespace has been added as a `tok2vec` feature, improving the
+ performance for many components, especially fine-grained tagging and sentence
+ segmentation.
+- The English attribute ruler patterns have been overhauled to improve
+ `Token.pos` and `Token.morph`.
+
+spaCy v3.2 also features a new Irish lemmatizer, support for `noun_chunks` in
+Portuguese, improved `noun_chunks` for Spanish and additional updates for
+Bulgarian, Catalan, Sinhala, Tagalog, Tigrinya and Vietnamese.
+
+## Notes about upgrading from v3.1 {#upgrading}
+
+### Pipeline package version compatibility {#version-compat}
+
+> #### Using legacy implementations
+>
+> In spaCy v3, you'll still be able to load and reference legacy implementations
+> via [`spacy-legacy`](https://github.com/explosion/spacy-legacy), even if the
+> components or architectures change and newer versions are available in the
+> core library.
+
+When you're loading a pipeline package trained with spaCy v3.0 or v3.1, you will
+see a warning telling you that the pipeline may be incompatible. This doesn't
+necessarily have to be true, but we recommend running your pipelines against
+your test suite or evaluation data to make sure there are no unexpected results.
+If you're using one of the [trained pipelines](/models) we provide, you should
+run [`spacy download`](/api/cli#download) to update to the latest version. To
+see an overview of all installed packages and their compatibility, you can run
+[`spacy validate`](/api/cli#validate).
+
+If you've trained your own custom pipeline and you've confirmed that it's still
+working as expected, you can update the spaCy version requirements in the
+[`meta.json`](/api/data-formats#meta):
+
+```diff
+- "spacy_version": ">=3.1.0,<3.2.0",
++ "spacy_version": ">=3.2.0,<3.3.0",
+```
+
+### Updating v3.1 configs
+
+To update a config from spaCy v3.1 with the new v3.2 settings, run
+[`init fill-config`](/api/cli#init-fill-config):
+
+```cli
+$ python -m spacy init fill-config config-v3.1.cfg config-v3.2.cfg
+```
+
+In many cases ([`spacy train`](/api/cli#train),
+[`spacy.load`](/api/top-level#spacy.load)), the new defaults will be filled in
+automatically, but you'll need to fill in the new settings to run
+[`debug config`](/api/cli#debug) and [`debug data`](/api/cli#debug-data).
+
+## Notes about upgrading from spacy-transformers v1.0 {#upgrading-transformers}
+
+When you're loading a transformer pipeline package trained with
+[`spacy-transformers`](https://github.com/explosion/spacy-transformers) v1.0
+after upgrading to `spacy-transformers` v1.1, you'll see a warning telling you
+that the pipeline may be incompatible. `spacy-transformers` v1.1 should be able
+to import v1.0 `transformer` components into the new internal format with no
+change in performance, but here we'd also recommend running your test suite to
+verify that the pipeline still performs as expected.
+
+If you save your pipeline with [`nlp.to_disk`](/api/language#to_disk), it will
+be saved in the new v1.1 format and should be fully compatible with
+`spacy-transformers` v1.1. Once you've confirmed the performance, you can update
+the requirements in [`meta.json`](/api/data-formats#meta):
+
+```diff
+ "requirements": [
+- "spacy-transformers>=1.0.3,<1.1.0"
++ "spacy-transformers>=1.1.2,<1.2.0"
+ ]
+```
+
+If you're using one of the [trained pipelines](/models) we provide, you should
+run [`spacy download`](/api/cli#download) to update to the latest version. To
+see an overview of all installed packages and their compatibility, you can run
+[`spacy validate`](/api/cli#validate).
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 8b4d2de7c..980f06172 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -854,6 +854,19 @@ pipeline component, the [`AttributeRuler`](/api/attributeruler). See the
you have tag maps and morph rules in the v2.x format, you can load them into the
attribute ruler before training using the `[initialize]` block of your config.
+### Using Lexeme Tables
+
+To use tables like `lexeme_prob` when training a model from scratch, you need
+to add an entry to the `initialize` block in your config. Here's what that
+looks like for the existing trained pipelines:
+
+```ini
+[initialize.lookups]
+@misc = "spacy.LookupsDataLoader.v1"
+lang = ${nlp.lang}
+tables = ["lexeme_norm"]
+```
+
> #### What does the initialization do?
>
> The `[initialize]` block is used when
diff --git a/website/docs/usage/visualizers.md b/website/docs/usage/visualizers.md
index cc73e7e67..072718f91 100644
--- a/website/docs/usage/visualizers.md
+++ b/website/docs/usage/visualizers.md
@@ -328,6 +328,15 @@ position.
}
```
+```python
+### ENT input with knowledge base links
+{
+ "text": "But Google is starting from behind.",
+ "ents": [{"start": 4, "end": 10, "label": "ORG", "kb_id": "Q95", "kb_url": "https://www.wikidata.org/entity/Q95"}],
+ "title": None
+}
+```
+
## Using displaCy in a web application {#webapp}
If you want to use the visualizers as part of a web application, for example to
diff --git a/website/meta/languages.json b/website/meta/languages.json
index 2ba117d53..a7dda6482 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -192,17 +192,10 @@
"models": [
"ja_core_news_sm",
"ja_core_news_md",
- "ja_core_news_lg"
+ "ja_core_news_lg",
+ "ja_core_news_trf"
],
"dependencies": [
- {
- "name": "Unidic",
- "url": "http://unidic.ninjal.ac.jp/back_number#unidic_cwj"
- },
- {
- "name": "Mecab",
- "url": "https://github.com/taku910/mecab"
- },
{
"name": "SudachiPy",
"url": "https://github.com/WorksApplications/SudachiPy"
diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index 6fe09f052..1054f7626 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -10,7 +10,8 @@
{ "text": "Facts & Figures", "url": "/usage/facts-figures" },
{ "text": "spaCy 101", "url": "/usage/spacy-101" },
{ "text": "New in v3.0", "url": "/usage/v3" },
- { "text": "New in v3.1", "url": "/usage/v3-1" }
+ { "text": "New in v3.1", "url": "/usage/v3-1" },
+ { "text": "New in v3.2", "url": "/usage/v3-2" }
]
},
{
diff --git a/website/meta/site.json b/website/meta/site.json
index b8f1a58ef..169680f86 100644
--- a/website/meta/site.json
+++ b/website/meta/site.json
@@ -22,7 +22,8 @@
"list": "89ad33e698"
},
"docSearch": {
- "apiKey": "371e26ed49d29a27bd36273dfdaf89af",
+ "appId": "Y1LB128RON",
+ "apiKey": "bb601a1daab73e2dc66faf2b79564807",
"indexName": "spacy"
},
"binderUrl": "explosion/spacy-io-binder",
diff --git a/website/meta/type-annotations.json b/website/meta/type-annotations.json
index 8136b3e96..0ffcbfb33 100644
--- a/website/meta/type-annotations.json
+++ b/website/meta/type-annotations.json
@@ -43,6 +43,7 @@
"cymem.Pool": "https://github.com/explosion/cymem",
"preshed.BloomFilter": "https://github.com/explosion/preshed",
"transformers.BatchEncoding": "https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding",
+ "transformers.file_utils.ModelOutput": "https://huggingface.co/transformers/main_classes/output.html#modeloutput",
"torch.Tensor": "https://pytorch.org/docs/stable/tensors.html",
"numpy.ndarray": "https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html",
"Match": "https://docs.python.org/3/library/re.html#match-objects",
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 537ba3eec..ba770a3fd 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1,5 +1,43 @@
{
"resources": [
+ {
+ "id": "spacypdfreader",
+ "title": "spadypdfreader",
+ "category": ["pipeline"],
+ "tags": ["PDF"],
+ "slogan": "Easy PDF to text to spaCy text extraction in Python.",
+ "description": "*spacypdfreader* is a Python library that allows you to convert PDF files directly into *spaCy* `Doc` objects. The library provides several built in parsers or bring your own parser. `Doc` objects are annotated with several custom attributes including: `token._.page_number`, `doc._.page_range`, `doc._.first_page`, `doc._.last_page`, `doc._.pdf_file_name`, and `doc._.page(int)`.",
+ "github": "SamEdwardes/spacypdfreader",
+ "pip": "spacypdfreader",
+ "url": "https://samedwardes.github.io/spacypdfreader/",
+ "code_language": "python",
+ "author": "Sam Edwardes",
+ "author_links": {
+ "twitter": "TheReaLSamlam",
+ "github": "SamEdwardes",
+ "website": "https://samedwardes.com"
+ },
+ "code_example": [
+ "import spacy",
+ "from spacypdfreader import pdf_reader",
+ "",
+ "nlp = spacy.load('en_core_web_sm')",
+ "doc = pdf_reader('tests/data/test_pdf_01.pdf', nlp)",
+ "",
+ "# Get the page number of any token.",
+ "print(doc[0]._.page_number) # 1",
+ "print(doc[-1]._.page_number) # 4",
+ "",
+ "# Get page meta data about the PDF document.",
+ "print(doc._.pdf_file_name) # 'tests/data/test_pdf_01.pdf'",
+ "print(doc._.page_range) # (1, 4)",
+ "print(doc._.first_page) # 1",
+ "print(doc._.last_page) # 4",
+ "",
+ "# Get all of the text from a specific PDF page.",
+ "print(doc._.page(4)) # 'able to display the destination page (unless...'"
+ ]
+ },
{
"id": "nlpcloud",
"title": "NLPCloud.io",
@@ -26,32 +64,6 @@
"category": ["apis", "nonpython", "standalone"],
"tags": ["api", "deploy", "production"]
},
- {
- "id": "denomme",
- "title": "denomme : Multilingual Name Detector",
- "slogan": "Multilingual Name Detection",
- "description": "A SpaCy extension for Spans to extract multilingual names out of documents trained on XLM-roberta backbone",
- "github": "meghanabhange/denomme",
- "pip": "denomme https://denomme.s3.us-east-2.amazonaws.com/xx_denomme-0.3.1/dist/xx_denomme-0.3.1.tar.gz",
- "code_example": [
- "from spacy.lang.xx import MultiLanguage",
- "from denomme.name import person_name_component",
- "nlp = MultiLanguage()",
- "nlp.add_pipe('denomme')",
- "doc = nlp('Hi my name is Meghana S.R Bhange and I want to talk Asha')",
- "print(doc._.person_name)",
- "# ['Meghana S.R Bhange', 'Asha']"
- ],
- "thumb": "https://i.ibb.co/jwGVWPZ/rainbow-bohemian-logo-removebg-preview.png",
- "code_language": "python",
- "author": "Meghana Bhange",
- "author_links": {
- "github": "meghanabhange",
- "twitter": "_aspiringcat"
- },
- "category": ["standalone"],
- "tags": ["person-name-detection"]
- },
{
"id": "eMFDscore",
"title": "eMFDscore : Extended Moral Foundation Dictionary Scoring for Python",
@@ -129,7 +141,8 @@
"website": "https://www.nr.no/~plison"
},
"category": ["pipeline", "standalone", "research", "training"],
- "tags": []
+ "tags": [],
+ "spacy_version": 3
},
{
"id": "numerizer",
@@ -437,10 +450,10 @@
"thumb": "https://i.imgur.com/myhLjMJ.png",
"code_example": [
"import stanza",
- "from spacy_stanza import StanzaLanguage",
+ "import spacy_stanza",
"",
- "snlp = stanza.Pipeline(lang=\"en\")",
- "nlp = StanzaLanguage(snlp)",
+ "stanza.download(\"en\")",
+ "nlp = spacy_stanza.load_pipeline(\"en\")",
"",
"doc = nlp(\"Barack Obama was born in Hawaii. He was elected president in 2008.\")",
"for token in doc:",
@@ -455,6 +468,32 @@
"website": "https://explosion.ai"
}
},
+ {
+ "id": "spacy-udpipe",
+ "title": "spacy-udpipe",
+ "slogan": "Use the latest UDPipe models directly in spaCy",
+ "description": "This package wraps the fast and efficient UDPipe language-agnostic NLP pipeline (via its Python bindings), so you can use UDPipe pre-trained models as a spaCy pipeline for 50+ languages out-of-the-box. Inspired by spacy-stanza, this package offers slightly less accurate models that are in turn much faster.",
+ "github": "TakeLab/spacy-udpipe",
+ "pip": "spacy-udpipe",
+ "code_example": [
+ "import spacy_udpipe",
+ "",
+ "spacy_udpipe.download(\"en\") # download English model",
+ "",
+ "text = \"Wikipedia is a free online encyclopedia, created and edited by volunteers around the world.\"",
+ "nlp = spacy_udpipe.load(\"en\")",
+ "",
+ "doc = nlp(text)",
+ "for token in doc:",
+ " print(token.text, token.lemma_, token.pos_, token.dep_)"
+ ],
+ "category": ["pipeline", "standalone", "models", "research"],
+ "author": "TakeLab",
+ "author_links": {
+ "github": "TakeLab",
+ "website": "https://takelab.fer.hr/"
+ }
+ },
{
"id": "spacy-server",
"title": "spaCy Server",
@@ -490,12 +529,12 @@
"title": "NeuroNER",
"slogan": "Named-entity recognition using neural networks",
"github": "Franck-Dernoncourt/NeuroNER",
+ "category": ["models"],
"pip": "pyneuroner[cpu]",
"code_example": [
"from neuroner import neuromodel",
"nn = neuromodel.NeuroNER(train_model=False, use_pretrained_model=True)"
],
- "category": ["ner"],
"tags": ["standalone"]
},
{
@@ -537,7 +576,7 @@
"trainer = ListTrainer(chatbot)",
"trainer.train([",
"'Hi, can I help you?',",
- "'Sure, I would like to book a flight to Iceland.",
+ "'Sure, I would like to book a flight to Iceland.',",
"'Your flight has been booked.'",
"])",
"",
@@ -587,7 +626,7 @@
"id": "spacymoji",
"slogan": "Emoji handling and meta data as a spaCy pipeline component",
"github": "ines/spacymoji",
- "description": "spaCy v2.0 extension and pipeline component for adding emoji meta data to `Doc` objects. Detects emoji consisting of one or more unicode characters, and can optionally merge multi-char emoji (combined pictures, emoji with skin tone modifiers) into one token. Human-readable emoji descriptions are added as a custom attribute, and an optional lookup table can be provided for your own descriptions. The extension sets the custom `Doc`, `Token` and `Span` attributes `._.is_emoji`, `._.emoji_desc`, `._.has_emoji` and `._.emoji`.",
+ "description": "spaCy extension and pipeline component for adding emoji meta data to `Doc` objects. Detects emoji consisting of one or more unicode characters, and can optionally merge multi-char emoji (combined pictures, emoji with skin tone modifiers) into one token. Human-readable emoji descriptions are added as a custom attribute, and an optional lookup table can be provided for your own descriptions. The extension sets the custom `Doc`, `Token` and `Span` attributes `._.is_emoji`, `._.emoji_desc`, `._.has_emoji` and `._.emoji`.",
"pip": "spacymoji",
"category": ["pipeline"],
"tags": ["emoji", "unicode"],
@@ -616,6 +655,32 @@
"website": "https://ines.io"
}
},
+ {
+ "id": "spacyopentapioca",
+ "title": "spaCyOpenTapioca",
+ "slogan": "Named entity linking on Wikidata in spaCy via OpenTapioca",
+ "description": "A spaCy wrapper of OpenTapioca for named entity linking on Wikidata",
+ "github": "UB-Mannheim/spacyopentapioca",
+ "pip": "spacyopentapioca",
+ "code_example": [
+ "import spacy",
+ "nlp = spacy.blank('en')",
+ "nlp.add_pipe('opentapioca')",
+ "doc = nlp('Christian Drosten works in Germany.')",
+ "for span in doc.ents:",
+ " print((span.text, span.kb_id_, span.label_, span._.description, span._.score))",
+ "# ('Christian Drosten', 'Q1079331', 'PERSON', 'German virologist and university teacher', 3.6533377082098895)",
+ "# ('Germany', 'Q183', 'LOC', 'sovereign state in Central Europe', 2.1099332471902863)",
+ "## Check also span._.types, span._.aliases, span._.rank"
+ ],
+ "category": ["models", "pipeline"],
+ "tags": ["NER", "NEL"],
+ "author": "Renat Shigapov",
+ "author_links": {
+ "twitter": "_shigapov",
+ "github": "shigapov"
+ }
+ },
{
"id": "spacy_hunspell",
"slogan": "Add spellchecking and spelling suggestions to your spaCy pipeline using Hunspell",
@@ -888,6 +953,54 @@
"category": ["pipeline"],
"tags": ["lemmatizer", "danish"]
},
+ {
+ "id": "dacy",
+ "title": "DaCy",
+ "slogan": "An efficient Pipeline for Danish NLP",
+ "description": "DaCy is a Danish preprocessing pipeline trained in SpaCy. It has achieved State-of-the-Art performance on Named entity recognition, part-of-speech tagging and dependency parsing for Danish. This repository contains material for using the DaCy, reproducing the results and guides on usage of the package. Furthermore, it also contains a series of behavioural test for biases and robustness of Danish NLP pipelines.",
+ "github": "centre-for-humanities-computing/DaCy",
+ "pip": "dacy",
+ "code_example": [
+ "import dacy",
+ "print(dacy.models()) # get a list of dacy models",
+ "nlp = dacy.load('medium') # load your spacy pipeline",
+ "",
+ "# DaCy also includes functionality for adding other Danish models to the pipeline",
+ "# For instance you can add the BertTone model for classification of sentiment polarity to the pipeline:",
+ "nlp = add_berttone_polarity(nlp)"
+ ],
+ "thumb": "https://github.com/centre-for-humanities-computing/DaCy/blob/main/img/icon_no_title.png?raw=true",
+ "author": "Centre for Humanities Computing Aarhus",
+ "author_links": {
+ "github": "centre-for-humanities-computing",
+ "website": "https://chcaa.io/#/"
+ },
+ "category": ["pipeline"],
+ "tags": ["pipeline", "danish"]
+ },
+ {
+ "id": "textdescriptives",
+ "title": "TextDescriptives",
+ "slogan": "Extraction of descriptive stats, readability, and syntactic complexity measures",
+ "description": "Pipeline component for spaCy v.3 that calculates descriptive statistics, readability metrics, and syntactic complexity (dependency distance).",
+ "github": "HLasse/TextDescriptives",
+ "pip": "textdescriptives",
+ "code_example": [
+ "import spacy",
+ "import textdescriptives as td",
+ "nlp = spacy.load('en_core_web_sm')",
+ "nlp.add_pipe('textdescriptives')",
+ "doc = nlp('This is a short test text')",
+ "doc._.readability # access some of the values",
+ "td.extract_df(doc) # extract all metrics to DataFrame"
+ ],
+ "author": "Lasse Hansen, Kenneth Enevoldsen, Ludvig Olsen",
+ "author_links": {
+ "github": "HLasse"
+ },
+ "category": ["pipeline"],
+ "tags": ["pipeline", "readability", "syntactic complexity", "descriptive statistics"]
+ },
{
"id": "wmd-relax",
"slogan": "Calculates word mover's distance insanely fast",
@@ -1035,6 +1148,26 @@
},
"category": ["visualizers"]
},
+ {
+ "id": "deplacy",
+ "slogan": "CUI-based Tree Visualizer for Universal Dependencies and Immediate Catena Analysis",
+ "description": "Simple dependency visualizer for [spaCy](https://spacy.io/), [UniDic2UD](https://pypi.org/project/unidic2ud), [Stanza](https://stanfordnlp.github.io/stanza/), [NLP-Cube](https://github.com/Adobe/NLP-Cube), [Trankit](https://github.com/nlp-uoregon/trankit), etc.",
+ "github": "KoichiYasuoka/deplacy",
+ "image": "https://i.imgur.com/6uOI4Op.png",
+ "code_example": [
+ "import spacy",
+ "import deplacy",
+ "",
+ "nlp=spacy.load('en_core_web_sm')",
+ "doc=nlp('I saw a horse yesterday which had no name.')",
+ "deplacy.render(doc)"
+ ],
+ "author": "Koichi Yasuoka",
+ "author_links": {
+ "github": "KoichiYasuoka"
+ },
+ "category": ["visualizers"]
+ },
{
"id": "scattertext",
"slogan": "Beautiful visualizations of how language differs among document types",
@@ -1150,7 +1283,7 @@
"description": "`textacy` is a Python library for performing a variety of natural language processing (NLP) tasks, built on the high-performance `spacy` library. With the fundamentals – tokenization, part-of-speech tagging, dependency parsing, etc. – delegated to another library, `textacy` focuses on the tasks that come before and follow after.",
"github": "chartbeat-labs/textacy",
"pip": "textacy",
- "url": "https://chartbeat-labs.github.io/textacy/",
+ "url": "https://github.com/chartbeat-labs/textacy",
"author": "Burton DeWilde",
"author_links": {
"github": "bdewilde",
@@ -1243,20 +1376,19 @@
"url": "https://explosion.ai/demos/sense2vec",
"code_example": [
"import spacy",
- "from sense2vec import Sense2VecComponent",
"",
- "nlp = spacy.load('en')",
- "s2v = Sense2VecComponent('/path/to/reddit_vectors-1.1.0')",
- "nlp.add_pipe(s2v)",
+ "nlp = spacy.load(\"en_core_web_sm\")",
+ "s2v = nlp.add_pipe(\"sense2vec\")",
+ "s2v.from_disk(\"/path/to/s2v_reddit_2015_md\")",
"",
"doc = nlp(\"A sentence about natural language processing.\")",
- "assert doc[3].text == 'natural language processing'",
- "freq = doc[3]._.s2v_freq",
- "vector = doc[3]._.s2v_vec",
- "most_similar = doc[3]._.s2v_most_similar(3)",
- "# [(('natural language processing', 'NOUN'), 1.0),",
- "# (('machine learning', 'NOUN'), 0.8986966609954834),",
- "# (('computer vision', 'NOUN'), 0.8636297583580017)]"
+ "assert doc[3:6].text == \"natural language processing\"",
+ "freq = doc[3:6]._.s2v_freq",
+ "vector = doc[3:6]._.s2v_vec",
+ "most_similar = doc[3:6]._.s2v_most_similar(3)",
+ "# [(('machine learning', 'NOUN'), 0.8986967),",
+ "# (('computer vision', 'NOUN'), 0.8636297),",
+ "# (('deep learning', 'NOUN'), 0.8573361)]"
],
"category": ["pipeline", "standalone", "visualizers"],
"tags": ["vectors"],
@@ -1357,7 +1489,7 @@
},
"category": ["nonpython"],
"tags": ["ruby"]
- },
+ },
{
"id": "spacy_api",
"slogan": "Server/client to load models in a separate, dedicated process",
@@ -1563,6 +1695,38 @@
"author": "Bhargav Srinivasa-Desikan",
"category": ["books"]
},
+ {
+ "type": "education",
+ "id": "mastering-spacy",
+ "title": "Mastering spaCy",
+ "slogan": "Packt, 2021",
+ "description": "This is your ultimate spaCy book. Master the crucial skills to use spaCy components effectively to create real-world NLP applications with spaCy. Explaining linguistic concepts such as dependency parsing, POS-tagging and named entity extraction with many examples, this book will help you to conquer computational linguistics with spaCy. The book further focuses on ML topics with Keras and Tensorflow. You'll cover popular topics, including intent recognition, sentiment analysis and context resolution; and use them on popular datasets and interpret the results. A special hands-on section on chatbot design is included.",
+ "github": "PacktPublishing/Mastering-spaCy",
+ "cover": "https://tinyimg.io/i/aWEm0dh.jpeg",
+ "url": "https://www.amazon.com/Mastering-spaCy-end-end-implementing/dp/1800563353",
+ "author": "Duygu Altinok",
+ "author_links": {
+ "github": "DuyguA",
+ "website": "https://www.linkedin.com/in/duygu-altinok-4021389a"
+ },
+ "category": ["books"]
+ },
+ {
+ "type": "education",
+ "id": "applied-nlp-in-enterprise",
+ "title": "Applied Natural Language Processing in the Enterprise: Teaching Machines to Read, Write, and Understand",
+ "slogan": "O'Reilly, 2021",
+ "description": "Natural language processing (NLP) is one of the hottest topics in AI today. Having lagged behind other deep learning fields such as computer vision for years, NLP only recently gained mainstream popularity. Even though Google, Facebook, and OpenAI have open sourced large pretrained language models to make NLP easier, many organizations today still struggle with developing and productionizing NLP applications. This hands-on guide helps you learn the field quickly.",
+ "github": "nlpbook/nlpbook",
+ "cover": "https://i.imgur.com/6RxLBvf.jpg",
+ "url": "https://www.amazon.com/dp/149206257X",
+ "author": "Ankur A. Patel",
+ "author_links": {
+ "github": "aapatel09",
+ "website": "https://www.ankurapatel.io"
+ },
+ "category": ["books"]
+ },
{
"type": "education",
"id": "learning-path-spacy",
@@ -1574,6 +1738,16 @@
"author": "Aaron Kramer",
"category": ["courses"]
},
+ {
+ "type": "education",
+ "id": "introduction-into-spacy-3",
+ "title": "Introduction to spaCy 3",
+ "slogan": "A free course for beginners by Dr. W.J.B. Mattingly",
+ "url": "http://spacy.pythonhumanities.com/",
+ "thumb": "https://spacy.pythonhumanities.com/_static/freecodecamp_small.jpg",
+ "author": "Dr. W.J.B. Mattingly",
+ "category": ["courses"]
+ },
{
"type": "education",
"id": "spacy-course",
@@ -1591,6 +1765,23 @@
},
"category": ["courses"]
},
+ {
+ "type": "education",
+ "id": "applt-course",
+ "title": "Applied Language Technology",
+ "slogan": "NLP for newcomers using spaCy and Stanza",
+ "description": "These learning materials provide an introduction to applied language technology for audiences who are unfamiliar with language technology and programming. The learning materials assume no previous knowledge of the Python programming language.",
+ "url": "https://applied-language-technology.mooc.fi",
+ "image": "https://www.mv.helsinki.fi/home/thiippal/images/applt-preview.jpg",
+ "thumb": "https://www.mv.helsinki.fi/home/thiippal/images/applt-logo.png",
+ "author": "Tuomo Hiippala",
+ "author_links": {
+ "twitter": "tuomo_h",
+ "github": "thiippal",
+ "website": "https://www.mv.helsinki.fi/home/thiippal/"
+ },
+ "category": ["courses"]
+ },
{
"type": "education",
"id": "video-spacys-ner-model",
@@ -1974,11 +2165,9 @@
"github": "nikitakit/self-attentive-parser",
"pip": "benepar",
"code_example": [
- "import spacy",
- "from benepar.spacy_plugin import BeneparComponent",
- "",
- "nlp = spacy.load('en')",
- "nlp.add_pipe(BeneparComponent('benepar_en'))",
+ "import benepar, spacy",
+ "nlp = spacy.load('en_core_web_md')",
+ "nlp.add_pipe('benepar', config={'model': 'benepar_en3'})",
"doc = nlp('The time for action is now. It is never too late to do something.')",
"sent = list(doc.sents)[0]",
"print(sent._.parse_string)",
@@ -2442,6 +2631,75 @@
"website": "https://explosion.ai"
}
},
+ {
+ "id": "spacy-huggingface-hub",
+ "title": "spacy-huggingface-hub",
+ "slogan": "Push your spaCy pipelines to the Hugging Face Hub",
+ "description": "This package provides a CLI command for uploading any trained spaCy pipeline packaged with [`spacy package`](https://spacy.io/api/cli#package) to the [Hugging Face Hub](https://huggingface.co). It auto-generates all meta information for you, uploads a pretty README (requires spaCy v3.1+) and handles version control under the hood.",
+ "github": "explosion/spacy-huggingface-hub",
+ "thumb": "https://i.imgur.com/j6FO9O6.jpg",
+ "url": "https://github.com/explosion/spacy-huggingface-hub",
+ "pip": "spacy-huggingface-hub",
+ "category": ["pipeline", "models"],
+ "author": "Explosion",
+ "author_links": {
+ "twitter": "explosion_ai",
+ "github": "explosion",
+ "website": "https://explosion.ai"
+ }
+ },
+ {
+ "id": "spacy-clausie",
+ "title": "spacy-clausie",
+ "slogan": "Implementation of the ClausIE information extraction system for Python+spaCy",
+ "github": "mmxgn/spacy-clausie",
+ "url": "https://github.com/mmxgn/spacy-clausie",
+ "description": "ClausIE, a novel, clause-based approach to open information extraction, which extracts relations and their arguments from natural language text",
+ "category": ["pipeline", "scientific", "research"],
+ "code_example": [
+ "import spacy",
+ "import claucy",
+ "",
+ "nlp = spacy.load(\"en\")",
+ "claucy.add_to_pipe(nlp)",
+ "",
+ "doc = nlp(\"AE died in Princeton in 1955.\")",
+ "",
+ "print(doc._.clauses)",
+ "# Output:",
+ "# ",
+ "",
+ "propositions = doc._.clauses[0].to_propositions(as_text=True)",
+ "",
+ "print(propositions)",
+ "# Output:",
+ "# [AE died in Princeton in 1955, AE died in 1955, AE died in Princeton"
+ ],
+ "author": "Emmanouil Theofanis Chourdakis",
+ "author_links": {
+ "github": "mmxgn"
+ }
+ },
+ {
+ "id": "ipymarkup",
+ "slogan": "NER, syntax markup visualizations",
+ "description": "Collection of NLP visualizations for NER and syntax tree markup. Similar to [displaCy](https://explosion.ai/demos/displacy) and [displaCy ENT](https://explosion.ai/demos/displacy-ent).",
+ "github": "natasha/ipymarkup",
+ "image": "https://github.com/natasha/ipymarkup/blob/master/table.png?raw=true",
+ "pip":"pip install ipymarkup",
+ "code_example": [
+ "from ipymarkup import show_span_ascii_markup, show_dep_ascii_markup",
+ "",
+ "text = 'В мероприятии примут участие не только российские учёные, но и зарубежные исследователи, в том числе, Крис Хелмбрехт - управляющий директор и совладелец креативного агентства Kollektiv (Германия, США), Ннека Угбома - руководитель проекта Mushroom works (Великобритания), Гергей Ковач - политик и лидер субкультурной партии «Dog with two tails» (Венгрия), Георг Жено - немецкий режиссёр, один из создателей экспериментального театра «Театр.doc», Театра им. Йозефа Бойса (Германия).'",
+ "spans = [(102, 116, 'PER'), (186, 194, 'LOC'), (196, 199, 'LOC'), (202, 214, 'PER'), (254, 268, 'LOC'), (271, 283, 'PER'), (324, 342, 'ORG'), (345, 352, 'LOC'), (355, 365, 'PER'), (445, 455, 'ORG'), (456, 468, 'PER'), (470, 478, 'LOC')]",
+ "show_span_ascii_markup(text, spans)"
+ ],
+ "author": "Alexander Kukushkin",
+ "author_links": {
+ "github": "kuk"
+ },
+ "category": ["visualizers"]
+ },
{
"id": "negspacy",
"title": "negspaCy",
@@ -2529,6 +2787,54 @@
"website": "https://yanaiela.github.io"
}
},
+ {
+ "id": "Healthsea",
+ "title": "Healthsea",
+ "slogan": "Healthsea: an end-to-end spaCy pipeline for exploring health supplement effects",
+ "description": "This spaCy project trains an NER model and a custom Text Classification model with Clause Segmentation and Blinding capabilities to analyze supplement reviews and their potential effects on health.",
+ "github": "explosion/healthsea",
+ "thumb": "https://github.com/explosion/healthsea/blob/main/img/Jellyfish.png",
+ "category": ["pipeline", "research"],
+ "code_example": [
+ "import spacy",
+ "",
+ "nlp = spacy.load(\"en_healthsea\")",
+ "doc = nlp(\"This is great for joint pain.\")",
+ "",
+ "# Clause Segmentation & Blinding",
+ "print(doc._.clauses)",
+ "",
+ "> {",
+ "> \"split_indices\": [0, 7],",
+ "> \"has_ent\": true,",
+ "> \"ent_indices\": [4, 6],",
+ "> \"blinder\": \"_CONDITION_\",",
+ "> \"ent_name\": \"joint pain\",",
+ "> \"cats\": {",
+ "> \"POSITIVE\": 0.9824668169021606,",
+ "> \"NEUTRAL\": 0.017364952713251114,",
+ "> \"NEGATIVE\": 0.00002889777533710003,",
+ "> \"ANAMNESIS\": 0.0001394189748680219",
+ "> \"prediction_text\": [\"This\", \"is\", \"great\", \"for\", \"_CONDITION_\", \"!\"]",
+ "> }",
+ "",
+ "# Aggregated results",
+ "> {",
+ "> \"joint_pain\": {",
+ "> \"effects\": [\"POSITIVE\"],",
+ "> \"effect\": \"POSITIVE\",",
+ "> \"label\": \"CONDITION\",",
+ "> \"text\": \"joint pain\"",
+ "> }",
+ "> }"
+ ],
+ "author": "Edward Schmuhl",
+ "author_links": {
+ "github": "thomashacker",
+ "twitter": "aestheticedwar1",
+ "website": "https://explosion.ai/"
+ }
+ },
{
"id": "presidio",
"title": "Presidio",
@@ -2741,11 +3047,10 @@
"github": "thomasthiebaud/spacy-fastlang",
"pip": "spacy_fastlang",
"code_example": [
- "import spacy",
- "from spacy_fastlang import LanguageDetector",
+ "import spacy_fastlang",
"",
- "nlp = spacy.load('en_core_web_sm')",
- "nlp.add_pipe(LanguageDetector())",
+ "nlp = spacy.load(\"en_core_web_sm\")",
+ "nlp.add_pipe(\"language_detector\")",
"doc = nlp('Life is like a box of chocolates. You never know what you are gonna get.')",
"",
"assert doc._.language == 'en'",
@@ -3116,6 +3421,65 @@
"category": ["research", "standalone", "scientific"],
"tags": ["Text Analytics", "Coherence", "Cohesion"]
},
+ {
+ "id": "lingfeat",
+ "title": "LingFeat",
+ "slogan": "A Linguistic Feature Extraction (Text Analysis) Tool for Readability Assessment and Text Simplification",
+ "description": "LingFeat is a feature extraction library which currently extracts 255 linguistic features from English string input. Categories include syntax, semantics, discourse, and also traditional readability formulas. Published in EMNLP 2021.",
+ "github": "brucewlee/lingfeat",
+ "pip": "lingfeat",
+ "code_example": [
+ "from lingfeat import extractor",
+ "",
+ "",
+ "text = 'TAEAN, South Chungcheong Province -- Just before sunup, Lee Young-ho, a seasoned fisherman with over 30 years of experience, silently waits for boats carrying blue crabs as the season for the seafood reaches its height. Soon afterward, small and big boats sail into Sinjin Port in Taean County, South Chungcheong Province, the second-largest source of blue crab after Incheon, accounting for 29 percent of total production of the country. A crane lifts 28 boxes filled with blue crabs weighing 40 kilograms each from the boat, worth about 10 million won ($8,500). “It has been a productive fall season for crabbing here. The water temperature is a very important factor affecting crab production. They hate cold water,” Lee said. The temperature of the sea off Taean appeared to have stayed at the level where crabs become active. If the sea temperature suddenly drops, crabs go into their winter dormancy mode, burrowing into the mud and sleeping through the cold months.'",
+ "",
+ "",
+ "#Pass text",
+ "LingFeat = extractor.pass_text(text)",
+ "",
+ "",
+ "#Preprocess text",
+ "LingFeat.preprocess()",
+ "",
+ "",
+ "#Extract features",
+ "#each method returns a dictionary of the corresponding features",
+ "#Advanced Semantic (AdSem) Features",
+ "WoKF = LingFeat.WoKF_() #Wikipedia Knowledge Features",
+ "WBKF = LingFeat.WBKF_() #WeeBit Corpus Knowledge Features",
+ "OSKF = LingFeat.OSKF_() #OneStopEng Corpus Knowledge Features",
+ "",
+ "#Discourse (Disco) Features",
+ "EnDF = LingFeat.EnDF_() #Entity Density Features",
+ "EnGF = LingFeat.EnGF_() #Entity Grid Features",
+ "",
+ "#Syntactic (Synta) Features",
+ "PhrF = LingFeat.PhrF_() #Noun/Verb/Adj/Adv/... Phrasal Features",
+ "TrSF = LingFeat.TrSF_() #(Parse) Tree Structural Features",
+ "POSF = LingFeat.POSF_() #Noun/Verb/Adj/Adv/... Part-of-Speech Features",
+ "",
+ "#Lexico Semantic (LxSem) Features",
+ "TTRF = LingFeat.TTRF_() #Type Token Ratio Features",
+ "VarF = LingFeat.VarF_() #Noun/Verb/Adj/Adv Variation Features",
+ "PsyF = LingFeat.PsyF_() #Psycholinguistic Difficulty of Words (AoA Kuperman)",
+ "WoLF = LingFeat.WorF_() #Word Familiarity from Frequency Count (SubtlexUS)",
+ "",
+ "Shallow Traditional (ShTra) Features",
+ "ShaF = LingFeat.ShaF_() #Shallow Features (e.g. avg number of tokens)",
+ "TraF = LingFeat.TraF_() #Traditional Formulas"
+ ],
+ "code_language": "python",
+ "thumb": "https://raw.githubusercontent.com/brucewlee/lingfeat/master/img/lingfeat_logo2.png",
+ "image": "https://raw.githubusercontent.com/brucewlee/lingfeat/master/img/lingfeat_logo.png",
+ "author": "Bruce W. Lee (이웅성)",
+ "author_links": {
+ "github": "brucewlee",
+ "website": "https://brucewlee.github.io/"
+ },
+ "category": ["research", "scientific"],
+ "tags": ["Readability", "Simplification", "Feature Extraction", "Syntax", "Discourse", "Semantics", "Lexical"]
+ },
{
"id": "hmrb",
"title": "Hammurabi",
@@ -3124,33 +3488,61 @@
"github": "babylonhealth/hmrb",
"pip": "hmrb",
"code_example": [
- "import spacy # __version__ 3.0+",
+ "import spacy",
"from hmrb.core import SpacyCore",
"",
+ "nlp = spacy.load(\"en_core_web_sm\")",
+ "sentences = \"I love gorillas. Peter loves gorillas. Jane loves Tarzan.\"",
+ "",
+ "def conj_be(subj: str) -> str:",
+ " if subj == \"I\":",
+ " return \"am\"",
+ " elif subj == \"you\":",
+ " return \"are\"",
+ " else:",
+ " return \"is\"",
+ "",
+ "@spacy.registry.callbacks(\"gorilla_callback\")",
+ "def gorilla_clb(seq: list, span: slice, data: dict) -> None:",
+ " subj = seq[span.start].text",
+ " be = conj_be(subj)",
+ " print(f\"{subj} {be} a gorilla person.\")",
+ "@spacy.registry.callbacks(\"lover_callback\")",
+ "def lover_clb(seq: list, span: slice, data: dict) -> None:",
+ " print(f\"{seq[span][-1].text} is a love interest of {seq[span.start].text}.\")",
+ "",
"grammar = \"\"\"",
- "Var is_hurting:",
- "(",
- " optional (lemma: \"be\")",
- " (lemma: \"hurt\")",
- ")",
- "Law:",
- " - package: \"headache\"",
- " - callback: \"mark_headache\"",
- "(",
- " (lemma: \"head\", pos: \"NOUN\")",
- " $is_hurting",
- ")\"\"\"",
+ " Law:",
+ " - callback: \"loves_gorilla\"",
+ " (",
+ " ((pos: \"PROPN\") or (pos: \"PRON\"))",
+ " (lemma: \"love\")",
+ " (lemma: \"gorilla\")",
+ " )",
+ " Law:",
+ " - callback: \"loves_someone\"",
+ " (",
+ " (pos: \"PROPN\")",
+ " (lower: \"loves\")",
+ " (pos: \"PROPN\")",
+ " )",
+ "\"\"\"",
+ "",
+ "@spacy.registry.augmenters(\"jsonify_span\")",
+ "def jsonify_span(span):",
+ " return [{\"lemma\": token.lemma_, \"pos\": token.pos_, \"lower\": token.lower_} for token in span]",
"",
"conf = {",
- " \"rules\": grammar",
+ " \"rules\": grammar,",
" \"callbacks\": {",
- " \"mark_headache\": \"callbacks.headache_handler\",",
- " },",
+ " \"loves_gorilla\": \"callbacks.gorilla_callback\",",
+ " \"loves_someone\": \"callbacks.lover_callback\",",
+ " },",
" \"map_doc\": \"augmenters.jsonify_span\",",
" \"sort_length\": True,",
"}",
- "nlp = spacy.load(\"en_core_web_sm\")",
- "nlp.add_pipe(\"hammurabi\", config=conf)",
+ "",
+ "nlp.add_pipe(\"hmrb\", config=conf)",
"nlp(sentences)"
],
"code_language": "python",
@@ -3171,15 +3563,17 @@
"slogan": "Forte is a toolkit for building Natural Language Processing pipelines, featuring cross-task interaction, adaptable data-model interfaces and composable pipelines.",
"description": "Forte provides a platform to assemble state-of-the-art NLP and ML technologies in a highly-composable fashion, including a wide spectrum of tasks ranging from Information Retrieval, Natural Language Understanding to Natural Language Generation.",
"github": "asyml/forte",
- "pip": "forte.spacy torch",
+ "pip": "forte.spacy stave torch",
"code_example": [
- "from forte.spacy import SpacyProcessor",
+ "from fortex.spacy import SpacyProcessor",
+ "from forte.processors.stave import StaveProcessor",
"from forte import Pipeline",
"from forte.data.readers import StringReader",
"",
"pipeline = Pipeline()",
"pipeline.set_reader(StringReader())",
"pipeline.add(SpacyProcessor())",
+ "pipeline.add(StaveProcessor())",
"pipeline.run('Running SpaCy with Forte!')"
],
"code_language": "python",
@@ -3194,6 +3588,114 @@
},
"category": ["pipeline", "standalone"],
"tags": ["pipeline"]
+ },
+ {
+ "id": "spacy-api-docker-v3",
+ "slogan": "spaCy v3 REST API, wrapped in a Docker container",
+ "github": "bbieniek/spacy-api-docker",
+ "url": "https://hub.docker.com/r/bbieniek/spacyapi/",
+ "thumb": "https://i.imgur.com/NRnDKyj.jpg",
+ "code_example": [
+ "version: '3'",
+ "",
+ "services:",
+ " spacyapi:",
+ " image: bbieniek/spacyapi:en_v3",
+ " ports:",
+ " - \"127.0.0.1:8080:80\"",
+ " restart: always"
+ ],
+ "code_language": "docker",
+ "author": "Baltazar Bieniek",
+ "author_links": {
+ "github": "bbieniek"
+ },
+ "category": ["apis"]
+ },
+ {
+ "id": "phruzz_matcher",
+ "title": "phruzz-matcher",
+ "slogan": "Phrase matcher using RapidFuzz",
+ "description": "Combination of the RapidFuzz library with Spacy PhraseMatcher The goal of this component is to find matches when there were NO \"perfect matches\" due to typos or abbreviations between a Spacy doc and a list of phrases.",
+ "github": "mjvallone/phruzz-matcher",
+ "pip": "phruzz_matcher",
+ "code_example": [
+ "import spacy",
+ "from spacy.language import Language",
+ "from phruzz_matcher.phrase_matcher import PhruzzMatcher",
+ "",
+ "famous_people = [",
+ " \"Brad Pitt\",",
+ " \"Demi Moore\",",
+ " \"Bruce Willis\",",
+ " \"Jim Carrey\",",
+ "]",
+ "",
+ "@Language.factory(\"phrase_matcher\")",
+ "def phrase_matcher(nlp: Language, name: str):",
+ " return PhruzzMatcher(nlp, famous_people, \"FAMOUS_PEOPLE\", 85)",
+ "",
+ "nlp = spacy.blank('es')",
+ "nlp.add_pipe(\"phrase_matcher\")",
+ "",
+ "doc = nlp(\"El otro día fui a un bar donde vi a brad pit y a Demi Moore, estaban tomando unas cervezas mientras charlaban de sus asuntos.\")",
+ "print(f\"doc.ents: {doc.ents}\")",
+ "",
+ "#OUTPUT",
+ "#doc.ents: (brad pit, Demi Moore)"
+ ],
+ "thumb": "https://avatars.githubusercontent.com/u/961296?v=4",
+ "image": "",
+ "code_language": "python",
+ "author": "Martin Vallone",
+ "author_links": {
+ "github": "mjvallone",
+ "twitter": "vallotin",
+ "website": "https://fiqus.coop/"
+ },
+ "category": ["pipeline", "research", "standalone"],
+ "tags": ["spacy", "python", "nlp", "ner"]
+ },
+ {
+ "id": "WordDumb",
+ "title": "WordDumb",
+ "slogan": "A calibre plugin that generates Word Wise and X-Ray files.",
+ "description": "A calibre plugin that generates Word Wise and X-Ray files then sends them to Kindle. Supports KFX, AZW3 and MOBI eBooks. X-Ray supports 18 languages.",
+ "github": "xxyzz/WordDumb",
+ "code_language": "python",
+ "thumb": "https://raw.githubusercontent.com/xxyzz/WordDumb/master/starfish.svg",
+ "image": "https://user-images.githubusercontent.com/21101839/130245435-b874f19a-7785-4093-9975-81596efc42bb.png",
+ "author": "xxyzz",
+ "author_links": {
+ "github": "xxyzz"
+ },
+ "category": ["standalone"]
+ },
+ {
+ "id": "eng_spacysentiment",
+ "title": "eng_spacysentiment",
+ "slogan": "Simple sentiment analysis using spaCy pipelines",
+ "description": "Sentiment analysis for simple english sentences using pre-trained spaCy pipelines",
+ "github": "vishnunkumar/spacysentiment",
+ "pip": "eng-spacysentiment",
+ "code_example": [
+ "import eng_spacysentiment",
+ "nlp = eng_spacysentiment.load()",
+ "text = \"Welcome to Arsenals official YouTube channel Watch as we take you closer and show you the personality of the club\"",
+ "doc = nlp(text)",
+ "print(doc.cats)",
+ "# {'positive': 0.29878824949264526, 'negative': 0.7012117505073547}"
+ ],
+ "thumb": "",
+ "image": "",
+ "code_language": "python",
+ "author": "Vishnu Nandakumar",
+ "author_links": {
+ "github": "Vishnunkumar",
+ "twitter": "vishnun_uchiha"
+ },
+ "category": ["pipeline"],
+ "tags": ["pipeline", "nlp", "sentiment"]
}
],
@@ -3226,6 +3728,11 @@
"title": "Scientific",
"description": "Frameworks and utilities for scientific text processing"
},
+ {
+ "id": "biomedical",
+ "title": "Biomedical",
+ "description": "Frameworks and utilities for processing biomedical text"
+ },
{
"id": "visualizers",
"title": "Visualizers",
diff --git a/website/src/components/embed.js b/website/src/components/embed.js
index 8d82bfaae..9f959bc99 100644
--- a/website/src/components/embed.js
+++ b/website/src/components/embed.js
@@ -3,6 +3,7 @@ import PropTypes from 'prop-types'
import classNames from 'classnames'
import Link from './link'
+import Button from './button'
import { InlineCode } from './code'
import { markdownToReact } from './util'
@@ -104,4 +105,23 @@ const Image = ({ src, alt, title, ...props }) => {
)
}
-export { YouTube, SoundCloud, Iframe, Image }
+const GoogleSheet = ({ id, link, height, button = 'View full table' }) => {
+ return (
+
+
+ {link && (
+
+ )}
+
+ )
+}
+
+export { YouTube, SoundCloud, Iframe, Image, GoogleSheet }
diff --git a/website/src/components/search.js b/website/src/components/search.js
index eeab9ef40..65d6f235a 100644
--- a/website/src/components/search.js
+++ b/website/src/components/search.js
@@ -6,13 +6,14 @@ import Icon from './icon'
import classes from '../styles/search.module.sass'
export default function Search({ id = 'docsearch', placeholder = 'Search docs', settings = {} }) {
- const { apiKey, indexName } = settings
+ const { apiKey, indexName, appId } = settings
if (!apiKey && !indexName) return null
const [initialized, setInitialized] = useState(false)
useEffect(() => {
if (!initialized) {
setInitialized(true)
window.docsearch({
+ appId,
apiKey,
indexName,
inputSelector: `#${id}`,
diff --git a/website/src/pages/404.js b/website/src/pages/404.js
index 4bdd43af6..53baebab9 100644
--- a/website/src/pages/404.js
+++ b/website/src/pages/404.js
@@ -41,6 +41,7 @@ export const pageQuery = graphql`
docSearch {
apiKey
indexName
+ appId
}
}
}
diff --git a/website/src/styles/embed.module.sass b/website/src/styles/embed.module.sass
index ba8a896c8..1eaf7b8d2 100644
--- a/website/src/styles/embed.module.sass
+++ b/website/src/styles/embed.module.sass
@@ -32,3 +32,7 @@
.image-link
display: block
+
+.google-sheet
+ width: 100%
+ margin-bottom: 1rem
diff --git a/website/src/templates/index.js b/website/src/templates/index.js
index 2c68ff056..dfd59e424 100644
--- a/website/src/templates/index.js
+++ b/website/src/templates/index.js
@@ -29,7 +29,7 @@ import Aside from '../components/aside'
import Button from '../components/button'
import Tag from '../components/tag'
import Grid from '../components/grid'
-import { YouTube, SoundCloud, Iframe, Image } from '../components/embed'
+import { YouTube, SoundCloud, Iframe, Image, GoogleSheet } from '../components/embed'
import Alert from '../components/alert'
import Search from '../components/search'
import Project from '../widgets/project'
@@ -72,6 +72,7 @@ const scopeComponents = {
YouTube,
SoundCloud,
Iframe,
+ GoogleSheet,
Abbr,
Tag,
Accordion,
@@ -119,8 +120,8 @@ const AlertSpace = ({ nightly, legacy }) => {
}
const navAlert = (
-
- 💥 Out now: spaCy v3.1
+
+ 💥 Out now: spaCy v3.2
)
@@ -234,6 +235,7 @@ export const pageQuery = graphql`
docSearch {
apiKey
indexName
+ appId
}
}
}
diff --git a/website/src/templates/models.js b/website/src/templates/models.js
index 21ade5e36..69cec3376 100644
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@@ -31,23 +31,26 @@ const COMPONENT_LINKS = {
const MODEL_META = {
core: 'Vocabulary, syntax, entities, vectors',
- core_sm: 'Vocabulary, syntax, entities',
+ core_no_vectors: 'Vocabulary, syntax, entities',
dep: 'Vocabulary, syntax',
ent: 'Named entities',
+ sent: 'Sentence boundaries',
pytt: 'PyTorch Transformers',
trf: 'Transformers',
vectors: 'Word vectors',
web: 'written text (blogs, news, comments)',
news: 'written text (news, media)',
wiki: 'Wikipedia',
- uas: 'Unlabelled dependencies',
- las: 'Labelled dependencies',
- dep_uas: 'Unlabelled dependencies',
- dep_las: 'Labelled dependencies',
+ uas: 'Unlabeled dependencies',
+ las: 'Labeled dependencies',
+ dep_uas: 'Unlabeled dependencies',
+ dep_las: 'Labeled dependencies',
token_acc: 'Tokenization',
tok: 'Tokenization',
lemma: 'Lemmatization',
morph: 'Morphological analysis',
+ lemma_acc: 'Lemmatization',
+ morph_acc: 'Morphological analysis',
tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)',
tag_acc: 'Part-of-speech tags (fine grained tags, Token.tag)',
tag: 'Part-of-speech tags (fine grained tags, Token.tag)',
@@ -114,8 +117,8 @@ function formatVectors(data) {
return `${abbrNum(keys)} keys, ${abbrNum(vectors)} unique vectors (${width} dimensions)`
}
-function formatAccuracy(data) {
- const exclude = ['speed']
+function formatAccuracy(data, lang) {
+ const exclude = (lang !== "ja") ? ['speed'] : ['speed', 'morph_acc']
if (!data) return []
return Object.keys(data)
.map(label => {
@@ -146,8 +149,7 @@ function formatModelMeta(data) {
license: data.license,
labels: isEmptyObj(data.labels) ? null : data.labels,
vectors: formatVectors(data.vectors),
- // TODO: remove accuracy fallback
- accuracy: formatAccuracy(data.accuracy || data.performance),
+ accuracy: formatAccuracy(data.performance, data.lang),
}
}
@@ -195,6 +197,7 @@ const Model = ({
const [isError, setIsError] = useState(true)
const [meta, setMeta] = useState({})
const { type, genre, size } = getModelComponents(name)
+ const display_type = type === 'core' && (size === 'sm' || size === 'trf') ? 'core_no_vectors' : type
const version = useMemo(() => getLatestVersion(name, compatibility, prereleases), [
name,
compatibility,
@@ -231,7 +234,7 @@ const Model = ({
const rows = [
{ label: 'Language', tag: langId, content: langName },
- { label: 'Type', tag: type, content: MODEL_META[type] },
+ { label: 'Type', tag: type, content: MODEL_META[display_type] },
{ label: 'Genre', tag: genre, content: MODEL_META[genre] },
{ label: 'Size', tag: size, content: meta.sizeFull },
{ label: 'Components', content: components, help: MODEL_META.components },
diff --git a/website/src/templates/universe.js b/website/src/templates/universe.js
index cfc8fdd0e..10f2520d9 100644
--- a/website/src/templates/universe.js
+++ b/website/src/templates/universe.js
@@ -8,10 +8,11 @@ import Title from '../components/title'
import Grid from '../components/grid'
import Button from '../components/button'
import Icon from '../components/icon'
+import Tag from '../components/tag'
import CodeBlock, { InlineCode } from '../components/code'
import Aside from '../components/aside'
import Sidebar from '../components/sidebar'
-import Section from '../components/section'
+import Section, { Hr } from '../components/section'
import Main from '../components/main'
import Footer from '../components/footer'
import { H3, H5, Label, InlineList } from '../components/typography'
@@ -121,6 +122,18 @@ const UniverseContent = ({ content = [], categories, theme, pageContext, mdxComp
)}
+
+
Found a mistake or something isn't working?
+
+ If you've come across a universe project that isn't working or is
+ incompatible with the reported spaCy version, let us know by{' '}
+
+ opening a discussion thread
+
+ .
+