mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-18 20:22:25 +03:00
Merge branch 'master' into feature/coref
This brings coref up to date, in particular giving access to 3.2 features.
This commit is contained in:
commit
c7f586c4ba
5
.github/ISSUE_TEMPLATE/config.yml
vendored
5
.github/ISSUE_TEMPLATE/config.yml
vendored
|
@ -1,8 +1,11 @@
|
||||||
blank_issues_enabled: false
|
blank_issues_enabled: false
|
||||||
contact_links:
|
contact_links:
|
||||||
|
- name: ⚠️ Python 3.10 Support
|
||||||
|
url: https://github.com/explosion/spaCy/discussions/9418
|
||||||
|
about: Python 3.10 wheels haven't been released yet, see the link for details.
|
||||||
- name: 🗯 Discussions Forum
|
- name: 🗯 Discussions Forum
|
||||||
url: https://github.com/explosion/spaCy/discussions
|
url: https://github.com/explosion/spaCy/discussions
|
||||||
about: Usage questions, general discussion and anything else that isn't a bug report.
|
about: Install issues, usage questions, general discussion and anything else that isn't a bug report.
|
||||||
- name: 📖 spaCy FAQ & Troubleshooting
|
- name: 📖 spaCy FAQ & Troubleshooting
|
||||||
url: https://github.com/explosion/spaCy/discussions/8226
|
url: https://github.com/explosion/spaCy/discussions/8226
|
||||||
about: Before you post, check out the FAQ for answers to common community questions!
|
about: Before you post, check out the FAQ for answers to common community questions!
|
||||||
|
|
2
.github/PULL_REQUEST_TEMPLATE.md
vendored
2
.github/PULL_REQUEST_TEMPLATE.md
vendored
|
@ -14,6 +14,6 @@ or new feature, or a change to the documentation? -->
|
||||||
## Checklist
|
## Checklist
|
||||||
<!--- Before you submit the PR, go over this checklist and make sure you can
|
<!--- Before you submit the PR, go over this checklist and make sure you can
|
||||||
tick off all the boxes. [] -> [x] -->
|
tick off all the boxes. [] -> [x] -->
|
||||||
- [ ] I have submitted the spaCy Contributor Agreement.
|
- [ ] I confirm that I have the right to submit this contribution under the project's MIT license.
|
||||||
- [ ] I ran the tests, and all new and existing tests passed.
|
- [ ] I ran the tests, and all new and existing tests passed.
|
||||||
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
|
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
|
||||||
|
|
15
.github/azure-steps.yml
vendored
15
.github/azure-steps.yml
vendored
|
@ -25,6 +25,10 @@ steps:
|
||||||
${{ parameters.prefix }} python setup.py sdist --formats=gztar
|
${{ parameters.prefix }} python setup.py sdist --formats=gztar
|
||||||
displayName: "Compile and build sdist"
|
displayName: "Compile and build sdist"
|
||||||
|
|
||||||
|
- script: python -m mypy spacy
|
||||||
|
displayName: 'Run mypy'
|
||||||
|
condition: ne(variables['python_version'], '3.10')
|
||||||
|
|
||||||
- task: DeleteFiles@1
|
- task: DeleteFiles@1
|
||||||
inputs:
|
inputs:
|
||||||
contents: "spacy"
|
contents: "spacy"
|
||||||
|
@ -100,3 +104,14 @@ steps:
|
||||||
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||||
displayName: 'Test assemble CLI vectors warning'
|
displayName: 'Test assemble CLI vectors warning'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python .github/validate_universe_json.py website/meta/universe.json
|
||||||
|
displayName: 'Test website/meta/universe.json'
|
||||||
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
${{ parameters.prefix }} python -m pip install thinc-apple-ops
|
||||||
|
${{ parameters.prefix }} python -m pytest --pyargs spacy
|
||||||
|
displayName: "Run CPU tests with thinc-apple-ops"
|
||||||
|
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.9'))
|
||||||
|
|
106
.github/contributors/Jette16.md
vendored
Normal file
106
.github/contributors/Jette16.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Henriette Behr |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 23.09.2021 |
|
||||||
|
| GitHub username | Jette16 |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/KennethEnevoldsen.md
vendored
Normal file
106
.github/contributors/KennethEnevoldsen.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [X] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------------- |
|
||||||
|
| Name | Kenneth Enevoldsen |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2021-07-13 |
|
||||||
|
| GitHub username | KennethEnevoldsen |
|
||||||
|
| Website (optional) | www.kennethenevoldsen.com |
|
106
.github/contributors/Pantalaymon.md
vendored
Normal file
106
.github/contributors/Pantalaymon.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name |Valentin-Gabriel Soumah|
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2021-11-23 |
|
||||||
|
| GitHub username | Pantalaymon |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/avi197.md
vendored
Normal file
106
.github/contributors/avi197.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Son Pham |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 09/10/2021 |
|
||||||
|
| GitHub username | Avi197 |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/bbieniek.md
vendored
Normal file
106
.github/contributors/bbieniek.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [X] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Baltazar Bieniek |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2021.08.19 |
|
||||||
|
| GitHub username | bbieniek |
|
||||||
|
| Website (optional) | https://baltazar.bieniek.org.pl/ |
|
106
.github/contributors/connorbrinton.md
vendored
Normal file
106
.github/contributors/connorbrinton.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Connor Brinton |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | July 20th, 2021 |
|
||||||
|
| GitHub username | connorbrinton |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/ezorita.md
vendored
Normal file
106
.github/contributors/ezorita.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Eduard Zorita |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 06/17/2021 |
|
||||||
|
| GitHub username | ezorita |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/fgaim.md
vendored
Normal file
106
.github/contributors/fgaim.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Fitsum Gaim |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2021-08-07 |
|
||||||
|
| GitHub username | fgaim |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/hlasse.md
vendored
Normal file
106
.github/contributors/hlasse.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [X] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------------- |
|
||||||
|
| Name | Lasse Hansen |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2021-08-11 |
|
||||||
|
| GitHub username | HLasse |
|
||||||
|
| Website (optional) | www.lassehansen.me |
|
106
.github/contributors/jmyerston.md
vendored
Normal file
106
.github/contributors/jmyerston.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
| ----------------------------- | ----------------------------------- |
|
||||||
|
| Name | Jacobo Myerston |
|
||||||
|
| Company name (if applicable) | University of California, San Diego |
|
||||||
|
| Title or role (if applicable) | Academic |
|
||||||
|
| Date | 07/05/2021 |
|
||||||
|
| GitHub username | jmyerston |
|
||||||
|
| Website (optional) | diogenet.ucsd.edu |
|
106
.github/contributors/mariosasko.md
vendored
Normal file
106
.github/contributors/mariosasko.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Mario Šaško |
|
||||||
|
| Company name (if applicable) | TakeLab FER |
|
||||||
|
| Title or role (if applicable) | R&D Intern |
|
||||||
|
| Date | 2021-07-12 |
|
||||||
|
| GitHub username | mariosasko |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/nsorros.md
vendored
Normal file
106
.github/contributors/nsorros.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Nick Sorros |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2/8/2021 |
|
||||||
|
| GitHub username | nsorros |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/philipvollet.md
vendored
Normal file
106
.github/contributors/philipvollet.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Philip Vollet |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 22.09.2021 |
|
||||||
|
| GitHub username | philipvollet |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/shigapov.md
vendored
Normal file
106
.github/contributors/shigapov.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | ------------------------ |
|
||||||
|
| Name | Renat Shigapov |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2021-09-09 |
|
||||||
|
| GitHub username | shigapov |
|
||||||
|
| Website (optional) | |
|
88
.github/contributors/swfarnsworth.md
vendored
Normal file
88
.github/contributors/swfarnsworth.md
vendored
Normal file
|
@ -0,0 +1,88 @@
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Steele Farnsworth |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 13 August, 2021 |
|
||||||
|
| GitHub username | swfarnsworth |
|
||||||
|
| Website (optional) | |
|
||||||
|
|
106
.github/contributors/syrull.md
vendored
Normal file
106
.github/contributors/syrull.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Dimitar Ganev |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2021/8/2 |
|
||||||
|
| GitHub username | syrull |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/thomashacker.md
vendored
Normal file
106
.github/contributors/thomashacker.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Edward Schmuhl |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 09.07.2021 |
|
||||||
|
| GitHub username | thomashacker |
|
||||||
|
| Website (optional) | |
|
19
.github/lock.yml
vendored
19
.github/lock.yml
vendored
|
@ -1,19 +0,0 @@
|
||||||
# Configuration for lock-threads - https://github.com/dessant/lock-threads
|
|
||||||
|
|
||||||
# Number of days of inactivity before a closed issue or pull request is locked
|
|
||||||
daysUntilLock: 30
|
|
||||||
|
|
||||||
# Issues and pull requests with these labels will not be locked. Set to `[]` to disable
|
|
||||||
exemptLabels: []
|
|
||||||
|
|
||||||
# Label to add before locking, such as `outdated`. Set to `false` to disable
|
|
||||||
lockLabel: false
|
|
||||||
|
|
||||||
# Comment to post before locking. Set to `false` to disable
|
|
||||||
lockComment: >
|
|
||||||
This thread has been automatically locked since there has not been
|
|
||||||
any recent activity after it was closed. Please open a new issue for
|
|
||||||
related bugs.
|
|
||||||
|
|
||||||
# Limit to only `issues` or `pulls`
|
|
||||||
only: issues
|
|
19
.github/validate_universe_json.py
vendored
Normal file
19
.github/validate_universe_json.py
vendored
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def validate_json(document):
|
||||||
|
universe_file = Path(document)
|
||||||
|
with universe_file.open() as f:
|
||||||
|
universe_data = json.load(f)
|
||||||
|
for entry in universe_data["resources"]:
|
||||||
|
if "github" in entry:
|
||||||
|
assert not re.match(
|
||||||
|
r"^(http:)|^(https:)", entry["github"]
|
||||||
|
), "Github field should be user/repo, not a url"
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
validate_json(str(sys.argv[1]))
|
2
.github/workflows/autoblack.yml
vendored
2
.github/workflows/autoblack.yml
vendored
|
@ -9,7 +9,7 @@ on:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
autoblack:
|
autoblack:
|
||||||
if: github.repository_owner = 'explosion'
|
if: github.repository_owner == 'explosion'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
|
|
27
.github/workflows/explosionbot.yml
vendored
Normal file
27
.github/workflows/explosionbot.yml
vendored
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
name: Explosion Bot
|
||||||
|
|
||||||
|
on:
|
||||||
|
issue_comment:
|
||||||
|
types:
|
||||||
|
- created
|
||||||
|
- edited
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
explosion-bot:
|
||||||
|
runs-on: ubuntu-18.04
|
||||||
|
steps:
|
||||||
|
- name: Dump GitHub context
|
||||||
|
env:
|
||||||
|
GITHUB_CONTEXT: ${{ toJson(github) }}
|
||||||
|
run: echo "$GITHUB_CONTEXT"
|
||||||
|
- uses: actions/checkout@v1
|
||||||
|
- uses: actions/setup-python@v1
|
||||||
|
- name: Install and run explosion-bot
|
||||||
|
run: |
|
||||||
|
pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot
|
||||||
|
python -m explosionbot
|
||||||
|
env:
|
||||||
|
INPUT_TOKEN: ${{ secrets.EXPLOSIONBOT_TOKEN }}
|
||||||
|
INPUT_BK_TOKEN: ${{ secrets.BUILDKITE_SECRET }}
|
||||||
|
ENABLED_COMMANDS: "test_gpu,test_slow"
|
||||||
|
ALLOWED_TEAMS: "spaCy"
|
25
.github/workflows/lock.yml
vendored
Normal file
25
.github/workflows/lock.yml
vendored
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
name: 'Lock Threads'
|
||||||
|
|
||||||
|
on:
|
||||||
|
schedule:
|
||||||
|
- cron: '0 0 * * *' # check every day
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
issues: write
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: lock
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
action:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: dessant/lock-threads@v3
|
||||||
|
with:
|
||||||
|
process-only: 'issues'
|
||||||
|
issue-inactive-days: '30'
|
||||||
|
issue-comment: >
|
||||||
|
This thread has been automatically locked since there
|
||||||
|
has not been any recent activity after it was closed.
|
||||||
|
Please open a new issue for related bugs.
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -9,6 +9,7 @@ keys/
|
||||||
spacy/tests/package/setup.cfg
|
spacy/tests/package/setup.cfg
|
||||||
spacy/tests/package/pyproject.toml
|
spacy/tests/package/pyproject.toml
|
||||||
spacy/tests/package/requirements.txt
|
spacy/tests/package/requirements.txt
|
||||||
|
spacy/tests/universe/universe.json
|
||||||
|
|
||||||
# Website
|
# Website
|
||||||
website/.cache/
|
website/.cache/
|
||||||
|
|
8
CITATION
8
CITATION
|
@ -1,8 +0,0 @@
|
||||||
@software{spacy,
|
|
||||||
author = {Honnibal, Matthew and Montani, Ines and Van Landeghem, Sofie and Boyd, Adriane},
|
|
||||||
title = {{spaCy: Industrial-strength Natural Language Processing in Python}},
|
|
||||||
year = 2020,
|
|
||||||
publisher = {Zenodo},
|
|
||||||
doi = {10.5281/zenodo.1212303},
|
|
||||||
url = {https://doi.org/10.5281/zenodo.1212303}
|
|
||||||
}
|
|
16
CITATION.cff
Normal file
16
CITATION.cff
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
cff-version: 1.2.0
|
||||||
|
preferred-citation:
|
||||||
|
type: article
|
||||||
|
message: "If you use spaCy, please cite it as below."
|
||||||
|
authors:
|
||||||
|
- family-names: "Honnibal"
|
||||||
|
given-names: "Matthew"
|
||||||
|
- family-names: "Montani"
|
||||||
|
given-names: "Ines"
|
||||||
|
- family-names: "Van Landeghem"
|
||||||
|
given-names: "Sofie"
|
||||||
|
- family-names: "Boyd"
|
||||||
|
given-names: "Adriane"
|
||||||
|
title: "spaCy: Industrial-strength Natural Language Processing in Python"
|
||||||
|
doi: "10.5281/zenodo.1212303"
|
||||||
|
year: 2020
|
|
@ -140,29 +140,28 @@ Changes to `.py` files will be effective immediately.
|
||||||
|
|
||||||
📖 **For more details and instructions, see the documentation on [compiling spaCy from source](https://spacy.io/usage/#source) and the [quickstart widget](https://spacy.io/usage/#section-quickstart) to get the right commands for your platform and Python version.**
|
📖 **For more details and instructions, see the documentation on [compiling spaCy from source](https://spacy.io/usage/#source) and the [quickstart widget](https://spacy.io/usage/#section-quickstart) to get the right commands for your platform and Python version.**
|
||||||
|
|
||||||
### Contributor agreement
|
|
||||||
|
|
||||||
If you've made a contribution to spaCy, you should fill in the
|
|
||||||
[spaCy contributor agreement](.github/CONTRIBUTOR_AGREEMENT.md) to ensure that
|
|
||||||
your contribution can be used across the project. If you agree to be bound by
|
|
||||||
the terms of the agreement, fill in the [template](.github/CONTRIBUTOR_AGREEMENT.md)
|
|
||||||
and include it with your pull request, or submit it separately to
|
|
||||||
[`.github/contributors/`](/.github/contributors). The name of the file should be
|
|
||||||
your GitHub username, with the extension `.md`. For example, the user
|
|
||||||
example_user would create the file `.github/contributors/example_user.md`.
|
|
||||||
|
|
||||||
### Fixing bugs
|
### Fixing bugs
|
||||||
|
|
||||||
When fixing a bug, first create an
|
When fixing a bug, first create an
|
||||||
[issue](https://github.com/explosion/spaCy/issues) if one does not already exist.
|
[issue](https://github.com/explosion/spaCy/issues) if one does not already
|
||||||
The description text can be very short – we don't want to make this too
|
exist. The description text can be very short – we don't want to make this too
|
||||||
bureaucratic.
|
bureaucratic.
|
||||||
|
|
||||||
Next, create a test file named `test_issue[ISSUE NUMBER].py` in the
|
Next, add a test to the relevant file in the
|
||||||
[`spacy/tests/regression`](spacy/tests/regression) folder. Test for the bug
|
[`spacy/tests`](spacy/tests)folder. Then add a [pytest
|
||||||
you're fixing, and make sure the test fails. Next, add and commit your test file
|
mark](https://docs.pytest.org/en/6.2.x/example/markers.html#working-with-custom-markers),
|
||||||
referencing the issue number in the commit message. Finally, fix the bug, make
|
`@pytest.mark.issue(NUMBER)`, to reference the issue number.
|
||||||
sure your test passes and reference the issue in your commit message.
|
|
||||||
|
```python
|
||||||
|
# Assume you're fixing Issue #1234
|
||||||
|
@pytest.mark.issue(1234)
|
||||||
|
def test_issue1234():
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
Test for the bug you're fixing, and make sure the test fails. Next, add and
|
||||||
|
commit your test file. Finally, fix the bug, make sure your test passes and
|
||||||
|
reference the issue number in your pull request description.
|
||||||
|
|
||||||
📖 **For more information on how to add tests, check out the [tests README](spacy/tests/README.md).**
|
📖 **For more information on how to add tests, check out the [tests README](spacy/tests/README.md).**
|
||||||
|
|
||||||
|
@ -185,7 +184,6 @@ Each time a `git commit` is initiated, `black` and `flake8` will run automatical
|
||||||
In case of error, or when `black` modified a file, the modified file needs to be `git add` once again and a new
|
In case of error, or when `black` modified a file, the modified file needs to be `git add` once again and a new
|
||||||
`git commit` has to be issued.
|
`git commit` has to be issued.
|
||||||
|
|
||||||
|
|
||||||
### Code formatting
|
### Code formatting
|
||||||
|
|
||||||
[`black`](https://github.com/ambv/black) is an opinionated Python code
|
[`black`](https://github.com/ambv/black) is an opinionated Python code
|
||||||
|
@ -414,14 +412,7 @@ all test files and test functions need to be prefixed with `test_`.
|
||||||
When adding tests, make sure to use descriptive names, keep the code short and
|
When adding tests, make sure to use descriptive names, keep the code short and
|
||||||
concise and only test for one behavior at a time. Try to `parametrize` test
|
concise and only test for one behavior at a time. Try to `parametrize` test
|
||||||
cases wherever possible, use our pre-defined fixtures for spaCy components and
|
cases wherever possible, use our pre-defined fixtures for spaCy components and
|
||||||
avoid unnecessary imports.
|
avoid unnecessary imports. Extensive tests that take a long time should be marked with `@pytest.mark.slow`.
|
||||||
|
|
||||||
Extensive tests that take a long time should be marked with `@pytest.mark.slow`.
|
|
||||||
Tests that require the model to be loaded should be marked with
|
|
||||||
`@pytest.mark.models`. Loading the models is expensive and not necessary if
|
|
||||||
you're not actually testing the model performance. If all you need is a `Doc`
|
|
||||||
object with annotations like heads, POS tags or the dependency parse, you can
|
|
||||||
use the `Doc` constructor to construct it manually.
|
|
||||||
|
|
||||||
📖 **For more guidelines and information on how to add tests, check out the [tests README](spacy/tests/README.md).**
|
📖 **For more guidelines and information on how to add tests, check out the [tests README](spacy/tests/README.md).**
|
||||||
|
|
||||||
|
@ -438,7 +429,7 @@ simply click on the "Suggest edits" button at the bottom of a page.
|
||||||
## Publishing spaCy extensions and plugins
|
## Publishing spaCy extensions and plugins
|
||||||
|
|
||||||
We're very excited about all the new possibilities for **community extensions**
|
We're very excited about all the new possibilities for **community extensions**
|
||||||
and plugins in spaCy v2.0, and we can't wait to see what you build with it!
|
and plugins in spaCy v3.0, and we can't wait to see what you build with it!
|
||||||
|
|
||||||
- An extension or plugin should add substantial functionality, be
|
- An extension or plugin should add substantial functionality, be
|
||||||
**well-documented** and **open-source**. It should be available for users to download
|
**well-documented** and **open-source**. It should be available for users to download
|
||||||
|
|
2
LICENSE
2
LICENSE
|
@ -1,6 +1,6 @@
|
||||||
The MIT License (MIT)
|
The MIT License (MIT)
|
||||||
|
|
||||||
Copyright (C) 2016-2021 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
|
Copyright (C) 2016-2022 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
|
|
@ -1,11 +1,8 @@
|
||||||
recursive-include include *.h
|
recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml
|
||||||
recursive-include spacy *.pyx *.pxd *.txt *.cfg *.jinja
|
|
||||||
include LICENSE
|
include LICENSE
|
||||||
include README.md
|
include README.md
|
||||||
include pyproject.toml
|
include pyproject.toml
|
||||||
include spacy/py.typed
|
include spacy/py.typed
|
||||||
recursive-exclude spacy/lang *.json
|
recursive-include spacy/cli *.yml
|
||||||
recursive-include spacy/lang *.json.gz
|
|
||||||
recursive-include spacy/cli *.json *.yml
|
|
||||||
recursive-include licenses *
|
recursive-include licenses *
|
||||||
recursive-exclude spacy *.cpp
|
recursive-exclude spacy *.cpp
|
||||||
|
|
|
@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
|
||||||
model packaging, deployment and workflow management. spaCy is commercial
|
model packaging, deployment and workflow management. spaCy is commercial
|
||||||
open-source software, released under the MIT license.
|
open-source software, released under the MIT license.
|
||||||
|
|
||||||
💫 **Version 3.0 out now!**
|
💫 **Version 3.2 out now!**
|
||||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||||
|
|
||||||
[](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
[](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
||||||
|
|
|
@ -14,15 +14,16 @@ trigger:
|
||||||
pr:
|
pr:
|
||||||
paths:
|
paths:
|
||||||
exclude:
|
exclude:
|
||||||
- "website/*"
|
|
||||||
- "*.md"
|
- "*.md"
|
||||||
|
- "website/docs/*"
|
||||||
|
- "website/src/*"
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
# Perform basic checks for most important errors (syntax etc.) Uses the config
|
# Perform basic checks for most important errors (syntax etc.) Uses the config
|
||||||
# defined in .flake8 and overwrites the selected codes.
|
# defined in .flake8 and overwrites the selected codes.
|
||||||
- job: "Validate"
|
- job: "Validate"
|
||||||
pool:
|
pool:
|
||||||
vmImage: "ubuntu-18.04"
|
vmImage: "ubuntu-latest"
|
||||||
steps:
|
steps:
|
||||||
- task: UsePythonVersion@0
|
- task: UsePythonVersion@0
|
||||||
inputs:
|
inputs:
|
||||||
|
@ -38,41 +39,50 @@ jobs:
|
||||||
matrix:
|
matrix:
|
||||||
# We're only running one platform per Python version to speed up builds
|
# We're only running one platform per Python version to speed up builds
|
||||||
Python36Linux:
|
Python36Linux:
|
||||||
imageName: "ubuntu-18.04"
|
imageName: "ubuntu-latest"
|
||||||
python.version: "3.6"
|
python.version: "3.6"
|
||||||
# Python36Windows:
|
# Python36Windows:
|
||||||
# imageName: "vs2017-win2016"
|
# imageName: "windows-latest"
|
||||||
# python.version: "3.6"
|
# python.version: "3.6"
|
||||||
# Python36Mac:
|
# Python36Mac:
|
||||||
# imageName: "macos-10.14"
|
# imageName: "macos-latest"
|
||||||
# python.version: "3.6"
|
# python.version: "3.6"
|
||||||
# Python37Linux:
|
# Python37Linux:
|
||||||
# imageName: "ubuntu-18.04"
|
# imageName: "ubuntu-latest"
|
||||||
# python.version: "3.7"
|
# python.version: "3.7"
|
||||||
Python37Windows:
|
Python37Windows:
|
||||||
imageName: "vs2017-win2016"
|
imageName: "windows-latest"
|
||||||
python.version: "3.7"
|
python.version: "3.7"
|
||||||
# Python37Mac:
|
# Python37Mac:
|
||||||
# imageName: "macos-10.14"
|
# imageName: "macos-latest"
|
||||||
# python.version: "3.7"
|
# python.version: "3.7"
|
||||||
# Python38Linux:
|
# Python38Linux:
|
||||||
# imageName: "ubuntu-18.04"
|
# imageName: "ubuntu-latest"
|
||||||
# python.version: "3.8"
|
# python.version: "3.8"
|
||||||
# Python38Windows:
|
# Python38Windows:
|
||||||
# imageName: "vs2017-win2016"
|
# imageName: "windows-latest"
|
||||||
# python.version: "3.8"
|
# python.version: "3.8"
|
||||||
Python38Mac:
|
Python38Mac:
|
||||||
imageName: "macos-10.14"
|
imageName: "macos-latest"
|
||||||
python.version: "3.8"
|
python.version: "3.8"
|
||||||
Python39Linux:
|
Python39Linux:
|
||||||
imageName: "ubuntu-18.04"
|
imageName: "ubuntu-latest"
|
||||||
python.version: "3.9"
|
|
||||||
Python39Windows:
|
|
||||||
imageName: "vs2017-win2016"
|
|
||||||
python.version: "3.9"
|
|
||||||
Python39Mac:
|
|
||||||
imageName: "macos-10.14"
|
|
||||||
python.version: "3.9"
|
python.version: "3.9"
|
||||||
|
# Python39Windows:
|
||||||
|
# imageName: "windows-latest"
|
||||||
|
# python.version: "3.9"
|
||||||
|
# Python39Mac:
|
||||||
|
# imageName: "macos-latest"
|
||||||
|
# python.version: "3.9"
|
||||||
|
Python310Linux:
|
||||||
|
imageName: "ubuntu-latest"
|
||||||
|
python.version: "3.10"
|
||||||
|
Python310Windows:
|
||||||
|
imageName: "windows-latest"
|
||||||
|
python.version: "3.10"
|
||||||
|
Python310Mac:
|
||||||
|
imageName: "macos-latest"
|
||||||
|
python.version: "3.10"
|
||||||
maxParallel: 4
|
maxParallel: 4
|
||||||
pool:
|
pool:
|
||||||
vmImage: $(imageName)
|
vmImage: $(imageName)
|
||||||
|
|
|
@ -2,4 +2,5 @@
|
||||||
numpy==1.15.0; python_version<='3.7'
|
numpy==1.15.0; python_version<='3.7'
|
||||||
numpy==1.17.3; python_version=='3.8'
|
numpy==1.17.3; python_version=='3.8'
|
||||||
numpy==1.19.3; python_version=='3.9'
|
numpy==1.19.3; python_version=='3.9'
|
||||||
numpy; python_version>='3.10'
|
numpy==1.21.3; python_version=='3.10'
|
||||||
|
numpy; python_version>='3.11'
|
||||||
|
|
546
extra/DEVELOPER_DOCS/Code Conventions.md
Normal file
546
extra/DEVELOPER_DOCS/Code Conventions.md
Normal file
|
@ -0,0 +1,546 @@
|
||||||
|
# Code Conventions
|
||||||
|
|
||||||
|
For a general overview of code conventions for contributors, see the [section in the contributing guide](https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md#code-conventions).
|
||||||
|
|
||||||
|
1. [Code compatibility](#code-compatibility)
|
||||||
|
2. [Auto-formatting](#auto-formatting)
|
||||||
|
3. [Linting](#linting)
|
||||||
|
4. [Documenting code](#documenting-code)
|
||||||
|
5. [Type hints](#type-hints)
|
||||||
|
6. [Structuring logic](#structuring-logic)
|
||||||
|
7. [Naming](#naming)
|
||||||
|
8. [Error handling](#error-handling)
|
||||||
|
9. [Writing tests](#writing-tests)
|
||||||
|
|
||||||
|
## Code compatibility
|
||||||
|
|
||||||
|
spaCy supports **Python 3.6** and above, so all code should be written compatible with 3.6. This means that there are certain new syntax features that we won't be able to use until we drop support for older Python versions. Some newer features provide backports that we can conditionally install for older versions, although we only want to do this if it's absolutely necessary. If we need to use conditional imports based on the Python version or other custom compatibility-specific helpers, those should live in `compat.py`.
|
||||||
|
|
||||||
|
## Auto-formatting
|
||||||
|
|
||||||
|
spaCy uses `black` for auto-formatting (which is also available as a pre-commit hook). It's recommended to configure your editor to perform this automatically, either triggered manually or whenever you save a file. We also have a GitHub action that regularly formats the code base and submits a PR if changes are available. Note that auto-formatting is currently only available for `.py` (Python) files, not for `.pyx` (Cython).
|
||||||
|
|
||||||
|
As a rule of thumb, if the auto-formatting produces output that looks messy, it can often indicate that there's a better way to structure the code to make it more concise.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
- range_suggester = registry.misc.get("spacy.ngram_range_suggester.v1")(
|
||||||
|
- min_size=1, max_size=3
|
||||||
|
- )
|
||||||
|
+ suggester_factory = registry.misc.get("spacy.ngram_range_suggester.v1")
|
||||||
|
+ range_suggester = suggester_factory(min_size=1, max_size=3)
|
||||||
|
```
|
||||||
|
|
||||||
|
In some specific cases, e.g. in the tests, it can make sense to disable auto-formatting for a specific block. You can do this by wrapping the code in `# fmt: off` and `# fmt: on`:
|
||||||
|
|
||||||
|
```diff
|
||||||
|
+ # fmt: off
|
||||||
|
text = "I look forward to using Thingamajig. I've been told it will make my life easier..."
|
||||||
|
deps = ["nsubj", "ROOT", "advmod", "prep", "pcomp", "dobj", "punct", "",
|
||||||
|
"nsubjpass", "aux", "auxpass", "ROOT", "nsubj", "aux", "ccomp",
|
||||||
|
"poss", "nsubj", "ccomp", "punct"]
|
||||||
|
+ # fmt: on
|
||||||
|
```
|
||||||
|
|
||||||
|
## Linting
|
||||||
|
|
||||||
|
[`flake8`](http://flake8.pycqa.org/en/latest/) is a tool for enforcing code style. It scans one or more files and outputs errors and warnings. This feedback can help you stick to general standards and conventions, and can be very useful for spotting potential mistakes and inconsistencies in your code. Code you write should be compatible with our flake8 rules and not cause any warnings.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
flake8 spacy
|
||||||
|
```
|
||||||
|
|
||||||
|
The most common problems surfaced by linting are:
|
||||||
|
|
||||||
|
- **Trailing or missing whitespace.** This is related to formatting and should be fixed automatically by running `black`.
|
||||||
|
- **Unused imports.** Those should be removed if the imports aren't actually used. If they're required, e.g. to expose them so they can be imported from the given module, you can add a comment and `# noqa: F401` exception (see details below).
|
||||||
|
- **Unused variables.** This can often indicate bugs, e.g. a variable that's declared and not correctly passed on or returned. To prevent ambiguity here, your code shouldn't contain unused variables. If you're unpacking a list of tuples and end up with variables you don't need, you can call them `_` to indicate that they're unused.
|
||||||
|
- **Redefinition of function.** This can also indicate bugs, e.g. a copy-pasted function that you forgot to rename and that now replaces the original function.
|
||||||
|
- **Repeated dictionary keys.** This either indicates a bug or unnecessary duplication.
|
||||||
|
- **Comparison with `True`, `False`, `None`**. This is mostly a stylistic thing: when checking whether a value is `True`, `False` or `None`, you should be using `is` instead of `==`. For example, `if value is None`.
|
||||||
|
|
||||||
|
### Ignoring linter rules for special cases
|
||||||
|
|
||||||
|
To ignore a given line, you can add a comment like `# noqa: F401`, specifying the code of the error or warning we want to ignore. It's also possible to ignore several comma-separated codes at once, e.g. `# noqa: E731,E123`. In general, you should always **specify the code(s)** you want to ignore – otherwise, you may end up missing actual problems.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# The imported class isn't used in this file, but imported here, so it can be
|
||||||
|
# imported *from* here by another module.
|
||||||
|
from .submodule import SomeClass # noqa: F401
|
||||||
|
|
||||||
|
try:
|
||||||
|
do_something()
|
||||||
|
except: # noqa: E722
|
||||||
|
# This bare except is justified, for some specific reason
|
||||||
|
do_something_else()
|
||||||
|
```
|
||||||
|
|
||||||
|
## Documenting code
|
||||||
|
|
||||||
|
All functions and methods you write should be documented with a docstring inline. The docstring can contain a simple summary, and an overview of the arguments and their (simplified) types. Modern editors will show this information to users when they call the function or method in their code.
|
||||||
|
|
||||||
|
If it's part of the public API and there's a documentation section available, we usually add the link as `DOCS:` at the end. This allows us to keep the docstrings simple and concise, while also providing additional information and examples if necessary.
|
||||||
|
|
||||||
|
```python
|
||||||
|
def has_pipe(self, name: str) -> bool:
|
||||||
|
"""Check if a component name is present in the pipeline. Equivalent to
|
||||||
|
`name in nlp.pipe_names`.
|
||||||
|
|
||||||
|
name (str): Name of the component.
|
||||||
|
RETURNS (bool): Whether a component of the name exists in the pipeline.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/language#has_pipe
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
We specifically chose this approach of maintaining the docstrings and API reference separately, instead of auto-generating the API docs from the docstrings like other packages do. We want to be able to provide extensive explanations and examples in the documentation and use our own custom markup for it that would otherwise clog up the docstrings. We also want to be able to update the documentation independently of the code base. It's slightly more work, but it's absolutely worth it in terms of user and developer experience.
|
||||||
|
|
||||||
|
### Inline code comments
|
||||||
|
|
||||||
|
We don't expect you to add inline comments for everything you're doing – this should be obvious from reading the code. If it's not, the first thing to check is whether your code can be improved to make it more explicit. That said, if your code includes complex logic or aspects that may be unintuitive at first glance (or even included a subtle bug that you ended up fixing), you should leave a quick comment that provides more context.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
token_index = indices[value]
|
||||||
|
+ # Index describes Token.i of last token but Span indices are inclusive
|
||||||
|
span = doc[prev_token_index:token_index + 1]
|
||||||
|
```
|
||||||
|
|
||||||
|
```diff
|
||||||
|
+ # To create the components we need to use the final interpolated config
|
||||||
|
+ # so all values are available (if component configs use variables).
|
||||||
|
+ # Later we replace the component config with the raw config again.
|
||||||
|
interpolated = filled.interpolate() if not filled.is_interpolated else filled
|
||||||
|
```
|
||||||
|
|
||||||
|
Don't be shy about including comments for tricky parts that _you_ found hard to implement or get right – those may come in handy for the next person working on this code, or even future you!
|
||||||
|
|
||||||
|
If your change implements a fix to a specific issue, it can often be helpful to include the issue number in the comment, especially if it's a relatively straightforward adjustment:
|
||||||
|
|
||||||
|
```diff
|
||||||
|
+ # Ensure object is a Span, not a Doc (#1234)
|
||||||
|
if isinstance(obj, Doc):
|
||||||
|
obj = obj[obj.start:obj.end]
|
||||||
|
```
|
||||||
|
|
||||||
|
### Including TODOs
|
||||||
|
|
||||||
|
It's fine to include code comments that indicate future TODOs, using the `TODO:` prefix. Modern editors typically format this in a different color, so it's easy to spot. TODOs don't necessarily have to be things that are absolutely critical to fix fight now – those should already be addressed in your pull request once it's ready for review. But they can include notes about potential future improvements.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
+ # TODO: this is currently pretty slow
|
||||||
|
dir_checksum = hashlib.md5()
|
||||||
|
for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
|
||||||
|
dir_checksum.update(sub_file.read_bytes())
|
||||||
|
```
|
||||||
|
|
||||||
|
If any of the TODOs you've added are important and should be fixed soon, you should add a task for this on Explosion's internal Ora board or an issue on the public issue tracker to make sure we don't forget to address it.
|
||||||
|
|
||||||
|
## Type hints
|
||||||
|
|
||||||
|
We use Python type hints across the `.py` files wherever possible. This makes it easy to understand what a function expects and returns, and modern editors will be able to show this information to you when you call an annotated function. Type hints are not currently used in the `.pyx` (Cython) code, except for definitions of registered functions and component factories, where they're used for config validation.
|
||||||
|
|
||||||
|
If possible, you should always use the more descriptive type hints like `List[str]` or even `List[Any]` instead of only `list`. We also annotate arguments and return types of `Callable` – although, you can simplify this if the type otherwise gets too verbose (e.g. functions that return factories to create callbacks). Remember that `Callable` takes two values: a **list** of the argument type(s) in order, and the return values.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
- def func(some_arg: dict) -> None:
|
||||||
|
+ def func(some_arg: Dict[str, Any]) -> None:
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
```python
|
||||||
|
def create_callback(some_arg: bool) -> Callable[[str, int], List[str]]:
|
||||||
|
def callback(arg1: str, arg2: int) -> List[str]:
|
||||||
|
...
|
||||||
|
|
||||||
|
return callback
|
||||||
|
```
|
||||||
|
|
||||||
|
For model architectures, Thinc also provides a collection of [custom types](https://thinc.ai/docs/api-types), including more specific types for arrays and model inputs/outputs. Even outside of static type checking, using these types will make the code a lot easier to read and follow, since it's always clear what array types are expected (and what might go wrong if the output is different from the expected type).
|
||||||
|
|
||||||
|
```python
|
||||||
|
def build_tagger_model(
|
||||||
|
tok2vec: Model[List[Doc], List[Floats2d]], nO: Optional[int] = None
|
||||||
|
) -> Model[List[Doc], List[Floats2d]]:
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
If you need to use a type hint that refers to something later declared in the same module, or the class that a method belongs to, you can use a string value instead:
|
||||||
|
|
||||||
|
```python
|
||||||
|
class SomeClass:
|
||||||
|
def from_bytes(self, data: bytes) -> "SomeClass":
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
In some cases, you won't be able to import a class from a different module to use it as a type hint because it'd cause circular imports. For instance, `spacy/util.py` includes various helper functions that return an instance of `Language`, but we couldn't import it, because `spacy/language.py` imports `util` itself. In this case, we can provide `"Language"` as a string and make the import conditional on `typing.TYPE_CHECKING` so it only runs when the code is evaluated by a type checker:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from typing TYPE_CHECKING
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from .language import Language
|
||||||
|
|
||||||
|
def load_model(name: str) -> "Language":
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
## Structuring logic
|
||||||
|
|
||||||
|
### Positional and keyword arguments
|
||||||
|
|
||||||
|
We generally try to avoid writing functions and methods with too many arguments, and use keyword-only arguments wherever possible. Python lets you define arguments as keyword-only by separating them with a `, *`. If you're writing functions with additional arguments that customize the behavior, you typically want to make those arguments keyword-only, so their names have to be provided explicitly.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
- def do_something(name: str, validate: bool = False):
|
||||||
|
+ def do_something(name: str, *, validate: bool = False):
|
||||||
|
...
|
||||||
|
|
||||||
|
- do_something("some_name", True)
|
||||||
|
+ do_something("some_name", validate=True)
|
||||||
|
```
|
||||||
|
|
||||||
|
This makes the function calls easier to read, because it's immediately clear what the additional values mean. It also makes it easier to extend arguments or change their order later on, because you don't end up with any function calls that depend on a specific positional order.
|
||||||
|
|
||||||
|
### Avoid mutable default arguments
|
||||||
|
|
||||||
|
A common Python gotcha are [mutable default arguments](https://docs.python-guide.org/writing/gotchas/#mutable-default-arguments): if your argument defines a mutable default value like `[]` or `{}` and then goes and mutates it, the default value is created _once_ when the function is created and the same object is then mutated every time the function is called. This can be pretty unintuitive when you first encounter it. We therefore avoid writing logic that does this.
|
||||||
|
|
||||||
|
If your arguments need to default to an empty list or dict, you can use the `SimpleFrozenList` and `SimpleFrozenDict` helpers provided by spaCy. They are simple frozen implementations that raise an error if they're being mutated to prevent bugs and logic that accidentally mutates default arguments.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
- def to_bytes(self, *, exclude: List[str] = []):
|
||||||
|
+ def to_bytes(self, *, exclude: List[str] = SimpleFrozenList()):
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
```diff
|
||||||
|
def do_something(values: List[str] = SimpleFrozenList()):
|
||||||
|
if some_condition:
|
||||||
|
- values.append("foo") # raises an error
|
||||||
|
+ values = [*values, "foo"]
|
||||||
|
return values
|
||||||
|
```
|
||||||
|
|
||||||
|
### Don't use `try`/`except` for control flow
|
||||||
|
|
||||||
|
We strongly discourage using `try`/`except` blocks for anything that's not third-party error handling or error handling that we otherwise have little control over. There's typically always a way to anticipate the _actual_ problem and **check for it explicitly**, which makes the code easier to follow and understand, and prevents bugs:
|
||||||
|
|
||||||
|
```diff
|
||||||
|
- try:
|
||||||
|
- token = doc[i]
|
||||||
|
- except IndexError:
|
||||||
|
- token = doc[-1]
|
||||||
|
|
||||||
|
+ if i < len(doc):
|
||||||
|
+ token = doc[i]
|
||||||
|
+ else:
|
||||||
|
+ token = doc[-1]
|
||||||
|
```
|
||||||
|
|
||||||
|
Even if you end up having to check for multiple conditions explicitly, this is still preferred over a catch-all `try`/`except`. It can be very helpful to think about the exact scenarios you need to cover, and what could go wrong at each step, which often leads to better code and fewer bugs. `try/except` blocks can also easily mask _other_ bugs and problems that raise the same errors you're catching, which is obviously bad.
|
||||||
|
|
||||||
|
If you have to use `try`/`except`, make sure to only include what's **absolutely necessary** in the `try` block and define the exception(s) explicitly. Otherwise, you may end up masking very different exceptions caused by other bugs.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
- try:
|
||||||
|
- value1 = get_some_value()
|
||||||
|
- value2 = get_some_other_value()
|
||||||
|
- score = external_library.compute_some_score(value1, value2)
|
||||||
|
- except:
|
||||||
|
- score = 0.0
|
||||||
|
|
||||||
|
+ value1 = get_some_value()
|
||||||
|
+ value2 = get_some_other_value()
|
||||||
|
+ try:
|
||||||
|
+ score = external_library.compute_some_score(value1, value2)
|
||||||
|
+ except ValueError:
|
||||||
|
+ score = 0.0
|
||||||
|
```
|
||||||
|
|
||||||
|
### Avoid lambda functions
|
||||||
|
|
||||||
|
`lambda` functions can be useful for defining simple anonymous functions in a single line, but they also introduce problems: for instance, they require [additional logic](https://stackoverflow.com/questions/25348532/can-python-pickle-lambda-functions) in order to be pickled and are pretty ugly to type-annotate. So we typically avoid them in the code base and only use them in the serialization handlers and within tests for simplicity. Instead of `lambda`s, check if your code can be refactored to not need them, or use helper functions instead.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
- split_string: Callable[[str], List[str]] = lambda value: [v.strip() for v in value.split(",")]
|
||||||
|
|
||||||
|
+ def split_string(value: str) -> List[str]:
|
||||||
|
+ return [v.strip() for v in value.split(",")]
|
||||||
|
```
|
||||||
|
|
||||||
|
### Iteration and comprehensions
|
||||||
|
|
||||||
|
We generally avoid using built-in functions like `filter` or `map` in favor of list or generator comprehensions.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
- filtered = filter(lambda x: x in ["foo", "bar"], values)
|
||||||
|
+ filtered = (x for x in values if x in ["foo", "bar"])
|
||||||
|
- filtered = list(filter(lambda x: x in ["foo", "bar"], values))
|
||||||
|
+ filtered = [x for x in values if x in ["foo", "bar"]]
|
||||||
|
|
||||||
|
- result = map(lambda x: { x: x in ["foo", "bar"]}, values)
|
||||||
|
+ result = ({x: x in ["foo", "bar"]} for x in values)
|
||||||
|
- result = list(map(lambda x: { x: x in ["foo", "bar"]}, values))
|
||||||
|
+ result = [{x: x in ["foo", "bar"]} for x in values]
|
||||||
|
```
|
||||||
|
|
||||||
|
If your logic is more complex, it's often better to write a loop instead, even if it adds more lines of code in total. The result will be much easier to follow and understand.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
- result = [{"key": key, "scores": {f"{i}": score for i, score in enumerate(scores)}} for key, scores in values]
|
||||||
|
|
||||||
|
+ result = []
|
||||||
|
+ for key, scores in values:
|
||||||
|
+ scores_dict = {f"{i}": score for i, score in enumerate(scores)}
|
||||||
|
+ result.append({"key": key, "scores": scores_dict})
|
||||||
|
```
|
||||||
|
|
||||||
|
### Composition vs. inheritance
|
||||||
|
|
||||||
|
Although spaCy uses a lot of classes, **inheritance is viewed with some suspicion** — it's seen as a mechanism of last resort. You should discuss plans to extend the class hierarchy before implementing. Unless you're implementing a new data structure or pipeline component, you typically shouldn't have to use classes at all.
|
||||||
|
|
||||||
|
### Don't use `print`
|
||||||
|
|
||||||
|
The core library never `print`s anything. While we encourage using `print` statements for simple debugging (it's the most straightforward way of looking at what's happening), make sure to clean them up once you're ready to submit your pull request. If you want to output warnings or debugging information for users, use the respective dedicated mechanisms for this instead (see sections on warnings and logging for details).
|
||||||
|
|
||||||
|
The only exceptions are the CLI functions, which pretty-print messages for the user, and methods that are explicitly intended for printing things, e.g. `Language.analyze_pipes` with `pretty=True` enabled. For this, we use our lightweight helper library [`wasabi`](https://github.com/ines/wasabi).
|
||||||
|
|
||||||
|
## Naming
|
||||||
|
|
||||||
|
Naming is hard and often a topic of long internal discussions. We don't expect you to come up with the perfect names for everything you write – finding the right names is often an iterative and collaborative process. That said, we do try to follow some basic conventions.
|
||||||
|
|
||||||
|
Consistent with general Python conventions, we use `CamelCase` for class names including dataclasses, `snake_case` for methods, functions and variables, and `UPPER_SNAKE_CASE` for constants, typically defined at the top of a module. We also avoid using variable names that shadow the names of built-in functions, e.g. `input`, `help` or `list`.
|
||||||
|
|
||||||
|
### Naming variables
|
||||||
|
|
||||||
|
Variable names should always make it clear _what exactly_ the variable is and what it's used for. Instances of common classes should use the same consistent names. For example, you should avoid naming a text string (or anything else that's not a `Doc` object) `doc`. The most common class-to-variable mappings are:
|
||||||
|
|
||||||
|
| Class | Variable | Example |
|
||||||
|
| ---------- | --------------------- | ------------------------------------------- |
|
||||||
|
| `Language` | `nlp` | `nlp = spacy.blank("en")` |
|
||||||
|
| `Doc` | `doc` | `doc = nlp("Some text")` |
|
||||||
|
| `Span` | `span`, `ent`, `sent` | `span = doc[1:4]`, `ent = doc.ents[0]` |
|
||||||
|
| `Token` | `token` | `token = doc[0]` |
|
||||||
|
| `Lexeme` | `lexeme`, `lex` | `lex = nlp.vocab["foo"]` |
|
||||||
|
| `Vocab` | `vocab` | `vocab = Vocab()` |
|
||||||
|
| `Example` | `example`, `eg` | `example = Example.from_dict(doc, gold)` |
|
||||||
|
| `Config` | `config`, `cfg` | `config = Config().from_disk("config.cfg")` |
|
||||||
|
|
||||||
|
We try to avoid introducing too many temporary variables, as these clutter your namespace. It's okay to re-assign to an existing variable, but only if the value has the same type.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
ents = get_a_list_of_entities()
|
||||||
|
ents = [ent for ent in doc.ents if ent.label_ == "PERSON"]
|
||||||
|
- ents = {(ent.start, ent.end): ent.label_ for ent in ents}
|
||||||
|
+ ent_mappings = {(ent.start, ent.end): ent.label_ for ent in ents}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Naming methods and functions
|
||||||
|
|
||||||
|
Try choosing short and descriptive names wherever possible and imperative verbs for methods that do something, e.g. `disable_pipes`, `add_patterns` or `get_vector`. Private methods and functions that are not intended to be part of the user-facing API should be prefixed with an underscore `_`. It's often helpful to look at the existing classes for inspiration.
|
||||||
|
|
||||||
|
Objects that can be serialized, e.g. data structures and pipeline components, should implement the same consistent methods for serialization. Those usually include at least `to_disk`, `from_disk`, `to_bytes` and `from_bytes`. Some objects can also implement more specific methods like `{to/from}_dict` or `{to/from}_str`.
|
||||||
|
|
||||||
|
## Error handling
|
||||||
|
|
||||||
|
We always encourage writing helpful and detailed custom error messages for everything we can anticipate going wrong, and including as much detail as possible. spaCy provides a directory of error messages in `errors.py` with unique codes for each message. This allows us to keep the code base more concise and avoids long and nested blocks of texts throughout the code that disrupt the reading flow. The codes make it easy to find references to the same error in different places, and also helps identify problems reported by users (since we can just search for the error code).
|
||||||
|
|
||||||
|
Errors can be referenced via their code, e.g. `Errors.E123`. Messages can also include placeholders for values, that can be populated by formatting the string with `.format()`.
|
||||||
|
|
||||||
|
```python
|
||||||
|
class Errors:
|
||||||
|
E123 = "Something went wrong"
|
||||||
|
E456 = "Unexpected value: {value}"
|
||||||
|
```
|
||||||
|
|
||||||
|
```diff
|
||||||
|
if something_went_wrong:
|
||||||
|
- raise ValueError("Something went wrong!")
|
||||||
|
+ raise ValueError(Errors.E123)
|
||||||
|
|
||||||
|
if not isinstance(value, int):
|
||||||
|
- raise ValueError(f"Unexpected value: {value}")
|
||||||
|
+ raise ValueError(Errors.E456.format(value=value))
|
||||||
|
```
|
||||||
|
|
||||||
|
As a general rule of thumb, all error messages raised within the **core library** should be added to `Errors`. The only place where we write errors and messages as strings is `spacy.cli`, since these functions typically pretty-print and generate a lot of output that'd otherwise be very difficult to separate from the actual logic.
|
||||||
|
|
||||||
|
### Re-raising exceptions
|
||||||
|
|
||||||
|
If we anticipate possible errors in third-party code that we don't control, or our own code in a very different context, we typically try to provide custom and more specific error messages if possible. If we need to re-raise an exception within a `try`/`except` block, we can re-raise a custom exception.
|
||||||
|
|
||||||
|
[Re-raising `from`](https://docs.python.org/3/tutorial/errors.html#exception-chaining) the original caught exception lets us chain the exceptions, so the user sees both the original error, as well as the custom message with a note "The above exception was the direct cause of the following exception".
|
||||||
|
|
||||||
|
```diff
|
||||||
|
try:
|
||||||
|
run_third_party_code_that_might_fail()
|
||||||
|
except ValueError as e:
|
||||||
|
+ raise ValueError(Errors.E123) from e
|
||||||
|
```
|
||||||
|
|
||||||
|
In some cases, it makes sense to suppress the original exception, e.g. if we know what it is and know that it's not particularly helpful. In that case, we can raise `from None`. This prevents clogging up the user's terminal with multiple and irrelevant chained exceptions.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
try:
|
||||||
|
run_our_own_code_that_might_fail_confusingly()
|
||||||
|
except ValueError:
|
||||||
|
+ raise ValueError(Errors.E123) from None
|
||||||
|
```
|
||||||
|
|
||||||
|
### Avoid using naked `assert`
|
||||||
|
|
||||||
|
During development, it can sometimes be helpful to add `assert` statements throughout your code to make sure that the values you're working with are what you expect. However, as you clean up your code, those should either be removed or replaced by more explicit error handling:
|
||||||
|
|
||||||
|
```diff
|
||||||
|
- assert score >= 0.0
|
||||||
|
+ if score < 0.0:
|
||||||
|
+ raise ValueError(Errors.789.format(score=score))
|
||||||
|
```
|
||||||
|
|
||||||
|
Otherwise, the user will get to see a naked `AssertionError` with no further explanation, which is very unhelpful. Instead of adding an error message to `assert`, it's always better to `raise` more explicit errors for specific conditions. If you're checking for something that _has to be right_ and would otherwise be a bug in spaCy, you can express this in the error message:
|
||||||
|
|
||||||
|
```python
|
||||||
|
E161 = ("Found an internal inconsistency when predicting entity links. "
|
||||||
|
"This is likely a bug in spaCy, so feel free to open an issue: "
|
||||||
|
"https://github.com/explosion/spaCy/issues")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Warnings
|
||||||
|
|
||||||
|
Instead of raising an error, some parts of the code base can raise warnings to notify the user of a potential problem. This is done using Python's `warnings.warn` and the messages defined in `Warnings` in the `errors.py`. Whether or not warnings are shown can be controlled by the user, including custom filters for disabling specific warnings using a regular expression matching our internal codes, e.g. `W123`.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
- print("Warning: No examples provided for validation")
|
||||||
|
+ warnings.warn(Warnings.W123)
|
||||||
|
```
|
||||||
|
|
||||||
|
When adding warnings, make sure you're not calling `warnings.warn` repeatedly, e.g. in a loop, which will clog up the terminal output. Instead, you can collect the potential problems first and then raise a single warning. If the problem is critical, consider raising an error instead.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
+ n_empty = 0
|
||||||
|
for spans in lots_of_annotations:
|
||||||
|
if len(spans) == 0:
|
||||||
|
- warnings.warn(Warnings.456)
|
||||||
|
+ n_empty += 1
|
||||||
|
+ warnings.warn(Warnings.456.format(count=n_empty))
|
||||||
|
```
|
||||||
|
|
||||||
|
### Logging
|
||||||
|
|
||||||
|
Log statements can be added via spaCy's `logger`, which uses Python's native `logging` module under the hood. We generally only use logging for debugging information that **the user may choose to see** in debugging mode or that's **relevant during training** but not at runtime.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
+ logger.info("Set up nlp object from config")
|
||||||
|
config = nlp.config.interpolate()
|
||||||
|
```
|
||||||
|
|
||||||
|
`spacy train` and similar CLI commands will enable all log statements of level `INFO` by default (which is not the case at runtime). This allows outputting specific information within certain parts of the core library during training, without having it shown at runtime. `DEBUG`-level logs are only shown if the user enables `--verbose` logging during training. They can be used to provide more specific and potentially more verbose details, especially in areas that can indicate bugs or problems, or to surface more details about what spaCy does under the hood. You should only use logging statements if absolutely necessary and important.
|
||||||
|
|
||||||
|
## Writing tests
|
||||||
|
|
||||||
|
spaCy uses the [`pytest`](http://doc.pytest.org/) framework for testing. Tests for spaCy modules and classes live in their own directories of the same name and all test files should be prefixed with `test_`. Tests included in the core library only cover the code and do not depend on any trained pipelines. When implementing a new feature or fixing a bug, it's usually good to start by writing some tests that describe what _should_ happen. As you write your code, you can then keep running the relevant tests until all of them pass.
|
||||||
|
|
||||||
|
### Test suite structure
|
||||||
|
|
||||||
|
When adding tests, make sure to use descriptive names and only test for one behavior at a time. Tests should be grouped into modules dedicated to the same type of functionality and some test modules are organized as directories of test files related to the same larger area of the library, e.g. `matcher` or `tokenizer`.
|
||||||
|
|
||||||
|
Regression tests are tests that refer to bugs reported in specific issues. They should live in the relevant module of the test suite, named according to the issue number (e.g., `test_issue1234.py`), and [marked](https://docs.pytest.org/en/6.2.x/example/markers.html#working-with-custom-markers) appropriately (e.g. `@pytest.mark.issue(1234)`). This system allows us to relate tests for specific bugs back to the original reported issue, which is especially useful if we introduce a regression and a previously passing regression tests suddenly fails again. When fixing a bug, it's often useful to create a regression test for it first.
|
||||||
|
|
||||||
|
The test suite also provides [fixtures](https://github.com/explosion/spaCy/blob/master/spacy/tests/conftest.py) for different language tokenizers that can be used as function arguments of the same name and will be passed in automatically. Those should only be used for tests related to those specific languages. We also have [test utility functions](https://github.com/explosion/spaCy/blob/master/spacy/tests/util.py) for common operations, like creating a temporary file.
|
||||||
|
|
||||||
|
### Constructing objects and state
|
||||||
|
|
||||||
|
Test functions usually follow the same simple structure: they set up some state, perform the operation you want to test and `assert` conditions that you expect to be true, usually before and after the operation.
|
||||||
|
|
||||||
|
Tests should focus on exactly what they're testing and avoid dependencies on other unrelated library functionality wherever possible. If all your test needs is a `Doc` object with certain annotations set, you should always construct it manually:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_doc_creation_with_pos():
|
||||||
|
doc = Doc(Vocab(), words=["hello", "world"], pos=["NOUN", "VERB"])
|
||||||
|
assert doc[0].pos_ == "NOUN"
|
||||||
|
assert doc[1].pos_ == "VERB"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Parametrizing tests
|
||||||
|
|
||||||
|
If you need to run the same test function over different input examples, you usually want to parametrize the test cases instead of using a loop within your test. This lets you keep a better separation between test cases and test logic, and it'll result in more useful output because `pytest` will be able to tell you which exact test case failed.
|
||||||
|
|
||||||
|
The `@pytest.mark.parametrize` decorator takes two arguments: a string defining one or more comma-separated arguments that should be passed to the test function and a list of corresponding test cases (or a list of tuples to provide multiple arguments).
|
||||||
|
|
||||||
|
```python
|
||||||
|
@pytest.mark.parametrize("words", [["hello", "world"], ["this", "is", "a", "test"]])
|
||||||
|
def test_doc_length(words):
|
||||||
|
doc = Doc(Vocab(), words=words)
|
||||||
|
assert len(doc) == len(words)
|
||||||
|
```
|
||||||
|
|
||||||
|
```python
|
||||||
|
@pytest.mark.parametrize("text,expected_len", [("hello world", 2), ("I can't!", 4)])
|
||||||
|
def test_token_length(en_tokenizer, text, expected_len): # en_tokenizer is a fixture
|
||||||
|
doc = en_tokenizer(text)
|
||||||
|
assert len(doc) == expected_len
|
||||||
|
```
|
||||||
|
|
||||||
|
You can also stack `@pytest.mark.parametrize` decorators, although this is not recommended unless it's absolutely needed or required for the test. When stacking decorators, keep in mind that this will run the test with all possible combinations of the respective parametrized values, which is often not what you want and can slow down the test suite.
|
||||||
|
|
||||||
|
### Handling failing tests
|
||||||
|
|
||||||
|
`xfail` means that a test **should pass but currently fails**, i.e. is expected to fail. You can mark a test as currently xfailing by adding the `@pytest.mark.xfail` decorator. This should only be used for tests that don't yet work, not for logic that cause errors we raise on purpose (see the section on testing errors for this). It's often very helpful to implement tests for edge cases that we don't yet cover and mark them as `xfail`. You can also provide a `reason` keyword argument to the decorator with an explanation of why the test currently fails.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
+ @pytest.mark.xfail(reason="Issue #225 - not yet implemented")
|
||||||
|
def test_en_tokenizer_splits_em_dash_infix(en_tokenizer):
|
||||||
|
doc = en_tokenizer("Will this road take me to Puddleton?\u2014No.")
|
||||||
|
assert doc[8].text == "\u2014"
|
||||||
|
```
|
||||||
|
|
||||||
|
When you run the test suite, you may come across tests that are reported as `xpass`. This means that they're marked as `xfail` but didn't actually fail. This is worth looking into: sometimes, it can mean that we have since fixed a bug that caused the test to previously fail, so we can remove the decorator. In other cases, especially when it comes to machine learning model implementations, it can also indicate that the **test is flaky**: it sometimes passes and sometimes fails. This can be caused by a bug, or by constraints being too narrowly defined. If a test shows different behavior depending on whether its run in isolation or not, this can indicate that it reacts to global state set in a previous test, which is unideal and should be avoided.
|
||||||
|
|
||||||
|
### Writing slow tests
|
||||||
|
|
||||||
|
If a test is useful but potentially quite slow, you can mark it with the `@pytest.mark.slow` decorator. This is a special marker we introduced and tests decorated with it only run if you run the test suite with `--slow`, but not as part of the main CI process. Before introducing a slow test, double-check that there isn't another and more efficient way to test for the behavior. You should also consider adding a simpler test with maybe only a subset of the test cases that can always run, so we at least have some coverage.
|
||||||
|
|
||||||
|
### Skipping tests
|
||||||
|
|
||||||
|
The `@pytest.mark.skip` decorator lets you skip tests entirely. You only want to do this for failing tests that may be slow to run or cause memory errors or segfaults, which would otherwise terminate the entire process and wouldn't be caught by `xfail`. We also sometimes use the `skip` decorator for old and outdated regression tests that we want to keep around but that don't apply anymore. When using the `skip` decorator, make sure to provide the `reason` keyword argument with a quick explanation of why you chose to skip this test.
|
||||||
|
|
||||||
|
### Testing errors and warnings
|
||||||
|
|
||||||
|
`pytest` lets you check whether a given error is raised by using the `pytest.raises` contextmanager. This is very useful when implementing custom error handling, so make sure you're not only testing for the correct behavior but also for errors resulting from incorrect inputs. If you're testing errors, you should always check for `pytest.raises` explicitly and not use `xfail`.
|
||||||
|
|
||||||
|
```python
|
||||||
|
words = ["a", "b", "c", "d", "e"]
|
||||||
|
ents = ["Q-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
Doc(Vocab(), words=words, ents=ents)
|
||||||
|
```
|
||||||
|
|
||||||
|
You can also use the `pytest.warns` contextmanager to check that a given warning type is raised. The first argument is the warning type or `None` (which will capture a list of warnings that you can `assert` is empty).
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_phrase_matcher_validation(en_vocab):
|
||||||
|
doc1 = Doc(en_vocab, words=["Test"], deps=["ROOT"])
|
||||||
|
doc2 = Doc(en_vocab, words=["Test"])
|
||||||
|
matcher = PhraseMatcher(en_vocab, validate=True)
|
||||||
|
with pytest.warns(UserWarning):
|
||||||
|
# Warn about unnecessarily parsed document
|
||||||
|
matcher.add("TEST1", [doc1])
|
||||||
|
with pytest.warns(None) as record:
|
||||||
|
matcher.add("TEST2", [docs])
|
||||||
|
assert not record.list
|
||||||
|
```
|
||||||
|
|
||||||
|
Keep in mind that your tests will fail if you're using the `pytest.warns` contextmanager with a given warning and the warning is _not_ shown. So you should only use it to check that spaCy handles and outputs warnings correctly. If your test outputs a warning that's expected but not relevant to what you're testing, you can use the `@pytest.mark.filterwarnings` decorator and ignore specific warnings starting with a given code:
|
||||||
|
|
||||||
|
```python
|
||||||
|
@pytest.mark.filterwarnings("ignore:\\[W036")
|
||||||
|
def test_matcher_empty(en_vocab):
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
matcher(Doc(en_vocab, words=["test"]))
|
||||||
|
```
|
||||||
|
|
||||||
|
### Testing trained pipelines
|
||||||
|
|
||||||
|
Our regular test suite does not depend on any of the trained pipelines, since their outputs can vary and aren't generally required to test the library functionality. We test pipelines separately using the tests included in the [`spacy-models`](https://github.com/explosion/spacy-models) repository, which run whenever we train a new suite of models. The tests here mostly focus on making sure that the packages can be loaded and that the predictions seam reasonable, and they include checks for common bugs we encountered previously. If your test does not primarily focus on verifying a model's predictions, it should be part of the core library tests and construct the required objects manually, instead of being added to the models tests.
|
||||||
|
|
||||||
|
Keep in mind that specific predictions may change, and we can't test for all incorrect predictions reported by users. Different models make different mistakes, so even a model that's significantly more accurate overall may end up making wrong predictions that it previously didn't. However, some surprising incorrect predictions may indicate deeper bugs that we definitely want to investigate.
|
150
extra/DEVELOPER_DOCS/Language.md
Normal file
150
extra/DEVELOPER_DOCS/Language.md
Normal file
|
@ -0,0 +1,150 @@
|
||||||
|
# Language
|
||||||
|
|
||||||
|
> Reference: `spacy/language.py`
|
||||||
|
|
||||||
|
1. [Constructing the `nlp` object from a config](#1-constructing-the-nlp-object-from-a-config)
|
||||||
|
- [A. Overview of `Language.from_config`](#1a-overview)
|
||||||
|
- [B. Component factories](#1b-how-pipeline-component-factories-work-in-the-config)
|
||||||
|
- [C. Sourcing a component](#1c-sourcing-a-pipeline-component)
|
||||||
|
- [D. Tracking components as they're modified](#1d-tracking-components-as-theyre-modified)
|
||||||
|
- [E. spaCy's config utility function](#1e-spacys-config-utility-functions)
|
||||||
|
2. [Initialization](#initialization)
|
||||||
|
- [A. Initialization for training](#2a-initialization-for-training): `init_nlp`
|
||||||
|
- [B. Initializing the `nlp` object](#2b-initializing-the-nlp-object): `Language.initialize`
|
||||||
|
- [C. Initializing the vocab](#2c-initializing-the-vocab): `init_vocab`
|
||||||
|
|
||||||
|
## 1. Constructing the `nlp` object from a config
|
||||||
|
|
||||||
|
### 1A. Overview
|
||||||
|
|
||||||
|
Most of the functions referenced in the config are regular functions with arbitrary arguments registered via the function registry. However, the pipeline components are a bit special: they don't only receive arguments passed in via the config file, but also the current `nlp` object and the string `name` of the individual component instance (so a user can have multiple components created with the same factory, e.g. `ner_one` and `ner_two`). This name can then be used by the components to add to the losses and scores. This special requirement means that pipeline components can't just be resolved via the config the "normal" way: we need to retrieve the component functions manually and pass them their arguments, plus the `nlp` and `name`.
|
||||||
|
|
||||||
|
The `Language.from_config` classmethod takes care of constructing the `nlp` object from a config. It's the single place where this happens and what `spacy.load` delegates to under the hood. Its main responsibilities are:
|
||||||
|
|
||||||
|
- **Load and validate the config**, and optionally **auto-fill** all missing values that we either have defaults for in the config template or that registered function arguments define defaults for. This helps ensure backwards-compatibility, because we're able to add a new argument `foo: str = "bar"` to an existing function, without breaking configs that don't specity it.
|
||||||
|
- **Execute relevant callbacks** for pipeline creation, e.g. optional functions called before and after creation of the `nlp` object and pipeline.
|
||||||
|
- **Initialize language subclass and create tokenizer**. The `from_config` classmethod will always be called on a language subclass, e.g. `English`, not on `Language` directly. Initializing the subclass takes a callback to create the tokenizer.
|
||||||
|
- **Set up the pipeline components**. Components can either refer to a component factory or a `source`, i.e. an existing pipeline that's loaded and that the component is then copied from. We also need to ensure that we update the information about which components are disabled.
|
||||||
|
- **Manage listeners.** If sourced components "listen" to other components (`tok2vec`, `transformer`), we need to ensure that the references are valid. If the config specifies that listeners should be replaced by copies (e.g. to give the `ner` component its own `tok2vec` model instead of listening to the shared `tok2vec` component in the pipeline), we also need to take care of that.
|
||||||
|
|
||||||
|
Note that we only resolve and load **selected sections** in `Language.from_config`, i.e. only the parts that are relevant at runtime, which is `[nlp]` and `[components]`. We don't want to be resolving anything related to training or initialization, since this would mean loading and constructing unnecessary functions, including functions that require information that isn't necessarily available at runtime, like `paths.train`.
|
||||||
|
|
||||||
|
### 1B. How pipeline component factories work in the config
|
||||||
|
|
||||||
|
As opposed to regular registered functions that refer to a registry and function name (e.g. `"@misc": "foo.v1"`), pipeline components follow a different format and refer to their component `factory` name. This corresponds to the name defined via the `@Language.component` or `@Language.factory` decorator. We need this decorator to define additional meta information for the components, like their default config and score weights.
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[components.my_component]
|
||||||
|
factory = "foo"
|
||||||
|
some_arg = "bar"
|
||||||
|
other_arg = ${paths.some_path}
|
||||||
|
```
|
||||||
|
|
||||||
|
This means that we need to create and resolve the `config["components"]` separately from the rest of the config. There are some important considerations and things we need to manage explicitly to avoid unexpected behavior:
|
||||||
|
|
||||||
|
#### Variable interpolation
|
||||||
|
|
||||||
|
When a config is resolved, references to variables are replaced, so that the functions receive the correct value instead of just the variable name. To interpolate a config, we need it in its entirety: we couldn't just interpolate a subsection that refers to variables defined in a different subsection. So we first interpolate the entire config.
|
||||||
|
|
||||||
|
However, the `nlp.config` should include the original config with variables intact – otherwise, loading a pipeline and saving it to disk will destroy all logic implemented via variables and hard-code the values all over the place. This means that when we create the components, we need to keep two versions of the config: the interpolated config with the "real" values and the `raw_config` including the variable references.
|
||||||
|
|
||||||
|
#### Factory registry
|
||||||
|
|
||||||
|
Component factories are special and use the `@Language.factory` or `@Language.component` decorator to register themselves and their meta. When the decorator runs, it performs some basic validation, stores the meta information for the factory on the `Language` class (default config, scores etc.) and then adds the factory function to `registry.factories`. The `component` decorator can be used for registering simple functions that just take a `Doc` object and return it so in that case, we create the factory for the user automatically.
|
||||||
|
|
||||||
|
There's one important detail to note about how factories are registered via entry points: A package that wants to expose spaCy components still needs to register them via the `@Language` decorators so we have the component meta information and can perform required checks. All we care about here is that the decorated function is **loaded and imported**. When it is, the `@Language` decorator takes care of everything, including actually registering the component factory.
|
||||||
|
|
||||||
|
Normally, adding to the registry via an entry point will just add the function to the registry under the given name. But for `spacy_factories`, we don't actually want that: all we care about is that the function decorated with `@Language` is imported so the decorator runs. So we only exploit Python's entry point system to automatically import the function, and the `spacy_factories` entry point group actually adds to a **separate registry**, `registry._factories`, under the hood. Its only purpose is that the functions are imported. The decorator then runs, creates the factory if needed and adds it to the `registry.factories` registry.
|
||||||
|
|
||||||
|
#### Language-specific factories
|
||||||
|
|
||||||
|
spaCy supports registering factories on the `Language` base class, as well as language-specific subclasses like `English` or `German`. This allows providing different factories depending on the language, e.g. a different default lemmatizer. The `Language.get_factory_name` classmethod constructs the factory name as `{lang}.{name}` if a language is available (i.e. if it's a subclass) and falls back to `{name}` otherwise. So `@German.factory("foo")` will add a factory `de.foo` under the hood. If you add `nlp.add_pipe("foo")`, we first check if there's a factory for `{nlp.lang}.foo` and if not, we fall back to checking for a factory `foo`.
|
||||||
|
|
||||||
|
#### Creating a pipeline component from a factory
|
||||||
|
|
||||||
|
`Language.add_pipe` takes care of adding a pipeline component, given its factory name, its config. If no source pipeline to copy the component from is provided, it delegates to `Language.create_pipe`, which sets up the actual component function.
|
||||||
|
|
||||||
|
- Validate the config and make sure that the factory was registered via the decorator and that we have meta for it.
|
||||||
|
- Update the component config with any defaults specified by the component's `default_config`, if available. This is done by merging the values we receive into the defaults. It ensures that you can still add a component without having to specify its _entire_ config including more complex settings like `model`. If no `model` is defined, we use the default.
|
||||||
|
- Check if we have a language-specific factory for the given `nlp.lang` and if not, fall back to the global factory.
|
||||||
|
- Construct the component config, consisting of whatever arguments were provided, plus the current `nlp` object and `name`, which are default expected arguments of all factories. We also add a reference to the `@factories` registry, so we can resolve the config via the registry, like any other config. With the added `nlp` and `name`, it should now include all expected arguments of the given function.
|
||||||
|
- Fill the config to make sure all unspecified defaults from the function arguments are added and update the `raw_config` (uninterpolated with variables intact) with that information, so the component config we store in `nlp.config` is up to date. We do this by adding the `raw_config` _into_ the filled config – otherwise, the references to variables would be overwritten.
|
||||||
|
- Resolve the config and create all functions it refers to (e.g. `model`). This gives us the actual component function that we can insert into the pipeline.
|
||||||
|
|
||||||
|
### 1C. Sourcing a pipeline component
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[components.ner]
|
||||||
|
source = "en_core_web_sm"
|
||||||
|
```
|
||||||
|
|
||||||
|
spaCy also allows ["sourcing" a component](https://spacy.io/usage/processing-pipelines#sourced-components), which will copy it over from an existing pipeline. In this case, `Language.add_pipe` will delegate to `Language.create_pipe_from_source`. In order to copy a component effectively and validate it, the source pipeline first needs to be loaded. This is done in `Language.from_config`, so a source pipeline only has to be loaded once if multiple components source from it. Sourcing a component will perform the following checks and modifications:
|
||||||
|
|
||||||
|
- For each sourced pipeline component loaded in `Language.from_config`, a hash of the vectors data from the source pipeline is stored in the pipeline meta so we're able to check whether the vectors match and warn if not (since different vectors that are used as features in components can lead to degraded performance). Because the vectors are not loaded at the point when components are sourced, the check is postponed to `init_vocab` as part of `Language.initialize`.
|
||||||
|
- If the sourced pipeline component is loaded through `Language.add_pipe(source=)`, the vectors are already loaded and can be compared directly. The check compares the shape and keys first and finally falls back to comparing the actual byte representation of the vectors (which is slower).
|
||||||
|
- Ensure that the component is available in the pipeline.
|
||||||
|
- Interpolate the entire config of the source pipeline so all variables are replaced and the component's config that's copied over doesn't include references to variables that are not available in the destination config.
|
||||||
|
- Add the source `vocab.strings` to the destination's `vocab.strings` so we don't end up with unavailable strings in the final pipeline (which would also include labels used by the sourced component).
|
||||||
|
|
||||||
|
Note that there may be other incompatibilities that we're currently not checking for and that could cause a sourced component to not work in the destination pipeline. We're interested in adding more checks here but there'll always be a small number of edge cases we'll never be able to catch, including a sourced component depending on other pipeline state that's not available in the destination pipeline.
|
||||||
|
|
||||||
|
### 1D. Tracking components as they're modified
|
||||||
|
|
||||||
|
The `Language` class implements methods for removing, replacing or renaming pipeline components. Whenever we make these changes, we need to update the information stored on the `Language` object to ensure that it matches the current state of the pipeline. If a user just writes to `nlp.config` manually, we obviously can't ensure that the config matches the reality – but since we offer modification via the pipe methods, it's expected that spaCy keeps the config in sync under the hood. Otherwise, saving a modified pipeline to disk and loading it back wouldn't work. The internal attributes we need to keep in sync here are:
|
||||||
|
|
||||||
|
| Attribute | Type | Description |
|
||||||
|
| ------------------------ | ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `Language._components` | `List[Tuple[str, Callable]]` | All pipeline components as `(name, func)` tuples. This is used as the source of truth for `Language.pipeline`, `Language.pipe_names` and `Language.components`. |
|
||||||
|
| `Language._pipe_meta` | `Dict[str, FactoryMeta]` | The meta information of a component's factory, keyed by component name. This can include multiple components referring to the same factory meta. |
|
||||||
|
| `Language._pipe_configs` | `Dict[str, Config]` | The component's config, keyed by component name. |
|
||||||
|
| `Language._disabled` | `Set[str]` | Names of components that are currently disabled. |
|
||||||
|
| `Language._config` | `Config` | The underlying config. This is only internals and will be used as the basis for constructing the config in the `Language.config` property. |
|
||||||
|
|
||||||
|
In addition to the actual component settings in `[components]`, the config also allows specifying component-specific arguments via the `[initialize.components]` block, which are passed to the component's `initialize` method during initialization if it's available. So we also need to keep this in sync in the underlying config.
|
||||||
|
|
||||||
|
### 1E. spaCy's config utility functions
|
||||||
|
|
||||||
|
When working with configs in spaCy, make sure to use the utility functions provided by spaCy if available, instead of calling the respective `Config` methods. The utilities take care of providing spaCy-specific error messages and ensure a consistent order of config sections by setting the `section_order` argument. This ensures that exported configs always have the same consistent format.
|
||||||
|
|
||||||
|
- `util.load_config`: load a config from a file
|
||||||
|
- `util.load_config_from_str`: load a confirm from a string representation
|
||||||
|
- `util.copy_config`: deepcopy a config
|
||||||
|
|
||||||
|
## 2. Initialization
|
||||||
|
|
||||||
|
Initialization is a separate step of the [config lifecycle](https://spacy.io/usage/training#config-lifecycle) that's not performed at runtime. It's implemented via the `training.initialize.init_nlp` helper and calls into `Language.initialize` method, which sets up the pipeline and component models before training. The `initialize` method takes a callback that returns a sample of examples, which is used to initialize the component models, add all required labels and perform shape inference if applicable.
|
||||||
|
|
||||||
|
Components can also define custom initialization setting via the `[initialize.components]` block, e.g. if they require external data like lookup tables to be loaded in. All config settings defined here will be passed to the component's `initialize` method, if it implements one. Components are expected to handle their own serialization after they're initialized so that any data or settings they require are saved with the pipeline and will be available from disk when the pipeline is loaded back at runtime.
|
||||||
|
|
||||||
|
### 2A. Initialization for training
|
||||||
|
|
||||||
|
The `init_nlp` function is called before training and returns an initialized `nlp` object that can be updated with the examples. It only needs the config and does the following:
|
||||||
|
|
||||||
|
- Load and validate the config. In order to validate certain settings like the `seed`, we also interpolate the config to get the final value (because in theory, a user could provide this via a variable).
|
||||||
|
- Set up the GPU allocation, if required.
|
||||||
|
- Create the `nlp` object from the raw, uninterpolated config, which delegates to `Language.from_config`. Since this method may modify and auto-fill the config and pipeline component settings, we then use the interpolated version of `nlp.config` going forward, to ensure that what we're training with is up to date.
|
||||||
|
- Resolve the `[training]` block of the config and perform validation, e.g. to check that the corpora are available.
|
||||||
|
- Determine the components that should be frozen (not updated during training) or resumed (sourced components from a different pipeline that should be updated from the examples and not reset and re-initialized). To resume training, we can call the `nlp.resume_training` method.
|
||||||
|
- Initialize the `nlp` object via `nlp.initialize` and pass it a `get_examples` callback that returns the training corpus (used for shape inference, setting up labels etc.). If the training corpus is streamed, we only provide a small sample of the data, which can potentially be infinite. `nlp.initialize` will delegate to the components as well and pass the data sample forward.
|
||||||
|
- Check the listeners and warn about components dependencies, e.g. if a frozen component listens to a component that is retrained, or vice versa (which can degrade results).
|
||||||
|
|
||||||
|
### 2B. Initializing the `nlp` object
|
||||||
|
|
||||||
|
The `Language.initialize` method does the following:
|
||||||
|
|
||||||
|
- **Resolve the config** defined in the `[initialize]` block separately (since everything else is already available in the loaded `nlp` object), based on the fully interpolated config.
|
||||||
|
- **Execute callbacks**, i.e. `before_init` and `after_init`, if they're defined.
|
||||||
|
- **Initialize the vocab**, including vocab data, lookup tables and vectors.
|
||||||
|
- **Initialize the tokenizer** if it implements an `initialize` method. This is not the case for the default tokenizers, but it allows custom tokenizers to depend on external data resources that are loaded in on initialization.
|
||||||
|
- **Initialize all pipeline components** if they implement an `initialize` method and pass them the `get_examples` callback, the current `nlp` object as well as well additional initialization config settings provided in the component-specific block.
|
||||||
|
- **Initialize pretraining** if a `[pretraining]` block is available in the config. This allows loading pretrained tok2vec weights in `spacy pretrain`.
|
||||||
|
- **Register listeners** if token-to-vector embedding layers of a component model "listen" to a previous component (`tok2vec`, `transformer`) in the pipeline.
|
||||||
|
- **Create an optimizer** on the `Language` class, either by adding the optimizer passed as `sgd` to `initialize`, or by creating the optimizer defined in the config's training settings.
|
||||||
|
|
||||||
|
### 2C. Initializing the vocab
|
||||||
|
|
||||||
|
Vocab initialization is handled in the `training.initialize.init_vocab` helper. It takes the relevant loaded functions and values from the config and takes care of the following:
|
||||||
|
|
||||||
|
- Add lookup tables defined in the config initialization, e.g. custom lemmatization tables. Those will be added to `nlp.vocab.lookups` from where they can be accessed by components.
|
||||||
|
- Add JSONL-formatted [vocabulary data](https://spacy.io/api/data-formats#vocab-jsonl) to pre-populate the lexical attributes.
|
||||||
|
- Load vectors into the pipeline. Vectors are defined as a name or path to a saved `nlp` object containing the vectors, e.g. `en_vectors_web_lg`. It's loaded and the vectors are ported over, while ensuring that all source strings are available in the destination strings. We also warn if there's a mismatch between sourced vectors, since this can lead to problems.
|
220
extra/DEVELOPER_DOCS/Listeners.md
Normal file
220
extra/DEVELOPER_DOCS/Listeners.md
Normal file
|
@ -0,0 +1,220 @@
|
||||||
|
# Listeners
|
||||||
|
|
||||||
|
1. [Overview](#1-overview)
|
||||||
|
2. [Initialization](#2-initialization)
|
||||||
|
- [A. Linking listeners to the embedding component](#2a-linking-listeners-to-the-embedding-component)
|
||||||
|
- [B. Shape inference](#2b-shape-inference)
|
||||||
|
3. [Internal communication](#3-internal-communication)
|
||||||
|
- [A. During prediction](#3a-during-prediction)
|
||||||
|
- [B. During training](#3b-during-training)
|
||||||
|
- [C. Frozen components](#3c-frozen-components)
|
||||||
|
4. [Replacing listener with standalone](#4-replacing-listener-with-standalone)
|
||||||
|
|
||||||
|
## 1. Overview
|
||||||
|
|
||||||
|
Trainable spaCy components typically use some sort of `tok2vec` layer as part of the `model` definition.
|
||||||
|
This `tok2vec` layer produces embeddings and is either a standard `Tok2Vec` layer, or a Transformer-based one.
|
||||||
|
Both versions can be used either inline/standalone, which means that they are defined and used
|
||||||
|
by only one specific component (e.g. NER), or
|
||||||
|
[shared](https://spacy.io/usage/embeddings-transformers#embedding-layers),
|
||||||
|
in which case the embedding functionality becomes a separate component that can
|
||||||
|
feed embeddings to multiple components downstream, using a listener-pattern.
|
||||||
|
|
||||||
|
| Type | Usage | Model Architecture |
|
||||||
|
| ------------- | ---------- | -------------------------------------------------------------------------------------------------- |
|
||||||
|
| `Tok2Vec` | standalone | [`spacy.Tok2Vec`](https://spacy.io/api/architectures#Tok2Vec) |
|
||||||
|
| `Tok2Vec` | listener | [`spacy.Tok2VecListener`](https://spacy.io/api/architectures#Tok2VecListener) |
|
||||||
|
| `Transformer` | standalone | [`spacy-transformers.Tok2VecTransformer`](https://spacy.io/api/architectures#Tok2VecTransformer) |
|
||||||
|
| `Transformer` | listener | [`spacy-transformers.TransformerListener`](https://spacy.io/api/architectures#TransformerListener) |
|
||||||
|
|
||||||
|
Here we discuss the listener pattern and its implementation in code in more detail.
|
||||||
|
|
||||||
|
## 2. Initialization
|
||||||
|
|
||||||
|
### 2A. Linking listeners to the embedding component
|
||||||
|
|
||||||
|
To allow sharing a `tok2vec` layer, a separate `tok2vec` component needs to be defined in the config:
|
||||||
|
|
||||||
|
```
|
||||||
|
[components.tok2vec]
|
||||||
|
factory = "tok2vec"
|
||||||
|
|
||||||
|
[components.tok2vec.model]
|
||||||
|
@architectures = "spacy.Tok2Vec.v2"
|
||||||
|
```
|
||||||
|
|
||||||
|
A listener can then be set up by making sure the correct `upstream` name is defined, referring to the
|
||||||
|
name of the `tok2vec` component (which equals the factory name by default), or `*` as a wildcard:
|
||||||
|
|
||||||
|
```
|
||||||
|
[components.ner.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
upstream = "tok2vec"
|
||||||
|
```
|
||||||
|
|
||||||
|
When an [`nlp`](https://github.com/explosion/spaCy/blob/master/extra/DEVELOPER_DOCS/Language.md) object is
|
||||||
|
initialized or deserialized, it will make sure to link each `tok2vec` component to its listeners. This is
|
||||||
|
implemented in the method `nlp._link_components()` which loops over each
|
||||||
|
component in the pipeline and calls `find_listeners()` on a component if it's defined.
|
||||||
|
The [`tok2vec` component](https://github.com/explosion/spaCy/blob/master/spacy/pipeline/tok2vec.py)'s implementation
|
||||||
|
of this `find_listener()` method will specifically identify sublayers of a model definition that are of type
|
||||||
|
`Tok2VecListener` with a matching upstream name and will then add that listener to the internal `self.listener_map`.
|
||||||
|
|
||||||
|
If it's a Transformer-based pipeline, a
|
||||||
|
[`transformer` component](https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py)
|
||||||
|
has a similar implementation but its `find_listener()` function will specifically look for `TransformerListener`
|
||||||
|
sublayers of downstream components.
|
||||||
|
|
||||||
|
### 2B. Shape inference
|
||||||
|
|
||||||
|
Typically, the output dimension `nO` of a listener's model equals the `nO` (or `width`) of the upstream embedding layer.
|
||||||
|
For a standard `Tok2Vec`-based component, this is typically known up-front and defined as such in the config:
|
||||||
|
|
||||||
|
```
|
||||||
|
[components.ner.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
```
|
||||||
|
|
||||||
|
A `transformer` component however only knows its `nO` dimension after the HuggingFace transformer
|
||||||
|
is set with the function `model.attrs["set_transformer"]`,
|
||||||
|
[implemented](https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/layers/transformer_model.py)
|
||||||
|
by `set_pytorch_transformer`.
|
||||||
|
This is why, upon linking of the transformer listeners, the `transformer` component also makes sure to set
|
||||||
|
the listener's output dimension correctly.
|
||||||
|
|
||||||
|
This shape inference mechanism also needs to happen with resumed/frozen components, which means that for some CLI
|
||||||
|
commands (`assemble` and `train`), we need to call `nlp._link_components` even before initializing the `nlp`
|
||||||
|
object. To cover all use-cases and avoid negative side effects, the code base ensures that performing the
|
||||||
|
linking twice is not harmful.
|
||||||
|
|
||||||
|
## 3. Internal communication
|
||||||
|
|
||||||
|
The internal communication between a listener and its downstream components is organized by sending and
|
||||||
|
receiving information across the components - either directly or implicitly.
|
||||||
|
The details are different depending on whether the pipeline is currently training, or predicting.
|
||||||
|
Either way, the `tok2vec` or `transformer` component always needs to run before the listener.
|
||||||
|
|
||||||
|
### 3A. During prediction
|
||||||
|
|
||||||
|
When the `Tok2Vec` pipeline component is called, its `predict()` method is executed to produce the results,
|
||||||
|
which are then stored by `set_annotations()` in the `doc.tensor` field of the document(s).
|
||||||
|
Similarly, the `Transformer` component stores the produced embeddings
|
||||||
|
in `doc._.trf_data`. Next, the `forward` pass of a
|
||||||
|
[`Tok2VecListener`](https://github.com/explosion/spaCy/blob/master/spacy/pipeline/tok2vec.py)
|
||||||
|
or a
|
||||||
|
[`TransformerListener`](https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/layers/listener.py)
|
||||||
|
accesses these fields on the `Doc` directly. Both listener implementations have a fallback mechanism for when these
|
||||||
|
properties were not set on the `Doc`: in that case an all-zero tensor is produced and returned.
|
||||||
|
We need this fallback mechanism to enable shape inference methods in Thinc, but the code
|
||||||
|
is slightly risky and at times might hide another bug - so it's a good spot to be aware of.
|
||||||
|
|
||||||
|
### 3B. During training
|
||||||
|
|
||||||
|
During training, the `update()` methods of the `Tok2Vec` & `Transformer` components don't necessarily set the
|
||||||
|
annotations on the `Doc` (though since 3.1 they can if they are part of the `annotating_components` list in the config).
|
||||||
|
Instead, we rely on a caching mechanism between the original embedding component and its listener.
|
||||||
|
Specifically, the produced embeddings are sent to the listeners by calling `listener.receive()` and uniquely
|
||||||
|
identifying the batch of documents with a `batch_id`. This `receive()` call also sends the appropriate `backprop`
|
||||||
|
call to ensure that gradients from the downstream component flow back to the trainable `Tok2Vec` or `Transformer`
|
||||||
|
network.
|
||||||
|
|
||||||
|
We rely on the `nlp` object properly batching the data and sending each batch through the pipeline in sequence,
|
||||||
|
which means that only one such batch needs to be kept in memory for each listener.
|
||||||
|
When the downstream component runs and the listener should produce embeddings, it accesses the batch in memory,
|
||||||
|
runs the backpropagation, and returns the results and the gradients.
|
||||||
|
|
||||||
|
There are two ways in which this mechanism can fail, both are detected by `verify_inputs()`:
|
||||||
|
|
||||||
|
- `E953` if a different batch is in memory than the requested one - signaling some kind of out-of-sync state of the
|
||||||
|
training pipeline.
|
||||||
|
- `E954` if no batch is in memory at all - signaling that the pipeline is probably not set up correctly.
|
||||||
|
|
||||||
|
#### Training with multiple listeners
|
||||||
|
|
||||||
|
One `Tok2Vec` or `Transformer` component may be listened to by several downstream components, e.g.
|
||||||
|
a tagger and a parser could be sharing the same embeddings. In this case, we need to be careful about how we do
|
||||||
|
the backpropagation. When the `Tok2Vec` or `Transformer` sends out data to the listener with `receive()`, they will
|
||||||
|
send an `accumulate_gradient` function call to all listeners, except the last one. This function will keep track
|
||||||
|
of the gradients received so far. Only the final listener in the pipeline will get an actual `backprop` call that
|
||||||
|
will initiate the backpropagation of the `tok2vec` or `transformer` model with the accumulated gradients.
|
||||||
|
|
||||||
|
### 3C. Frozen components
|
||||||
|
|
||||||
|
The listener pattern can get particularly tricky in combination with frozen components. To detect components
|
||||||
|
with listeners that are not frozen consistently, `init_nlp()` (which is called by `spacy train`) goes through
|
||||||
|
the listeners and their upstream components and warns in two scenarios.
|
||||||
|
|
||||||
|
#### The Tok2Vec or Transformer is frozen
|
||||||
|
|
||||||
|
If the `Tok2Vec` or `Transformer` was already trained,
|
||||||
|
e.g. by [pretraining](https://spacy.io/usage/embeddings-transformers#pretraining),
|
||||||
|
it could be a valid use-case to freeze the embedding architecture and only train downstream components such
|
||||||
|
as a tagger or a parser. This used to be impossible before 3.1, but has become supported since then by putting the
|
||||||
|
embedding component in the [`annotating_components`](https://spacy.io/usage/training#annotating-components)
|
||||||
|
list of the config. This works like any other "annotating component" because it relies on the `Doc` attributes.
|
||||||
|
|
||||||
|
However, if the `Tok2Vec` or `Transformer` is frozen, and not present in `annotating_components`, and a related
|
||||||
|
listener isn't frozen, then a `W086` warning is shown and further training of the pipeline will likely end with `E954`.
|
||||||
|
|
||||||
|
#### The upstream component is frozen
|
||||||
|
|
||||||
|
If an upstream component is frozen but the underlying `Tok2Vec` or `Transformer` isn't, the performance of
|
||||||
|
the upstream component will be degraded after training. In this case, a `W087` warning is shown, explaining
|
||||||
|
how to use the `replace_listeners` functionality to prevent this problem.
|
||||||
|
|
||||||
|
## 4. Replacing listener with standalone
|
||||||
|
|
||||||
|
The [`replace_listeners`](https://spacy.io/api/language#replace_listeners) functionality changes the architecture
|
||||||
|
of a downstream component from using a listener pattern to a standalone `tok2vec` or `transformer` layer,
|
||||||
|
effectively making the downstream component independent of any other components in the pipeline.
|
||||||
|
It is implemented by `nlp.replace_listeners()` and typically executed by `nlp.from_config()`.
|
||||||
|
First, it fetches the original `Model` of the original component that creates the embeddings:
|
||||||
|
|
||||||
|
```
|
||||||
|
tok2vec = self.get_pipe(tok2vec_name)
|
||||||
|
tok2vec_model = tok2vec.model
|
||||||
|
```
|
||||||
|
|
||||||
|
Which is either a [`Tok2Vec` model](https://github.com/explosion/spaCy/blob/master/spacy/ml/models/tok2vec.py) or a
|
||||||
|
[`TransformerModel`](https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/layers/transformer_model.py).
|
||||||
|
|
||||||
|
In the case of the `tok2vec`, this model can be copied as-is into the configuration and architecture of the
|
||||||
|
downstream component. However, for the `transformer`, this doesn't work.
|
||||||
|
The reason is that the `TransformerListener` architecture chains the listener with
|
||||||
|
[`trfs2arrays`](https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/layers/trfs2arrays.py):
|
||||||
|
|
||||||
|
```
|
||||||
|
model = chain(
|
||||||
|
TransformerListener(upstream_name=upstream)
|
||||||
|
trfs2arrays(pooling, grad_factor),
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
but the standalone `Tok2VecTransformer` has an additional `split_trf_batch` chained inbetween the model
|
||||||
|
and `trfs2arrays`:
|
||||||
|
|
||||||
|
```
|
||||||
|
model = chain(
|
||||||
|
TransformerModel(name, get_spans, tokenizer_config),
|
||||||
|
split_trf_batch(),
|
||||||
|
trfs2arrays(pooling, grad_factor),
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
So you can't just take the model from the listener, and drop that into the component internally. You need to
|
||||||
|
adjust the model and the config. To facilitate this, `nlp.replace_listeners()` will check whether additional
|
||||||
|
[functions](https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/layers/_util.py) are
|
||||||
|
[defined](https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/layers/transformer_model.py)
|
||||||
|
in `model.attrs`, and if so, it will essentially call these to make the appropriate changes:
|
||||||
|
|
||||||
|
```
|
||||||
|
replace_func = tok2vec_model.attrs["replace_listener_cfg"]
|
||||||
|
new_config = replace_func(tok2vec_cfg["model"], pipe_cfg["model"]["tok2vec"])
|
||||||
|
...
|
||||||
|
new_model = tok2vec_model.attrs["replace_listener"](new_model)
|
||||||
|
```
|
||||||
|
|
||||||
|
The new config and model are then properly stored on the `nlp` object.
|
||||||
|
Note that this functionality (running the replacement for a transformer listener) was broken prior to
|
||||||
|
`spacy-transformers` 1.0.5.
|
7
extra/DEVELOPER_DOCS/README.md
Normal file
7
extra/DEVELOPER_DOCS/README.md
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
<a href="https://explosion.ai"><img src="https://explosion.ai/assets/img/logo.svg" width="125" height="125" align="right" /></a>
|
||||||
|
|
||||||
|
# Developer Documentation
|
||||||
|
|
||||||
|
This directory includes additional documentation and explanations of spaCy's internals. It's mostly intended for the spaCy core development team and contributors interested in the more complex parts of the library. The documents generally focus on more abstract implementation details and how specific methods and algorithms work, and they assume knowledge of what's already available in the [usage documentation](https://spacy.io/usage) and [API reference](https://spacy.io/api).
|
||||||
|
|
||||||
|
If you're looking to contribute to spaCy, make sure to check out the documentation and [contributing guide](https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md) first.
|
216
extra/DEVELOPER_DOCS/StringStore-Vocab.md
Normal file
216
extra/DEVELOPER_DOCS/StringStore-Vocab.md
Normal file
|
@ -0,0 +1,216 @@
|
||||||
|
# StringStore & Vocab
|
||||||
|
|
||||||
|
> Reference: `spacy/strings.pyx`
|
||||||
|
> Reference: `spacy/vocab.pyx`
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
spaCy represents mosts strings internally using a `uint64` in Cython which
|
||||||
|
corresponds to a hash. The magic required to make this largely transparent is
|
||||||
|
handled by the `StringStore`, and is integrated into the pipelines using the
|
||||||
|
`Vocab`, which also connects it to some other information.
|
||||||
|
|
||||||
|
These are mostly internal details that average library users should never have
|
||||||
|
to think about. On the other hand, when developing a component it's normal to
|
||||||
|
interact with the Vocab for lexeme data or word vectors, and it's not unusual
|
||||||
|
to add labels to the `StringStore`.
|
||||||
|
|
||||||
|
## StringStore
|
||||||
|
|
||||||
|
### Overview
|
||||||
|
|
||||||
|
The `StringStore` is a `cdef class` that looks a bit like a two-way dictionary,
|
||||||
|
though it is not a subclass of anything in particular.
|
||||||
|
|
||||||
|
The main functionality of the `StringStore` is that `__getitem__` converts
|
||||||
|
hashes into strings or strings into hashes.
|
||||||
|
|
||||||
|
The full details of the conversion are complicated. Normally you shouldn't have
|
||||||
|
to worry about them, but the first applicable case here is used to get the
|
||||||
|
return value:
|
||||||
|
|
||||||
|
1. 0 and the empty string are special cased to each other
|
||||||
|
2. internal symbols use a lookup table (`SYMBOLS_BY_STR`)
|
||||||
|
3. normal strings or bytes are hashed
|
||||||
|
4. internal symbol IDs in `SYMBOLS_BY_INT` are handled
|
||||||
|
5. anything not yet handled is used as a hash to lookup a string
|
||||||
|
|
||||||
|
For the symbol enums, see [`symbols.pxd`](https://github.com/explosion/spaCy/blob/master/spacy/symbols.pxd).
|
||||||
|
|
||||||
|
Almost all strings in spaCy are stored in the `StringStore`. This naturally
|
||||||
|
includes tokens, but also includes things like labels (not just NER/POS/dep,
|
||||||
|
but also categories etc.), lemmas, lowercase forms, word shapes, and so on. One
|
||||||
|
of the main results of this is that tokens can be represented by a compact C
|
||||||
|
struct ([`LexemeC`](https://spacy.io/api/cython-structs#lexemec)/[`TokenC`](https://github.com/explosion/spaCy/issues/4854)) that mostly consists of string hashes. This also means that converting
|
||||||
|
input for the models is straightforward, and there's not a token mapping step
|
||||||
|
like in many machine learning frameworks. Additionally, because the token IDs
|
||||||
|
in spaCy are based on hashes, they are consistent across environments or
|
||||||
|
models.
|
||||||
|
|
||||||
|
One pattern you'll see a lot in spaCy APIs is that `something.value` returns an
|
||||||
|
`int` and `something.value_` returns a string. That's implemented using the
|
||||||
|
`StringStore`. Typically the `int` is stored in a C struct and the string is
|
||||||
|
generated via a property that calls into the `StringStore` with the `int`.
|
||||||
|
|
||||||
|
Besides `__getitem__`, the `StringStore` has functions to return specifically a
|
||||||
|
string or specifically a hash, regardless of whether the input was a string or
|
||||||
|
hash to begin with, though these are only used occasionally.
|
||||||
|
|
||||||
|
### Implementation Details: Hashes and Allocations
|
||||||
|
|
||||||
|
Hashes are 64-bit and are computed using [murmurhash][] on UTF-8 bytes. There is no
|
||||||
|
mechanism for detecting and avoiding collisions. To date there has never been a
|
||||||
|
reproducible collision or user report about any related issues.
|
||||||
|
|
||||||
|
[murmurhash]: https://github.com/explosion/murmurhash
|
||||||
|
|
||||||
|
The empty string is not hashed, it's just converted to/from 0.
|
||||||
|
|
||||||
|
A small number of strings use indices into a lookup table (so low integers)
|
||||||
|
rather than hashes. This is mostly Universal Dependencies labels or other
|
||||||
|
strings considered "core" in spaCy. This was critical in v1, which hadn't
|
||||||
|
introduced hashing yet. Since v2 it's important for items in `spacy.attrs`,
|
||||||
|
especially lexeme flags, but is otherwise only maintained for backwards
|
||||||
|
compatibility.
|
||||||
|
|
||||||
|
You can call `strings["mystring"]` with a string the `StringStore` has never seen
|
||||||
|
before and it will return a hash. But in order to do the reverse operation, you
|
||||||
|
need to call `strings.add("mystring")` first. Without a call to `add` the
|
||||||
|
string will not be interned.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
```
|
||||||
|
from spacy.strings import StringStore
|
||||||
|
|
||||||
|
ss = StringStore()
|
||||||
|
hashval = ss["spacy"] # 10639093010105930009
|
||||||
|
try:
|
||||||
|
# this won't work
|
||||||
|
ss[hashval]
|
||||||
|
except KeyError:
|
||||||
|
print(f"key {hashval} unknown in the StringStore.")
|
||||||
|
|
||||||
|
ss.add("spacy")
|
||||||
|
assert ss[hashval] == "spacy" # it works now
|
||||||
|
|
||||||
|
# There is no `.keys` property, but you can iterate over keys
|
||||||
|
# The empty string will never be in the list of keys
|
||||||
|
for key in ss:
|
||||||
|
print(key)
|
||||||
|
```
|
||||||
|
|
||||||
|
In normal use nothing is ever removed from the `StringStore`. In theory this
|
||||||
|
means that if you do something like iterate through all hex values of a certain
|
||||||
|
length you can have explosive memory usage. In practice this has never been an
|
||||||
|
issue. (Note that this is also different from using `sys.intern` to intern
|
||||||
|
Python strings, which does not guarantee they won't be garbage collected later.)
|
||||||
|
|
||||||
|
Strings are stored in the `StringStore` in a peculiar way: each string uses a
|
||||||
|
union that is either an eight-byte `char[]` or a `char*`. Short strings are
|
||||||
|
stored directly in the `char[]`, while longer strings are stored in allocated
|
||||||
|
memory and prefixed with their length. This is a strategy to reduce indirection
|
||||||
|
and memory fragmentation. See `decode_Utf8Str` and `_allocate` in
|
||||||
|
`strings.pyx` for the implementation.
|
||||||
|
|
||||||
|
### When to Use the StringStore?
|
||||||
|
|
||||||
|
While you can ignore the `StringStore` in many cases, there are situations where
|
||||||
|
you should make use of it to avoid errors.
|
||||||
|
|
||||||
|
Any time you introduce a string that may be set on a `Doc` field that has a hash,
|
||||||
|
you should add the string to the `StringStore`. This mainly happens when adding
|
||||||
|
labels in components, but there are some other cases:
|
||||||
|
|
||||||
|
- syntax iterators, mainly `get_noun_chunks`
|
||||||
|
- external data used in components, like the `KnowledgeBase` in the `entity_linker`
|
||||||
|
- labels used in tests
|
||||||
|
|
||||||
|
## Vocab
|
||||||
|
|
||||||
|
The `Vocab` is a core component of a `Language` pipeline. Its main function is
|
||||||
|
to manage `Lexeme`s, which are structs that contain information about a token
|
||||||
|
that depends only on its surface form, without context. `Lexeme`s store much of
|
||||||
|
the data associated with `Token`s. As a side effect of this the `Vocab` also
|
||||||
|
manages the `StringStore` for a pipeline and a grab-bag of other data.
|
||||||
|
|
||||||
|
These are things stored in the vocab:
|
||||||
|
|
||||||
|
- `Lexeme`s
|
||||||
|
- `StringStore`
|
||||||
|
- `Morphology`: manages info used in `MorphAnalysis` objects
|
||||||
|
- `vectors`: basically a dict for word vectors
|
||||||
|
- `lookups`: language specific data like lemmas
|
||||||
|
- `writing_system`: language specific metadata
|
||||||
|
- `get_noun_chunks`: a syntax iterator
|
||||||
|
- lex attribute getters: functions like `is_punct`, set in language defaults
|
||||||
|
- `cfg`: **not** the pipeline config, this is mostly unused
|
||||||
|
- `_unused_object`: Formerly an unused object, kept around until v4 for compatability
|
||||||
|
|
||||||
|
Some of these, like the Morphology and Vectors, are complex enough that they
|
||||||
|
need their own explanations. Here we'll just look at Vocab-specific items.
|
||||||
|
|
||||||
|
### Lexemes
|
||||||
|
|
||||||
|
A `Lexeme` is a type that mainly wraps a `LexemeC`, a struct consisting of ints
|
||||||
|
that identify various context-free token attributes. Lexemes are the core data
|
||||||
|
of the `Vocab`, and can be accessed using `__getitem__` on the `Vocab`. The memory
|
||||||
|
for storing `LexemeC` objects is managed by a pool that belongs to the `Vocab`.
|
||||||
|
|
||||||
|
Note that `__getitem__` on the `Vocab` works much like the `StringStore`, in
|
||||||
|
that it accepts a hash or id, with one important difference: if you do a lookup
|
||||||
|
using a string, that value is added to the `StringStore` automatically.
|
||||||
|
|
||||||
|
The attributes stored in a `LexemeC` are:
|
||||||
|
|
||||||
|
- orth (the raw text)
|
||||||
|
- lower
|
||||||
|
- norm
|
||||||
|
- shape
|
||||||
|
- prefix
|
||||||
|
- suffix
|
||||||
|
|
||||||
|
Most of these are straightforward. All of them can be customized, and (except
|
||||||
|
`orth`) probably should be since the defaults are based on English, but in
|
||||||
|
practice this is rarely done at present.
|
||||||
|
|
||||||
|
### Lookups
|
||||||
|
|
||||||
|
This is basically a dict of dicts, implemented using a `Table` for each
|
||||||
|
sub-dict, that stores lemmas and other language-specific lookup data.
|
||||||
|
|
||||||
|
A `Table` is a subclass of `OrderedDict` used for string-to-string data. It uses
|
||||||
|
Bloom filters to speed up misses and has some extra serialization features.
|
||||||
|
Tables are not used outside of the lookups.
|
||||||
|
|
||||||
|
### Lex Attribute Getters
|
||||||
|
|
||||||
|
Lexical Attribute Getters like `is_punct` are defined on a per-language basis,
|
||||||
|
much like lookups, but take the form of functions rather than string-to-string
|
||||||
|
dicts, so they're stored separately.
|
||||||
|
|
||||||
|
### Writing System
|
||||||
|
|
||||||
|
This is a dict with three attributes:
|
||||||
|
|
||||||
|
- `direction`: ltr or rtl (default ltr)
|
||||||
|
- `has_case`: bool (default `True`)
|
||||||
|
- `has_letters`: bool (default `True`, `False` only for CJK for now)
|
||||||
|
|
||||||
|
Currently these are not used much - the main use is that `direction` is used in
|
||||||
|
visualizers, though `rtl` doesn't quite work (see
|
||||||
|
[#4854](https://github.com/explosion/spaCy/issues/4854)). In the future they
|
||||||
|
could be used when choosing hyperparameters for subwords, controlling word
|
||||||
|
shape generation, and similar tasks.
|
||||||
|
|
||||||
|
### Other Vocab Members
|
||||||
|
|
||||||
|
The Vocab is kind of the default place to store things from `Language.defaults`
|
||||||
|
that don't belong to the Tokenizer. The following properties are in the Vocab
|
||||||
|
just because they don't have anywhere else to go.
|
||||||
|
|
||||||
|
- `get_noun_chunks`
|
||||||
|
- `cfg`: This is a dict that just stores `oov_prob` (hardcoded to `-20`)
|
||||||
|
- `_unused_object`: Leftover C member, should be removed in next major version
|
||||||
|
|
||||||
|
|
|
@ -104,3 +104,26 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
SOFTWARE.
|
SOFTWARE.
|
||||||
|
|
||||||
|
|
||||||
|
importlib_metadata
|
||||||
|
------------------
|
||||||
|
|
||||||
|
* Files: util.py
|
||||||
|
|
||||||
|
The implementation of packages_distributions() is adapted from
|
||||||
|
importlib_metadata, which is distributed under the following license:
|
||||||
|
|
||||||
|
Copyright 2017-2019 Jason R. Coombs, Barry Warsaw
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
|
|
@ -5,7 +5,7 @@ requires = [
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.0.7,<8.1.0",
|
"thinc>=8.0.12,<8.1.0",
|
||||||
"blis>=0.4.0,<0.8.0",
|
"blis>=0.4.0,<0.8.0",
|
||||||
"pathy",
|
"pathy",
|
||||||
"numpy>=1.15.0",
|
"numpy>=1.15.0",
|
||||||
|
|
|
@ -1,15 +1,16 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
spacy-legacy>=3.0.7,<3.1.0
|
spacy-legacy>=3.0.8,<3.1.0
|
||||||
|
spacy-loggers>=1.0.0,<2.0.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.7,<8.1.0
|
thinc>=8.0.12,<8.1.0
|
||||||
blis>=0.4.0,<0.8.0
|
blis>=0.4.0,<0.8.0
|
||||||
ml_datasets>=0.2.0,<0.3.0
|
ml_datasets>=0.2.0,<0.3.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.8.1,<1.1.0
|
wasabi>=0.8.1,<1.1.0
|
||||||
srsly>=2.4.1,<3.0.0
|
srsly>=2.4.1,<3.0.0
|
||||||
catalogue>=2.0.4,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
typer>=0.3.0,<0.4.0
|
typer>=0.3.0,<0.5.0
|
||||||
pathy>=0.3.5
|
pathy>=0.3.5
|
||||||
# Third party dependencies
|
# Third party dependencies
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
|
@ -17,6 +18,7 @@ requests>=2.13.0,<3.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0
|
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0
|
||||||
jinja2
|
jinja2
|
||||||
|
langcodes>=3.2.0,<4.0.0
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
packaging>=20.0
|
packaging>=20.0
|
||||||
|
@ -29,3 +31,7 @@ pytest-timeout>=1.3.0,<2.0.0
|
||||||
mock>=2.0.0,<3.0.0
|
mock>=2.0.0,<3.0.0
|
||||||
flake8>=3.8.0,<3.10.0
|
flake8>=3.8.0,<3.10.0
|
||||||
hypothesis>=3.27.0,<7.0.0
|
hypothesis>=3.27.0,<7.0.0
|
||||||
|
mypy==0.910
|
||||||
|
types-dataclasses>=0.1.3; python_version < "3.7"
|
||||||
|
types-mock>=0.1.1
|
||||||
|
types-requests
|
||||||
|
|
55
setup.cfg
55
setup.cfg
|
@ -21,6 +21,7 @@ classifiers =
|
||||||
Programming Language :: Python :: 3.7
|
Programming Language :: Python :: 3.7
|
||||||
Programming Language :: Python :: 3.8
|
Programming Language :: Python :: 3.8
|
||||||
Programming Language :: Python :: 3.9
|
Programming Language :: Python :: 3.9
|
||||||
|
Programming Language :: Python :: 3.10
|
||||||
Topic :: Scientific/Engineering
|
Topic :: Scientific/Engineering
|
||||||
project_urls =
|
project_urls =
|
||||||
Release notes = https://github.com/explosion/spaCy/releases
|
Release notes = https://github.com/explosion/spaCy/releases
|
||||||
|
@ -37,19 +38,20 @@ setup_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.0.7,<8.1.0
|
thinc>=8.0.12,<8.1.0
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
spacy-legacy>=3.0.7,<3.1.0
|
spacy-legacy>=3.0.8,<3.1.0
|
||||||
|
spacy-loggers>=1.0.0,<2.0.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.7,<8.1.0
|
thinc>=8.0.12,<8.1.0
|
||||||
blis>=0.4.0,<0.8.0
|
blis>=0.4.0,<0.8.0
|
||||||
wasabi>=0.8.1,<1.1.0
|
wasabi>=0.8.1,<1.1.0
|
||||||
srsly>=2.4.1,<3.0.0
|
srsly>=2.4.1,<3.0.0
|
||||||
catalogue>=2.0.4,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
typer>=0.3.0,<0.4.0
|
typer>=0.3.0,<0.5.0
|
||||||
pathy>=0.3.5
|
pathy>=0.3.5
|
||||||
# Third-party dependencies
|
# Third-party dependencies
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
|
@ -61,6 +63,7 @@ install_requires =
|
||||||
setuptools
|
setuptools
|
||||||
packaging>=20.0
|
packaging>=20.0
|
||||||
typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8"
|
typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8"
|
||||||
|
langcodes>=3.2.0,<4.0.0
|
||||||
|
|
||||||
[options.entry_points]
|
[options.entry_points]
|
||||||
console_scripts =
|
console_scripts =
|
||||||
|
@ -68,37 +71,45 @@ console_scripts =
|
||||||
|
|
||||||
[options.extras_require]
|
[options.extras_require]
|
||||||
lookups =
|
lookups =
|
||||||
spacy_lookups_data>=1.0.2,<1.1.0
|
spacy_lookups_data>=1.0.3,<1.1.0
|
||||||
transformers =
|
transformers =
|
||||||
spacy_transformers>=1.0.1,<1.1.0
|
spacy_transformers>=1.1.2,<1.2.0
|
||||||
ray =
|
ray =
|
||||||
spacy_ray>=0.1.0,<1.0.0
|
spacy_ray>=0.1.0,<1.0.0
|
||||||
cuda =
|
cuda =
|
||||||
cupy>=5.0.0b4,<10.0.0
|
cupy>=5.0.0b4,<11.0.0
|
||||||
cuda80 =
|
cuda80 =
|
||||||
cupy-cuda80>=5.0.0b4,<10.0.0
|
cupy-cuda80>=5.0.0b4,<11.0.0
|
||||||
cuda90 =
|
cuda90 =
|
||||||
cupy-cuda90>=5.0.0b4,<10.0.0
|
cupy-cuda90>=5.0.0b4,<11.0.0
|
||||||
cuda91 =
|
cuda91 =
|
||||||
cupy-cuda91>=5.0.0b4,<10.0.0
|
cupy-cuda91>=5.0.0b4,<11.0.0
|
||||||
cuda92 =
|
cuda92 =
|
||||||
cupy-cuda92>=5.0.0b4,<10.0.0
|
cupy-cuda92>=5.0.0b4,<11.0.0
|
||||||
cuda100 =
|
cuda100 =
|
||||||
cupy-cuda100>=5.0.0b4,<10.0.0
|
cupy-cuda100>=5.0.0b4,<11.0.0
|
||||||
cuda101 =
|
cuda101 =
|
||||||
cupy-cuda101>=5.0.0b4,<10.0.0
|
cupy-cuda101>=5.0.0b4,<11.0.0
|
||||||
cuda102 =
|
cuda102 =
|
||||||
cupy-cuda102>=5.0.0b4,<10.0.0
|
cupy-cuda102>=5.0.0b4,<11.0.0
|
||||||
cuda110 =
|
cuda110 =
|
||||||
cupy-cuda110>=5.0.0b4,<10.0.0
|
cupy-cuda110>=5.0.0b4,<11.0.0
|
||||||
cuda111 =
|
cuda111 =
|
||||||
cupy-cuda111>=5.0.0b4,<10.0.0
|
cupy-cuda111>=5.0.0b4,<11.0.0
|
||||||
cuda112 =
|
cuda112 =
|
||||||
cupy-cuda112>=5.0.0b4,<10.0.0
|
cupy-cuda112>=5.0.0b4,<11.0.0
|
||||||
|
cuda113 =
|
||||||
|
cupy-cuda113>=5.0.0b4,<11.0.0
|
||||||
|
cuda114 =
|
||||||
|
cupy-cuda114>=5.0.0b4,<11.0.0
|
||||||
|
cuda115 =
|
||||||
|
cupy-cuda115>=5.0.0b4,<11.0.0
|
||||||
|
apple =
|
||||||
|
thinc-apple-ops>=0.0.4,<1.0.0
|
||||||
# Language tokenizers with external dependencies
|
# Language tokenizers with external dependencies
|
||||||
ja =
|
ja =
|
||||||
sudachipy>=0.4.9
|
sudachipy>=0.5.2,!=0.6.1
|
||||||
sudachidict_core>=20200330
|
sudachidict_core>=20211220
|
||||||
ko =
|
ko =
|
||||||
natto-py==0.9.0
|
natto-py==0.9.0
|
||||||
th =
|
th =
|
||||||
|
@ -122,9 +133,11 @@ exclude =
|
||||||
|
|
||||||
[tool:pytest]
|
[tool:pytest]
|
||||||
markers =
|
markers =
|
||||||
slow
|
slow: mark a test as slow
|
||||||
|
issue: reference specific issue
|
||||||
|
|
||||||
[mypy]
|
[mypy]
|
||||||
ignore_missing_imports = True
|
ignore_missing_imports = True
|
||||||
no_implicit_optional = True
|
no_implicit_optional = True
|
||||||
plugins = pydantic.mypy, thinc.mypy
|
plugins = pydantic.mypy, thinc.mypy
|
||||||
|
allow_redefinition = True
|
||||||
|
|
1
setup.py
1
setup.py
|
@ -81,6 +81,7 @@ COPY_FILES = {
|
||||||
ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package",
|
ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package",
|
||||||
ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package",
|
ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package",
|
||||||
ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package",
|
ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package",
|
||||||
|
ROOT / "website" / "meta" / "universe.json": PACKAGE_ROOT / "tests" / "universe",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@ import sys
|
||||||
# set library-specific custom warning handling before doing anything else
|
# set library-specific custom warning handling before doing anything else
|
||||||
from .errors import setup_default_warnings
|
from .errors import setup_default_warnings
|
||||||
|
|
||||||
setup_default_warnings()
|
setup_default_warnings() # noqa: E402
|
||||||
|
|
||||||
# These are imported as part of the API
|
# These are imported as part of the API
|
||||||
from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401
|
from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "3.1.0"
|
__version__ = "3.2.1"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__projects__ = "https://github.com/explosion/projects"
|
__projects__ = "https://github.com/explosion/projects"
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
from .errors import Errors
|
||||||
|
|
||||||
|
IOB_STRINGS = ("", "I", "O", "B")
|
||||||
|
|
||||||
IDS = {
|
IDS = {
|
||||||
"": NULL_ATTR,
|
"": NULL_ATTR,
|
||||||
|
@ -64,7 +67,6 @@ IDS = {
|
||||||
"FLAG61": FLAG61,
|
"FLAG61": FLAG61,
|
||||||
"FLAG62": FLAG62,
|
"FLAG62": FLAG62,
|
||||||
"FLAG63": FLAG63,
|
"FLAG63": FLAG63,
|
||||||
|
|
||||||
"ID": ID,
|
"ID": ID,
|
||||||
"ORTH": ORTH,
|
"ORTH": ORTH,
|
||||||
"LOWER": LOWER,
|
"LOWER": LOWER,
|
||||||
|
@ -72,7 +74,6 @@ IDS = {
|
||||||
"SHAPE": SHAPE,
|
"SHAPE": SHAPE,
|
||||||
"PREFIX": PREFIX,
|
"PREFIX": PREFIX,
|
||||||
"SUFFIX": SUFFIX,
|
"SUFFIX": SUFFIX,
|
||||||
|
|
||||||
"LENGTH": LENGTH,
|
"LENGTH": LENGTH,
|
||||||
"LEMMA": LEMMA,
|
"LEMMA": LEMMA,
|
||||||
"POS": POS,
|
"POS": POS,
|
||||||
|
@ -87,7 +88,7 @@ IDS = {
|
||||||
"SPACY": SPACY,
|
"SPACY": SPACY,
|
||||||
"LANG": LANG,
|
"LANG": LANG,
|
||||||
"MORPH": MORPH,
|
"MORPH": MORPH,
|
||||||
"IDX": IDX
|
"IDX": IDX,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -109,28 +110,66 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||||
"""
|
"""
|
||||||
inty_attrs = {}
|
inty_attrs = {}
|
||||||
if _do_deprecated:
|
if _do_deprecated:
|
||||||
if 'F' in stringy_attrs:
|
if "F" in stringy_attrs:
|
||||||
stringy_attrs["ORTH"] = stringy_attrs.pop("F")
|
stringy_attrs["ORTH"] = stringy_attrs.pop("F")
|
||||||
if 'L' in stringy_attrs:
|
if "L" in stringy_attrs:
|
||||||
stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
|
stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
|
||||||
if 'pos' in stringy_attrs:
|
if "pos" in stringy_attrs:
|
||||||
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
|
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
|
||||||
if 'morph' in stringy_attrs:
|
if "morph" in stringy_attrs:
|
||||||
morphs = stringy_attrs.pop('morph')
|
morphs = stringy_attrs.pop("morph")
|
||||||
if 'number' in stringy_attrs:
|
if "number" in stringy_attrs:
|
||||||
stringy_attrs.pop('number')
|
stringy_attrs.pop("number")
|
||||||
if 'tenspect' in stringy_attrs:
|
if "tenspect" in stringy_attrs:
|
||||||
stringy_attrs.pop('tenspect')
|
stringy_attrs.pop("tenspect")
|
||||||
morph_keys = [
|
morph_keys = [
|
||||||
'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number',
|
"PunctType",
|
||||||
'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
|
"PunctSide",
|
||||||
'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
|
"Other",
|
||||||
'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr',
|
"Degree",
|
||||||
'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm',
|
"AdvType",
|
||||||
'NumValue', 'PartType', 'Polite', 'StyleVariant',
|
"Number",
|
||||||
'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
|
"VerbForm",
|
||||||
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
|
"PronType",
|
||||||
'Polarity', 'PrepCase', 'Animacy' # U20
|
"Aspect",
|
||||||
|
"Tense",
|
||||||
|
"PartType",
|
||||||
|
"Poss",
|
||||||
|
"Hyph",
|
||||||
|
"ConjType",
|
||||||
|
"NumType",
|
||||||
|
"Foreign",
|
||||||
|
"VerbType",
|
||||||
|
"NounType",
|
||||||
|
"Gender",
|
||||||
|
"Mood",
|
||||||
|
"Negative",
|
||||||
|
"Tense",
|
||||||
|
"Voice",
|
||||||
|
"Abbr",
|
||||||
|
"Derivation",
|
||||||
|
"Echo",
|
||||||
|
"Foreign",
|
||||||
|
"NameType",
|
||||||
|
"NounType",
|
||||||
|
"NumForm",
|
||||||
|
"NumValue",
|
||||||
|
"PartType",
|
||||||
|
"Polite",
|
||||||
|
"StyleVariant",
|
||||||
|
"PronType",
|
||||||
|
"AdjType",
|
||||||
|
"Person",
|
||||||
|
"Variant",
|
||||||
|
"AdpType",
|
||||||
|
"Reflex",
|
||||||
|
"Negative",
|
||||||
|
"Mood",
|
||||||
|
"Aspect",
|
||||||
|
"Case",
|
||||||
|
"Polarity",
|
||||||
|
"PrepCase",
|
||||||
|
"Animacy", # U20
|
||||||
]
|
]
|
||||||
for key in morph_keys:
|
for key in morph_keys:
|
||||||
if key in stringy_attrs:
|
if key in stringy_attrs:
|
||||||
|
@ -142,8 +181,13 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||||
for name, value in stringy_attrs.items():
|
for name, value in stringy_attrs.items():
|
||||||
int_key = intify_attr(name)
|
int_key = intify_attr(name)
|
||||||
if int_key is not None:
|
if int_key is not None:
|
||||||
if strings_map is not None and isinstance(value, basestring):
|
if int_key == ENT_IOB:
|
||||||
if hasattr(strings_map, 'add'):
|
if value in IOB_STRINGS:
|
||||||
|
value = IOB_STRINGS.index(value)
|
||||||
|
elif isinstance(value, str):
|
||||||
|
raise ValueError(Errors.E1025.format(value=value))
|
||||||
|
if strings_map is not None and isinstance(value, str):
|
||||||
|
if hasattr(strings_map, "add"):
|
||||||
value = strings_map.add(value)
|
value = strings_map.add(value)
|
||||||
else:
|
else:
|
||||||
value = strings_map[value]
|
value = strings_map[value]
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, TYPE_CHECKING
|
from typing import Dict, Any, Union, List, Optional, Tuple, Iterable
|
||||||
|
from typing import TYPE_CHECKING, overload
|
||||||
import sys
|
import sys
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -15,6 +16,7 @@ from thinc.util import has_cupy, gpu_is_available
|
||||||
from configparser import InterpolationError
|
from configparser import InterpolationError
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
from ..compat import Literal
|
||||||
from ..schemas import ProjectConfigSchema, validate
|
from ..schemas import ProjectConfigSchema, validate
|
||||||
from ..util import import_file, run_command, make_tempdir, registry, logger
|
from ..util import import_file, run_command, make_tempdir, registry, logger
|
||||||
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
|
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
|
||||||
|
@ -260,15 +262,16 @@ def get_checksum(path: Union[Path, str]) -> str:
|
||||||
RETURNS (str): The checksum.
|
RETURNS (str): The checksum.
|
||||||
"""
|
"""
|
||||||
path = Path(path)
|
path = Path(path)
|
||||||
|
if not (path.is_file() or path.is_dir()):
|
||||||
|
msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1)
|
||||||
if path.is_file():
|
if path.is_file():
|
||||||
return hashlib.md5(Path(path).read_bytes()).hexdigest()
|
return hashlib.md5(Path(path).read_bytes()).hexdigest()
|
||||||
if path.is_dir():
|
else:
|
||||||
# TODO: this is currently pretty slow
|
# TODO: this is currently pretty slow
|
||||||
dir_checksum = hashlib.md5()
|
dir_checksum = hashlib.md5()
|
||||||
for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
|
for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
|
||||||
dir_checksum.update(sub_file.read_bytes())
|
dir_checksum.update(sub_file.read_bytes())
|
||||||
return dir_checksum.hexdigest()
|
return dir_checksum.hexdigest()
|
||||||
msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1)
|
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
|
@ -397,7 +400,11 @@ def git_checkout(
|
||||||
run_command(cmd, capture=True)
|
run_command(cmd, capture=True)
|
||||||
# We need Path(name) to make sure we also support subdirectories
|
# We need Path(name) to make sure we also support subdirectories
|
||||||
try:
|
try:
|
||||||
shutil.copytree(str(tmp_dir / Path(subpath)), str(dest))
|
source_path = tmp_dir / Path(subpath)
|
||||||
|
if not is_subpath_of(tmp_dir, source_path):
|
||||||
|
err = f"'{subpath}' is a path outside of the cloned repository."
|
||||||
|
msg.fail(err, repo, exits=1)
|
||||||
|
shutil.copytree(str(source_path), str(dest))
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')"
|
err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')"
|
||||||
msg.fail(err, repo, exits=1)
|
msg.fail(err, repo, exits=1)
|
||||||
|
@ -445,8 +452,14 @@ def git_sparse_checkout(repo, subpath, dest, branch):
|
||||||
# And finally, we can checkout our subpath
|
# And finally, we can checkout our subpath
|
||||||
cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
|
cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
|
||||||
run_command(cmd, capture=True)
|
run_command(cmd, capture=True)
|
||||||
# We need Path(name) to make sure we also support subdirectories
|
|
||||||
shutil.move(str(tmp_dir / Path(subpath)), str(dest))
|
# Get a subdirectory of the cloned path, if appropriate
|
||||||
|
source_path = tmp_dir / Path(subpath)
|
||||||
|
if not is_subpath_of(tmp_dir, source_path):
|
||||||
|
err = f"'{subpath}' is a path outside of the cloned repository."
|
||||||
|
msg.fail(err, repo, exits=1)
|
||||||
|
|
||||||
|
shutil.move(str(source_path), str(dest))
|
||||||
|
|
||||||
|
|
||||||
def get_git_version(
|
def get_git_version(
|
||||||
|
@ -458,12 +471,15 @@ def get_git_version(
|
||||||
RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
|
RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
|
||||||
(0, 0) if the version couldn't be determined.
|
(0, 0) if the version couldn't be determined.
|
||||||
"""
|
"""
|
||||||
|
try:
|
||||||
ret = run_command("git --version", capture=True)
|
ret = run_command("git --version", capture=True)
|
||||||
|
except:
|
||||||
|
raise RuntimeError(error)
|
||||||
stdout = ret.stdout.strip()
|
stdout = ret.stdout.strip()
|
||||||
if not stdout or not stdout.startswith("git version"):
|
if not stdout or not stdout.startswith("git version"):
|
||||||
return (0, 0)
|
return 0, 0
|
||||||
version = stdout[11:].strip().split(".")
|
version = stdout[11:].strip().split(".")
|
||||||
return (int(version[0]), int(version[1]))
|
return int(version[0]), int(version[1])
|
||||||
|
|
||||||
|
|
||||||
def _http_to_git(repo: str) -> str:
|
def _http_to_git(repo: str) -> str:
|
||||||
|
@ -477,6 +493,29 @@ def _http_to_git(repo: str) -> str:
|
||||||
return repo
|
return repo
|
||||||
|
|
||||||
|
|
||||||
|
def is_subpath_of(parent, child):
|
||||||
|
"""
|
||||||
|
Check whether `child` is a path contained within `parent`.
|
||||||
|
"""
|
||||||
|
# Based on https://stackoverflow.com/a/37095733 .
|
||||||
|
|
||||||
|
# In Python 3.9, the `Path.is_relative_to()` method will supplant this, so
|
||||||
|
# we can stop using crusty old os.path functions.
|
||||||
|
parent_realpath = os.path.realpath(parent)
|
||||||
|
child_realpath = os.path.realpath(child)
|
||||||
|
return os.path.commonpath([parent_realpath, child_realpath]) == parent_realpath
|
||||||
|
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def string_to_list(value: str, intify: Literal[True]) -> List[int]:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[int]]:
|
def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[int]]:
|
||||||
"""Parse a comma-separated string to a list and account for various
|
"""Parse a comma-separated string to a list and account for various
|
||||||
formatting options. Mostly used to handle CLI arguments that take a list of
|
formatting options. Mostly used to handle CLI arguments that take a list of
|
||||||
|
@ -487,7 +526,7 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
|
||||||
RETURNS (Union[List[str], List[int]]): A list of strings or ints.
|
RETURNS (Union[List[str], List[int]]): A list of strings or ints.
|
||||||
"""
|
"""
|
||||||
if not value:
|
if not value:
|
||||||
return []
|
return [] # type: ignore[return-value]
|
||||||
if value.startswith("[") and value.endswith("]"):
|
if value.startswith("[") and value.endswith("]"):
|
||||||
value = value[1:-1]
|
value = value[1:-1]
|
||||||
result = []
|
result = []
|
||||||
|
@ -499,7 +538,7 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
|
||||||
p = p[1:-1]
|
p = p[1:-1]
|
||||||
p = p.strip()
|
p = p.strip()
|
||||||
if intify:
|
if intify:
|
||||||
p = int(p)
|
p = int(p) # type: ignore[assignment]
|
||||||
result.append(p)
|
result.append(p)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional, Any, List, Union
|
from typing import Callable, Iterable, Mapping, Optional, Any, List, Union
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
@ -9,7 +9,7 @@ import itertools
|
||||||
|
|
||||||
from ._util import app, Arg, Opt
|
from ._util import app, Arg, Opt
|
||||||
from ..training import docs_to_json
|
from ..training import docs_to_json
|
||||||
from ..tokens import DocBin
|
from ..tokens import Doc, DocBin
|
||||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
||||||
from ..training.converters import conllu_to_docs
|
from ..training.converters import conllu_to_docs
|
||||||
|
|
||||||
|
@ -19,7 +19,7 @@ from ..training.converters import conllu_to_docs
|
||||||
# entry to this dict with the file extension mapped to the converter function
|
# entry to this dict with the file extension mapped to the converter function
|
||||||
# imported from /converters.
|
# imported from /converters.
|
||||||
|
|
||||||
CONVERTERS = {
|
CONVERTERS: Mapping[str, Callable[..., Iterable[Doc]]] = {
|
||||||
"conllubio": conllu_to_docs,
|
"conllubio": conllu_to_docs,
|
||||||
"conllu": conllu_to_docs,
|
"conllu": conllu_to_docs,
|
||||||
"conll": conll_ner_to_docs,
|
"conll": conll_ner_to_docs,
|
||||||
|
@ -66,19 +66,16 @@ def convert_cli(
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/cli#convert
|
DOCS: https://spacy.io/api/cli#convert
|
||||||
"""
|
"""
|
||||||
if isinstance(file_type, FileTypes):
|
|
||||||
# We get an instance of the FileTypes from the CLI so we need its string value
|
|
||||||
file_type = file_type.value
|
|
||||||
input_path = Path(input_path)
|
input_path = Path(input_path)
|
||||||
output_dir = "-" if output_dir == Path("-") else output_dir
|
output_dir: Union[str, Path] = "-" if output_dir == Path("-") else output_dir
|
||||||
silent = output_dir == "-"
|
silent = output_dir == "-"
|
||||||
msg = Printer(no_print=silent)
|
msg = Printer(no_print=silent)
|
||||||
verify_cli_args(msg, input_path, output_dir, file_type, converter, ner_map)
|
verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
|
||||||
converter = _get_converter(msg, converter, input_path)
|
converter = _get_converter(msg, converter, input_path)
|
||||||
convert(
|
convert(
|
||||||
input_path,
|
input_path,
|
||||||
output_dir,
|
output_dir,
|
||||||
file_type=file_type,
|
file_type=file_type.value,
|
||||||
n_sents=n_sents,
|
n_sents=n_sents,
|
||||||
seg_sents=seg_sents,
|
seg_sents=seg_sents,
|
||||||
model=model,
|
model=model,
|
||||||
|
@ -94,7 +91,7 @@ def convert_cli(
|
||||||
|
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
input_path: Union[str, Path],
|
input_path: Path,
|
||||||
output_dir: Union[str, Path],
|
output_dir: Union[str, Path],
|
||||||
*,
|
*,
|
||||||
file_type: str = "json",
|
file_type: str = "json",
|
||||||
|
@ -108,13 +105,14 @@ def convert(
|
||||||
lang: Optional[str] = None,
|
lang: Optional[str] = None,
|
||||||
concatenate: bool = False,
|
concatenate: bool = False,
|
||||||
silent: bool = True,
|
silent: bool = True,
|
||||||
msg: Optional[Printer],
|
msg: Optional[Printer] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
input_path = Path(input_path)
|
||||||
if not msg:
|
if not msg:
|
||||||
msg = Printer(no_print=silent)
|
msg = Printer(no_print=silent)
|
||||||
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
|
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
|
||||||
doc_files = []
|
doc_files = []
|
||||||
for input_loc in walk_directory(Path(input_path), converter):
|
for input_loc in walk_directory(input_path, converter):
|
||||||
with input_loc.open("r", encoding="utf-8") as infile:
|
with input_loc.open("r", encoding="utf-8") as infile:
|
||||||
input_data = infile.read()
|
input_data = infile.read()
|
||||||
# Use converter function to convert data
|
# Use converter function to convert data
|
||||||
|
@ -141,7 +139,7 @@ def convert(
|
||||||
else:
|
else:
|
||||||
db = DocBin(docs=docs, store_user_data=True)
|
db = DocBin(docs=docs, store_user_data=True)
|
||||||
len_docs = len(db)
|
len_docs = len(db)
|
||||||
data = db.to_bytes()
|
data = db.to_bytes() # type: ignore[assignment]
|
||||||
if output_dir == "-":
|
if output_dir == "-":
|
||||||
_print_docs_to_stdout(data, file_type)
|
_print_docs_to_stdout(data, file_type)
|
||||||
else:
|
else:
|
||||||
|
@ -220,13 +218,12 @@ def walk_directory(path: Path, converter: str) -> List[Path]:
|
||||||
|
|
||||||
def verify_cli_args(
|
def verify_cli_args(
|
||||||
msg: Printer,
|
msg: Printer,
|
||||||
input_path: Union[str, Path],
|
input_path: Path,
|
||||||
output_dir: Union[str, Path],
|
output_dir: Union[str, Path],
|
||||||
file_type: FileTypes,
|
file_type: str,
|
||||||
converter: str,
|
converter: str,
|
||||||
ner_map: Optional[Path],
|
ner_map: Optional[Path],
|
||||||
):
|
):
|
||||||
input_path = Path(input_path)
|
|
||||||
if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
|
if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
|
||||||
msg.fail(
|
msg.fail(
|
||||||
f"Can't write .{file_type} data to stdout. Please specify an output directory.",
|
f"Can't write .{file_type} data to stdout. Please specify an output directory.",
|
||||||
|
@ -244,13 +241,13 @@ def verify_cli_args(
|
||||||
msg.fail("No input files in directory", input_path, exits=1)
|
msg.fail("No input files in directory", input_path, exits=1)
|
||||||
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
|
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
|
||||||
if converter == "auto" and len(file_types) >= 2:
|
if converter == "auto" and len(file_types) >= 2:
|
||||||
file_types = ",".join(file_types)
|
file_types_str = ",".join(file_types)
|
||||||
msg.fail("All input files must be same type", file_types, exits=1)
|
msg.fail("All input files must be same type", file_types_str, exits=1)
|
||||||
if converter != "auto" and converter not in CONVERTERS:
|
if converter != "auto" and converter not in CONVERTERS:
|
||||||
msg.fail(f"Can't find converter for {converter}", exits=1)
|
msg.fail(f"Can't find converter for {converter}", exits=1)
|
||||||
|
|
||||||
|
|
||||||
def _get_converter(msg, converter, input_path):
|
def _get_converter(msg, converter, input_path: Path):
|
||||||
if input_path.is_dir():
|
if input_path.is_dir():
|
||||||
input_path = walk_directory(input_path, converter)[0]
|
input_path = walk_directory(input_path, converter)[0]
|
||||||
if converter == "auto":
|
if converter == "auto":
|
||||||
|
|
|
@ -25,7 +25,7 @@ def debug_config_cli(
|
||||||
show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")
|
show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Debug a config.cfg file and show validation errors. The command will
|
"""Debug a config file and show validation errors. The command will
|
||||||
create all objects in the tree and validate them. Note that some config
|
create all objects in the tree and validate them. Note that some config
|
||||||
validation errors are blocking and will prevent the rest of the config from
|
validation errors are blocking and will prevent the rest of the config from
|
||||||
being resolved. This means that you may not see all validation errors at
|
being resolved. This means that you may not see all validation errors at
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
from typing import List, Sequence, Dict, Any, Tuple, Optional, Set
|
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
|
||||||
|
from typing import cast, overload
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import sys
|
import sys
|
||||||
|
@ -13,10 +14,11 @@ from ..training.initialize import get_sourced_components
|
||||||
from ..schemas import ConfigSchemaTraining
|
from ..schemas import ConfigSchemaTraining
|
||||||
from ..pipeline._parser_internals import nonproj
|
from ..pipeline._parser_internals import nonproj
|
||||||
from ..pipeline._parser_internals.nonproj import DELIMITER
|
from ..pipeline._parser_internals.nonproj import DELIMITER
|
||||||
from ..pipeline import Morphologizer
|
from ..pipeline import Morphologizer, SpanCategorizer
|
||||||
from ..morphology import Morphology
|
from ..morphology import Morphology
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..util import registry, resolve_dot_names
|
from ..util import registry, resolve_dot_names
|
||||||
|
from ..compat import Literal
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -101,13 +103,14 @@ def debug_data(
|
||||||
# Create the gold corpus to be able to better analyze data
|
# Create the gold corpus to be able to better analyze data
|
||||||
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
||||||
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
|
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
|
||||||
|
|
||||||
|
nlp.initialize(lambda: train_corpus(nlp))
|
||||||
|
msg.good("Pipeline can be initialized with data")
|
||||||
|
|
||||||
train_dataset = list(train_corpus(nlp))
|
train_dataset = list(train_corpus(nlp))
|
||||||
dev_dataset = list(dev_corpus(nlp))
|
dev_dataset = list(dev_corpus(nlp))
|
||||||
msg.good("Corpus is loadable")
|
msg.good("Corpus is loadable")
|
||||||
|
|
||||||
nlp.initialize(lambda: train_dataset)
|
|
||||||
msg.good("Pipeline can be initialized with data")
|
|
||||||
|
|
||||||
# Create all gold data here to avoid iterating over the train_dataset constantly
|
# Create all gold data here to avoid iterating over the train_dataset constantly
|
||||||
gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True)
|
gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True)
|
||||||
gold_train_unpreprocessed_data = _compile_gold(
|
gold_train_unpreprocessed_data = _compile_gold(
|
||||||
|
@ -200,7 +203,7 @@ def debug_data(
|
||||||
has_low_data_warning = False
|
has_low_data_warning = False
|
||||||
has_no_neg_warning = False
|
has_no_neg_warning = False
|
||||||
has_ws_ents_error = False
|
has_ws_ents_error = False
|
||||||
has_punct_ents_warning = False
|
has_boundary_cross_ents_warning = False
|
||||||
|
|
||||||
msg.divider("Named Entity Recognition")
|
msg.divider("Named Entity Recognition")
|
||||||
msg.info(f"{len(model_labels)} label(s)")
|
msg.info(f"{len(model_labels)} label(s)")
|
||||||
|
@ -227,10 +230,6 @@ def debug_data(
|
||||||
msg.fail(f"{gold_train_data['ws_ents']} invalid whitespace entity spans")
|
msg.fail(f"{gold_train_data['ws_ents']} invalid whitespace entity spans")
|
||||||
has_ws_ents_error = True
|
has_ws_ents_error = True
|
||||||
|
|
||||||
if gold_train_data["punct_ents"]:
|
|
||||||
msg.warn(f"{gold_train_data['punct_ents']} entity span(s) with punctuation")
|
|
||||||
has_punct_ents_warning = True
|
|
||||||
|
|
||||||
for label in labels:
|
for label in labels:
|
||||||
if label_counts[label] <= NEW_LABEL_THRESHOLD:
|
if label_counts[label] <= NEW_LABEL_THRESHOLD:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
|
@ -244,14 +243,20 @@ def debug_data(
|
||||||
msg.warn(f"No examples for texts WITHOUT new label '{label}'")
|
msg.warn(f"No examples for texts WITHOUT new label '{label}'")
|
||||||
has_no_neg_warning = True
|
has_no_neg_warning = True
|
||||||
|
|
||||||
|
if gold_train_data["boundary_cross_ents"]:
|
||||||
|
msg.warn(
|
||||||
|
f"{gold_train_data['boundary_cross_ents']} entity span(s) crossing sentence boundaries"
|
||||||
|
)
|
||||||
|
has_boundary_cross_ents_warning = True
|
||||||
|
|
||||||
if not has_low_data_warning:
|
if not has_low_data_warning:
|
||||||
msg.good("Good amount of examples for all labels")
|
msg.good("Good amount of examples for all labels")
|
||||||
if not has_no_neg_warning:
|
if not has_no_neg_warning:
|
||||||
msg.good("Examples without occurrences available for all labels")
|
msg.good("Examples without occurrences available for all labels")
|
||||||
if not has_ws_ents_error:
|
if not has_ws_ents_error:
|
||||||
msg.good("No entities consisting of or starting/ending with whitespace")
|
msg.good("No entities consisting of or starting/ending with whitespace")
|
||||||
if not has_punct_ents_warning:
|
if not has_boundary_cross_ents_warning:
|
||||||
msg.good("No entities consisting of or starting/ending with punctuation")
|
msg.good("No entities crossing sentence boundaries")
|
||||||
|
|
||||||
if has_low_data_warning:
|
if has_low_data_warning:
|
||||||
msg.text(
|
msg.text(
|
||||||
|
@ -267,15 +272,9 @@ def debug_data(
|
||||||
show=verbose,
|
show=verbose,
|
||||||
)
|
)
|
||||||
if has_ws_ents_error:
|
if has_ws_ents_error:
|
||||||
msg.text(
|
|
||||||
"As of spaCy v2.1.0, entity spans consisting of or starting/ending "
|
|
||||||
"with whitespace characters are considered invalid."
|
|
||||||
)
|
|
||||||
|
|
||||||
if has_punct_ents_warning:
|
|
||||||
msg.text(
|
msg.text(
|
||||||
"Entity spans consisting of or starting/ending "
|
"Entity spans consisting of or starting/ending "
|
||||||
"with punctuation can not be trained with a noise level > 0."
|
"with whitespace characters are considered invalid."
|
||||||
)
|
)
|
||||||
|
|
||||||
if "textcat" in factory_names:
|
if "textcat" in factory_names:
|
||||||
|
@ -377,10 +376,11 @@ def debug_data(
|
||||||
|
|
||||||
if "tagger" in factory_names:
|
if "tagger" in factory_names:
|
||||||
msg.divider("Part-of-speech Tagging")
|
msg.divider("Part-of-speech Tagging")
|
||||||
labels = [label for label in gold_train_data["tags"]]
|
label_list = [label for label in gold_train_data["tags"]]
|
||||||
model_labels = _get_labels_from_model(nlp, "tagger")
|
model_labels = _get_labels_from_model(nlp, "tagger")
|
||||||
msg.info(f"{len(labels)} label(s) in train data")
|
msg.info(f"{len(label_list)} label(s) in train data")
|
||||||
missing_labels = model_labels - set(labels)
|
labels = set(label_list)
|
||||||
|
missing_labels = model_labels - labels
|
||||||
if missing_labels:
|
if missing_labels:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Some model labels are not present in the train data. The "
|
"Some model labels are not present in the train data. The "
|
||||||
|
@ -394,10 +394,11 @@ def debug_data(
|
||||||
|
|
||||||
if "morphologizer" in factory_names:
|
if "morphologizer" in factory_names:
|
||||||
msg.divider("Morphologizer (POS+Morph)")
|
msg.divider("Morphologizer (POS+Morph)")
|
||||||
labels = [label for label in gold_train_data["morphs"]]
|
label_list = [label for label in gold_train_data["morphs"]]
|
||||||
model_labels = _get_labels_from_model(nlp, "morphologizer")
|
model_labels = _get_labels_from_model(nlp, "morphologizer")
|
||||||
msg.info(f"{len(labels)} label(s) in train data")
|
msg.info(f"{len(label_list)} label(s) in train data")
|
||||||
missing_labels = model_labels - set(labels)
|
labels = set(label_list)
|
||||||
|
missing_labels = model_labels - labels
|
||||||
if missing_labels:
|
if missing_labels:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Some model labels are not present in the train data. The "
|
"Some model labels are not present in the train data. The "
|
||||||
|
@ -564,7 +565,7 @@ def _compile_gold(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
make_proj: bool,
|
make_proj: bool,
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
data = {
|
data: Dict[str, Any] = {
|
||||||
"ner": Counter(),
|
"ner": Counter(),
|
||||||
"cats": Counter(),
|
"cats": Counter(),
|
||||||
"tags": Counter(),
|
"tags": Counter(),
|
||||||
|
@ -573,7 +574,7 @@ def _compile_gold(
|
||||||
"words": Counter(),
|
"words": Counter(),
|
||||||
"roots": Counter(),
|
"roots": Counter(),
|
||||||
"ws_ents": 0,
|
"ws_ents": 0,
|
||||||
"punct_ents": 0,
|
"boundary_cross_ents": 0,
|
||||||
"n_words": 0,
|
"n_words": 0,
|
||||||
"n_misaligned_words": 0,
|
"n_misaligned_words": 0,
|
||||||
"words_missing_vectors": Counter(),
|
"words_missing_vectors": Counter(),
|
||||||
|
@ -608,19 +609,11 @@ def _compile_gold(
|
||||||
if label.startswith(("B-", "U-", "L-")) and doc[i].is_space:
|
if label.startswith(("B-", "U-", "L-")) and doc[i].is_space:
|
||||||
# "Illegal" whitespace entity
|
# "Illegal" whitespace entity
|
||||||
data["ws_ents"] += 1
|
data["ws_ents"] += 1
|
||||||
if label.startswith(("B-", "U-", "L-")) and doc[i].text in [
|
|
||||||
".",
|
|
||||||
"'",
|
|
||||||
"!",
|
|
||||||
"?",
|
|
||||||
",",
|
|
||||||
]:
|
|
||||||
# punctuation entity: could be replaced by whitespace when training with noise,
|
|
||||||
# so add a warning to alert the user to this unexpected side effect.
|
|
||||||
data["punct_ents"] += 1
|
|
||||||
if label.startswith(("B-", "U-")):
|
if label.startswith(("B-", "U-")):
|
||||||
combined_label = label.split("-")[1]
|
combined_label = label.split("-")[1]
|
||||||
data["ner"][combined_label] += 1
|
data["ner"][combined_label] += 1
|
||||||
|
if gold[i].is_sent_start and label.startswith(("I-", "L-")):
|
||||||
|
data["boundary_cross_ents"] += 1
|
||||||
elif label == "-":
|
elif label == "-":
|
||||||
data["ner"]["-"] += 1
|
data["ner"]["-"] += 1
|
||||||
if "textcat" in factory_names or "textcat_multilabel" in factory_names:
|
if "textcat" in factory_names or "textcat_multilabel" in factory_names:
|
||||||
|
@ -669,10 +662,28 @@ def _compile_gold(
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def _format_labels(labels: List[Tuple[str, int]], counts: bool = False) -> str:
|
@overload
|
||||||
|
def _format_labels(labels: Iterable[str], counts: Literal[False] = False) -> str:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def _format_labels(
|
||||||
|
labels: Iterable[Tuple[str, int]],
|
||||||
|
counts: Literal[True],
|
||||||
|
) -> str:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
def _format_labels(
|
||||||
|
labels: Union[Iterable[str], Iterable[Tuple[str, int]]],
|
||||||
|
counts: bool = False,
|
||||||
|
) -> str:
|
||||||
if counts:
|
if counts:
|
||||||
return ", ".join([f"'{l}' ({c})" for l, c in labels])
|
return ", ".join(
|
||||||
return ", ".join([f"'{l}'" for l in labels])
|
[f"'{l}' ({c})" for l, c in cast(Iterable[Tuple[str, int]], labels)]
|
||||||
|
)
|
||||||
|
return ", ".join([f"'{l}'" for l in cast(Iterable[str], labels)])
|
||||||
|
|
||||||
|
|
||||||
def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
|
def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
|
||||||
|
@ -688,8 +699,30 @@ def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
|
||||||
return count
|
return count
|
||||||
|
|
||||||
|
|
||||||
def _get_labels_from_model(nlp: Language, pipe_name: str) -> Set[str]:
|
def _get_labels_from_model(nlp: Language, factory_name: str) -> Set[str]:
|
||||||
if pipe_name not in nlp.pipe_names:
|
pipe_names = [
|
||||||
return set()
|
pipe_name
|
||||||
|
for pipe_name in nlp.pipe_names
|
||||||
|
if nlp.get_pipe_meta(pipe_name).factory == factory_name
|
||||||
|
]
|
||||||
|
labels: Set[str] = set()
|
||||||
|
for pipe_name in pipe_names:
|
||||||
pipe = nlp.get_pipe(pipe_name)
|
pipe = nlp.get_pipe(pipe_name)
|
||||||
return set(pipe.labels)
|
labels.update(pipe.labels)
|
||||||
|
return labels
|
||||||
|
|
||||||
|
|
||||||
|
def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
|
||||||
|
pipe_names = [
|
||||||
|
pipe_name
|
||||||
|
for pipe_name in nlp.pipe_names
|
||||||
|
if nlp.get_pipe_meta(pipe_name).factory == "spancat"
|
||||||
|
]
|
||||||
|
labels: Dict[str, Set[str]] = {}
|
||||||
|
for pipe_name in pipe_names:
|
||||||
|
pipe = nlp.get_pipe(pipe_name)
|
||||||
|
assert isinstance(pipe, SpanCategorizer)
|
||||||
|
if pipe.key not in labels:
|
||||||
|
labels[pipe.key] = set()
|
||||||
|
labels[pipe.key].update(pipe.labels)
|
||||||
|
return labels
|
||||||
|
|
|
@ -136,7 +136,7 @@ def evaluate(
|
||||||
|
|
||||||
|
|
||||||
def handle_scores_per_type(
|
def handle_scores_per_type(
|
||||||
scores: Union[Scorer, Dict[str, Any]],
|
scores: Dict[str, Any],
|
||||||
data: Dict[str, Any] = {},
|
data: Dict[str, Any] = {},
|
||||||
*,
|
*,
|
||||||
spans_key: str = "sc",
|
spans_key: str = "sc",
|
||||||
|
|
|
@ -15,7 +15,7 @@ def info_cli(
|
||||||
model: Optional[str] = Arg(None, help="Optional loadable spaCy pipeline"),
|
model: Optional[str] = Arg(None, help="Optional loadable spaCy pipeline"),
|
||||||
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
|
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
|
||||||
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
|
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
|
||||||
exclude: Optional[str] = Opt("labels", "--exclude", "-e", help="Comma-separated keys to exclude from the print-out"),
|
exclude: str = Opt("labels", "--exclude", "-e", help="Comma-separated keys to exclude from the print-out"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -61,7 +61,7 @@ def info(
|
||||||
return raw_data
|
return raw_data
|
||||||
|
|
||||||
|
|
||||||
def info_spacy() -> Dict[str, any]:
|
def info_spacy() -> Dict[str, Any]:
|
||||||
"""Generate info about the current spaCy intallation.
|
"""Generate info about the current spaCy intallation.
|
||||||
|
|
||||||
RETURNS (dict): The spaCy info.
|
RETURNS (dict): The spaCy info.
|
||||||
|
|
|
@ -27,9 +27,9 @@ class Optimizations(str, Enum):
|
||||||
@init_cli.command("config")
|
@init_cli.command("config")
|
||||||
def init_config_cli(
|
def init_config_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
output_file: Path = Arg(..., help="File to save config.cfg to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
|
output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
|
||||||
lang: Optional[str] = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
|
lang: str = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
|
||||||
pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
|
pipeline: str = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
|
||||||
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||||
gpu: bool = Opt(False, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
gpu: bool = Opt(False, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||||
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
|
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
|
||||||
|
@ -37,15 +37,13 @@ def init_config_cli(
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Generate a starter config.cfg for training. Based on your requirements
|
Generate a starter config file for training. Based on your requirements
|
||||||
specified via the CLI arguments, this command generates a config with the
|
specified via the CLI arguments, this command generates a config with the
|
||||||
optimal settings for your use case. This includes the choice of architecture,
|
optimal settings for your use case. This includes the choice of architecture,
|
||||||
pretrained weights and related hyperparameters.
|
pretrained weights and related hyperparameters.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/cli#init-config
|
DOCS: https://spacy.io/api/cli#init-config
|
||||||
"""
|
"""
|
||||||
if isinstance(optimize, Optimizations): # instance of enum from the CLI
|
|
||||||
optimize = optimize.value
|
|
||||||
pipeline = string_to_list(pipeline)
|
pipeline = string_to_list(pipeline)
|
||||||
is_stdout = str(output_file) == "-"
|
is_stdout = str(output_file) == "-"
|
||||||
if not is_stdout and output_file.exists() and not force_overwrite:
|
if not is_stdout and output_file.exists() and not force_overwrite:
|
||||||
|
@ -57,7 +55,7 @@ def init_config_cli(
|
||||||
config = init_config(
|
config = init_config(
|
||||||
lang=lang,
|
lang=lang,
|
||||||
pipeline=pipeline,
|
pipeline=pipeline,
|
||||||
optimize=optimize,
|
optimize=optimize.value,
|
||||||
gpu=gpu,
|
gpu=gpu,
|
||||||
pretraining=pretraining,
|
pretraining=pretraining,
|
||||||
silent=is_stdout,
|
silent=is_stdout,
|
||||||
|
@ -68,15 +66,15 @@ def init_config_cli(
|
||||||
@init_cli.command("fill-config")
|
@init_cli.command("fill-config")
|
||||||
def init_fill_config_cli(
|
def init_fill_config_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False),
|
base_path: Path = Arg(..., help="Path to base config to fill", exists=True, dir_okay=False),
|
||||||
output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
|
output_file: Path = Arg("-", help="Path to output .cfg file (or - for stdout)", allow_dash=True),
|
||||||
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
|
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
|
||||||
diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes"),
|
diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes"),
|
||||||
code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Fill partial config.cfg with default values. Will add all missing settings
|
Fill partial config file with default values. Will add all missing settings
|
||||||
from the default config and will create all objects, check the registered
|
from the default config and will create all objects, check the registered
|
||||||
functions for their default values and update the base config. This command
|
functions for their default values and update the base config. This command
|
||||||
can be used with a config generated via the training quickstart widget:
|
can be used with a config generated via the training quickstart widget:
|
||||||
|
@ -175,8 +173,8 @@ def init_config(
|
||||||
"Pipeline": ", ".join(pipeline),
|
"Pipeline": ", ".join(pipeline),
|
||||||
"Optimize for": optimize,
|
"Optimize for": optimize,
|
||||||
"Hardware": variables["hardware"].upper(),
|
"Hardware": variables["hardware"].upper(),
|
||||||
"Transformer": template_vars.transformer.get("name")
|
"Transformer": template_vars.transformer.get("name") # type: ignore[attr-defined]
|
||||||
if template_vars.use_transformer
|
if template_vars.use_transformer # type: ignore[attr-defined]
|
||||||
else None,
|
else None,
|
||||||
}
|
}
|
||||||
msg.info("Generated config template specific for your use case")
|
msg.info("Generated config template specific for your use case")
|
||||||
|
|
|
@ -20,6 +20,7 @@ def init_vectors_cli(
|
||||||
output_dir: Path = Arg(..., help="Pipeline output directory"),
|
output_dir: Path = Arg(..., help="Pipeline output directory"),
|
||||||
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
|
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
|
||||||
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
||||||
|
mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
|
||||||
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
|
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
|
||||||
|
@ -34,7 +35,14 @@ def init_vectors_cli(
|
||||||
nlp = util.get_lang_class(lang)()
|
nlp = util.get_lang_class(lang)()
|
||||||
if jsonl_loc is not None:
|
if jsonl_loc is not None:
|
||||||
update_lexemes(nlp, jsonl_loc)
|
update_lexemes(nlp, jsonl_loc)
|
||||||
convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
|
convert_vectors(
|
||||||
|
nlp,
|
||||||
|
vectors_loc,
|
||||||
|
truncate=truncate,
|
||||||
|
prune=prune,
|
||||||
|
name=name,
|
||||||
|
mode=mode,
|
||||||
|
)
|
||||||
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
||||||
nlp.to_disk(output_dir)
|
nlp.to_disk(output_dir)
|
||||||
msg.good(
|
msg.good(
|
||||||
|
|
|
@ -1,7 +1,10 @@
|
||||||
from typing import Optional, Union, Any, Dict, List, Tuple
|
from typing import Optional, Union, Any, Dict, List, Tuple, cast
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer, MarkdownRenderer, get_raw_input
|
from wasabi import Printer, MarkdownRenderer, get_raw_input
|
||||||
|
from thinc.api import Config
|
||||||
|
from collections import defaultdict
|
||||||
|
from catalogue import RegistryError
|
||||||
import srsly
|
import srsly
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
@ -99,6 +102,12 @@ def package(
|
||||||
msg.fail("Can't load pipeline meta.json", meta_path, exits=1)
|
msg.fail("Can't load pipeline meta.json", meta_path, exits=1)
|
||||||
meta = srsly.read_json(meta_path)
|
meta = srsly.read_json(meta_path)
|
||||||
meta = get_meta(input_dir, meta)
|
meta = get_meta(input_dir, meta)
|
||||||
|
if meta["requirements"]:
|
||||||
|
msg.good(
|
||||||
|
f"Including {len(meta['requirements'])} package requirement(s) from "
|
||||||
|
f"meta and config",
|
||||||
|
", ".join(meta["requirements"]),
|
||||||
|
)
|
||||||
if name is not None:
|
if name is not None:
|
||||||
meta["name"] = name
|
meta["name"] = name
|
||||||
if version is not None:
|
if version is not None:
|
||||||
|
@ -139,6 +148,9 @@ def package(
|
||||||
readme = generate_readme(meta)
|
readme = generate_readme(meta)
|
||||||
create_file(readme_path, readme)
|
create_file(readme_path, readme)
|
||||||
create_file(package_path / model_name_v / "README.md", readme)
|
create_file(package_path / model_name_v / "README.md", readme)
|
||||||
|
msg.good("Generated README.md from meta.json")
|
||||||
|
else:
|
||||||
|
msg.info("Using existing README.md from pipeline directory")
|
||||||
imports = []
|
imports = []
|
||||||
for code_path in code_paths:
|
for code_path in code_paths:
|
||||||
imports.append(code_path.stem)
|
imports.append(code_path.stem)
|
||||||
|
@ -172,6 +184,64 @@ def has_wheel() -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def get_third_party_dependencies(
|
||||||
|
config: Config, exclude: List[str] = util.SimpleFrozenList()
|
||||||
|
) -> List[str]:
|
||||||
|
"""If the config includes references to registered functions that are
|
||||||
|
provided by third-party packages (spacy-transformers, other libraries), we
|
||||||
|
want to include them in meta["requirements"] so that the package specifies
|
||||||
|
them as dependencies and the user won't have to do it manually.
|
||||||
|
|
||||||
|
We do this by:
|
||||||
|
- traversing the config to check for registered function (@ keys)
|
||||||
|
- looking up the functions and getting their module
|
||||||
|
- looking up the module version and generating an appropriate version range
|
||||||
|
|
||||||
|
config (Config): The pipeline config.
|
||||||
|
exclude (list): List of packages to exclude (e.g. that already exist in meta).
|
||||||
|
RETURNS (list): The versioned requirements.
|
||||||
|
"""
|
||||||
|
own_packages = ("spacy", "spacy-legacy", "spacy-nightly", "thinc", "srsly")
|
||||||
|
distributions = util.packages_distributions()
|
||||||
|
funcs = defaultdict(set)
|
||||||
|
# We only want to look at runtime-relevant sections, not [training] or [initialize]
|
||||||
|
for section in ("nlp", "components"):
|
||||||
|
for path, value in util.walk_dict(config[section]):
|
||||||
|
if path[-1].startswith("@"): # collect all function references by registry
|
||||||
|
funcs[path[-1][1:]].add(value)
|
||||||
|
for component in config.get("components", {}).values():
|
||||||
|
if "factory" in component:
|
||||||
|
funcs["factories"].add(component["factory"])
|
||||||
|
modules = set()
|
||||||
|
lang = config["nlp"]["lang"]
|
||||||
|
for reg_name, func_names in funcs.items():
|
||||||
|
for func_name in func_names:
|
||||||
|
# Try the lang-specific version and fall back
|
||||||
|
try:
|
||||||
|
func_info = util.registry.find(reg_name, lang + "." + func_name)
|
||||||
|
except RegistryError:
|
||||||
|
try:
|
||||||
|
func_info = util.registry.find(reg_name, func_name)
|
||||||
|
except RegistryError as regerr:
|
||||||
|
# lang-specific version being absent is not actually an issue
|
||||||
|
raise regerr from None
|
||||||
|
module_name = func_info.get("module") # type: ignore[attr-defined]
|
||||||
|
if module_name: # the code is part of a module, not a --code file
|
||||||
|
modules.add(func_info["module"].split(".")[0]) # type: ignore[index]
|
||||||
|
dependencies = []
|
||||||
|
for module_name in modules:
|
||||||
|
if module_name in distributions:
|
||||||
|
dist = distributions.get(module_name)
|
||||||
|
if dist:
|
||||||
|
pkg = dist[0]
|
||||||
|
if pkg in own_packages or pkg in exclude:
|
||||||
|
continue
|
||||||
|
version = util.get_package_version(pkg)
|
||||||
|
version_range = util.get_minor_version_range(version) # type: ignore[arg-type]
|
||||||
|
dependencies.append(f"{pkg}{version_range}")
|
||||||
|
return dependencies
|
||||||
|
|
||||||
|
|
||||||
def get_build_formats(formats: List[str]) -> Tuple[bool, bool]:
|
def get_build_formats(formats: List[str]) -> Tuple[bool, bool]:
|
||||||
supported = ["sdist", "wheel", "none"]
|
supported = ["sdist", "wheel", "none"]
|
||||||
for form in formats:
|
for form in formats:
|
||||||
|
@ -192,7 +262,7 @@ def create_file(file_path: Path, contents: str) -> None:
|
||||||
def get_meta(
|
def get_meta(
|
||||||
model_path: Union[str, Path], existing_meta: Dict[str, Any]
|
model_path: Union[str, Path], existing_meta: Dict[str, Any]
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
meta = {
|
meta: Dict[str, Any] = {
|
||||||
"lang": "en",
|
"lang": "en",
|
||||||
"name": "pipeline",
|
"name": "pipeline",
|
||||||
"version": "0.0.0",
|
"version": "0.0.0",
|
||||||
|
@ -202,9 +272,10 @@ def get_meta(
|
||||||
"url": "",
|
"url": "",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
}
|
}
|
||||||
meta.update(existing_meta)
|
|
||||||
nlp = util.load_model_from_path(Path(model_path))
|
nlp = util.load_model_from_path(Path(model_path))
|
||||||
meta["spacy_version"] = util.get_model_version_range(about.__version__)
|
meta.update(nlp.meta)
|
||||||
|
meta.update(existing_meta)
|
||||||
|
meta["spacy_version"] = util.get_minor_version_range(about.__version__)
|
||||||
meta["vectors"] = {
|
meta["vectors"] = {
|
||||||
"width": nlp.vocab.vectors_length,
|
"width": nlp.vocab.vectors_length,
|
||||||
"vectors": len(nlp.vocab.vectors),
|
"vectors": len(nlp.vocab.vectors),
|
||||||
|
@ -213,6 +284,11 @@ def get_meta(
|
||||||
}
|
}
|
||||||
if about.__title__ != "spacy":
|
if about.__title__ != "spacy":
|
||||||
meta["parent_package"] = about.__title__
|
meta["parent_package"] = about.__title__
|
||||||
|
meta.setdefault("requirements", [])
|
||||||
|
# Update the requirements with all third-party packages in the config
|
||||||
|
existing_reqs = [util.split_requirement(req)[0] for req in meta["requirements"]]
|
||||||
|
reqs = get_third_party_dependencies(nlp.config, exclude=existing_reqs)
|
||||||
|
meta["requirements"].extend(reqs)
|
||||||
return meta
|
return meta
|
||||||
|
|
||||||
|
|
||||||
|
@ -258,8 +334,8 @@ def generate_readme(meta: Dict[str, Any]) -> str:
|
||||||
license_name = meta.get("license")
|
license_name = meta.get("license")
|
||||||
sources = _format_sources(meta.get("sources"))
|
sources = _format_sources(meta.get("sources"))
|
||||||
description = meta.get("description")
|
description = meta.get("description")
|
||||||
label_scheme = _format_label_scheme(meta.get("labels"))
|
label_scheme = _format_label_scheme(cast(Dict[str, Any], meta.get("labels")))
|
||||||
accuracy = _format_accuracy(meta.get("performance"))
|
accuracy = _format_accuracy(cast(Dict[str, Any], meta.get("performance")))
|
||||||
table_data = [
|
table_data = [
|
||||||
(md.bold("Name"), md.code(name)),
|
(md.bold("Name"), md.code(name)),
|
||||||
(md.bold("Version"), md.code(version)),
|
(md.bold("Version"), md.code(version)),
|
||||||
|
@ -331,7 +407,7 @@ def _format_label_scheme(data: Dict[str, Any]) -> str:
|
||||||
continue
|
continue
|
||||||
col1 = md.bold(md.code(pipe))
|
col1 = md.bold(md.code(pipe))
|
||||||
col2 = ", ".join(
|
col2 = ", ".join(
|
||||||
[md.code(label.replace("|", "\\|")) for label in labels]
|
[md.code(str(label).replace("|", "\\|")) for label in labels]
|
||||||
) # noqa: W605
|
) # noqa: W605
|
||||||
label_data.append((col1, col2))
|
label_data.append((col1, col2))
|
||||||
n_labels += len(labels)
|
n_labels += len(labels)
|
||||||
|
|
|
@ -32,7 +32,7 @@ def profile_cli(
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/cli#debug-profile
|
DOCS: https://spacy.io/api/cli#debug-profile
|
||||||
"""
|
"""
|
||||||
if ctx.parent.command.name == NAME: # called as top-level command
|
if ctx.parent.command.name == NAME: # type: ignore[union-attr] # called as top-level command
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"The profile command is now available via the 'debug profile' "
|
"The profile command is now available via the 'debug profile' "
|
||||||
"subcommand. You can run python -m spacy debug --help for an "
|
"subcommand. You can run python -m spacy debug --help for an "
|
||||||
|
@ -42,9 +42,9 @@ def profile_cli(
|
||||||
|
|
||||||
|
|
||||||
def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> None:
|
def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> None:
|
||||||
|
|
||||||
if inputs is not None:
|
if inputs is not None:
|
||||||
inputs = _read_inputs(inputs, msg)
|
texts = _read_inputs(inputs, msg)
|
||||||
|
texts = list(itertools.islice(texts, n_texts))
|
||||||
if inputs is None:
|
if inputs is None:
|
||||||
try:
|
try:
|
||||||
import ml_datasets
|
import ml_datasets
|
||||||
|
@ -56,16 +56,13 @@ def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) ->
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
|
|
||||||
n_inputs = 25000
|
with msg.loading("Loading IMDB dataset via ml_datasets..."):
|
||||||
with msg.loading("Loading IMDB dataset via Thinc..."):
|
imdb_train, _ = ml_datasets.imdb(train_limit=n_texts, dev_limit=0)
|
||||||
imdb_train, _ = ml_datasets.imdb()
|
texts, _ = zip(*imdb_train)
|
||||||
inputs, _ = zip(*imdb_train)
|
msg.info(f"Loaded IMDB dataset and using {n_texts} examples")
|
||||||
msg.info(f"Loaded IMDB dataset and using {n_inputs} examples")
|
|
||||||
inputs = inputs[:n_inputs]
|
|
||||||
with msg.loading(f"Loading pipeline '{model}'..."):
|
with msg.loading(f"Loading pipeline '{model}'..."):
|
||||||
nlp = load_model(model)
|
nlp = load_model(model)
|
||||||
msg.good(f"Loaded pipeline '{model}'")
|
msg.good(f"Loaded pipeline '{model}'")
|
||||||
texts = list(itertools.islice(inputs, n_texts))
|
|
||||||
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
||||||
s = pstats.Stats("Profile.prof")
|
s = pstats.Stats("Profile.prof")
|
||||||
msg.divider("Profile stats")
|
msg.divider("Profile stats")
|
||||||
|
@ -87,7 +84,7 @@ def _read_inputs(loc: Union[Path, str], msg: Printer) -> Iterator[str]:
|
||||||
if not input_path.exists() or not input_path.is_file():
|
if not input_path.exists() or not input_path.is_file():
|
||||||
msg.fail("Not a valid input data file", loc, exits=1)
|
msg.fail("Not a valid input data file", loc, exits=1)
|
||||||
msg.info(f"Using data from {input_path.parts[-1]}")
|
msg.info(f"Using data from {input_path.parts[-1]}")
|
||||||
file_ = input_path.open()
|
file_ = input_path.open() # type: ignore[assignment]
|
||||||
for line in file_:
|
for line in file_:
|
||||||
data = srsly.json_loads(line)
|
data = srsly.json_loads(line)
|
||||||
text = data["text"]
|
text = data["text"]
|
||||||
|
|
|
@ -1,18 +1,25 @@
|
||||||
from typing import Optional
|
from typing import Any, Dict, Optional
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import requests
|
import requests
|
||||||
|
import typer
|
||||||
|
|
||||||
from ...util import ensure_path, working_dir
|
from ...util import ensure_path, working_dir
|
||||||
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
|
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
|
||||||
from .._util import get_checksum, download_file, git_checkout, get_git_version
|
from .._util import get_checksum, download_file, git_checkout, get_git_version
|
||||||
|
from .._util import SimpleFrozenDict, parse_config_overrides
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("assets")
|
@project_cli.command(
|
||||||
|
"assets",
|
||||||
|
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||||
|
)
|
||||||
def project_assets_cli(
|
def project_assets_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
||||||
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+.")
|
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+.")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
@ -24,16 +31,22 @@ def project_assets_cli(
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/cli#project-assets
|
DOCS: https://spacy.io/api/cli#project-assets
|
||||||
"""
|
"""
|
||||||
project_assets(project_dir, sparse_checkout=sparse_checkout)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
|
project_assets(project_dir, overrides=overrides, sparse_checkout=sparse_checkout)
|
||||||
|
|
||||||
|
|
||||||
def project_assets(project_dir: Path, *, sparse_checkout: bool = False) -> None:
|
def project_assets(
|
||||||
|
project_dir: Path,
|
||||||
|
*,
|
||||||
|
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
||||||
|
sparse_checkout: bool = False,
|
||||||
|
) -> None:
|
||||||
"""Fetch assets for a project using DVC if possible.
|
"""Fetch assets for a project using DVC if possible.
|
||||||
|
|
||||||
project_dir (Path): Path to project directory.
|
project_dir (Path): Path to project directory.
|
||||||
"""
|
"""
|
||||||
project_path = ensure_path(project_dir)
|
project_path = ensure_path(project_dir)
|
||||||
config = load_project_config(project_path)
|
config = load_project_config(project_path, overrides=overrides)
|
||||||
assets = config.get("assets", {})
|
assets = config.get("assets", {})
|
||||||
if not assets:
|
if not assets:
|
||||||
msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
|
msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
|
||||||
|
@ -59,6 +72,15 @@ def project_assets(project_dir: Path, *, sparse_checkout: bool = False) -> None:
|
||||||
shutil.rmtree(dest)
|
shutil.rmtree(dest)
|
||||||
else:
|
else:
|
||||||
dest.unlink()
|
dest.unlink()
|
||||||
|
if "repo" not in asset["git"] or asset["git"]["repo"] is None:
|
||||||
|
msg.fail(
|
||||||
|
"A git asset must include 'repo', the repository address.", exits=1
|
||||||
|
)
|
||||||
|
if "path" not in asset["git"] or asset["git"]["path"] is None:
|
||||||
|
msg.fail(
|
||||||
|
"A git asset must include 'path' - use \"\" to get the entire repository.",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
git_checkout(
|
git_checkout(
|
||||||
asset["git"]["repo"],
|
asset["git"]["repo"],
|
||||||
asset["git"]["path"],
|
asset["git"]["path"],
|
||||||
|
@ -108,11 +130,17 @@ def fetch_asset(
|
||||||
the asset failed.
|
the asset failed.
|
||||||
"""
|
"""
|
||||||
dest_path = (project_path / dest).resolve()
|
dest_path = (project_path / dest).resolve()
|
||||||
if dest_path.exists() and checksum:
|
if dest_path.exists():
|
||||||
# If there's already a file, check for checksum
|
# If there's already a file, check for checksum
|
||||||
|
if checksum:
|
||||||
if checksum == get_checksum(dest_path):
|
if checksum == get_checksum(dest_path):
|
||||||
msg.good(f"Skipping download with matching checksum: {dest}")
|
msg.good(f"Skipping download with matching checksum: {dest}")
|
||||||
return dest_path
|
return
|
||||||
|
else:
|
||||||
|
# If there's not a checksum, make sure the file is a possibly valid size
|
||||||
|
if os.path.getsize(dest_path) == 0:
|
||||||
|
msg.warn(f"Asset exists but with size of 0 bytes, deleting: {dest}")
|
||||||
|
os.remove(dest_path)
|
||||||
# We might as well support the user here and create parent directories in
|
# We might as well support the user here and create parent directories in
|
||||||
# case the asset dir isn't listed as a dir to create in the project.yml
|
# case the asset dir isn't listed as a dir to create in the project.yml
|
||||||
if not dest_path.parent.exists():
|
if not dest_path.parent.exists():
|
||||||
|
@ -129,7 +157,6 @@ def fetch_asset(
|
||||||
msg.good(f"Copied local asset {dest}")
|
msg.good(f"Copied local asset {dest}")
|
||||||
else:
|
else:
|
||||||
msg.fail(f"Download failed: {dest}", e)
|
msg.fail(f"Download failed: {dest}", e)
|
||||||
return
|
|
||||||
if checksum and checksum != get_checksum(dest_path):
|
if checksum and checksum != get_checksum(dest_path):
|
||||||
msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
|
msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
|
||||||
|
|
||||||
|
|
|
@ -80,9 +80,9 @@ def check_clone(name: str, dest: Path, repo: str) -> None:
|
||||||
repo (str): URL of the repo to clone from.
|
repo (str): URL of the repo to clone from.
|
||||||
"""
|
"""
|
||||||
git_err = (
|
git_err = (
|
||||||
f"Cloning spaCy project templates requires Git and the 'git' command. ",
|
f"Cloning spaCy project templates requires Git and the 'git' command. "
|
||||||
f"To clone a project without Git, copy the files from the '{name}' "
|
f"To clone a project without Git, copy the files from the '{name}' "
|
||||||
f"directory in the {repo} to {dest} manually.",
|
f"directory in the {repo} to {dest} manually."
|
||||||
)
|
)
|
||||||
get_git_version(error=git_err)
|
get_git_version(error=git_err)
|
||||||
if not dest:
|
if not dest:
|
||||||
|
|
|
@ -143,8 +143,8 @@ def run_dvc_commands(
|
||||||
easier to pass flags like --quiet that depend on a variable or
|
easier to pass flags like --quiet that depend on a variable or
|
||||||
command-line setting while avoiding lots of nested conditionals.
|
command-line setting while avoiding lots of nested conditionals.
|
||||||
"""
|
"""
|
||||||
for command in commands:
|
for c in commands:
|
||||||
command = split_command(command)
|
command = split_command(c)
|
||||||
dvc_command = ["dvc", *command]
|
dvc_command = ["dvc", *command]
|
||||||
# Add the flags if they are set to True
|
# Add the flags if they are set to True
|
||||||
for flag, is_active in flags.items():
|
for flag, is_active in flags.items():
|
||||||
|
|
|
@ -2,7 +2,7 @@ from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
from .remote_storage import RemoteStorage
|
from .remote_storage import RemoteStorage
|
||||||
from .remote_storage import get_command_hash
|
from .remote_storage import get_command_hash
|
||||||
from .._util import project_cli, Arg
|
from .._util import project_cli, Arg, logger
|
||||||
from .._util import load_project_config
|
from .._util import load_project_config
|
||||||
from .run import update_lockfile
|
from .run import update_lockfile
|
||||||
|
|
||||||
|
@ -39,11 +39,15 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
||||||
# in the list.
|
# in the list.
|
||||||
while commands:
|
while commands:
|
||||||
for i, cmd in enumerate(list(commands)):
|
for i, cmd in enumerate(list(commands)):
|
||||||
|
logger.debug(f"CMD: {cmd['name']}.")
|
||||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
||||||
if all(dep.exists() for dep in deps):
|
if all(dep.exists() for dep in deps):
|
||||||
cmd_hash = get_command_hash("", "", deps, cmd["script"])
|
cmd_hash = get_command_hash("", "", deps, cmd["script"])
|
||||||
for output_path in cmd.get("outputs", []):
|
for output_path in cmd.get("outputs", []):
|
||||||
url = storage.pull(output_path, command_hash=cmd_hash)
|
url = storage.pull(output_path, command_hash=cmd_hash)
|
||||||
|
logger.debug(
|
||||||
|
f"URL: {url} for {output_path} with command hash {cmd_hash}"
|
||||||
|
)
|
||||||
yield url, output_path
|
yield url, output_path
|
||||||
|
|
||||||
out_locs = [project_dir / out for out in cmd.get("outputs", [])]
|
out_locs = [project_dir / out for out in cmd.get("outputs", [])]
|
||||||
|
@ -53,6 +57,8 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
||||||
# we iterate over the loop again.
|
# we iterate over the loop again.
|
||||||
commands.pop(i)
|
commands.pop(i)
|
||||||
break
|
break
|
||||||
|
else:
|
||||||
|
logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.")
|
||||||
else:
|
else:
|
||||||
# If we didn't break the for loop, break the while loop.
|
# If we didn't break the for loop, break the while loop.
|
||||||
break
|
break
|
||||||
|
|
|
@ -3,7 +3,7 @@ from wasabi import msg
|
||||||
from .remote_storage import RemoteStorage
|
from .remote_storage import RemoteStorage
|
||||||
from .remote_storage import get_content_hash, get_command_hash
|
from .remote_storage import get_content_hash, get_command_hash
|
||||||
from .._util import load_project_config
|
from .._util import load_project_config
|
||||||
from .._util import project_cli, Arg
|
from .._util import project_cli, Arg, logger
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("push")
|
@project_cli.command("push")
|
||||||
|
@ -37,12 +37,15 @@ def project_push(project_dir: Path, remote: str):
|
||||||
remote = config["remotes"][remote]
|
remote = config["remotes"][remote]
|
||||||
storage = RemoteStorage(project_dir, remote)
|
storage = RemoteStorage(project_dir, remote)
|
||||||
for cmd in config.get("commands", []):
|
for cmd in config.get("commands", []):
|
||||||
|
logger.debug(f"CMD: cmd['name']")
|
||||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
||||||
if any(not dep.exists() for dep in deps):
|
if any(not dep.exists() for dep in deps):
|
||||||
|
logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs")
|
||||||
continue
|
continue
|
||||||
cmd_hash = get_command_hash(
|
cmd_hash = get_command_hash(
|
||||||
"", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
|
"", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
|
||||||
)
|
)
|
||||||
|
logger.debug(f"CMD_HASH: {cmd_hash}")
|
||||||
for output_path in cmd.get("outputs", []):
|
for output_path in cmd.get("outputs", []):
|
||||||
output_loc = project_dir / output_path
|
output_loc = project_dir / output_path
|
||||||
if output_loc.exists() and _is_not_empty_dir(output_loc):
|
if output_loc.exists() and _is_not_empty_dir(output_loc):
|
||||||
|
@ -51,6 +54,9 @@ def project_push(project_dir: Path, remote: str):
|
||||||
command_hash=cmd_hash,
|
command_hash=cmd_hash,
|
||||||
content_hash=get_content_hash(output_loc),
|
content_hash=get_content_hash(output_loc),
|
||||||
)
|
)
|
||||||
|
logger.debug(
|
||||||
|
f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}"
|
||||||
|
)
|
||||||
yield output_path, url
|
yield output_path, url
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -41,7 +41,7 @@ class RemoteStorage:
|
||||||
raise IOError(f"Cannot push {loc}: does not exist.")
|
raise IOError(f"Cannot push {loc}: does not exist.")
|
||||||
url = self.make_url(path, command_hash, content_hash)
|
url = self.make_url(path, command_hash, content_hash)
|
||||||
if url.exists():
|
if url.exists():
|
||||||
return None
|
return url
|
||||||
tmp: Path
|
tmp: Path
|
||||||
with make_tempdir() as tmp:
|
with make_tempdir() as tmp:
|
||||||
tar_loc = tmp / self.encode_name(str(path))
|
tar_loc = tmp / self.encode_name(str(path))
|
||||||
|
@ -131,8 +131,10 @@ def get_command_hash(
|
||||||
currently installed packages, whatever environment variables have been marked
|
currently installed packages, whatever environment variables have been marked
|
||||||
as relevant, and the command.
|
as relevant, and the command.
|
||||||
"""
|
"""
|
||||||
check_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
|
if check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION):
|
||||||
spacy_v = GIT_VERSION if check_commit else get_minor_version(about.__version__)
|
spacy_v = GIT_VERSION
|
||||||
|
else:
|
||||||
|
spacy_v = str(get_minor_version(about.__version__) or "")
|
||||||
dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
|
dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
|
||||||
hashes = [spacy_v, site_hash, env_hash] + dep_checksums
|
hashes = [spacy_v, site_hash, env_hash] + dep_checksums
|
||||||
hashes.extend(cmd)
|
hashes.extend(cmd)
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
from typing import Optional, List, Dict, Sequence, Any, Iterable
|
from typing import Optional, List, Dict, Sequence, Any, Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
from wasabi.util import locale_escape
|
||||||
import sys
|
import sys
|
||||||
import srsly
|
import srsly
|
||||||
import typer
|
import typer
|
||||||
|
@ -57,6 +58,7 @@ def project_run(
|
||||||
|
|
||||||
project_dir (Path): Path to project directory.
|
project_dir (Path): Path to project directory.
|
||||||
subcommand (str): Name of command to run.
|
subcommand (str): Name of command to run.
|
||||||
|
overrides (Dict[str, Any]): Optional config overrides.
|
||||||
force (bool): Force re-running, even if nothing changed.
|
force (bool): Force re-running, even if nothing changed.
|
||||||
dry (bool): Perform a dry run and don't execute commands.
|
dry (bool): Perform a dry run and don't execute commands.
|
||||||
capture (bool): Whether to capture the output and errors of individual commands.
|
capture (bool): Whether to capture the output and errors of individual commands.
|
||||||
|
@ -68,11 +70,18 @@ def project_run(
|
||||||
config = load_project_config(project_dir, overrides=overrides)
|
config = load_project_config(project_dir, overrides=overrides)
|
||||||
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||||
workflows = config.get("workflows", {})
|
workflows = config.get("workflows", {})
|
||||||
validate_subcommand(commands.keys(), workflows.keys(), subcommand)
|
validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
|
||||||
if subcommand in workflows:
|
if subcommand in workflows:
|
||||||
msg.info(f"Running workflow '{subcommand}'")
|
msg.info(f"Running workflow '{subcommand}'")
|
||||||
for cmd in workflows[subcommand]:
|
for cmd in workflows[subcommand]:
|
||||||
project_run(project_dir, cmd, force=force, dry=dry, capture=capture)
|
project_run(
|
||||||
|
project_dir,
|
||||||
|
cmd,
|
||||||
|
overrides=overrides,
|
||||||
|
force=force,
|
||||||
|
dry=dry,
|
||||||
|
capture=capture,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
cmd = commands[subcommand]
|
cmd = commands[subcommand]
|
||||||
for dep in cmd.get("deps", []):
|
for dep in cmd.get("deps", []):
|
||||||
|
@ -107,7 +116,7 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
||||||
workflows = config.get("workflows", {})
|
workflows = config.get("workflows", {})
|
||||||
project_loc = "" if is_cwd(project_dir) else project_dir
|
project_loc = "" if is_cwd(project_dir) else project_dir
|
||||||
if subcommand:
|
if subcommand:
|
||||||
validate_subcommand(commands.keys(), workflows.keys(), subcommand)
|
validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
|
||||||
print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
|
print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
|
||||||
if subcommand in commands:
|
if subcommand in commands:
|
||||||
help_text = commands[subcommand].get("help")
|
help_text = commands[subcommand].get("help")
|
||||||
|
@ -127,7 +136,7 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
||||||
print("")
|
print("")
|
||||||
title = config.get("title")
|
title = config.get("title")
|
||||||
if title:
|
if title:
|
||||||
print(f"{title}\n")
|
print(f"{locale_escape(title)}\n")
|
||||||
if config_commands:
|
if config_commands:
|
||||||
print(f"Available commands in {PROJECT_FILE}")
|
print(f"Available commands in {PROJECT_FILE}")
|
||||||
print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
|
print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
|
||||||
|
@ -155,8 +164,8 @@ def run_commands(
|
||||||
when you want to turn over execution to the command, and capture=True
|
when you want to turn over execution to the command, and capture=True
|
||||||
when you want to run the command more like a function.
|
when you want to run the command more like a function.
|
||||||
"""
|
"""
|
||||||
for command in commands:
|
for c in commands:
|
||||||
command = split_command(command)
|
command = split_command(c)
|
||||||
# Not sure if this is needed or a good idea. Motivation: users may often
|
# Not sure if this is needed or a good idea. Motivation: users may often
|
||||||
# use commands in their config that reference "python" and we want to
|
# use commands in their config that reference "python" and we want to
|
||||||
# make sure that it's always executing the same Python that spaCy is
|
# make sure that it's always executing the same Python that spaCy is
|
||||||
|
@ -212,6 +221,9 @@ def check_rerun(
|
||||||
strict_version (bool):
|
strict_version (bool):
|
||||||
RETURNS (bool): Whether to re-run the command.
|
RETURNS (bool): Whether to re-run the command.
|
||||||
"""
|
"""
|
||||||
|
# Always rerun if no-skip is set
|
||||||
|
if command.get("no_skip", False):
|
||||||
|
return True
|
||||||
lock_path = project_dir / PROJECT_LOCK
|
lock_path = project_dir / PROJECT_LOCK
|
||||||
if not lock_path.exists(): # We don't have a lockfile, run command
|
if not lock_path.exists(): # We don't have a lockfile, run command
|
||||||
return True
|
return True
|
||||||
|
@ -282,7 +294,7 @@ def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, str]]:
|
def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, Optional[str]]]:
|
||||||
"""Generate the file information for a list of paths (dependencies, outputs).
|
"""Generate the file information for a list of paths (dependencies, outputs).
|
||||||
Includes the file path and the file's checksum.
|
Includes the file path and the file's checksum.
|
||||||
|
|
||||||
|
|
|
@ -16,7 +16,10 @@ gpu_allocator = null
|
||||||
|
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = "{{ lang }}"
|
lang = "{{ lang }}"
|
||||||
{%- if "tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "entity_linker" in components or (("textcat" in components or "textcat_multilabel" in components) and optimize == "accuracy") -%}
|
{%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
|
||||||
|
{%- set with_accuracy = optimize == "accuracy" -%}
|
||||||
|
{%- set has_accurate_textcat = has_textcat and with_accuracy -%}
|
||||||
|
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "entity_linker" in components or has_accurate_textcat) -%}
|
||||||
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %}
|
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %}
|
||||||
{%- else -%}
|
{%- else -%}
|
||||||
{%- set full_pipeline = components %}
|
{%- set full_pipeline = components %}
|
||||||
|
@ -32,7 +35,7 @@ batch_size = {{ 128 if hardware == "gpu" else 1000 }}
|
||||||
factory = "transformer"
|
factory = "transformer"
|
||||||
|
|
||||||
[components.transformer.model]
|
[components.transformer.model]
|
||||||
@architectures = "spacy-transformers.TransformerModel.v1"
|
@architectures = "spacy-transformers.TransformerModel.v3"
|
||||||
name = "{{ transformer["name"] }}"
|
name = "{{ transformer["name"] }}"
|
||||||
tokenizer_config = {"use_fast": true}
|
tokenizer_config = {"use_fast": true}
|
||||||
|
|
||||||
|
@ -198,7 +201,7 @@ no_output_layer = false
|
||||||
|
|
||||||
{# NON-TRANSFORMER PIPELINE #}
|
{# NON-TRANSFORMER PIPELINE #}
|
||||||
{% else -%}
|
{% else -%}
|
||||||
|
{% if "tok2vec" in full_pipeline -%}
|
||||||
[components.tok2vec]
|
[components.tok2vec]
|
||||||
factory = "tok2vec"
|
factory = "tok2vec"
|
||||||
|
|
||||||
|
@ -223,6 +226,7 @@ width = {{ 96 if optimize == "efficiency" else 256 }}
|
||||||
depth = {{ 4 if optimize == "efficiency" else 8 }}
|
depth = {{ 4 if optimize == "efficiency" else 8 }}
|
||||||
window_size = 1
|
window_size = 1
|
||||||
maxout_pieces = 3
|
maxout_pieces = 3
|
||||||
|
{% endif -%}
|
||||||
|
|
||||||
{% if "morphologizer" in components %}
|
{% if "morphologizer" in components %}
|
||||||
[components.morphologizer]
|
[components.morphologizer]
|
||||||
|
|
|
@ -41,10 +41,10 @@ da:
|
||||||
word_vectors: da_core_news_lg
|
word_vectors: da_core_news_lg
|
||||||
transformer:
|
transformer:
|
||||||
efficiency:
|
efficiency:
|
||||||
name: DJSammy/bert-base-danish-uncased_BotXO,ai
|
name: Maltehb/danish-bert-botxo
|
||||||
size_factor: 3
|
size_factor: 3
|
||||||
accuracy:
|
accuracy:
|
||||||
name: DJSammy/bert-base-danish-uncased_BotXO,ai
|
name: Maltehb/danish-bert-botxo
|
||||||
size_factor: 3
|
size_factor: 3
|
||||||
de:
|
de:
|
||||||
word_vectors: de_core_news_lg
|
word_vectors: de_core_news_lg
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Dict, Any, Union
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
import typer
|
import typer
|
||||||
|
@ -7,7 +7,7 @@ import sys
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||||
from ._util import import_code, setup_gpu
|
from ._util import import_code, setup_gpu
|
||||||
from ..training.loop import train
|
from ..training.loop import train as train_nlp
|
||||||
from ..training.initialize import init_nlp
|
from ..training.initialize import init_nlp
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
@ -40,14 +40,30 @@ def train_cli(
|
||||||
DOCS: https://spacy.io/api/cli#train
|
DOCS: https://spacy.io/api/cli#train
|
||||||
"""
|
"""
|
||||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
|
overrides = parse_config_overrides(ctx.args)
|
||||||
|
import_code(code_path)
|
||||||
|
train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
|
||||||
|
|
||||||
|
|
||||||
|
def train(
|
||||||
|
config_path: Union[str, Path],
|
||||||
|
output_path: Optional[Union[str, Path]] = None,
|
||||||
|
*,
|
||||||
|
use_gpu: int = -1,
|
||||||
|
overrides: Dict[str, Any] = util.SimpleFrozenDict(),
|
||||||
|
):
|
||||||
|
config_path = util.ensure_path(config_path)
|
||||||
|
output_path = util.ensure_path(output_path)
|
||||||
# Make sure all files and paths exists if they are needed
|
# Make sure all files and paths exists if they are needed
|
||||||
if not config_path or (str(config_path) != "-" and not config_path.exists()):
|
if not config_path or (str(config_path) != "-" and not config_path.exists()):
|
||||||
msg.fail("Config file not found", config_path, exits=1)
|
msg.fail("Config file not found", config_path, exits=1)
|
||||||
if output_path is not None and not output_path.exists():
|
if not output_path:
|
||||||
|
msg.info("No output directory provided")
|
||||||
|
else:
|
||||||
|
if not output_path.exists():
|
||||||
output_path.mkdir(parents=True)
|
output_path.mkdir(parents=True)
|
||||||
msg.good(f"Created output directory: {output_path}")
|
msg.good(f"Created output directory: {output_path}")
|
||||||
overrides = parse_config_overrides(ctx.args)
|
msg.info(f"Saving to output directory: {output_path}")
|
||||||
import_code(code_path)
|
|
||||||
setup_gpu(use_gpu)
|
setup_gpu(use_gpu)
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
config = util.load_config(config_path, overrides=overrides, interpolate=False)
|
config = util.load_config(config_path, overrides=overrides, interpolate=False)
|
||||||
|
@ -56,4 +72,4 @@ def train_cli(
|
||||||
nlp = init_nlp(config, use_gpu=use_gpu)
|
nlp = init_nlp(config, use_gpu=use_gpu)
|
||||||
msg.good("Initialized pipeline")
|
msg.good("Initialized pipeline")
|
||||||
msg.divider("Training pipeline")
|
msg.divider("Training pipeline")
|
||||||
train(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr)
|
train_nlp(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr)
|
||||||
|
|
|
@ -99,7 +99,7 @@ def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]:
|
||||||
warnings.filterwarnings("ignore", message="\\[W09[45]")
|
warnings.filterwarnings("ignore", message="\\[W09[45]")
|
||||||
model_meta = get_model_meta(model_path)
|
model_meta = get_model_meta(model_path)
|
||||||
spacy_version = model_meta.get("spacy_version", "n/a")
|
spacy_version = model_meta.get("spacy_version", "n/a")
|
||||||
is_compat = is_compatible_version(about.__version__, spacy_version)
|
is_compat = is_compatible_version(about.__version__, spacy_version) # type: ignore[assignment]
|
||||||
pkgs[pkg_name] = {
|
pkgs[pkg_name] = {
|
||||||
"name": package,
|
"name": package,
|
||||||
"version": version,
|
"version": version,
|
||||||
|
|
|
@ -5,12 +5,12 @@ from thinc.util import copy_array
|
||||||
try:
|
try:
|
||||||
import cPickle as pickle
|
import cPickle as pickle
|
||||||
except ImportError:
|
except ImportError:
|
||||||
import pickle
|
import pickle # type: ignore[no-redef]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import copy_reg
|
import copy_reg
|
||||||
except ImportError:
|
except ImportError:
|
||||||
import copyreg as copy_reg
|
import copyreg as copy_reg # type: ignore[no-redef]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from cupy.cuda.stream import Stream as CudaStream
|
from cupy.cuda.stream import Stream as CudaStream
|
||||||
|
@ -22,10 +22,18 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
cupy = None
|
cupy = None
|
||||||
|
|
||||||
|
if sys.version_info[:2] >= (3, 8): # Python 3.8+
|
||||||
|
from typing import Literal, Protocol, runtime_checkable
|
||||||
|
else:
|
||||||
|
from typing_extensions import Literal, Protocol, runtime_checkable # noqa: F401
|
||||||
|
|
||||||
|
# Important note: The importlib_metadata "backport" includes functionality
|
||||||
|
# that's not part of the built-in importlib.metadata. We should treat this
|
||||||
|
# import like the built-in and only use what's available there.
|
||||||
try: # Python 3.8+
|
try: # Python 3.8+
|
||||||
from typing import Literal
|
import importlib.metadata as importlib_metadata
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from typing_extensions import Literal # noqa: F401
|
from catalogue import _importlib_metadata as importlib_metadata # type: ignore[no-redef] # noqa: F401
|
||||||
|
|
||||||
from thinc.api import Optimizer # noqa: F401
|
from thinc.api import Optimizer # noqa: F401
|
||||||
|
|
||||||
|
|
|
@ -68,12 +68,14 @@ seed = ${system.seed}
|
||||||
gpu_allocator = ${system.gpu_allocator}
|
gpu_allocator = ${system.gpu_allocator}
|
||||||
dropout = 0.1
|
dropout = 0.1
|
||||||
accumulate_gradient = 1
|
accumulate_gradient = 1
|
||||||
# Controls early-stopping. 0 disables early stopping.
|
# Controls early-stopping, i.e., the number of steps to continue without
|
||||||
|
# improvement before stopping. 0 disables early stopping.
|
||||||
patience = 1600
|
patience = 1600
|
||||||
# Number of epochs. 0 means unlimited. If >= 0, train corpus is loaded once in
|
# Number of epochs. 0 means unlimited. If >= 0, train corpus is loaded once in
|
||||||
# memory and shuffled within the training loop. -1 means stream train corpus
|
# memory and shuffled within the training loop. -1 means stream train corpus
|
||||||
# rather than loading in memory with no shuffling within the training loop.
|
# rather than loading in memory with no shuffling within the training loop.
|
||||||
max_epochs = 0
|
max_epochs = 0
|
||||||
|
# Maximum number of update steps to train for. 0 means an unlimited number of steps.
|
||||||
max_steps = 20000
|
max_steps = 20000
|
||||||
eval_frequency = 200
|
eval_frequency = 200
|
||||||
# Control how scores are printed and checkpoints are evaluated.
|
# Control how scores are printed and checkpoints are evaluated.
|
||||||
|
|
|
@ -5,6 +5,7 @@ raw_text = null
|
||||||
max_epochs = 1000
|
max_epochs = 1000
|
||||||
dropout = 0.2
|
dropout = 0.2
|
||||||
n_save_every = null
|
n_save_every = null
|
||||||
|
n_save_epoch = null
|
||||||
component = "tok2vec"
|
component = "tok2vec"
|
||||||
layer = ""
|
layer = ""
|
||||||
corpus = "corpora.pretrain"
|
corpus = "corpora.pretrain"
|
||||||
|
|
|
@ -18,7 +18,7 @@ RENDER_WRAPPER = None
|
||||||
|
|
||||||
|
|
||||||
def render(
|
def render(
|
||||||
docs: Union[Iterable[Union[Doc, Span]], Doc, Span],
|
docs: Union[Iterable[Union[Doc, Span, dict]], Doc, Span, dict],
|
||||||
style: str = "dep",
|
style: str = "dep",
|
||||||
page: bool = False,
|
page: bool = False,
|
||||||
minify: bool = False,
|
minify: bool = False,
|
||||||
|
@ -28,7 +28,8 @@ def render(
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Render displaCy visualisation.
|
"""Render displaCy visualisation.
|
||||||
|
|
||||||
docs (Union[Iterable[Doc], Doc]): Document(s) to visualise.
|
docs (Union[Iterable[Union[Doc, Span, dict]], Doc, Span, dict]]): Document(s) to visualise.
|
||||||
|
a 'dict' is only allowed here when 'manual' is set to True
|
||||||
style (str): Visualisation style, 'dep' or 'ent'.
|
style (str): Visualisation style, 'dep' or 'ent'.
|
||||||
page (bool): Render markup as full HTML page.
|
page (bool): Render markup as full HTML page.
|
||||||
minify (bool): Minify HTML markup.
|
minify (bool): Minify HTML markup.
|
||||||
|
@ -53,8 +54,8 @@ def render(
|
||||||
raise ValueError(Errors.E096)
|
raise ValueError(Errors.E096)
|
||||||
renderer_func, converter = factories[style]
|
renderer_func, converter = factories[style]
|
||||||
renderer = renderer_func(options=options)
|
renderer = renderer_func(options=options)
|
||||||
parsed = [converter(doc, options) for doc in docs] if not manual else docs
|
parsed = [converter(doc, options) for doc in docs] if not manual else docs # type: ignore
|
||||||
_html["parsed"] = renderer.render(parsed, page=page, minify=minify).strip()
|
_html["parsed"] = renderer.render(parsed, page=page, minify=minify).strip() # type: ignore
|
||||||
html = _html["parsed"]
|
html = _html["parsed"]
|
||||||
if RENDER_WRAPPER is not None:
|
if RENDER_WRAPPER is not None:
|
||||||
html = RENDER_WRAPPER(html)
|
html = RENDER_WRAPPER(html)
|
||||||
|
@ -133,7 +134,7 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
||||||
"lemma": np.root.lemma_,
|
"lemma": np.root.lemma_,
|
||||||
"ent_type": np.root.ent_type_,
|
"ent_type": np.root.ent_type_,
|
||||||
}
|
}
|
||||||
retokenizer.merge(np, attrs=attrs)
|
retokenizer.merge(np, attrs=attrs) # type: ignore[arg-type]
|
||||||
if options.get("collapse_punct", True):
|
if options.get("collapse_punct", True):
|
||||||
spans = []
|
spans = []
|
||||||
for word in doc[:-1]:
|
for word in doc[:-1]:
|
||||||
|
@ -148,7 +149,7 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
for span, tag, lemma, ent_type in spans:
|
for span, tag, lemma, ent_type in spans:
|
||||||
attrs = {"tag": tag, "lemma": lemma, "ent_type": ent_type}
|
attrs = {"tag": tag, "lemma": lemma, "ent_type": ent_type}
|
||||||
retokenizer.merge(span, attrs=attrs)
|
retokenizer.merge(span, attrs=attrs) # type: ignore[arg-type]
|
||||||
fine_grained = options.get("fine_grained")
|
fine_grained = options.get("fine_grained")
|
||||||
add_lemma = options.get("add_lemma")
|
add_lemma = options.get("add_lemma")
|
||||||
words = [
|
words = [
|
||||||
|
@ -180,11 +181,19 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
||||||
def parse_ents(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
def parse_ents(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
||||||
"""Generate named entities in [{start: i, end: i, label: 'label'}] format.
|
"""Generate named entities in [{start: i, end: i, label: 'label'}] format.
|
||||||
|
|
||||||
doc (Doc): Document do parse.
|
doc (Doc): Document to parse.
|
||||||
|
options (Dict[str, Any]): NER-specific visualisation options.
|
||||||
RETURNS (dict): Generated entities keyed by text (original text) and ents.
|
RETURNS (dict): Generated entities keyed by text (original text) and ents.
|
||||||
"""
|
"""
|
||||||
|
kb_url_template = options.get("kb_url_template", None)
|
||||||
ents = [
|
ents = [
|
||||||
{"start": ent.start_char, "end": ent.end_char, "label": ent.label_}
|
{
|
||||||
|
"start": ent.start_char,
|
||||||
|
"end": ent.end_char,
|
||||||
|
"label": ent.label_,
|
||||||
|
"kb_id": ent.kb_id_ if ent.kb_id_ else "",
|
||||||
|
"kb_url": kb_url_template.format(ent.kb_id_) if kb_url_template else "#",
|
||||||
|
}
|
||||||
for ent in doc.ents
|
for ent in doc.ents
|
||||||
]
|
]
|
||||||
if not ents:
|
if not ents:
|
||||||
|
|
|
@ -3,7 +3,7 @@ import uuid
|
||||||
|
|
||||||
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_WORDS_LEMMA, TPL_DEP_ARCS
|
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_WORDS_LEMMA, TPL_DEP_ARCS
|
||||||
from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
|
from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
|
||||||
from .templates import TPL_ENTS
|
from .templates import TPL_ENTS, TPL_KB_LINK
|
||||||
from ..util import minify_html, escape_html, registry
|
from ..util import minify_html, escape_html, registry
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
|
||||||
|
@ -18,7 +18,7 @@ DEFAULT_LABEL_COLORS = {
|
||||||
"LOC": "#ff9561",
|
"LOC": "#ff9561",
|
||||||
"PERSON": "#aa9cfc",
|
"PERSON": "#aa9cfc",
|
||||||
"NORP": "#c887fb",
|
"NORP": "#c887fb",
|
||||||
"FACILITY": "#9cc9cc",
|
"FAC": "#9cc9cc",
|
||||||
"EVENT": "#ffeb80",
|
"EVENT": "#ffeb80",
|
||||||
"LAW": "#ff8197",
|
"LAW": "#ff8197",
|
||||||
"LANGUAGE": "#ff8197",
|
"LANGUAGE": "#ff8197",
|
||||||
|
@ -305,7 +305,7 @@ class EntityRenderer:
|
||||||
"""Render entities in text.
|
"""Render entities in text.
|
||||||
|
|
||||||
text (str): Original text.
|
text (str): Original text.
|
||||||
spans (list): Individual entity spans and their start, end and label.
|
spans (list): Individual entity spans and their start, end, label, kb_id and kb_url.
|
||||||
title (str / None): Document title set in Doc.user_data['title'].
|
title (str / None): Document title set in Doc.user_data['title'].
|
||||||
"""
|
"""
|
||||||
markup = ""
|
markup = ""
|
||||||
|
@ -314,6 +314,9 @@ class EntityRenderer:
|
||||||
label = span["label"]
|
label = span["label"]
|
||||||
start = span["start"]
|
start = span["start"]
|
||||||
end = span["end"]
|
end = span["end"]
|
||||||
|
kb_id = span.get("kb_id", "")
|
||||||
|
kb_url = span.get("kb_url", "#")
|
||||||
|
kb_link = TPL_KB_LINK.format(kb_id=kb_id, kb_url=kb_url) if kb_id else ""
|
||||||
additional_params = span.get("params", {})
|
additional_params = span.get("params", {})
|
||||||
entity = escape_html(text[start:end])
|
entity = escape_html(text[start:end])
|
||||||
fragments = text[offset:start].split("\n")
|
fragments = text[offset:start].split("\n")
|
||||||
|
@ -323,7 +326,12 @@ class EntityRenderer:
|
||||||
markup += "</br>"
|
markup += "</br>"
|
||||||
if self.ents is None or label.upper() in self.ents:
|
if self.ents is None or label.upper() in self.ents:
|
||||||
color = self.colors.get(label.upper(), self.default_color)
|
color = self.colors.get(label.upper(), self.default_color)
|
||||||
ent_settings = {"label": label, "text": entity, "bg": color}
|
ent_settings = {
|
||||||
|
"label": label,
|
||||||
|
"text": entity,
|
||||||
|
"bg": color,
|
||||||
|
"kb_link": kb_link,
|
||||||
|
}
|
||||||
ent_settings.update(additional_params)
|
ent_settings.update(additional_params)
|
||||||
markup += self.ent_template.format(**ent_settings)
|
markup += self.ent_template.format(**ent_settings)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -51,17 +51,22 @@ TPL_ENTS = """
|
||||||
TPL_ENT = """
|
TPL_ENT = """
|
||||||
<mark class="entity" style="background: {bg}; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
|
<mark class="entity" style="background: {bg}; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
|
||||||
{text}
|
{text}
|
||||||
<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">{label}</span>
|
<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">{label}{kb_link}</span>
|
||||||
</mark>
|
</mark>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
TPL_ENT_RTL = """
|
TPL_ENT_RTL = """
|
||||||
<mark class="entity" style="background: {bg}; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em">
|
<mark class="entity" style="background: {bg}; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em">
|
||||||
{text}
|
{text}
|
||||||
<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-right: 0.5rem">{label}</span>
|
<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-right: 0.5rem">{label}{kb_link}</span>
|
||||||
</mark>
|
</mark>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Important: this needs to start with a space!
|
||||||
|
TPL_KB_LINK = """
|
||||||
|
<a style="text-decoration: none; color: inherit; font-weight: normal" href="{kb_url}">{kb_id}</a>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
TPL_PAGE = """
|
TPL_PAGE = """
|
||||||
<!DOCTYPE html>
|
<!DOCTYPE html>
|
||||||
|
|
|
@ -1,19 +1,14 @@
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
|
|
||||||
def add_codes(err_cls):
|
class ErrorsWithCodes(type):
|
||||||
"""Add error codes to string messages via class attribute names."""
|
|
||||||
|
|
||||||
class ErrorsWithCodes(err_cls):
|
|
||||||
def __getattribute__(self, code):
|
def __getattribute__(self, code):
|
||||||
msg = super(ErrorsWithCodes, self).__getattribute__(code)
|
msg = super().__getattribute__(code)
|
||||||
if code.startswith("__"): # python system attributes like __class__
|
if code.startswith("__"): # python system attributes like __class__
|
||||||
return msg
|
return msg
|
||||||
else:
|
else:
|
||||||
return "[{code}] {msg}".format(code=code, msg=msg)
|
return "[{code}] {msg}".format(code=code, msg=msg)
|
||||||
|
|
||||||
return ErrorsWithCodes()
|
|
||||||
|
|
||||||
|
|
||||||
def setup_default_warnings():
|
def setup_default_warnings():
|
||||||
# ignore certain numpy warnings
|
# ignore certain numpy warnings
|
||||||
|
@ -25,7 +20,10 @@ def setup_default_warnings():
|
||||||
filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
|
filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
|
||||||
|
|
||||||
# warn once about lemmatizer without required POS
|
# warn once about lemmatizer without required POS
|
||||||
filter_warning("once", error_msg="[W108]")
|
filter_warning("once", error_msg=Warnings.W108)
|
||||||
|
|
||||||
|
# floret vector table cannot be modified
|
||||||
|
filter_warning("once", error_msg="[W114]")
|
||||||
|
|
||||||
|
|
||||||
def filter_warning(action: str, error_msg: str):
|
def filter_warning(action: str, error_msg: str):
|
||||||
|
@ -44,8 +42,7 @@ def _escape_warning_msg(msg):
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
|
|
||||||
@add_codes
|
class Warnings(metaclass=ErrorsWithCodes):
|
||||||
class Warnings:
|
|
||||||
W005 = ("Doc object not parsed. This means displaCy won't be able to "
|
W005 = ("Doc object not parsed. This means displaCy won't be able to "
|
||||||
"generate a dependency visualization for it. Make sure the Doc "
|
"generate a dependency visualization for it. Make sure the Doc "
|
||||||
"was processed with a model that supports dependency parsing, and "
|
"was processed with a model that supports dependency parsing, and "
|
||||||
|
@ -116,13 +113,11 @@ class Warnings:
|
||||||
|
|
||||||
# New warnings added in v3.x
|
# New warnings added in v3.x
|
||||||
W086 = ("Component '{listener}' will be (re)trained, but it needs the component "
|
W086 = ("Component '{listener}' will be (re)trained, but it needs the component "
|
||||||
"'{name}' which is frozen. You can either freeze both, or neither "
|
"'{name}' which is frozen. If you want to prevent retraining '{name}' "
|
||||||
"of the two. If you're sourcing the component from "
|
"but want to train '{listener}' on top of it, you should add '{name}' to the "
|
||||||
"an existing pipeline, you can use the `replace_listeners` setting in "
|
"list of 'annotating_components' in the 'training' block in the config. "
|
||||||
"the config block to replace its token-to-vector listener with a copy "
|
"See the documentation for details: "
|
||||||
"and make it independent. For example, `replace_listeners = "
|
"https://spacy.io/usage/training#annotating-components")
|
||||||
"[\"model.tok2vec\"]` See the documentation for details: "
|
|
||||||
"https://spacy.io/usage/training#config-components-listeners")
|
|
||||||
W087 = ("Component '{name}' will be (re)trained, but the component '{listener}' "
|
W087 = ("Component '{name}' will be (re)trained, but the component '{listener}' "
|
||||||
"depends on it via a listener and is frozen. This means that the "
|
"depends on it via a listener and is frozen. This means that the "
|
||||||
"performance of '{listener}' will be degraded. You can either freeze "
|
"performance of '{listener}' will be degraded. You can either freeze "
|
||||||
|
@ -172,8 +167,8 @@ class Warnings:
|
||||||
"call the {matcher} on each Doc object.")
|
"call the {matcher} on each Doc object.")
|
||||||
W107 = ("The property `Doc.{prop}` is deprecated. Use "
|
W107 = ("The property `Doc.{prop}` is deprecated. Use "
|
||||||
"`Doc.has_annotation(\"{attr}\")` instead.")
|
"`Doc.has_annotation(\"{attr}\")` instead.")
|
||||||
W108 = ("The rule-based lemmatizer did not find POS annotation for the "
|
W108 = ("The rule-based lemmatizer did not find POS annotation for one or "
|
||||||
"token '{text}'. Check that your pipeline includes components that "
|
"more tokens. Check that your pipeline includes components that "
|
||||||
"assign token.pos, typically 'tagger'+'attribute_ruler' or "
|
"assign token.pos, typically 'tagger'+'attribute_ruler' or "
|
||||||
"'morphologizer'.")
|
"'morphologizer'.")
|
||||||
W109 = ("Unable to save user hooks while serializing the doc. Re-add any "
|
W109 = ("Unable to save user hooks while serializing the doc. Re-add any "
|
||||||
|
@ -192,10 +187,14 @@ class Warnings:
|
||||||
"vectors. This is almost certainly a mistake.")
|
"vectors. This is almost certainly a mistake.")
|
||||||
W113 = ("Sourced component '{name}' may not work as expected: source "
|
W113 = ("Sourced component '{name}' may not work as expected: source "
|
||||||
"vectors are not identical to current pipeline vectors.")
|
"vectors are not identical to current pipeline vectors.")
|
||||||
|
W114 = ("Using multiprocessing with GPU models is not recommended and may "
|
||||||
|
"lead to errors.")
|
||||||
|
W115 = ("Skipping {method}: the floret vector table cannot be modified. "
|
||||||
|
"Vectors are calculated from character ngrams.")
|
||||||
|
W116 = ("Unable to clean attribute '{attr}'.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
class Errors(metaclass=ErrorsWithCodes):
|
||||||
class Errors:
|
|
||||||
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
|
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
|
||||||
E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
|
E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
|
||||||
"This usually happens when spaCy calls `nlp.{method}` with a custom "
|
"This usually happens when spaCy calls `nlp.{method}` with a custom "
|
||||||
|
@ -284,7 +283,7 @@ class Errors:
|
||||||
"you forget to call the `set_extension` method?")
|
"you forget to call the `set_extension` method?")
|
||||||
E047 = ("Can't assign a value to unregistered extension attribute "
|
E047 = ("Can't assign a value to unregistered extension attribute "
|
||||||
"'{name}'. Did you forget to call the `set_extension` method?")
|
"'{name}'. Did you forget to call the `set_extension` method?")
|
||||||
E048 = ("Can't import language {lang} from spacy.lang: {err}")
|
E048 = ("Can't import language {lang} or any matching language from spacy.lang: {err}")
|
||||||
E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
|
E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
|
||||||
"package or a valid path to a data directory.")
|
"package or a valid path to a data directory.")
|
||||||
E052 = ("Can't find model directory: {path}")
|
E052 = ("Can't find model directory: {path}")
|
||||||
|
@ -356,8 +355,8 @@ class Errors:
|
||||||
E098 = ("Invalid pattern: expected both RIGHT_ID and RIGHT_ATTRS.")
|
E098 = ("Invalid pattern: expected both RIGHT_ID and RIGHT_ATTRS.")
|
||||||
E099 = ("Invalid pattern: the first node of pattern should be an anchor "
|
E099 = ("Invalid pattern: the first node of pattern should be an anchor "
|
||||||
"node. The node should only contain RIGHT_ID and RIGHT_ATTRS.")
|
"node. The node should only contain RIGHT_ID and RIGHT_ATTRS.")
|
||||||
E100 = ("Nodes other than the anchor node should all contain LEFT_ID, "
|
E100 = ("Nodes other than the anchor node should all contain {required}, "
|
||||||
"REL_OP and RIGHT_ID.")
|
"but these are missing: {missing}")
|
||||||
E101 = ("RIGHT_ID should be a new node and LEFT_ID should already have "
|
E101 = ("RIGHT_ID should be a new node and LEFT_ID should already have "
|
||||||
"have been declared in previous edges.")
|
"have been declared in previous edges.")
|
||||||
E102 = ("Can't merge non-disjoint spans. '{token}' is already part of "
|
E102 = ("Can't merge non-disjoint spans. '{token}' is already part of "
|
||||||
|
@ -518,9 +517,24 @@ class Errors:
|
||||||
E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
|
E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
|
||||||
E200 = ("Can't yet set {attr} from Span. Vote for this feature on the "
|
E200 = ("Can't yet set {attr} from Span. Vote for this feature on the "
|
||||||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||||
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
|
E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
|
||||||
|
|
||||||
# New errors added in v3.x
|
# New errors added in v3.x
|
||||||
|
E858 = ("The {mode} vector table does not support this operation. "
|
||||||
|
"{alternative}")
|
||||||
|
E859 = ("The floret vector table cannot be modified.")
|
||||||
|
E860 = ("Can't truncate fasttext-bloom vectors.")
|
||||||
|
E861 = ("No 'keys' should be provided when initializing floret vectors "
|
||||||
|
"with 'minn' and 'maxn'.")
|
||||||
|
E862 = ("'hash_count' must be between 1-4 for floret vectors.")
|
||||||
|
E863 = ("'maxn' must be greater than or equal to 'minn'.")
|
||||||
|
E864 = ("The complete vector table 'data' is required to initialize floret "
|
||||||
|
"vectors.")
|
||||||
|
E865 = ("A SpanGroup is not functional after the corresponding Doc has "
|
||||||
|
"been garbage collected. To keep using the spans, make sure that "
|
||||||
|
"the corresponding Doc object is still available in the scope of "
|
||||||
|
"your function.")
|
||||||
|
E866 = ("Expected a string or 'Doc' as input, but got: {type}.")
|
||||||
E867 = ("The 'textcat' component requires at least two labels because it "
|
E867 = ("The 'textcat' component requires at least two labels because it "
|
||||||
"uses mutually exclusive classes where exactly one label is True "
|
"uses mutually exclusive classes where exactly one label is True "
|
||||||
"for each doc. For binary classification tasks, you can use two "
|
"for each doc. For binary classification tasks, you can use two "
|
||||||
|
@ -628,7 +642,7 @@ class Errors:
|
||||||
E912 = ("Failed to initialize lemmatizer. Missing lemmatizer table(s) found "
|
E912 = ("Failed to initialize lemmatizer. Missing lemmatizer table(s) found "
|
||||||
"for mode '{mode}'. Required tables: {tables}. Found: {found}.")
|
"for mode '{mode}'. Required tables: {tables}. Found: {found}.")
|
||||||
E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
|
E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
|
||||||
"config.cfg or override it on the CLI?")
|
".cfg file or override it on the CLI?")
|
||||||
E914 = ("Executing {name} callback failed. Expected the function to "
|
E914 = ("Executing {name} callback failed. Expected the function to "
|
||||||
"return the nlp object but got: {value}. Maybe you forgot to return "
|
"return the nlp object but got: {value}. Maybe you forgot to return "
|
||||||
"the modified object in your function?")
|
"the modified object in your function?")
|
||||||
|
@ -655,7 +669,9 @@ class Errors:
|
||||||
"{nO} - cannot add any more labels.")
|
"{nO} - cannot add any more labels.")
|
||||||
E923 = ("It looks like there is no proper sample data to initialize the "
|
E923 = ("It looks like there is no proper sample data to initialize the "
|
||||||
"Model of component '{name}'. To check your input data paths and "
|
"Model of component '{name}'. To check your input data paths and "
|
||||||
"annotation, run: python -m spacy debug data config.cfg")
|
"annotation, run: python -m spacy debug data config.cfg "
|
||||||
|
"and include the same config override values you would specify "
|
||||||
|
"for the 'spacy train' command.")
|
||||||
E924 = ("The '{name}' component does not seem to be initialized properly. "
|
E924 = ("The '{name}' component does not seem to be initialized properly. "
|
||||||
"This is likely a bug in spaCy, so feel free to open an issue: "
|
"This is likely a bug in spaCy, so feel free to open an issue: "
|
||||||
"https://github.com/explosion/spaCy/issues")
|
"https://github.com/explosion/spaCy/issues")
|
||||||
|
@ -790,7 +806,7 @@ class Errors:
|
||||||
"to token boundaries.")
|
"to token boundaries.")
|
||||||
E982 = ("The `Token.ent_iob` attribute should be an integer indexing "
|
E982 = ("The `Token.ent_iob` attribute should be an integer indexing "
|
||||||
"into {values}, but found {value}.")
|
"into {values}, but found {value}.")
|
||||||
E983 = ("Invalid key for '{dict}': {key}. Available keys: "
|
E983 = ("Invalid key(s) for '{dict}': {key}. Available keys: "
|
||||||
"{keys}")
|
"{keys}")
|
||||||
E984 = ("Invalid component config for '{name}': component block needs either "
|
E984 = ("Invalid component config for '{name}': component block needs either "
|
||||||
"a key `factory` specifying the registered function used to "
|
"a key `factory` specifying the registered function used to "
|
||||||
|
@ -864,6 +880,20 @@ class Errors:
|
||||||
E1018 = ("Knowledge base for component '{name}' is not set. "
|
E1018 = ("Knowledge base for component '{name}' is not set. "
|
||||||
"Make sure either `nel.initialize` or `nel.set_kb` "
|
"Make sure either `nel.initialize` or `nel.set_kb` "
|
||||||
"is called with a `kb_loader` function.")
|
"is called with a `kb_loader` function.")
|
||||||
|
E1019 = ("`noun_chunks` requires the pos tagging, which requires a "
|
||||||
|
"statistical model to be installed and loaded. For more info, see "
|
||||||
|
"the documentation:\nhttps://spacy.io/usage/models")
|
||||||
|
E1020 = ("No `epoch_resume` value specified and could not infer one from "
|
||||||
|
"filename. Specify an epoch to resume from.")
|
||||||
|
E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
|
||||||
|
"Non-UD tags should use the `tag` property.")
|
||||||
|
E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
|
||||||
|
E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
|
||||||
|
"exist.")
|
||||||
|
E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler "
|
||||||
|
"patterns.")
|
||||||
|
E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
|
||||||
|
"supported values are: 'I', 'O', 'B' and ''")
|
||||||
|
|
||||||
|
|
||||||
# Deprecated model shortcuts, only used in errors and warnings
|
# Deprecated model shortcuts, only used in errors and warnings
|
||||||
|
|
|
@ -95,6 +95,7 @@ GLOSSARY = {
|
||||||
"XX": "unknown",
|
"XX": "unknown",
|
||||||
"BES": 'auxiliary "be"',
|
"BES": 'auxiliary "be"',
|
||||||
"HVS": 'forms of "have"',
|
"HVS": 'forms of "have"',
|
||||||
|
"_SP": "whitespace",
|
||||||
# POS Tags (German)
|
# POS Tags (German)
|
||||||
# TIGER Treebank
|
# TIGER Treebank
|
||||||
# http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
|
# http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
|
||||||
|
|
33
spacy/kb.pyx
33
spacy/kb.pyx
|
@ -1,5 +1,5 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: infer_types=True, profile=True
|
||||||
from typing import Iterator, Iterable
|
from typing import Iterator, Iterable, Callable, Dict, Any
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
@ -96,6 +96,8 @@ cdef class KnowledgeBase:
|
||||||
def initialize_entities(self, int64_t nr_entities):
|
def initialize_entities(self, int64_t nr_entities):
|
||||||
self._entry_index = PreshMap(nr_entities + 1)
|
self._entry_index = PreshMap(nr_entities + 1)
|
||||||
self._entries = entry_vec(nr_entities + 1)
|
self._entries = entry_vec(nr_entities + 1)
|
||||||
|
|
||||||
|
def initialize_vectors(self, int64_t nr_entities):
|
||||||
self._vectors_table = float_matrix(nr_entities + 1)
|
self._vectors_table = float_matrix(nr_entities + 1)
|
||||||
|
|
||||||
def initialize_aliases(self, int64_t nr_aliases):
|
def initialize_aliases(self, int64_t nr_aliases):
|
||||||
|
@ -122,7 +124,7 @@ cdef class KnowledgeBase:
|
||||||
def get_alias_strings(self):
|
def get_alias_strings(self):
|
||||||
return [self.vocab.strings[x] for x in self._alias_index]
|
return [self.vocab.strings[x] for x in self._alias_index]
|
||||||
|
|
||||||
def add_entity(self, unicode entity, float freq, vector[float] entity_vector):
|
def add_entity(self, str entity, float freq, vector[float] entity_vector):
|
||||||
"""
|
"""
|
||||||
Add an entity to the KB, optionally specifying its log probability based on corpus frequency
|
Add an entity to the KB, optionally specifying its log probability based on corpus frequency
|
||||||
Return the hash of the entity ID/name at the end.
|
Return the hash of the entity ID/name at the end.
|
||||||
|
@ -154,6 +156,7 @@ cdef class KnowledgeBase:
|
||||||
|
|
||||||
nr_entities = len(set(entity_list))
|
nr_entities = len(set(entity_list))
|
||||||
self.initialize_entities(nr_entities)
|
self.initialize_entities(nr_entities)
|
||||||
|
self.initialize_vectors(nr_entities)
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
cdef KBEntryC entry
|
cdef KBEntryC entry
|
||||||
|
@ -172,8 +175,8 @@ cdef class KnowledgeBase:
|
||||||
entry.entity_hash = entity_hash
|
entry.entity_hash = entity_hash
|
||||||
entry.freq = freq_list[i]
|
entry.freq = freq_list[i]
|
||||||
|
|
||||||
vector_index = self.c_add_vector(entity_vector=vector_list[i])
|
self._vectors_table[i] = entity_vector
|
||||||
entry.vector_index = vector_index
|
entry.vector_index = i
|
||||||
|
|
||||||
entry.feats_row = -1 # Features table currently not implemented
|
entry.feats_row = -1 # Features table currently not implemented
|
||||||
|
|
||||||
|
@ -182,15 +185,15 @@ cdef class KnowledgeBase:
|
||||||
|
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
def contains_entity(self, unicode entity):
|
def contains_entity(self, str entity):
|
||||||
cdef hash_t entity_hash = self.vocab.strings.add(entity)
|
cdef hash_t entity_hash = self.vocab.strings.add(entity)
|
||||||
return entity_hash in self._entry_index
|
return entity_hash in self._entry_index
|
||||||
|
|
||||||
def contains_alias(self, unicode alias):
|
def contains_alias(self, str alias):
|
||||||
cdef hash_t alias_hash = self.vocab.strings.add(alias)
|
cdef hash_t alias_hash = self.vocab.strings.add(alias)
|
||||||
return alias_hash in self._alias_index
|
return alias_hash in self._alias_index
|
||||||
|
|
||||||
def add_alias(self, unicode alias, entities, probabilities):
|
def add_alias(self, str alias, entities, probabilities):
|
||||||
"""
|
"""
|
||||||
For a given alias, add its potential entities and prior probabilies to the KB.
|
For a given alias, add its potential entities and prior probabilies to the KB.
|
||||||
Return the alias_hash at the end
|
Return the alias_hash at the end
|
||||||
|
@ -236,7 +239,7 @@ cdef class KnowledgeBase:
|
||||||
raise RuntimeError(Errors.E891.format(alias=alias))
|
raise RuntimeError(Errors.E891.format(alias=alias))
|
||||||
return alias_hash
|
return alias_hash
|
||||||
|
|
||||||
def append_alias(self, unicode alias, unicode entity, float prior_prob, ignore_warnings=False):
|
def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False):
|
||||||
"""
|
"""
|
||||||
For an alias already existing in the KB, extend its potential entities with one more.
|
For an alias already existing in the KB, extend its potential entities with one more.
|
||||||
Throw a warning if either the alias or the entity is unknown,
|
Throw a warning if either the alias or the entity is unknown,
|
||||||
|
@ -283,7 +286,7 @@ cdef class KnowledgeBase:
|
||||||
alias_entry.probs = probs
|
alias_entry.probs = probs
|
||||||
self._aliases_table[alias_index] = alias_entry
|
self._aliases_table[alias_index] = alias_entry
|
||||||
|
|
||||||
def get_alias_candidates(self, unicode alias) -> Iterator[Candidate]:
|
def get_alias_candidates(self, str alias) -> Iterator[Candidate]:
|
||||||
"""
|
"""
|
||||||
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
||||||
and the prior probability of that alias resolving to that entity.
|
and the prior probability of that alias resolving to that entity.
|
||||||
|
@ -304,7 +307,7 @@ cdef class KnowledgeBase:
|
||||||
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
|
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
|
||||||
if entry_index != 0]
|
if entry_index != 0]
|
||||||
|
|
||||||
def get_vector(self, unicode entity):
|
def get_vector(self, str entity):
|
||||||
cdef hash_t entity_hash = self.vocab.strings[entity]
|
cdef hash_t entity_hash = self.vocab.strings[entity]
|
||||||
|
|
||||||
# Return an empty list if this entity is unknown in this KB
|
# Return an empty list if this entity is unknown in this KB
|
||||||
|
@ -314,7 +317,7 @@ cdef class KnowledgeBase:
|
||||||
|
|
||||||
return self._vectors_table[self._entries[entry_index].vector_index]
|
return self._vectors_table[self._entries[entry_index].vector_index]
|
||||||
|
|
||||||
def get_prior_prob(self, unicode entity, unicode alias):
|
def get_prior_prob(self, str entity, str alias):
|
||||||
""" Return the prior probability of a given alias being linked to a given entity,
|
""" Return the prior probability of a given alias being linked to a given entity,
|
||||||
or return 0.0 when this combination is not known in the knowledge base"""
|
or return 0.0 when this combination is not known in the knowledge base"""
|
||||||
cdef hash_t alias_hash = self.vocab.strings[alias]
|
cdef hash_t alias_hash = self.vocab.strings[alias]
|
||||||
|
@ -386,6 +389,7 @@ cdef class KnowledgeBase:
|
||||||
nr_aliases = header[1]
|
nr_aliases = header[1]
|
||||||
entity_vector_length = header[2]
|
entity_vector_length = header[2]
|
||||||
self.initialize_entities(nr_entities)
|
self.initialize_entities(nr_entities)
|
||||||
|
self.initialize_vectors(nr_entities)
|
||||||
self.initialize_aliases(nr_aliases)
|
self.initialize_aliases(nr_aliases)
|
||||||
self.entity_vector_length = entity_vector_length
|
self.entity_vector_length = entity_vector_length
|
||||||
|
|
||||||
|
@ -446,7 +450,7 @@ cdef class KnowledgeBase:
|
||||||
raise ValueError(Errors.E929.format(loc=path))
|
raise ValueError(Errors.E929.format(loc=path))
|
||||||
if not path.is_dir():
|
if not path.is_dir():
|
||||||
raise ValueError(Errors.E928.format(loc=path))
|
raise ValueError(Errors.E928.format(loc=path))
|
||||||
deserialize = {}
|
deserialize: Dict[str, Callable[[Any], Any]] = {}
|
||||||
deserialize["contents"] = lambda p: self.read_contents(p)
|
deserialize["contents"] = lambda p: self.read_contents(p)
|
||||||
deserialize["strings.json"] = lambda p: self.vocab.strings.from_disk(p)
|
deserialize["strings.json"] = lambda p: self.vocab.strings.from_disk(p)
|
||||||
util.from_disk(path, deserialize, exclude)
|
util.from_disk(path, deserialize, exclude)
|
||||||
|
@ -509,6 +513,7 @@ cdef class KnowledgeBase:
|
||||||
reader.read_header(&nr_entities, &entity_vector_length)
|
reader.read_header(&nr_entities, &entity_vector_length)
|
||||||
|
|
||||||
self.initialize_entities(nr_entities)
|
self.initialize_entities(nr_entities)
|
||||||
|
self.initialize_vectors(nr_entities)
|
||||||
self.entity_vector_length = entity_vector_length
|
self.entity_vector_length = entity_vector_length
|
||||||
|
|
||||||
# STEP 1: load entity vectors
|
# STEP 1: load entity vectors
|
||||||
|
@ -582,7 +587,7 @@ cdef class Writer:
|
||||||
def __init__(self, path):
|
def __init__(self, path):
|
||||||
assert isinstance(path, Path)
|
assert isinstance(path, Path)
|
||||||
content = bytes(path)
|
content = bytes(path)
|
||||||
cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
|
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
|
||||||
self._fp = fopen(<char*>bytes_loc, 'wb')
|
self._fp = fopen(<char*>bytes_loc, 'wb')
|
||||||
if not self._fp:
|
if not self._fp:
|
||||||
raise IOError(Errors.E146.format(path=path))
|
raise IOError(Errors.E146.format(path=path))
|
||||||
|
@ -624,7 +629,7 @@ cdef class Writer:
|
||||||
cdef class Reader:
|
cdef class Reader:
|
||||||
def __init__(self, path):
|
def __init__(self, path):
|
||||||
content = bytes(path)
|
content = bytes(path)
|
||||||
cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
|
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
|
||||||
self._fp = fopen(<char*>bytes_loc, 'rb')
|
self._fp = fopen(<char*>bytes_loc, 'rb')
|
||||||
if not self._fp:
|
if not self._fp:
|
||||||
PyErr_SetFromErrno(IOError)
|
PyErr_SetFromErrno(IOError)
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class AfrikaansDefaults(Language.Defaults):
|
class AfrikaansDefaults(BaseDefaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,12 +4,12 @@ from .punctuation import TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
from ...attrs import LANG
|
from ...attrs import LANG
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
class AmharicDefaults(Language.Defaults):
|
class AmharicDefaults(BaseDefaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
lex_attr_getters[LANG] = lambda text: "am"
|
lex_attr_getters[LANG] = lambda text: "am"
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
||||||
from ..char_classes import UNITS, ALPHA_UPPER
|
from ..char_classes import UNITS, ALPHA_UPPER
|
||||||
|
|
||||||
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧".strip().split()
|
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
|
||||||
|
|
||||||
_suffixes = (
|
_suffixes = (
|
||||||
_list_punct
|
_list_punct
|
||||||
|
|
|
@ -2,10 +2,10 @@ from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class ArabicDefaults(Language.Defaults):
|
class ArabicDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class AzerbaijaniDefaults(Language.Defaults):
|
class AzerbaijaniDefaults(BaseDefaults):
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
|
@ -3,12 +3,12 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
from ...attrs import LANG
|
from ...attrs import LANG
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
class BulgarianDefaults(Language.Defaults):
|
class BulgarianDefaults(BaseDefaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: "bg"
|
lex_attr_getters[LANG] = lambda text: "bg"
|
||||||
|
|
||||||
|
|
|
@ -1,265 +1,79 @@
|
||||||
# Source: https://github.com/Alir3z4/stop-words
|
"""
|
||||||
|
References:
|
||||||
|
https://github.com/Alir3z4/stop-words - Original list, serves as a base.
|
||||||
|
https://postvai.com/books/stop-dumi.pdf - Additions to the original list in order to improve it.
|
||||||
|
"""
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
а
|
а автентичен аз ако ала
|
||||||
автентичен
|
|
||||||
аз
|
бе без беше би бивш бивша бившо бивши бил била били било благодаря близо бъдат
|
||||||
ако
|
бъде бъда бяха
|
||||||
ала
|
|
||||||
бе
|
в вас ваш ваша вашата вашият вероятно вече взема ви вие винаги внимава време все
|
||||||
без
|
всеки всички вместо всичко вследствие всъщност всяка втори във въпреки върху
|
||||||
беше
|
вътре веднъж
|
||||||
би
|
|
||||||
бивш
|
г ги главен главна главно глас го годно година години годишен
|
||||||
бивша
|
|
||||||
бившо
|
д да дали далеч далече два двама двамата две двете ден днес дни до добра добре
|
||||||
бил
|
добро добър достатъчно докато докога дори досега доста друг друга другаде други
|
||||||
била
|
|
||||||
били
|
е евтин едва един една еднаква еднакви еднакъв едно екип ето
|
||||||
било
|
|
||||||
благодаря
|
живот жив
|
||||||
близо
|
|
||||||
бъдат
|
за здравей здрасти знае зная забавям зад зададени заедно заради засега заспал
|
||||||
бъде
|
затова запазва започвам защо защото завинаги
|
||||||
бяха
|
|
||||||
в
|
и из или им има имат иска искам използвайки изглежда изглеждаше изглеждайки
|
||||||
вас
|
извън имайки
|
||||||
ваш
|
|
||||||
ваша
|
й йо
|
||||||
вероятно
|
|
||||||
вече
|
каза казва казвайки казвам как каква какво както какъв като кога кауза каузи
|
||||||
взема
|
когато когото което които кой който колко която къде където към край кратък
|
||||||
ви
|
кръгъл
|
||||||
вие
|
|
||||||
винаги
|
лесен лесно ли летя летиш летим лош
|
||||||
внимава
|
|
||||||
време
|
м май малко макар малцина междувременно минус ме между мек мен месец ми мис
|
||||||
все
|
мисля много мнозина мога могат може мой можем мокър моля момента му
|
||||||
всеки
|
|
||||||
всички
|
н на над назад най наш навсякъде навътре нагоре направи напред надолу наистина
|
||||||
всичко
|
например наопаки наполовина напоследък нека независимо нас насам наскоро
|
||||||
всяка
|
настрана необходимо него негов нещо нея ни ние никой нито нищо но нов някак нова
|
||||||
във
|
нови новина някои някой някога някъде няколко няма
|
||||||
въпреки
|
|
||||||
върху
|
о обаче около описан опитах опитва опитвайки опитвам определен определено освен
|
||||||
г
|
обикновено осигурява обратно означава особен особено от ох отвъд отгоре отдолу
|
||||||
ги
|
отново отива отивам отидох отсега отделно отколкото откъдето очевидно оттам
|
||||||
главен
|
относно още
|
||||||
главна
|
|
||||||
главно
|
п пак по повече повечето под поне просто пряко поради после последен последно
|
||||||
глас
|
посочен почти прави прав прави правя пред преди през при пък първата първи първо
|
||||||
го
|
път пъти плюс
|
||||||
година
|
|
||||||
години
|
равен равна различен различни разумен разумно
|
||||||
годишен
|
|
||||||
д
|
с са сам само себе сериозно сигурен сигурно се сега си син скоро скорошен след
|
||||||
да
|
следващ следващия следва следното следователно случва сме смях собствен
|
||||||
дали
|
сравнително смея според сред става срещу съвсем съдържа съдържащ съжалявам
|
||||||
два
|
съответен съответно сте съм със също
|
||||||
двама
|
|
||||||
двамата
|
т така техен техни такива такъв твърде там трета твой те тези ти то това
|
||||||
две
|
тогава този той търси толкова точно три трябва тук тъй тя тях
|
||||||
двете
|
|
||||||
ден
|
у утре ужасно употреба успоредно уточнен уточняване
|
||||||
днес
|
|
||||||
дни
|
харесва харесали хиляди
|
||||||
до
|
|
||||||
добра
|
ч часа ценя цяло цялостен че често чрез чудя
|
||||||
добре
|
|
||||||
добро
|
ще щеше щом щяха
|
||||||
добър
|
|
||||||
докато
|
|
||||||
докога
|
|
||||||
дори
|
|
||||||
досега
|
|
||||||
доста
|
|
||||||
друг
|
|
||||||
друга
|
|
||||||
други
|
|
||||||
е
|
|
||||||
евтин
|
|
||||||
едва
|
|
||||||
един
|
|
||||||
една
|
|
||||||
еднаква
|
|
||||||
еднакви
|
|
||||||
еднакъв
|
|
||||||
едно
|
|
||||||
екип
|
|
||||||
ето
|
|
||||||
живот
|
|
||||||
за
|
|
||||||
забавям
|
|
||||||
зад
|
|
||||||
заедно
|
|
||||||
заради
|
|
||||||
засега
|
|
||||||
заспал
|
|
||||||
затова
|
|
||||||
защо
|
|
||||||
защото
|
|
||||||
и
|
|
||||||
из
|
|
||||||
или
|
|
||||||
им
|
|
||||||
има
|
|
||||||
имат
|
|
||||||
иска
|
|
||||||
й
|
|
||||||
каза
|
|
||||||
как
|
|
||||||
каква
|
|
||||||
какво
|
|
||||||
както
|
|
||||||
какъв
|
|
||||||
като
|
|
||||||
кога
|
|
||||||
когато
|
|
||||||
което
|
|
||||||
които
|
|
||||||
кой
|
|
||||||
който
|
|
||||||
колко
|
|
||||||
която
|
|
||||||
къде
|
|
||||||
където
|
|
||||||
към
|
|
||||||
лесен
|
|
||||||
лесно
|
|
||||||
ли
|
|
||||||
лош
|
|
||||||
м
|
|
||||||
май
|
|
||||||
малко
|
|
||||||
ме
|
|
||||||
между
|
|
||||||
мек
|
|
||||||
мен
|
|
||||||
месец
|
|
||||||
ми
|
|
||||||
много
|
|
||||||
мнозина
|
|
||||||
мога
|
|
||||||
могат
|
|
||||||
може
|
|
||||||
мокър
|
|
||||||
моля
|
|
||||||
момента
|
|
||||||
му
|
|
||||||
н
|
|
||||||
на
|
|
||||||
над
|
|
||||||
назад
|
|
||||||
най
|
|
||||||
направи
|
|
||||||
напред
|
|
||||||
например
|
|
||||||
нас
|
|
||||||
не
|
|
||||||
него
|
|
||||||
нещо
|
|
||||||
нея
|
|
||||||
ни
|
|
||||||
ние
|
|
||||||
никой
|
|
||||||
нито
|
|
||||||
нищо
|
|
||||||
но
|
|
||||||
нов
|
|
||||||
нова
|
|
||||||
нови
|
|
||||||
новина
|
|
||||||
някои
|
|
||||||
някой
|
|
||||||
няколко
|
|
||||||
няма
|
|
||||||
обаче
|
|
||||||
около
|
|
||||||
освен
|
|
||||||
особено
|
|
||||||
от
|
|
||||||
отгоре
|
|
||||||
отново
|
|
||||||
още
|
|
||||||
пак
|
|
||||||
по
|
|
||||||
повече
|
|
||||||
повечето
|
|
||||||
под
|
|
||||||
поне
|
|
||||||
поради
|
|
||||||
после
|
|
||||||
почти
|
|
||||||
прави
|
|
||||||
пред
|
|
||||||
преди
|
|
||||||
през
|
|
||||||
при
|
|
||||||
пък
|
|
||||||
първата
|
|
||||||
първи
|
|
||||||
първо
|
|
||||||
пъти
|
|
||||||
равен
|
|
||||||
равна
|
|
||||||
с
|
|
||||||
са
|
|
||||||
сам
|
|
||||||
само
|
|
||||||
се
|
|
||||||
сега
|
|
||||||
си
|
|
||||||
син
|
|
||||||
скоро
|
|
||||||
след
|
|
||||||
следващ
|
|
||||||
сме
|
|
||||||
смях
|
|
||||||
според
|
|
||||||
сред
|
|
||||||
срещу
|
|
||||||
сте
|
|
||||||
съм
|
|
||||||
със
|
|
||||||
също
|
|
||||||
т
|
|
||||||
тази
|
|
||||||
така
|
|
||||||
такива
|
|
||||||
такъв
|
|
||||||
там
|
|
||||||
твой
|
|
||||||
те
|
|
||||||
тези
|
|
||||||
ти
|
|
||||||
т.н.
|
|
||||||
то
|
|
||||||
това
|
|
||||||
тогава
|
|
||||||
този
|
|
||||||
той
|
|
||||||
толкова
|
|
||||||
точно
|
|
||||||
три
|
|
||||||
трябва
|
|
||||||
тук
|
|
||||||
тъй
|
|
||||||
тя
|
|
||||||
тях
|
|
||||||
у
|
|
||||||
утре
|
|
||||||
харесва
|
|
||||||
хиляди
|
|
||||||
ч
|
|
||||||
часа
|
|
||||||
че
|
|
||||||
често
|
|
||||||
чрез
|
|
||||||
ще
|
|
||||||
щом
|
|
||||||
юмрук
|
юмрук
|
||||||
я
|
|
||||||
як
|
я як
|
||||||
""".split()
|
""".split()
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,10 +1,16 @@
|
||||||
|
"""
|
||||||
|
References:
|
||||||
|
https://slovored.com/bg/abbr/grammar/ - Additional refs for abbreviations
|
||||||
|
(countries, occupations, fields of studies and more).
|
||||||
|
"""
|
||||||
|
|
||||||
from ...symbols import ORTH, NORM
|
from ...symbols import ORTH, NORM
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
# measurements
|
||||||
_abbr_exc = [
|
for abbr in [
|
||||||
{ORTH: "м", NORM: "метър"},
|
{ORTH: "м", NORM: "метър"},
|
||||||
{ORTH: "мм", NORM: "милиметър"},
|
{ORTH: "мм", NORM: "милиметър"},
|
||||||
{ORTH: "см", NORM: "сантиметър"},
|
{ORTH: "см", NORM: "сантиметър"},
|
||||||
|
@ -17,51 +23,191 @@ _abbr_exc = [
|
||||||
{ORTH: "хл", NORM: "хектолиър"},
|
{ORTH: "хл", NORM: "хектолиър"},
|
||||||
{ORTH: "дкл", NORM: "декалитър"},
|
{ORTH: "дкл", NORM: "декалитър"},
|
||||||
{ORTH: "л", NORM: "литър"},
|
{ORTH: "л", NORM: "литър"},
|
||||||
]
|
]:
|
||||||
for abbr in _abbr_exc:
|
|
||||||
_exc[abbr[ORTH]] = [abbr]
|
_exc[abbr[ORTH]] = [abbr]
|
||||||
|
|
||||||
_abbr_line_exc = [
|
# line abbreviations
|
||||||
|
for abbr in [
|
||||||
{ORTH: "г-жа", NORM: "госпожа"},
|
{ORTH: "г-жа", NORM: "госпожа"},
|
||||||
{ORTH: "г-н", NORM: "господин"},
|
{ORTH: "г-н", NORM: "господин"},
|
||||||
{ORTH: "г-ца", NORM: "госпожица"},
|
{ORTH: "г-ца", NORM: "госпожица"},
|
||||||
{ORTH: "д-р", NORM: "доктор"},
|
{ORTH: "д-р", NORM: "доктор"},
|
||||||
{ORTH: "о-в", NORM: "остров"},
|
{ORTH: "о-в", NORM: "остров"},
|
||||||
{ORTH: "п-в", NORM: "полуостров"},
|
{ORTH: "п-в", NORM: "полуостров"},
|
||||||
]
|
{ORTH: "с-у", NORM: "срещу"},
|
||||||
|
{ORTH: "в-у", NORM: "върху"},
|
||||||
for abbr in _abbr_line_exc:
|
{ORTH: "м-у", NORM: "между"},
|
||||||
|
]:
|
||||||
_exc[abbr[ORTH]] = [abbr]
|
_exc[abbr[ORTH]] = [abbr]
|
||||||
|
|
||||||
_abbr_dot_exc = [
|
# foreign language related abbreviations
|
||||||
|
for abbr in [
|
||||||
|
{ORTH: "англ.", NORM: "английски"},
|
||||||
|
{ORTH: "ан.", NORM: "английски термин"},
|
||||||
|
{ORTH: "араб.", NORM: "арабски"},
|
||||||
|
{ORTH: "афр.", NORM: "африкански"},
|
||||||
|
{ORTH: "гр.", NORM: "гръцки"},
|
||||||
|
{ORTH: "лат.", NORM: "латински"},
|
||||||
|
{ORTH: "рим.", NORM: "римски"},
|
||||||
|
{ORTH: "старогр.", NORM: "старогръцки"},
|
||||||
|
{ORTH: "староевр.", NORM: "староеврейски"},
|
||||||
|
{ORTH: "фр.", NORM: "френски"},
|
||||||
|
{ORTH: "хол.", NORM: "холандски"},
|
||||||
|
{ORTH: "швед.", NORM: "шведски"},
|
||||||
|
{ORTH: "шотл.", NORM: "шотландски"},
|
||||||
|
{ORTH: "яп.", NORM: "японски"},
|
||||||
|
]:
|
||||||
|
_exc[abbr[ORTH]] = [abbr]
|
||||||
|
|
||||||
|
# profession and academic titles abbreviations
|
||||||
|
for abbr in [
|
||||||
{ORTH: "акад.", NORM: "академик"},
|
{ORTH: "акад.", NORM: "академик"},
|
||||||
{ORTH: "ал.", NORM: "алинея"},
|
|
||||||
{ORTH: "арх.", NORM: "архитект"},
|
{ORTH: "арх.", NORM: "архитект"},
|
||||||
|
{ORTH: "инж.", NORM: "инженер"},
|
||||||
|
{ORTH: "канц.", NORM: "канцлер"},
|
||||||
|
{ORTH: "проф.", NORM: "професор"},
|
||||||
|
{ORTH: "св.", NORM: "свети"},
|
||||||
|
]:
|
||||||
|
_exc[abbr[ORTH]] = [abbr]
|
||||||
|
|
||||||
|
# fields of studies
|
||||||
|
for abbr in [
|
||||||
|
{ORTH: "агр.", NORM: "агрономия"},
|
||||||
|
{ORTH: "ав.", NORM: "авиация"},
|
||||||
|
{ORTH: "агр.", NORM: "агрономия"},
|
||||||
|
{ORTH: "археол.", NORM: "археология"},
|
||||||
|
{ORTH: "астр.", NORM: "астрономия"},
|
||||||
|
{ORTH: "геод.", NORM: "геодезия"},
|
||||||
|
{ORTH: "геол.", NORM: "геология"},
|
||||||
|
{ORTH: "геом.", NORM: "геометрия"},
|
||||||
|
{ORTH: "гимн.", NORM: "гимнастика"},
|
||||||
|
{ORTH: "грам.", NORM: "граматика"},
|
||||||
|
{ORTH: "жур.", NORM: "журналистика"},
|
||||||
|
{ORTH: "журн.", NORM: "журналистика"},
|
||||||
|
{ORTH: "зем.", NORM: "земеделие"},
|
||||||
|
{ORTH: "икон.", NORM: "икономика"},
|
||||||
|
{ORTH: "лит.", NORM: "литература"},
|
||||||
|
{ORTH: "мат.", NORM: "математика"},
|
||||||
|
{ORTH: "мед.", NORM: "медицина"},
|
||||||
|
{ORTH: "муз.", NORM: "музика"},
|
||||||
|
{ORTH: "печ.", NORM: "печатарство"},
|
||||||
|
{ORTH: "пол.", NORM: "политика"},
|
||||||
|
{ORTH: "псих.", NORM: "психология"},
|
||||||
|
{ORTH: "соц.", NORM: "социология"},
|
||||||
|
{ORTH: "стат.", NORM: "статистика"},
|
||||||
|
{ORTH: "стил.", NORM: "стилистика"},
|
||||||
|
{ORTH: "топогр.", NORM: "топография"},
|
||||||
|
{ORTH: "търг.", NORM: "търговия"},
|
||||||
|
{ORTH: "фарм.", NORM: "фармацевтика"},
|
||||||
|
{ORTH: "фехт.", NORM: "фехтовка"},
|
||||||
|
{ORTH: "физиол.", NORM: "физиология"},
|
||||||
|
{ORTH: "физ.", NORM: "физика"},
|
||||||
|
{ORTH: "фил.", NORM: "философия"},
|
||||||
|
{ORTH: "фин.", NORM: "финанси"},
|
||||||
|
{ORTH: "фолкл.", NORM: "фолклор"},
|
||||||
|
{ORTH: "фон.", NORM: "фонетика"},
|
||||||
|
{ORTH: "фот.", NORM: "фотография"},
|
||||||
|
{ORTH: "футб.", NORM: "футбол"},
|
||||||
|
{ORTH: "хим.", NORM: "химия"},
|
||||||
|
{ORTH: "хир.", NORM: "хирургия"},
|
||||||
|
{ORTH: "ел.", NORM: "електротехника"},
|
||||||
|
]:
|
||||||
|
_exc[abbr[ORTH]] = [abbr]
|
||||||
|
|
||||||
|
for abbr in [
|
||||||
|
{ORTH: "ал.", NORM: "алинея"},
|
||||||
|
{ORTH: "авт.", NORM: "автоматично"},
|
||||||
|
{ORTH: "адм.", NORM: "администрация"},
|
||||||
|
{ORTH: "арт.", NORM: "артилерия"},
|
||||||
{ORTH: "бл.", NORM: "блок"},
|
{ORTH: "бл.", NORM: "блок"},
|
||||||
{ORTH: "бр.", NORM: "брой"},
|
{ORTH: "бр.", NORM: "брой"},
|
||||||
{ORTH: "бул.", NORM: "булевард"},
|
{ORTH: "бул.", NORM: "булевард"},
|
||||||
|
{ORTH: "букв.", NORM: "буквално"},
|
||||||
{ORTH: "в.", NORM: "век"},
|
{ORTH: "в.", NORM: "век"},
|
||||||
|
{ORTH: "вр.", NORM: "време"},
|
||||||
|
{ORTH: "вм.", NORM: "вместо"},
|
||||||
|
{ORTH: "воен.", NORM: "военен термин"},
|
||||||
{ORTH: "г.", NORM: "година"},
|
{ORTH: "г.", NORM: "година"},
|
||||||
{ORTH: "гр.", NORM: "град"},
|
{ORTH: "гр.", NORM: "град"},
|
||||||
|
{ORTH: "гл.", NORM: "глагол"},
|
||||||
|
{ORTH: "др.", NORM: "други"},
|
||||||
|
{ORTH: "ез.", NORM: "езеро"},
|
||||||
{ORTH: "ж.р.", NORM: "женски род"},
|
{ORTH: "ж.р.", NORM: "женски род"},
|
||||||
{ORTH: "инж.", NORM: "инженер"},
|
{ORTH: "жп.", NORM: "железопът"},
|
||||||
|
{ORTH: "застр.", NORM: "застрахователно дело"},
|
||||||
|
{ORTH: "знач.", NORM: "значение"},
|
||||||
|
{ORTH: "и др.", NORM: "и други"},
|
||||||
|
{ORTH: "и под.", NORM: "и подобни"},
|
||||||
|
{ORTH: "и пр.", NORM: "и прочие"},
|
||||||
|
{ORTH: "изр.", NORM: "изречение"},
|
||||||
|
{ORTH: "изт.", NORM: "източен"},
|
||||||
|
{ORTH: "конкр.", NORM: "конкретно"},
|
||||||
{ORTH: "лв.", NORM: "лев"},
|
{ORTH: "лв.", NORM: "лев"},
|
||||||
|
{ORTH: "л.", NORM: "лице"},
|
||||||
{ORTH: "м.р.", NORM: "мъжки род"},
|
{ORTH: "м.р.", NORM: "мъжки род"},
|
||||||
{ORTH: "мат.", NORM: "математика"},
|
{ORTH: "мин.вр.", NORM: "минало време"},
|
||||||
{ORTH: "мед.", NORM: "медицина"},
|
{ORTH: "мн.ч.", NORM: "множествено число"},
|
||||||
|
{ORTH: "напр.", NORM: "например"},
|
||||||
|
{ORTH: "нар.", NORM: "наречие"},
|
||||||
|
{ORTH: "науч.", NORM: "научен термин"},
|
||||||
|
{ORTH: "непр.", NORM: "неправилно"},
|
||||||
|
{ORTH: "обик.", NORM: "обикновено"},
|
||||||
|
{ORTH: "опред.", NORM: "определение"},
|
||||||
|
{ORTH: "особ.", NORM: "особено"},
|
||||||
|
{ORTH: "ост.", NORM: "остаряло"},
|
||||||
|
{ORTH: "относ.", NORM: "относително"},
|
||||||
|
{ORTH: "отр.", NORM: "отрицателно"},
|
||||||
{ORTH: "пл.", NORM: "площад"},
|
{ORTH: "пл.", NORM: "площад"},
|
||||||
{ORTH: "проф.", NORM: "професор"},
|
{ORTH: "пад.", NORM: "падеж"},
|
||||||
|
{ORTH: "парл.", NORM: "парламентарен"},
|
||||||
|
{ORTH: "погов.", NORM: "поговорка"},
|
||||||
|
{ORTH: "пон.", NORM: "понякога"},
|
||||||
|
{ORTH: "правосл.", NORM: "православен"},
|
||||||
|
{ORTH: "прибл.", NORM: "приблизително"},
|
||||||
|
{ORTH: "прил.", NORM: "прилагателно име"},
|
||||||
|
{ORTH: "пр.", NORM: "прочие"},
|
||||||
{ORTH: "с.", NORM: "село"},
|
{ORTH: "с.", NORM: "село"},
|
||||||
{ORTH: "с.р.", NORM: "среден род"},
|
{ORTH: "с.р.", NORM: "среден род"},
|
||||||
{ORTH: "св.", NORM: "свети"},
|
|
||||||
{ORTH: "сп.", NORM: "списание"},
|
{ORTH: "сп.", NORM: "списание"},
|
||||||
{ORTH: "стр.", NORM: "страница"},
|
{ORTH: "стр.", NORM: "страница"},
|
||||||
|
{ORTH: "сз.", NORM: "съюз"},
|
||||||
|
{ORTH: "сег.", NORM: "сегашно"},
|
||||||
|
{ORTH: "сп.", NORM: "спорт"},
|
||||||
|
{ORTH: "срв.", NORM: "сравни"},
|
||||||
|
{ORTH: "с.ст.", NORM: "селскостопанска техника"},
|
||||||
|
{ORTH: "счет.", NORM: "счетоводство"},
|
||||||
|
{ORTH: "съкр.", NORM: "съкратено"},
|
||||||
|
{ORTH: "съобщ.", NORM: "съобщение"},
|
||||||
|
{ORTH: "същ.", NORM: "съществително"},
|
||||||
|
{ORTH: "текст.", NORM: "текстилен"},
|
||||||
|
{ORTH: "телев.", NORM: "телевизия"},
|
||||||
|
{ORTH: "тел.", NORM: "телефон"},
|
||||||
|
{ORTH: "т.е.", NORM: "тоест"},
|
||||||
|
{ORTH: "т.н.", NORM: "така нататък"},
|
||||||
|
{ORTH: "т.нар.", NORM: "така наречен"},
|
||||||
|
{ORTH: "търж.", NORM: "тържествено"},
|
||||||
{ORTH: "ул.", NORM: "улица"},
|
{ORTH: "ул.", NORM: "улица"},
|
||||||
|
{ORTH: "уч.", NORM: "училище"},
|
||||||
|
{ORTH: "унив.", NORM: "университет"},
|
||||||
|
{ORTH: "харт.", NORM: "хартия"},
|
||||||
|
{ORTH: "хидр.", NORM: "хидравлика"},
|
||||||
|
{ORTH: "хран.", NORM: "хранителна"},
|
||||||
|
{ORTH: "църк.", NORM: "църковен термин"},
|
||||||
|
{ORTH: "числ.", NORM: "числително"},
|
||||||
{ORTH: "чл.", NORM: "член"},
|
{ORTH: "чл.", NORM: "член"},
|
||||||
]
|
{ORTH: "ч.", NORM: "число"},
|
||||||
|
{ORTH: "числ.", NORM: "числително"},
|
||||||
for abbr in _abbr_dot_exc:
|
{ORTH: "шахм.", NORM: "шахмат"},
|
||||||
|
{ORTH: "шах.", NORM: "шахмат"},
|
||||||
|
{ORTH: "юр.", NORM: "юридически"},
|
||||||
|
]:
|
||||||
_exc[abbr[ORTH]] = [abbr]
|
_exc[abbr[ORTH]] = [abbr]
|
||||||
|
|
||||||
|
# slash abbreviations
|
||||||
|
for abbr in [
|
||||||
|
{ORTH: "м/у", NORM: "между"},
|
||||||
|
{ORTH: "с/у", NORM: "срещу"},
|
||||||
|
]:
|
||||||
|
_exc[abbr[ORTH]] = [abbr]
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = _exc
|
||||||
|
|
|
@ -1,13 +1,13 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
class BengaliDefaults(Language.Defaults):
|
class BengaliDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
@ -23,13 +23,25 @@ class Bengali(Language):
|
||||||
@Bengali.factory(
|
@Bengali.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "rule",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return Lemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Bengali"]
|
__all__ = ["Bengali"]
|
||||||
|
|
27
spacy/lang/ca/__init__.py
Normal file → Executable file
27
spacy/lang/ca/__init__.py
Normal file → Executable file
|
@ -1,20 +1,21 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
from .lemmatizer import CatalanLemmatizer
|
from .lemmatizer import CatalanLemmatizer
|
||||||
|
|
||||||
|
|
||||||
class CatalanDefaults(Language.Defaults):
|
class CatalanDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
prefixes = TOKENIZER_PREFIXES
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
@ -28,13 +29,25 @@ class Catalan(Language):
|
||||||
@Catalan.factory(
|
@Catalan.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "rule",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return CatalanLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return CatalanLemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Catalan"]
|
__all__ = ["Catalan"]
|
||||||
|
|
|
@ -76,6 +76,6 @@ class CatalanLemmatizer(Lemmatizer):
|
||||||
forms.append(self.lookup_lemmatize(token)[0])
|
forms.append(self.lookup_lemmatize(token)[0])
|
||||||
if not forms:
|
if not forms:
|
||||||
forms.append(string)
|
forms.append(string)
|
||||||
forms = list(set(forms))
|
forms = list(dict.fromkeys(forms))
|
||||||
self.cache[cache_key] = forms
|
self.cache[cache_key] = forms
|
||||||
return forms
|
return forms
|
||||||
|
|
11
spacy/lang/ca/punctuation.py
Normal file → Executable file
11
spacy/lang/ca/punctuation.py
Normal file → Executable file
|
@ -1,4 +1,5 @@
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
||||||
|
from ..char_classes import LIST_CURRENCY
|
||||||
from ..char_classes import CURRENCY
|
from ..char_classes import CURRENCY
|
||||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
|
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
|
||||||
from ..char_classes import merge_chars, _units
|
from ..char_classes import merge_chars, _units
|
||||||
|
@ -6,6 +7,14 @@ from ..char_classes import merge_chars, _units
|
||||||
|
|
||||||
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
|
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
|
||||||
|
|
||||||
|
_prefixes = (
|
||||||
|
["§", "%", "=", "—", "–", "-", r"\+(?![0-9])"]
|
||||||
|
+ LIST_PUNCT
|
||||||
|
+ LIST_ELLIPSES
|
||||||
|
+ LIST_QUOTES
|
||||||
|
+ LIST_CURRENCY
|
||||||
|
+ LIST_ICONS
|
||||||
|
)
|
||||||
|
|
||||||
_infixes = (
|
_infixes = (
|
||||||
LIST_ELLIPSES
|
LIST_ELLIPSES
|
||||||
|
@ -18,6 +27,7 @@ _infixes = (
|
||||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION),
|
r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION),
|
||||||
|
r"('ls|'l|'ns|'t|'m|'n|-les|-la|-lo|-li|-los|-me|-nos|-te|-vos|-se|-hi|-ne|-ho)(?![A-Za-z])|(-l'|-m'|-t'|-n')",
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -44,3 +54,4 @@ _suffixes = (
|
||||||
|
|
||||||
TOKENIZER_INFIXES = _infixes
|
TOKENIZER_INFIXES = _infixes
|
||||||
TOKENIZER_SUFFIXES = _suffixes
|
TOKENIZER_SUFFIXES = _suffixes
|
||||||
|
TOKENIZER_PREFIXES = _prefixes
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
|
from typing import Union, Iterator, Tuple
|
||||||
|
from ...tokens import Doc, Span
|
||||||
from ...symbols import NOUN, PROPN
|
from ...symbols import NOUN, PROPN
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike):
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||||
# fmt: off
|
# fmt: off
|
||||||
labels = ["nsubj", "nsubj:pass", "obj", "obl", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
labels = ["nsubj", "nsubj:pass", "obj", "obl", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||||
|
|
21
spacy/lang/ca/tokenizer_exceptions.py
Normal file → Executable file
21
spacy/lang/ca/tokenizer_exceptions.py
Normal file → Executable file
|
@ -18,12 +18,21 @@ for exc_data in [
|
||||||
{ORTH: "nov.", NORM: "novembre"},
|
{ORTH: "nov.", NORM: "novembre"},
|
||||||
{ORTH: "dec.", NORM: "desembre"},
|
{ORTH: "dec.", NORM: "desembre"},
|
||||||
{ORTH: "Dr.", NORM: "doctor"},
|
{ORTH: "Dr.", NORM: "doctor"},
|
||||||
|
{ORTH: "Dra.", NORM: "doctora"},
|
||||||
{ORTH: "Sr.", NORM: "senyor"},
|
{ORTH: "Sr.", NORM: "senyor"},
|
||||||
{ORTH: "Sra.", NORM: "senyora"},
|
{ORTH: "Sra.", NORM: "senyora"},
|
||||||
{ORTH: "Srta.", NORM: "senyoreta"},
|
{ORTH: "Srta.", NORM: "senyoreta"},
|
||||||
{ORTH: "núm", NORM: "número"},
|
{ORTH: "núm", NORM: "número"},
|
||||||
{ORTH: "St.", NORM: "sant"},
|
{ORTH: "St.", NORM: "sant"},
|
||||||
{ORTH: "Sta.", NORM: "santa"},
|
{ORTH: "Sta.", NORM: "santa"},
|
||||||
|
{ORTH: "pl.", NORM: "plaça"},
|
||||||
|
{ORTH: "à."},
|
||||||
|
{ORTH: "è."},
|
||||||
|
{ORTH: "é."},
|
||||||
|
{ORTH: "í."},
|
||||||
|
{ORTH: "ò."},
|
||||||
|
{ORTH: "ó."},
|
||||||
|
{ORTH: "ú."},
|
||||||
{ORTH: "'l"},
|
{ORTH: "'l"},
|
||||||
{ORTH: "'ls"},
|
{ORTH: "'ls"},
|
||||||
{ORTH: "'m"},
|
{ORTH: "'m"},
|
||||||
|
@ -34,6 +43,18 @@ for exc_data in [
|
||||||
]:
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
_exc["del"] = [{ORTH: "d", NORM: "de"}, {ORTH: "el"}]
|
||||||
|
_exc["dels"] = [{ORTH: "d", NORM: "de"}, {ORTH: "els"}]
|
||||||
|
|
||||||
|
_exc["al"] = [{ORTH: "a"}, {ORTH: "l", NORM: "el"}]
|
||||||
|
_exc["als"] = [{ORTH: "a"}, {ORTH: "ls", NORM: "els"}]
|
||||||
|
|
||||||
|
_exc["pel"] = [{ORTH: "p", NORM: "per"}, {ORTH: "el"}]
|
||||||
|
_exc["pels"] = [{ORTH: "p", NORM: "per"}, {ORTH: "els"}]
|
||||||
|
|
||||||
|
_exc["holahola"] = [{ORTH: "holahola", NORM: "cocacola"}]
|
||||||
|
|
||||||
|
|
||||||
# Times
|
# Times
|
||||||
_exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", NORM: "p.m."}]
|
_exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", NORM: "p.m."}]
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class CzechDefaults(Language.Defaults):
|
class CzechDefaults(BaseDefaults):
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
|
@ -3,10 +3,10 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class DanishDefaults(Language.Defaults):
|
class DanishDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
|
from typing import Union, Iterator, Tuple
|
||||||
|
from ...tokens import Doc, Span
|
||||||
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
|
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike):
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
def is_verb_token(tok):
|
def is_verb_token(tok):
|
||||||
return tok.pos in [VERB, AUX]
|
return tok.pos in [VERB, AUX]
|
||||||
|
|
||||||
|
@ -32,7 +34,7 @@ def noun_chunks(doclike):
|
||||||
def get_bounds(doc, root):
|
def get_bounds(doc, root):
|
||||||
return get_left_bound(doc, root), get_right_bound(doc, root)
|
return get_left_bound(doc, root), get_right_bound(doc, root)
|
||||||
|
|
||||||
doc = doclike.doc
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
if not doc.has_annotation("DEP"):
|
if not doc.has_annotation("DEP"):
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
|
|
|
@ -2,10 +2,10 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class GermanDefaults(Language.Defaults):
|
class GermanDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
from typing import Union, Iterator
|
from typing import Union, Iterator, Tuple
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
from ...tokens import Doc, Span
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||||
# this iterator extracts spans headed by NOUNs starting from the left-most
|
# this iterator extracts spans headed by NOUNs starting from the left-most
|
||||||
# syntactic dependent until the NOUN itself for close apposition and
|
# syntactic dependent until the NOUN itself for close apposition and
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
@ -7,10 +7,10 @@ from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .lemmatizer import GreekLemmatizer
|
from .lemmatizer import GreekLemmatizer
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class GreekDefaults(Language.Defaults):
|
class GreekDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
@ -28,13 +28,25 @@ class Greek(Language):
|
||||||
@Greek.factory(
|
@Greek.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "rule",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return GreekLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return GreekLemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Greek"]
|
__all__ = ["Greek"]
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
from typing import Union, Iterator
|
from typing import Union, Iterator, Tuple
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
from ...tokens import Doc, Span
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||||
# It follows the logic of the noun chunks finder of English language,
|
# It follows the logic of the noun chunks finder of English language,
|
||||||
# adjusted to some Greek language special characteristics.
|
# adjusted to some Greek language special characteristics.
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
@ -7,10 +7,10 @@ from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from .lemmatizer import EnglishLemmatizer
|
from .lemmatizer import EnglishLemmatizer
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class EnglishDefaults(Language.Defaults):
|
class EnglishDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
@ -26,13 +26,25 @@ class English(Language):
|
||||||
@English.factory(
|
@English.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "rule",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return EnglishLemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["English"]
|
__all__ = ["English"]
|
||||||
|
|
|
@ -10,7 +10,7 @@ class EnglishLemmatizer(Lemmatizer):
|
||||||
Check whether we're dealing with an uninflected paradigm, so we can
|
Check whether we're dealing with an uninflected paradigm, so we can
|
||||||
avoid lemmatization entirely.
|
avoid lemmatization entirely.
|
||||||
|
|
||||||
univ_pos (unicode / int): The token's universal part-of-speech tag.
|
univ_pos (str / int): The token's universal part-of-speech tag.
|
||||||
morphology (dict): The token's morphological features following the
|
morphology (dict): The token's morphological features following the
|
||||||
Universal Dependencies scheme.
|
Universal Dependencies scheme.
|
||||||
"""
|
"""
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user