mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-28 04:43:42 +03:00
Merge branch 'develop' into pr/6253
This commit is contained in:
commit
d94e241fce
106
.github/contributors/Stannislav.md
vendored
Normal file
106
.github/contributors/Stannislav.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Stanislav Schmidt |
|
||||||
|
| Company name (if applicable) | Blue Brain Project |
|
||||||
|
| Title or role (if applicable) | ML Engineer |
|
||||||
|
| Date | 2020-10-02 |
|
||||||
|
| GitHub username | Stannislav |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/delzac.md
vendored
Normal file
106
.github/contributors/delzac.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Matthew Chin |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-09-22 |
|
||||||
|
| GitHub username | delzac |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/florijanstamenkovic.md
vendored
Normal file
106
.github/contributors/florijanstamenkovic.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Florijan Stamenkovic |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-10-05 |
|
||||||
|
| GitHub username | florijanstamenkovic |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/rasyidf.md
vendored
Normal file
106
.github/contributors/rasyidf.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | ------------------------ |
|
||||||
|
| Name | Muhammad Fahmi Rasyid |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-09-23 |
|
||||||
|
| GitHub username | rasyidf |
|
||||||
|
| Website (optional) | http://rasyidf.github.io |
|
106
.github/contributors/zaibacu.md
vendored
Normal file
106
.github/contributors/zaibacu.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Šarūnas Navickas |
|
||||||
|
| Company name (if applicable) | TokenMill |
|
||||||
|
| Title or role (if applicable) | Data Engineer |
|
||||||
|
| Date | 2020-09-24 |
|
||||||
|
| GitHub username | zaibacu |
|
||||||
|
| Website (optional) | |
|
|
@ -224,7 +224,7 @@ for that particular code. Here's an example:
|
||||||
```python
|
```python
|
||||||
# fmt: off
|
# fmt: off
|
||||||
text = "I look forward to using Thingamajig. I've been told it will make my life easier..."
|
text = "I look forward to using Thingamajig. I've been told it will make my life easier..."
|
||||||
heads = [1, 0, -1, -2, -1, -1, -5, -1, 3, 2, 1, 0, 2, 1, -3, 1, 1, -3, -7]
|
heads = [1, 1, 1, 1, 3, 4, 1, 6, 11, 11, 11, 11, 14, 14, 11, 16, 17, 14, 11]
|
||||||
deps = ["nsubj", "ROOT", "advmod", "prep", "pcomp", "dobj", "punct", "",
|
deps = ["nsubj", "ROOT", "advmod", "prep", "pcomp", "dobj", "punct", "",
|
||||||
"nsubjpass", "aux", "auxpass", "ROOT", "nsubj", "aux", "ccomp",
|
"nsubjpass", "aux", "auxpass", "ROOT", "nsubj", "aux", "ccomp",
|
||||||
"poss", "nsubj", "ccomp", "punct"]
|
"poss", "nsubj", "ccomp", "punct"]
|
||||||
|
@ -421,7 +421,7 @@ Tests that require the model to be loaded should be marked with
|
||||||
`@pytest.mark.models`. Loading the models is expensive and not necessary if
|
`@pytest.mark.models`. Loading the models is expensive and not necessary if
|
||||||
you're not actually testing the model performance. If all you need is a `Doc`
|
you're not actually testing the model performance. If all you need is a `Doc`
|
||||||
object with annotations like heads, POS tags or the dependency parse, you can
|
object with annotations like heads, POS tags or the dependency parse, you can
|
||||||
use the `get_doc()` utility function to construct it manually.
|
use the `Doc` constructor to construct it manually.
|
||||||
|
|
||||||
📖 **For more guidelines and information on how to add tests, check out the [tests README](spacy/tests/README.md).**
|
📖 **For more guidelines and information on how to add tests, check out the [tests README](spacy/tests/README.md).**
|
||||||
|
|
||||||
|
|
2
Makefile
2
Makefile
|
@ -1,7 +1,7 @@
|
||||||
SHELL := /bin/bash
|
SHELL := /bin/bash
|
||||||
|
|
||||||
ifndef SPACY_EXTRAS
|
ifndef SPACY_EXTRAS
|
||||||
override SPACY_EXTRAS = spacy-lookups-data==0.4.0.dev0 jieba pkuseg==0.0.25 sudachipy sudachidict_core
|
override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba spacy-pkuseg==0.0.26 sudachipy sudachidict_core
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifndef PYVER
|
ifndef PYVER
|
||||||
|
|
32
README.md
32
README.md
|
@ -8,12 +8,12 @@ be used in real products.
|
||||||
|
|
||||||
spaCy comes with
|
spaCy comes with
|
||||||
[pretrained pipelines](https://spacy.io/models) and vectors, and
|
[pretrained pipelines](https://spacy.io/models) and vectors, and
|
||||||
currently supports tokenization for **59+ languages**. It features
|
currently supports tokenization for **60+ languages**. It features
|
||||||
state-of-the-art speed, convolutional **neural network models** for tagging,
|
state-of-the-art speed, convolutional **neural network models** for tagging,
|
||||||
parsing, **named entity recognition**, **text classification** and more, multi-task learning with pretrained **transformers** like BERT, as well as a production-ready training system and easy model packaging, deployment and workflow management.
|
parsing, **named entity recognition**, **text classification** and more, multi-task learning with pretrained **transformers** like BERT, as well as a production-ready training system and easy model packaging, deployment and workflow management.
|
||||||
spaCy is commercial open-source software, released under the MIT license.
|
spaCy is commercial open-source software, released under the MIT license.
|
||||||
|
|
||||||
💫 **Version 2.3 out now!**
|
💫 **Version 3.0 (nightly) out now!**
|
||||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||||
|
|
||||||
[](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
[](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
||||||
|
@ -29,16 +29,17 @@ spaCy is commercial open-source software, released under the MIT license.
|
||||||
|
|
||||||
## 📖 Documentation
|
## 📖 Documentation
|
||||||
|
|
||||||
| Documentation | |
|
| Documentation | |
|
||||||
| --------------- | -------------------------------------------------------------- |
|
| ------------------- | -------------------------------------------------------------- |
|
||||||
| [spaCy 101] | New to spaCy? Here's everything you need to know! |
|
| [spaCy 101] | New to spaCy? Here's everything you need to know! |
|
||||||
| [Usage Guides] | How to use spaCy and its features. |
|
| [Usage Guides] | How to use spaCy and its features. |
|
||||||
| [New in v3.0] | New features, backwards incompatibilities and migration guide. |
|
| [New in v3.0] | New features, backwards incompatibilities and migration guide. |
|
||||||
| [API Reference] | The detailed reference for spaCy's API. |
|
| [Project Templates] | End-to-end workflows you can clone, modify and run. |
|
||||||
| [Models] | Download statistical language models for spaCy. |
|
| [API Reference] | The detailed reference for spaCy's API. |
|
||||||
| [Universe] | Libraries, extensions, demos, books and courses. |
|
| [Models] | Download statistical language models for spaCy. |
|
||||||
| [Changelog] | Changes and version history. |
|
| [Universe] | Libraries, extensions, demos, books and courses. |
|
||||||
| [Contribute] | How to contribute to the spaCy project and code base. |
|
| [Changelog] | Changes and version history. |
|
||||||
|
| [Contribute] | How to contribute to the spaCy project and code base. |
|
||||||
|
|
||||||
[spacy 101]: https://spacy.io/usage/spacy-101
|
[spacy 101]: https://spacy.io/usage/spacy-101
|
||||||
[new in v3.0]: https://spacy.io/usage/v3
|
[new in v3.0]: https://spacy.io/usage/v3
|
||||||
|
@ -46,6 +47,7 @@ spaCy is commercial open-source software, released under the MIT license.
|
||||||
[api reference]: https://spacy.io/api/
|
[api reference]: https://spacy.io/api/
|
||||||
[models]: https://spacy.io/models
|
[models]: https://spacy.io/models
|
||||||
[universe]: https://spacy.io/universe
|
[universe]: https://spacy.io/universe
|
||||||
|
[project templates]: https://github.com/explosion/projects
|
||||||
[changelog]: https://spacy.io/usage#changelog
|
[changelog]: https://spacy.io/usage#changelog
|
||||||
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
||||||
|
|
||||||
|
@ -69,7 +71,7 @@ it.
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
- Support for **59+ languages**
|
- Support for **60+ languages**
|
||||||
- **Trained pipelines**
|
- **Trained pipelines**
|
||||||
- Multi-task learning with pretrained **transformers** like BERT
|
- Multi-task learning with pretrained **transformers** like BERT
|
||||||
- Pretrained **word vectors**
|
- Pretrained **word vectors**
|
||||||
|
@ -102,9 +104,11 @@ For detailed installation instructions, see the
|
||||||
### pip
|
### pip
|
||||||
|
|
||||||
Using pip, spaCy releases are available as source packages and binary wheels (as
|
Using pip, spaCy releases are available as source packages and binary wheels (as
|
||||||
of `v2.0.13`).
|
of `v2.0.13`). Before you install spaCy and its dependencies, make sure that
|
||||||
|
your `pip`, `setuptools` and `wheel` are up to date.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
pip install -U pip setuptools wheel
|
||||||
pip install spacy
|
pip install spacy
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -1,133 +0,0 @@
|
||||||
[paths]
|
|
||||||
train = ""
|
|
||||||
dev = ""
|
|
||||||
raw = null
|
|
||||||
init_tok2vec = null
|
|
||||||
|
|
||||||
[system]
|
|
||||||
seed = 0
|
|
||||||
use_pytorch_for_gpu_memory = false
|
|
||||||
|
|
||||||
[training]
|
|
||||||
seed = ${system:seed}
|
|
||||||
dropout = 0.1
|
|
||||||
init_tok2vec = ${paths:init_tok2vec}
|
|
||||||
vectors = null
|
|
||||||
accumulate_gradient = 1
|
|
||||||
max_steps = 0
|
|
||||||
max_epochs = 0
|
|
||||||
patience = 10000
|
|
||||||
eval_frequency = 200
|
|
||||||
score_weights = {"dep_las": 0.4, "ents_f": 0.4, "tag_acc": 0.2}
|
|
||||||
frozen_components = []
|
|
||||||
|
|
||||||
[training.train_corpus]
|
|
||||||
@readers = "spacy.Corpus.v1"
|
|
||||||
path = ${paths:train}
|
|
||||||
gold_preproc = true
|
|
||||||
max_length = 0
|
|
||||||
limit = 0
|
|
||||||
|
|
||||||
[training.dev_corpus]
|
|
||||||
@readers = "spacy.Corpus.v1"
|
|
||||||
path = ${paths:dev}
|
|
||||||
gold_preproc = ${training.read_train:gold_preproc}
|
|
||||||
max_length = 0
|
|
||||||
limit = 0
|
|
||||||
|
|
||||||
[training.batcher]
|
|
||||||
@batchers = "spacy.batch_by_words.v1"
|
|
||||||
discard_oversize = false
|
|
||||||
tolerance = 0.2
|
|
||||||
|
|
||||||
[training.batcher.size]
|
|
||||||
@schedules = "compounding.v1"
|
|
||||||
start = 100
|
|
||||||
stop = 1000
|
|
||||||
compound = 1.001
|
|
||||||
|
|
||||||
[training.optimizer]
|
|
||||||
@optimizers = "Adam.v1"
|
|
||||||
beta1 = 0.9
|
|
||||||
beta2 = 0.999
|
|
||||||
L2_is_weight_decay = true
|
|
||||||
L2 = 0.01
|
|
||||||
grad_clip = 1.0
|
|
||||||
use_averages = false
|
|
||||||
eps = 1e-8
|
|
||||||
learn_rate = 0.001
|
|
||||||
|
|
||||||
[nlp]
|
|
||||||
lang = "en"
|
|
||||||
load_vocab_data = false
|
|
||||||
pipeline = ["tok2vec", "ner", "tagger", "parser"]
|
|
||||||
|
|
||||||
[nlp.tokenizer]
|
|
||||||
@tokenizers = "spacy.Tokenizer.v1"
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
|
||||||
|
|
||||||
[components]
|
|
||||||
|
|
||||||
[components.tok2vec]
|
|
||||||
factory = "tok2vec"
|
|
||||||
|
|
||||||
[components.ner]
|
|
||||||
factory = "ner"
|
|
||||||
learn_tokens = false
|
|
||||||
min_action_freq = 1
|
|
||||||
|
|
||||||
[components.tagger]
|
|
||||||
factory = "tagger"
|
|
||||||
|
|
||||||
[components.parser]
|
|
||||||
factory = "parser"
|
|
||||||
learn_tokens = false
|
|
||||||
min_action_freq = 30
|
|
||||||
|
|
||||||
[components.tagger.model]
|
|
||||||
@architectures = "spacy.Tagger.v1"
|
|
||||||
|
|
||||||
[components.tagger.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecListener.v1"
|
|
||||||
width = ${components.tok2vec.model.encode:width}
|
|
||||||
|
|
||||||
[components.parser.model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
|
||||||
nr_feature_tokens = 8
|
|
||||||
hidden_width = 128
|
|
||||||
maxout_pieces = 2
|
|
||||||
use_upper = true
|
|
||||||
|
|
||||||
[components.parser.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecListener.v1"
|
|
||||||
width = ${components.tok2vec.model.encode:width}
|
|
||||||
|
|
||||||
[components.ner.model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
|
||||||
nr_feature_tokens = 3
|
|
||||||
hidden_width = 128
|
|
||||||
maxout_pieces = 2
|
|
||||||
use_upper = true
|
|
||||||
|
|
||||||
[components.ner.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecListener.v1"
|
|
||||||
width = ${components.tok2vec.model.encode:width}
|
|
||||||
|
|
||||||
[components.tok2vec.model]
|
|
||||||
@architectures = "spacy.Tok2Vec.v1"
|
|
||||||
|
|
||||||
[components.tok2vec.model.embed]
|
|
||||||
@architectures = "spacy.MultiHashEmbed.v1"
|
|
||||||
width = ${components.tok2vec.model.encode:width}
|
|
||||||
rows = 2000
|
|
||||||
also_embed_subwords = true
|
|
||||||
also_use_static_vectors = false
|
|
||||||
|
|
||||||
[components.tok2vec.model.encode]
|
|
||||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
|
||||||
width = 96
|
|
||||||
depth = 4
|
|
||||||
window_size = 1
|
|
||||||
maxout_pieces = 3
|
|
|
@ -1,152 +0,0 @@
|
||||||
# Training hyper-parameters and additional features.
|
|
||||||
[training]
|
|
||||||
# Whether to train on sequences with 'gold standard' sentence boundaries
|
|
||||||
# and tokens. If you set this to true, take care to ensure your run-time
|
|
||||||
# data is passed in sentence-by-sentence via some prior preprocessing.
|
|
||||||
gold_preproc = false
|
|
||||||
# Limitations on training document length or number of examples.
|
|
||||||
max_length = 0
|
|
||||||
limit = 0
|
|
||||||
# Data augmentation
|
|
||||||
orth_variant_level = 0.0
|
|
||||||
dropout = 0.1
|
|
||||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
|
||||||
patience = 1600
|
|
||||||
max_epochs = 0
|
|
||||||
max_steps = 20000
|
|
||||||
eval_frequency = 400
|
|
||||||
# Other settings
|
|
||||||
seed = 0
|
|
||||||
accumulate_gradient = 1
|
|
||||||
use_pytorch_for_gpu_memory = false
|
|
||||||
# Control how scores are printed and checkpoints are evaluated.
|
|
||||||
scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
|
|
||||||
score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
|
|
||||||
# These settings are invalid for the transformer models.
|
|
||||||
init_tok2vec = null
|
|
||||||
discard_oversize = false
|
|
||||||
omit_extra_lookups = false
|
|
||||||
batch_by = "words"
|
|
||||||
use_gpu = -1
|
|
||||||
raw_text = null
|
|
||||||
tag_map = null
|
|
||||||
|
|
||||||
[training.batch_size]
|
|
||||||
@schedules = "compounding.v1"
|
|
||||||
start = 1000
|
|
||||||
stop = 1000
|
|
||||||
compound = 1.001
|
|
||||||
|
|
||||||
[training.optimizer]
|
|
||||||
@optimizers = "Adam.v1"
|
|
||||||
beta1 = 0.9
|
|
||||||
beta2 = 0.999
|
|
||||||
L2_is_weight_decay = true
|
|
||||||
L2 = 0.01
|
|
||||||
grad_clip = 1.0
|
|
||||||
use_averages = true
|
|
||||||
eps = 1e-8
|
|
||||||
learn_rate = 0.001
|
|
||||||
|
|
||||||
[pretraining]
|
|
||||||
max_epochs = 1000
|
|
||||||
min_length = 5
|
|
||||||
max_length = 500
|
|
||||||
dropout = 0.2
|
|
||||||
n_save_every = null
|
|
||||||
batch_size = 3000
|
|
||||||
seed = ${training:seed}
|
|
||||||
use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory}
|
|
||||||
tok2vec_model = "nlp.pipeline.tok2vec.model"
|
|
||||||
|
|
||||||
[pretraining.objective]
|
|
||||||
type = "characters"
|
|
||||||
n_characters = 4
|
|
||||||
|
|
||||||
[pretraining.optimizer]
|
|
||||||
@optimizers = "Adam.v1"
|
|
||||||
beta1 = 0.9
|
|
||||||
beta2 = 0.999
|
|
||||||
L2_is_weight_decay = true
|
|
||||||
L2 = 0.01
|
|
||||||
grad_clip = 1.0
|
|
||||||
use_averages = true
|
|
||||||
eps = 1e-8
|
|
||||||
learn_rate = 0.001
|
|
||||||
|
|
||||||
[nlp]
|
|
||||||
lang = "en"
|
|
||||||
vectors = null
|
|
||||||
base_model = null
|
|
||||||
|
|
||||||
[nlp.pipeline]
|
|
||||||
|
|
||||||
[nlp.pipeline.tok2vec]
|
|
||||||
factory = "tok2vec"
|
|
||||||
|
|
||||||
[nlp.pipeline.senter]
|
|
||||||
factory = "senter"
|
|
||||||
|
|
||||||
[nlp.pipeline.ner]
|
|
||||||
factory = "ner"
|
|
||||||
learn_tokens = false
|
|
||||||
min_action_freq = 1
|
|
||||||
beam_width = 1
|
|
||||||
beam_update_prob = 1.0
|
|
||||||
|
|
||||||
[nlp.pipeline.tagger]
|
|
||||||
factory = "tagger"
|
|
||||||
|
|
||||||
[nlp.pipeline.parser]
|
|
||||||
factory = "parser"
|
|
||||||
learn_tokens = false
|
|
||||||
min_action_freq = 1
|
|
||||||
beam_width = 1
|
|
||||||
beam_update_prob = 1.0
|
|
||||||
|
|
||||||
[nlp.pipeline.senter.model]
|
|
||||||
@architectures = "spacy.Tagger.v1"
|
|
||||||
|
|
||||||
[nlp.pipeline.senter.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecTensors.v1"
|
|
||||||
width = ${nlp.pipeline.tok2vec.model:width}
|
|
||||||
|
|
||||||
[nlp.pipeline.tagger.model]
|
|
||||||
@architectures = "spacy.Tagger.v1"
|
|
||||||
|
|
||||||
[nlp.pipeline.tagger.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecTensors.v1"
|
|
||||||
width = ${nlp.pipeline.tok2vec.model:width}
|
|
||||||
|
|
||||||
[nlp.pipeline.parser.model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
|
||||||
nr_feature_tokens = 8
|
|
||||||
hidden_width = 128
|
|
||||||
maxout_pieces = 3
|
|
||||||
use_upper = false
|
|
||||||
|
|
||||||
[nlp.pipeline.parser.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecTensors.v1"
|
|
||||||
width = ${nlp.pipeline.tok2vec.model:width}
|
|
||||||
|
|
||||||
[nlp.pipeline.ner.model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
|
||||||
nr_feature_tokens = 3
|
|
||||||
hidden_width = 128
|
|
||||||
maxout_pieces = 3
|
|
||||||
use_upper = false
|
|
||||||
|
|
||||||
[nlp.pipeline.ner.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecTensors.v1"
|
|
||||||
width = ${nlp.pipeline.tok2vec.model:width}
|
|
||||||
|
|
||||||
[nlp.pipeline.tok2vec.model]
|
|
||||||
@architectures = "spacy.HashEmbedCNN.v1"
|
|
||||||
pretrained_vectors = ${nlp:vectors}
|
|
||||||
width = 256
|
|
||||||
depth = 6
|
|
||||||
window_size = 1
|
|
||||||
embed_size = 10000
|
|
||||||
maxout_pieces = 3
|
|
||||||
subword_features = true
|
|
||||||
dropout = null
|
|
|
@ -1,73 +0,0 @@
|
||||||
# Training hyper-parameters and additional features.
|
|
||||||
[training]
|
|
||||||
# Whether to train on sequences with 'gold standard' sentence boundaries
|
|
||||||
# and tokens. If you set this to true, take care to ensure your run-time
|
|
||||||
# data is passed in sentence-by-sentence via some prior preprocessing.
|
|
||||||
gold_preproc = false
|
|
||||||
# Limitations on training document length or number of examples.
|
|
||||||
max_length = 3000
|
|
||||||
limit = 0
|
|
||||||
# Data augmentation
|
|
||||||
orth_variant_level = 0.0
|
|
||||||
dropout = 0.1
|
|
||||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
|
||||||
patience = 100000
|
|
||||||
max_epochs = 0
|
|
||||||
max_steps = 0
|
|
||||||
eval_frequency = 1000
|
|
||||||
# Other settings
|
|
||||||
seed = 0
|
|
||||||
accumulate_gradient = 1
|
|
||||||
use_pytorch_for_gpu_memory = false
|
|
||||||
# Control how scores are printed and checkpoints are evaluated.
|
|
||||||
scores = ["speed", "ents_p", "ents_r", "ents_f"]
|
|
||||||
score_weights = {"ents_f": 1.0}
|
|
||||||
# These settings are invalid for the transformer models.
|
|
||||||
init_tok2vec = null
|
|
||||||
discard_oversize = false
|
|
||||||
omit_extra_lookups = false
|
|
||||||
batch_by = "words"
|
|
||||||
|
|
||||||
[training.batch_size]
|
|
||||||
@schedules = "compounding.v1"
|
|
||||||
start = 100
|
|
||||||
stop = 1000
|
|
||||||
compound = 1.001
|
|
||||||
|
|
||||||
[training.optimizer]
|
|
||||||
@optimizers = "Adam.v1"
|
|
||||||
beta1 = 0.9
|
|
||||||
beta2 = 0.999
|
|
||||||
L2_is_weight_decay = true
|
|
||||||
L2 = 0.01
|
|
||||||
grad_clip = 1.0
|
|
||||||
use_averages = true
|
|
||||||
eps = 1e-8
|
|
||||||
learn_rate = 0.001
|
|
||||||
|
|
||||||
[nlp]
|
|
||||||
lang = "en"
|
|
||||||
vectors = null
|
|
||||||
|
|
||||||
[nlp.pipeline.ner]
|
|
||||||
factory = "ner"
|
|
||||||
learn_tokens = false
|
|
||||||
min_action_freq = 1
|
|
||||||
|
|
||||||
[nlp.pipeline.ner.model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
|
||||||
nr_feature_tokens = 3
|
|
||||||
hidden_width = 64
|
|
||||||
maxout_pieces = 2
|
|
||||||
use_upper = true
|
|
||||||
|
|
||||||
[nlp.pipeline.ner.model.tok2vec]
|
|
||||||
@architectures = "spacy.HashEmbedCNN.v1"
|
|
||||||
pretrained_vectors = ${nlp:vectors}
|
|
||||||
width = 96
|
|
||||||
depth = 4
|
|
||||||
window_size = 1
|
|
||||||
embed_size = 2000
|
|
||||||
maxout_pieces = 3
|
|
||||||
subword_features = true
|
|
||||||
dropout = ${training:dropout}
|
|
|
@ -1,73 +0,0 @@
|
||||||
[training]
|
|
||||||
patience = 10000
|
|
||||||
eval_frequency = 200
|
|
||||||
dropout = 0.2
|
|
||||||
init_tok2vec = null
|
|
||||||
vectors = null
|
|
||||||
max_epochs = 100
|
|
||||||
orth_variant_level = 0.0
|
|
||||||
gold_preproc = true
|
|
||||||
max_length = 0
|
|
||||||
use_gpu = 0
|
|
||||||
scores = ["tags_acc", "uas", "las"]
|
|
||||||
score_weights = {"las": 0.8, "tags_acc": 0.2}
|
|
||||||
limit = 0
|
|
||||||
seed = 0
|
|
||||||
accumulate_gradient = 2
|
|
||||||
discard_oversize = false
|
|
||||||
|
|
||||||
[training.batch_size]
|
|
||||||
@schedules = "compounding.v1"
|
|
||||||
start = 100
|
|
||||||
stop = 1000
|
|
||||||
compound = 1.001
|
|
||||||
|
|
||||||
[training.optimizer]
|
|
||||||
@optimizers = "Adam.v1"
|
|
||||||
learn_rate = 0.001
|
|
||||||
beta1 = 0.9
|
|
||||||
beta2 = 0.999
|
|
||||||
|
|
||||||
[nlp]
|
|
||||||
lang = "en"
|
|
||||||
vectors = ${training:vectors}
|
|
||||||
|
|
||||||
[nlp.pipeline.tok2vec]
|
|
||||||
factory = "tok2vec"
|
|
||||||
|
|
||||||
[nlp.pipeline.tagger]
|
|
||||||
factory = "tagger"
|
|
||||||
|
|
||||||
[nlp.pipeline.parser]
|
|
||||||
factory = "parser"
|
|
||||||
learn_tokens = false
|
|
||||||
min_action_freq = 1
|
|
||||||
beam_width = 1
|
|
||||||
beam_update_prob = 1.0
|
|
||||||
|
|
||||||
[nlp.pipeline.tagger.model]
|
|
||||||
@architectures = "spacy.Tagger.v1"
|
|
||||||
|
|
||||||
[nlp.pipeline.tagger.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecTensors.v1"
|
|
||||||
width = ${nlp.pipeline.tok2vec.model:width}
|
|
||||||
|
|
||||||
[nlp.pipeline.parser.model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
|
||||||
nr_feature_tokens = 8
|
|
||||||
hidden_width = 64
|
|
||||||
maxout_pieces = 3
|
|
||||||
|
|
||||||
[nlp.pipeline.parser.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecTensors.v1"
|
|
||||||
width = ${nlp.pipeline.tok2vec.model:width}
|
|
||||||
|
|
||||||
[nlp.pipeline.tok2vec.model]
|
|
||||||
@architectures = "spacy.HashEmbedBiLSTM.v1"
|
|
||||||
pretrained_vectors = ${nlp:vectors}
|
|
||||||
width = 96
|
|
||||||
depth = 4
|
|
||||||
embed_size = 2000
|
|
||||||
subword_features = true
|
|
||||||
maxout_pieces = 3
|
|
||||||
dropout = null
|
|
|
@ -1,110 +0,0 @@
|
||||||
[paths]
|
|
||||||
train = ""
|
|
||||||
dev = ""
|
|
||||||
raw = null
|
|
||||||
init_tok2vec = null
|
|
||||||
|
|
||||||
[system]
|
|
||||||
seed = 0
|
|
||||||
use_pytorch_for_gpu_memory = false
|
|
||||||
|
|
||||||
[training]
|
|
||||||
seed = ${system:seed}
|
|
||||||
dropout = 0.2
|
|
||||||
init_tok2vec = ${paths:init_tok2vec}
|
|
||||||
vectors = null
|
|
||||||
accumulate_gradient = 1
|
|
||||||
max_steps = 0
|
|
||||||
max_epochs = 0
|
|
||||||
patience = 10000
|
|
||||||
eval_frequency = 200
|
|
||||||
score_weights = {"dep_las": 0.8, "tag_acc": 0.2}
|
|
||||||
|
|
||||||
[training.read_train]
|
|
||||||
@readers = "spacy.Corpus.v1"
|
|
||||||
path = ${paths:train}
|
|
||||||
gold_preproc = true
|
|
||||||
max_length = 0
|
|
||||||
limit = 0
|
|
||||||
|
|
||||||
[training.read_dev]
|
|
||||||
@readers = "spacy.Corpus.v1"
|
|
||||||
path = ${paths:dev}
|
|
||||||
gold_preproc = ${training.read_train:gold_preproc}
|
|
||||||
max_length = 0
|
|
||||||
limit = 0
|
|
||||||
|
|
||||||
[training.batcher]
|
|
||||||
@batchers = "spacy.batch_by_words.v1"
|
|
||||||
discard_oversize = false
|
|
||||||
tolerance = 0.2
|
|
||||||
|
|
||||||
[training.batcher.size]
|
|
||||||
@schedules = "compounding.v1"
|
|
||||||
start = 100
|
|
||||||
stop = 1000
|
|
||||||
compound = 1.001
|
|
||||||
|
|
||||||
[training.optimizer]
|
|
||||||
@optimizers = "Adam.v1"
|
|
||||||
learn_rate = 0.001
|
|
||||||
beta1 = 0.9
|
|
||||||
beta2 = 0.999
|
|
||||||
|
|
||||||
[nlp]
|
|
||||||
lang = "en"
|
|
||||||
pipeline = ["tok2vec", "tagger", "parser"]
|
|
||||||
load_vocab_data = false
|
|
||||||
|
|
||||||
[nlp.tokenizer]
|
|
||||||
@tokenizers = "spacy.Tokenizer.v1"
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
|
||||||
|
|
||||||
[components]
|
|
||||||
|
|
||||||
[components.tok2vec]
|
|
||||||
factory = "tok2vec"
|
|
||||||
|
|
||||||
[components.tagger]
|
|
||||||
factory = "tagger"
|
|
||||||
|
|
||||||
[components.parser]
|
|
||||||
factory = "parser"
|
|
||||||
learn_tokens = false
|
|
||||||
min_action_freq = 1
|
|
||||||
|
|
||||||
[components.tagger.model]
|
|
||||||
@architectures = "spacy.Tagger.v1"
|
|
||||||
|
|
||||||
[components.tagger.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecListener.v1"
|
|
||||||
width = ${components.tok2vec.model.encode:width}
|
|
||||||
|
|
||||||
[components.parser.model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
|
||||||
nr_feature_tokens = 8
|
|
||||||
hidden_width = 64
|
|
||||||
maxout_pieces = 3
|
|
||||||
|
|
||||||
[components.parser.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecListener.v1"
|
|
||||||
width = ${components.tok2vec.model.encode:width}
|
|
||||||
|
|
||||||
[components.tok2vec.model]
|
|
||||||
@architectures = "spacy.Tok2Vec.v1"
|
|
||||||
|
|
||||||
[components.tok2vec.model.embed]
|
|
||||||
@architectures = "spacy.MultiHashEmbed.v1"
|
|
||||||
width = ${components.tok2vec.model.encode:width}
|
|
||||||
rows = 2000
|
|
||||||
also_embed_subwords = true
|
|
||||||
also_use_static_vectors = false
|
|
||||||
|
|
||||||
[components.tok2vec.model.encode]
|
|
||||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
|
||||||
width = 96
|
|
||||||
depth = 4
|
|
||||||
window_size = 1
|
|
||||||
maxout_pieces = 3
|
|
|
@ -1,69 +0,0 @@
|
||||||
[training]
|
|
||||||
use_gpu = -1
|
|
||||||
limit = 0
|
|
||||||
dropout = 0.2
|
|
||||||
patience = 10000
|
|
||||||
eval_frequency = 200
|
|
||||||
scores = ["ents_f"]
|
|
||||||
score_weights = {"ents_f": 1}
|
|
||||||
orth_variant_level = 0.0
|
|
||||||
gold_preproc = true
|
|
||||||
max_length = 0
|
|
||||||
batch_size = 25
|
|
||||||
seed = 0
|
|
||||||
accumulate_gradient = 2
|
|
||||||
discard_oversize = false
|
|
||||||
|
|
||||||
[training.optimizer]
|
|
||||||
@optimizers = "Adam.v1"
|
|
||||||
learn_rate = 0.001
|
|
||||||
beta1 = 0.9
|
|
||||||
beta2 = 0.999
|
|
||||||
|
|
||||||
[nlp]
|
|
||||||
lang = "en"
|
|
||||||
vectors = null
|
|
||||||
|
|
||||||
[nlp.pipeline.tok2vec]
|
|
||||||
factory = "tok2vec"
|
|
||||||
|
|
||||||
[nlp.pipeline.tok2vec.model]
|
|
||||||
@architectures = "spacy.Tok2Vec.v1"
|
|
||||||
|
|
||||||
[nlp.pipeline.tok2vec.model.extract]
|
|
||||||
@architectures = "spacy.CharacterEmbed.v1"
|
|
||||||
width = 96
|
|
||||||
nM = 64
|
|
||||||
nC = 8
|
|
||||||
rows = 2000
|
|
||||||
columns = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
|
|
||||||
dropout = null
|
|
||||||
|
|
||||||
[nlp.pipeline.tok2vec.model.extract.features]
|
|
||||||
@architectures = "spacy.Doc2Feats.v1"
|
|
||||||
columns = ${nlp.pipeline.tok2vec.model.extract:columns}
|
|
||||||
|
|
||||||
[nlp.pipeline.tok2vec.model.embed]
|
|
||||||
@architectures = "spacy.LayerNormalizedMaxout.v1"
|
|
||||||
width = ${nlp.pipeline.tok2vec.model.extract:width}
|
|
||||||
maxout_pieces = 4
|
|
||||||
|
|
||||||
[nlp.pipeline.tok2vec.model.encode]
|
|
||||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
|
||||||
width = ${nlp.pipeline.tok2vec.model.extract:width}
|
|
||||||
window_size = 1
|
|
||||||
maxout_pieces = 2
|
|
||||||
depth = 2
|
|
||||||
|
|
||||||
[nlp.pipeline.ner]
|
|
||||||
factory = "ner"
|
|
||||||
|
|
||||||
[nlp.pipeline.ner.model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
|
||||||
nr_feature_tokens = 6
|
|
||||||
hidden_width = 64
|
|
||||||
maxout_pieces = 2
|
|
||||||
|
|
||||||
[nlp.pipeline.ner.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecTensors.v1"
|
|
||||||
width = ${nlp.pipeline.tok2vec.model.extract:width}
|
|
|
@ -1,51 +0,0 @@
|
||||||
[training]
|
|
||||||
use_gpu = -1
|
|
||||||
limit = 0
|
|
||||||
dropout = 0.2
|
|
||||||
patience = 10000
|
|
||||||
eval_frequency = 200
|
|
||||||
scores = ["ents_p", "ents_r", "ents_f"]
|
|
||||||
score_weights = {"ents_f": 1}
|
|
||||||
orth_variant_level = 0.0
|
|
||||||
gold_preproc = true
|
|
||||||
max_length = 0
|
|
||||||
seed = 0
|
|
||||||
accumulate_gradient = 2
|
|
||||||
discard_oversize = false
|
|
||||||
|
|
||||||
[training.batch_size]
|
|
||||||
@schedules = "compounding.v1"
|
|
||||||
start = 3000
|
|
||||||
stop = 3000
|
|
||||||
compound = 1.001
|
|
||||||
|
|
||||||
|
|
||||||
[training.optimizer]
|
|
||||||
@optimizers = "Adam.v1"
|
|
||||||
learn_rate = 0.001
|
|
||||||
beta1 = 0.9
|
|
||||||
beta2 = 0.999
|
|
||||||
|
|
||||||
[nlp]
|
|
||||||
lang = "en"
|
|
||||||
vectors = null
|
|
||||||
|
|
||||||
[nlp.pipeline.ner]
|
|
||||||
factory = "ner"
|
|
||||||
|
|
||||||
[nlp.pipeline.ner.model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
|
||||||
nr_feature_tokens = 6
|
|
||||||
hidden_width = 64
|
|
||||||
maxout_pieces = 2
|
|
||||||
|
|
||||||
[nlp.pipeline.ner.model.tok2vec]
|
|
||||||
@architectures = "spacy.HashEmbedCNN.v1"
|
|
||||||
width = 128
|
|
||||||
depth = 4
|
|
||||||
embed_size = 7000
|
|
||||||
maxout_pieces = 3
|
|
||||||
window_size = 1
|
|
||||||
subword_features = true
|
|
||||||
pretrained_vectors = null
|
|
||||||
dropout = null
|
|
|
@ -6,8 +6,8 @@ requires = [
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.0.0a31,<8.0.0a40",
|
"thinc>=8.0.0rc0,<8.1.0",
|
||||||
"blis>=0.4.0,<0.5.0",
|
"blis>=0.4.0,<0.8.0",
|
||||||
"pytokenizations",
|
"pytokenizations",
|
||||||
"pathy"
|
"pathy"
|
||||||
]
|
]
|
||||||
|
|
|
@ -1,12 +1,12 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a31,<8.0.0a40
|
thinc>=8.0.0rc0,<8.1.0
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.8.0
|
||||||
ml_datasets>=0.1.1
|
ml_datasets==0.2.0a0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.8.0,<1.1.0
|
wasabi>=0.8.0,<1.1.0
|
||||||
srsly>=2.1.0,<3.0.0
|
srsly>=2.3.0,<3.0.0
|
||||||
catalogue>=2.0.1,<2.1.0
|
catalogue>=2.0.1,<2.1.0
|
||||||
typer>=0.3.0,<0.4.0
|
typer>=0.3.0,<0.4.0
|
||||||
pathy
|
pathy
|
||||||
|
@ -14,12 +14,13 @@ pathy
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
pydantic>=1.3.0,<2.0.0
|
pydantic>=1.5.0,<2.0.0
|
||||||
pytokenizations
|
pytokenizations
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
packaging
|
packaging>=20.0
|
||||||
importlib_metadata>=0.20; python_version < "3.8"
|
importlib_metadata>=0.20; python_version < "3.8"
|
||||||
|
typing_extensions>=3.7.4; python_version < "3.8"
|
||||||
# Development dependencies
|
# Development dependencies
|
||||||
cython>=0.25
|
cython>=0.25
|
||||||
pytest>=4.6.5
|
pytest>=4.6.5
|
||||||
|
|
25
setup.cfg
25
setup.cfg
|
@ -34,16 +34,16 @@ setup_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.0.0a31,<8.0.0a40
|
thinc>=8.0.0rc0,<8.1.0
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a31,<8.0.0a40
|
thinc>=8.0.0rc0,<8.1.0
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.8.0
|
||||||
wasabi>=0.8.0,<1.1.0
|
wasabi>=0.8.0,<1.1.0
|
||||||
srsly>=2.1.0,<3.0.0
|
srsly>=2.3.0,<3.0.0
|
||||||
catalogue>=2.0.1,<2.1.0
|
catalogue>=2.0.1,<2.1.0
|
||||||
typer>=0.3.0,<0.4.0
|
typer>=0.3.0,<0.4.0
|
||||||
pathy
|
pathy
|
||||||
|
@ -51,12 +51,13 @@ install_requires =
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
pydantic>=1.3.0,<2.0.0
|
pydantic>=1.5.0,<2.0.0
|
||||||
pytokenizations
|
pytokenizations
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
packaging
|
packaging>=20.0
|
||||||
importlib_metadata>=0.20; python_version < "3.8"
|
importlib_metadata>=0.20; python_version < "3.8"
|
||||||
|
typing_extensions>=3.7.4; python_version < "3.8"
|
||||||
|
|
||||||
[options.entry_points]
|
[options.entry_points]
|
||||||
console_scripts =
|
console_scripts =
|
||||||
|
@ -64,7 +65,11 @@ console_scripts =
|
||||||
|
|
||||||
[options.extras_require]
|
[options.extras_require]
|
||||||
lookups =
|
lookups =
|
||||||
spacy_lookups_data==0.4.0.dev0
|
spacy_lookups_data>=1.0.0rc0,<1.1.0
|
||||||
|
transformers =
|
||||||
|
spacy_transformers>=1.0.0rc0,<1.1.0
|
||||||
|
ray =
|
||||||
|
spacy_ray>=0.1.0,<1.0.0
|
||||||
cuda =
|
cuda =
|
||||||
cupy>=5.0.0b4,<9.0.0
|
cupy>=5.0.0b4,<9.0.0
|
||||||
cuda80 =
|
cuda80 =
|
||||||
|
@ -83,12 +88,14 @@ cuda102 =
|
||||||
cupy-cuda102>=5.0.0b4,<9.0.0
|
cupy-cuda102>=5.0.0b4,<9.0.0
|
||||||
# Language tokenizers with external dependencies
|
# Language tokenizers with external dependencies
|
||||||
ja =
|
ja =
|
||||||
sudachipy>=0.4.5
|
sudachipy>=0.4.9
|
||||||
sudachidict_core>=20200330
|
sudachidict_core>=20200330
|
||||||
ko =
|
ko =
|
||||||
natto-py==0.9.0
|
natto-py==0.9.0
|
||||||
th =
|
th =
|
||||||
pythainlp>=2.0
|
pythainlp>=2.0
|
||||||
|
zh =
|
||||||
|
spacy-pkuseg==0.0.26
|
||||||
|
|
||||||
[bdist_wheel]
|
[bdist_wheel]
|
||||||
universal = false
|
universal = false
|
||||||
|
@ -97,7 +104,7 @@ universal = false
|
||||||
formats = gztar
|
formats = gztar
|
||||||
|
|
||||||
[flake8]
|
[flake8]
|
||||||
ignore = E203, E266, E501, E731, W503
|
ignore = E203, E266, E501, E731, W503, E741
|
||||||
max-line-length = 80
|
max-line-length = 80
|
||||||
select = B,C,E,F,W,T4,B9
|
select = B,C,E,F,W,T4,B9
|
||||||
exclude =
|
exclude =
|
||||||
|
|
1
setup.py
1
setup.py
|
@ -37,6 +37,7 @@ MOD_NAMES = [
|
||||||
"spacy.pipeline.multitask",
|
"spacy.pipeline.multitask",
|
||||||
"spacy.pipeline.ner",
|
"spacy.pipeline.ner",
|
||||||
"spacy.pipeline.pipe",
|
"spacy.pipeline.pipe",
|
||||||
|
"spacy.pipeline.trainable_pipe",
|
||||||
"spacy.pipeline.sentencizer",
|
"spacy.pipeline.sentencizer",
|
||||||
"spacy.pipeline.senter",
|
"spacy.pipeline.senter",
|
||||||
"spacy.pipeline.tagger",
|
"spacy.pipeline.tagger",
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy-nightly"
|
__title__ = "spacy-nightly"
|
||||||
__version__ = "3.0.0a18"
|
__version__ = "3.0.0a41"
|
||||||
__release__ = True
|
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__projects__ = "https://github.com/explosion/projects"
|
__projects__ = "https://github.com/explosion/projects"
|
||||||
|
|
|
@ -15,7 +15,7 @@ from .debug_config import debug_config # noqa: F401
|
||||||
from .debug_model import debug_model # noqa: F401
|
from .debug_model import debug_model # noqa: F401
|
||||||
from .evaluate import evaluate # noqa: F401
|
from .evaluate import evaluate # noqa: F401
|
||||||
from .convert import convert # noqa: F401
|
from .convert import convert # noqa: F401
|
||||||
from .init_model import init_model # noqa: F401
|
from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||||
from .init_config import init_config, fill_config # noqa: F401
|
from .init_config import init_config, fill_config # noqa: F401
|
||||||
from .validate import validate # noqa: F401
|
from .validate import validate # noqa: F401
|
||||||
from .project.clone import project_clone # noqa: F401
|
from .project.clone import project_clone # noqa: F401
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Dict, Any, Union, List, Optional, Tuple, TYPE_CHECKING
|
from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, TYPE_CHECKING
|
||||||
import sys
|
import sys
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -6,15 +6,18 @@ from wasabi import msg
|
||||||
import srsly
|
import srsly
|
||||||
import hashlib
|
import hashlib
|
||||||
import typer
|
import typer
|
||||||
import subprocess
|
|
||||||
from click import NoSuchOption
|
from click import NoSuchOption
|
||||||
|
from click.parser import split_arg_string
|
||||||
from typer.main import get_command
|
from typer.main import get_command
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from thinc.config import Config, ConfigValidationError
|
from thinc.api import Config, ConfigValidationError, require_gpu
|
||||||
from configparser import InterpolationError
|
from configparser import InterpolationError
|
||||||
|
import os
|
||||||
|
|
||||||
from ..schemas import ProjectConfigSchema, validate
|
from ..schemas import ProjectConfigSchema, validate
|
||||||
from ..util import import_file, run_command, make_tempdir, registry
|
from ..util import import_file, run_command, make_tempdir, registry, logger
|
||||||
|
from ..util import is_compatible_version, ENV_VARS
|
||||||
|
from .. import about
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from pathy import Pathy # noqa: F401
|
from pathy import Pathy # noqa: F401
|
||||||
|
@ -62,24 +65,41 @@ def setup_cli() -> None:
|
||||||
command(prog_name=COMMAND)
|
command(prog_name=COMMAND)
|
||||||
|
|
||||||
|
|
||||||
def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
|
def parse_config_overrides(
|
||||||
|
args: List[str], env_var: Optional[str] = ENV_VARS.CONFIG_OVERRIDES
|
||||||
|
) -> Dict[str, Any]:
|
||||||
"""Generate a dictionary of config overrides based on the extra arguments
|
"""Generate a dictionary of config overrides based on the extra arguments
|
||||||
provided on the CLI, e.g. --training.batch_size to override
|
provided on the CLI, e.g. --training.batch_size to override
|
||||||
"training.batch_size". Arguments without a "." are considered invalid,
|
"training.batch_size". Arguments without a "." are considered invalid,
|
||||||
since the config only allows top-level sections to exist.
|
since the config only allows top-level sections to exist.
|
||||||
|
|
||||||
args (List[str]): The extra arguments from the command line.
|
env_vars (Optional[str]): Optional environment variable to read from.
|
||||||
RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting.
|
RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting.
|
||||||
"""
|
"""
|
||||||
|
env_string = os.environ.get(env_var, "") if env_var else ""
|
||||||
|
env_overrides = _parse_overrides(split_arg_string(env_string))
|
||||||
|
cli_overrides = _parse_overrides(args, is_cli=True)
|
||||||
|
if cli_overrides:
|
||||||
|
keys = [k for k in cli_overrides if k not in env_overrides]
|
||||||
|
logger.debug(f"Config overrides from CLI: {keys}")
|
||||||
|
if env_overrides:
|
||||||
|
logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
|
||||||
|
return {**cli_overrides, **env_overrides}
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_overrides(args: List[str], is_cli: bool = False) -> Dict[str, Any]:
|
||||||
result = {}
|
result = {}
|
||||||
while args:
|
while args:
|
||||||
opt = args.pop(0)
|
opt = args.pop(0)
|
||||||
err = f"Invalid CLI argument '{opt}'"
|
err = f"Invalid config override '{opt}'"
|
||||||
if opt.startswith("--"): # new argument
|
if opt.startswith("--"): # new argument
|
||||||
orig_opt = opt
|
orig_opt = opt
|
||||||
opt = opt.replace("--", "")
|
opt = opt.replace("--", "")
|
||||||
if "." not in opt:
|
if "." not in opt:
|
||||||
raise NoSuchOption(orig_opt)
|
if is_cli:
|
||||||
|
raise NoSuchOption(orig_opt)
|
||||||
|
else:
|
||||||
|
msg.fail(f"{err}: can't override top-level sections", exits=1)
|
||||||
if "=" in opt: # we have --opt=value
|
if "=" in opt: # we have --opt=value
|
||||||
opt, value = opt.split("=", 1)
|
opt, value = opt.split("=", 1)
|
||||||
opt = opt.replace("-", "_")
|
opt = opt.replace("-", "_")
|
||||||
|
@ -98,7 +118,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
|
||||||
except ValueError:
|
except ValueError:
|
||||||
result[opt] = str(value)
|
result[opt] = str(value)
|
||||||
else:
|
else:
|
||||||
msg.fail(f"{err}: override option should start with --", exits=1)
|
msg.fail(f"{err}: name should start with --", exits=1)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@ -123,6 +143,7 @@ def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
|
||||||
msg.fail(invalid_err)
|
msg.fail(invalid_err)
|
||||||
print("\n".join(errors))
|
print("\n".join(errors))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
validate_project_version(config)
|
||||||
validate_project_commands(config)
|
validate_project_commands(config)
|
||||||
# Make sure directories defined in config exist
|
# Make sure directories defined in config exist
|
||||||
for subdir in config.get("directories", []):
|
for subdir in config.get("directories", []):
|
||||||
|
@ -148,6 +169,23 @@ def substitute_project_variables(config: Dict[str, Any], overrides: Dict = {}):
|
||||||
return dict(interpolated["project"])
|
return dict(interpolated["project"])
|
||||||
|
|
||||||
|
|
||||||
|
def validate_project_version(config: Dict[str, Any]) -> None:
|
||||||
|
"""If the project defines a compatible spaCy version range, chec that it's
|
||||||
|
compatible with the current version of spaCy.
|
||||||
|
|
||||||
|
config (Dict[str, Any]): The loaded config.
|
||||||
|
"""
|
||||||
|
spacy_version = config.get("spacy_version", None)
|
||||||
|
if spacy_version and not is_compatible_version(about.__version__, spacy_version):
|
||||||
|
err = (
|
||||||
|
f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) "
|
||||||
|
f"that's not compatible with the version of spaCy you're running "
|
||||||
|
f"({about.__version__}). You can edit version requirement in the "
|
||||||
|
f"{PROJECT_FILE} to load it, but the project may not run as expected."
|
||||||
|
)
|
||||||
|
msg.fail(err, exits=1)
|
||||||
|
|
||||||
|
|
||||||
def validate_project_commands(config: Dict[str, Any]) -> None:
|
def validate_project_commands(config: Dict[str, Any]) -> None:
|
||||||
"""Check that project commands and workflows are valid, don't contain
|
"""Check that project commands and workflows are valid, don't contain
|
||||||
duplicates, don't clash and only refer to commands that exist.
|
duplicates, don't clash and only refer to commands that exist.
|
||||||
|
@ -174,12 +212,15 @@ def validate_project_commands(config: Dict[str, Any]) -> None:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_hash(data) -> str:
|
def get_hash(data, exclude: Iterable[str] = tuple()) -> str:
|
||||||
"""Get the hash for a JSON-serializable object.
|
"""Get the hash for a JSON-serializable object.
|
||||||
|
|
||||||
data: The data to hash.
|
data: The data to hash.
|
||||||
|
exclude (Iterable[str]): Top-level keys to exclude if data is a dict.
|
||||||
RETURNS (str): The hash.
|
RETURNS (str): The hash.
|
||||||
"""
|
"""
|
||||||
|
if isinstance(data, dict):
|
||||||
|
data = {k: v for k, v in data.items() if k not in exclude}
|
||||||
data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
|
data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
|
||||||
return hashlib.md5(data_str).hexdigest()
|
return hashlib.md5(data_str).hexdigest()
|
||||||
|
|
||||||
|
@ -207,32 +248,40 @@ def get_checksum(path: Union[Path, str]) -> str:
|
||||||
def show_validation_error(
|
def show_validation_error(
|
||||||
file_path: Optional[Union[str, Path]] = None,
|
file_path: Optional[Union[str, Path]] = None,
|
||||||
*,
|
*,
|
||||||
title: str = "Config validation error",
|
title: Optional[str] = None,
|
||||||
|
desc: str = "",
|
||||||
|
show_config: Optional[bool] = None,
|
||||||
hint_fill: bool = True,
|
hint_fill: bool = True,
|
||||||
):
|
):
|
||||||
"""Helper to show custom config validation errors on the CLI.
|
"""Helper to show custom config validation errors on the CLI.
|
||||||
|
|
||||||
file_path (str / Path): Optional file path of config file, used in hints.
|
file_path (str / Path): Optional file path of config file, used in hints.
|
||||||
title (str): Title of the custom formatted error.
|
title (str): Override title of custom formatted error.
|
||||||
|
desc (str): Override description of custom formatted error.
|
||||||
|
show_config (bool): Whether to output the config the error refers to.
|
||||||
hint_fill (bool): Show hint about filling config.
|
hint_fill (bool): Show hint about filling config.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
yield
|
yield
|
||||||
except (ConfigValidationError, InterpolationError) as e:
|
except ConfigValidationError as e:
|
||||||
msg.fail(title, spaced=True)
|
title = title if title is not None else e.title
|
||||||
# TODO: This is kinda hacky and we should probably provide a better
|
if e.desc:
|
||||||
# helper for this in Thinc
|
desc = f"{e.desc}" if not desc else f"{e.desc}\n\n{desc}"
|
||||||
err_text = str(e).replace("Config validation error", "").strip()
|
# Re-generate a new error object with overrides
|
||||||
print(err_text)
|
err = e.from_error(e, title="", desc=desc, show_config=show_config)
|
||||||
if hint_fill and "field required" in err_text:
|
msg.fail(title)
|
||||||
|
print(err.text.strip())
|
||||||
|
if hint_fill and "value_error.missing" in err.error_types:
|
||||||
config_path = file_path if file_path is not None else "config.cfg"
|
config_path = file_path if file_path is not None else "config.cfg"
|
||||||
msg.text(
|
msg.text(
|
||||||
"If your config contains missing values, you can run the 'init "
|
"If your config contains missing values, you can run the 'init "
|
||||||
"fill-config' command to fill in all the defaults, if possible:",
|
"fill-config' command to fill in all the defaults, if possible:",
|
||||||
spaced=True,
|
spaced=True,
|
||||||
)
|
)
|
||||||
print(f"{COMMAND} init fill-config {config_path} --base {config_path}\n")
|
print(f"{COMMAND} init fill-config {config_path} {config_path} \n")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
except InterpolationError as e:
|
||||||
|
msg.fail("Config validation error", e, exits=1)
|
||||||
|
|
||||||
|
|
||||||
def import_code(code_path: Optional[Union[Path, str]]) -> None:
|
def import_code(code_path: Optional[Union[Path, str]]) -> None:
|
||||||
|
@ -248,18 +297,6 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
|
||||||
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
|
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
|
||||||
|
|
||||||
|
|
||||||
def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
|
|
||||||
"""RETURNS (List[str]): All sourced components in the original config,
|
|
||||||
e.g. {"source": "en_core_web_sm"}. If the config contains a key
|
|
||||||
"factory", we assume it refers to a component factory.
|
|
||||||
"""
|
|
||||||
return [
|
|
||||||
name
|
|
||||||
for name, cfg in config.get("components", {}).items()
|
|
||||||
if "factory" not in cfg and "source" in cfg
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
|
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
|
||||||
"""Upload a file.
|
"""Upload a file.
|
||||||
|
|
||||||
|
@ -287,7 +324,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
|
||||||
if dest.exists() and not force:
|
if dest.exists() and not force:
|
||||||
return None
|
return None
|
||||||
src = str(src)
|
src = str(src)
|
||||||
with smart_open.open(src, mode="rb") as input_file:
|
with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
|
||||||
with dest.open(mode="wb") as output_file:
|
with dest.open(mode="wb") as output_file:
|
||||||
output_file.write(input_file.read())
|
output_file.write(input_file.read())
|
||||||
|
|
||||||
|
@ -307,7 +344,31 @@ def git_checkout(
|
||||||
if dest.exists():
|
if dest.exists():
|
||||||
msg.fail("Destination of checkout must not exist", exits=1)
|
msg.fail("Destination of checkout must not exist", exits=1)
|
||||||
if not dest.parent.exists():
|
if not dest.parent.exists():
|
||||||
raise IOError("Parent of destination of checkout must exist")
|
msg.fail("Parent of destination of checkout must exist", exits=1)
|
||||||
|
if sparse and git_version >= (2, 22):
|
||||||
|
return git_sparse_checkout(repo, subpath, dest, branch)
|
||||||
|
elif sparse:
|
||||||
|
# Only show warnings if the user explicitly wants sparse checkout but
|
||||||
|
# the Git version doesn't support it
|
||||||
|
err_old = (
|
||||||
|
f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
|
||||||
|
f"that doesn't fully support sparse checkout yet."
|
||||||
|
)
|
||||||
|
err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
|
||||||
|
msg.warn(
|
||||||
|
f"{err_unk if git_version == (0, 0) else err_old} "
|
||||||
|
f"This means that more files than necessary may be downloaded "
|
||||||
|
f"temporarily. To only download the files needed, make sure "
|
||||||
|
f"you're using Git v2.22 or above."
|
||||||
|
)
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
|
||||||
|
run_command(cmd, capture=True)
|
||||||
|
# We need Path(name) to make sure we also support subdirectories
|
||||||
|
shutil.copytree(str(tmp_dir / Path(subpath)), str(dest))
|
||||||
|
|
||||||
|
|
||||||
|
def git_sparse_checkout(repo, subpath, dest, branch):
|
||||||
# We're using Git, partial clone and sparse checkout to
|
# We're using Git, partial clone and sparse checkout to
|
||||||
# only clone the files we need
|
# only clone the files we need
|
||||||
# This ends up being RIDICULOUS. omg.
|
# This ends up being RIDICULOUS. omg.
|
||||||
|
@ -324,47 +385,31 @@ def git_checkout(
|
||||||
# *that* we can do by path.
|
# *that* we can do by path.
|
||||||
# We're using Git and sparse checkout to only clone the files we need
|
# We're using Git and sparse checkout to only clone the files we need
|
||||||
with make_tempdir() as tmp_dir:
|
with make_tempdir() as tmp_dir:
|
||||||
supports_sparse = git_version >= (2, 22)
|
|
||||||
use_sparse = supports_sparse and sparse
|
|
||||||
# This is the "clone, but don't download anything" part.
|
# This is the "clone, but don't download anything" part.
|
||||||
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} "
|
cmd = (
|
||||||
if use_sparse:
|
f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
|
||||||
cmd += f"--filter=blob:none" # <-- The key bit
|
f"-b {branch} --filter=blob:none"
|
||||||
# Only show warnings if the user explicitly wants sparse checkout but
|
)
|
||||||
# the Git version doesn't support it
|
run_command(cmd)
|
||||||
elif sparse:
|
|
||||||
err_old = (
|
|
||||||
f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
|
|
||||||
f"that doesn't fully support sparse checkout yet."
|
|
||||||
)
|
|
||||||
err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
|
|
||||||
msg.warn(
|
|
||||||
f"{err_unk if git_version == (0, 0) else err_old} "
|
|
||||||
f"This means that more files than necessary may be downloaded "
|
|
||||||
f"temporarily. To only download the files needed, make sure "
|
|
||||||
f"you're using Git v2.22 or above."
|
|
||||||
)
|
|
||||||
try_run_command(cmd)
|
|
||||||
# Now we need to find the missing filenames for the subpath we want.
|
# Now we need to find the missing filenames for the subpath we want.
|
||||||
# Looking for this 'rev-list' command in the git --help? Hah.
|
# Looking for this 'rev-list' command in the git --help? Hah.
|
||||||
cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if use_sparse else ''} -- {subpath}"
|
cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
|
||||||
ret = try_run_command(cmd)
|
ret = run_command(cmd, capture=True)
|
||||||
git_repo = _from_http_to_git(repo)
|
git_repo = _http_to_git(repo)
|
||||||
# Now pass those missings into another bit of git internals
|
# Now pass those missings into another bit of git internals
|
||||||
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
|
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
|
||||||
if use_sparse and not missings:
|
if not missings:
|
||||||
err = (
|
err = (
|
||||||
f"Could not find any relevant files for '{subpath}'. "
|
f"Could not find any relevant files for '{subpath}'. "
|
||||||
f"Did you specify a correct and complete path within repo '{repo}' "
|
f"Did you specify a correct and complete path within repo '{repo}' "
|
||||||
f"and branch {branch}?"
|
f"and branch {branch}?"
|
||||||
)
|
)
|
||||||
msg.fail(err, exits=1)
|
msg.fail(err, exits=1)
|
||||||
if use_sparse:
|
cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
|
||||||
cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
|
run_command(cmd, capture=True)
|
||||||
try_run_command(cmd)
|
|
||||||
# And finally, we can checkout our subpath
|
# And finally, we can checkout our subpath
|
||||||
cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
|
cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
|
||||||
try_run_command(cmd)
|
run_command(cmd, capture=True)
|
||||||
# We need Path(name) to make sure we also support subdirectories
|
# We need Path(name) to make sure we also support subdirectories
|
||||||
shutil.move(str(tmp_dir / Path(subpath)), str(dest))
|
shutil.move(str(tmp_dir / Path(subpath)), str(dest))
|
||||||
|
|
||||||
|
@ -378,7 +423,7 @@ def get_git_version(
|
||||||
RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
|
RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
|
||||||
(0, 0) if the version couldn't be determined.
|
(0, 0) if the version couldn't be determined.
|
||||||
"""
|
"""
|
||||||
ret = try_run_command(["git", "--version"], error=error)
|
ret = run_command("git --version", capture=True)
|
||||||
stdout = ret.stdout.strip()
|
stdout = ret.stdout.strip()
|
||||||
if not stdout or not stdout.startswith("git version"):
|
if not stdout or not stdout.startswith("git version"):
|
||||||
return (0, 0)
|
return (0, 0)
|
||||||
|
@ -386,24 +431,7 @@ def get_git_version(
|
||||||
return (int(version[0]), int(version[1]))
|
return (int(version[0]), int(version[1]))
|
||||||
|
|
||||||
|
|
||||||
def try_run_command(
|
def _http_to_git(repo: str) -> str:
|
||||||
cmd: Union[str, List[str]], error: str = "Could not run command"
|
|
||||||
) -> subprocess.CompletedProcess:
|
|
||||||
"""Try running a command and raise an error if it fails.
|
|
||||||
|
|
||||||
cmd (Union[str, List[str]]): The command to run.
|
|
||||||
error (str): The error message.
|
|
||||||
RETURNS (CompletedProcess): The completed process if the command ran.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
return run_command(cmd, capture=True)
|
|
||||||
except subprocess.CalledProcessError as e:
|
|
||||||
msg.fail(error)
|
|
||||||
print(cmd)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
|
|
||||||
def _from_http_to_git(repo: str) -> str:
|
|
||||||
if repo.startswith("http://"):
|
if repo.startswith("http://"):
|
||||||
repo = repo.replace(r"http://", r"https://")
|
repo = repo.replace(r"http://", r"https://")
|
||||||
if repo.startswith(r"https://"):
|
if repo.startswith(r"https://"):
|
||||||
|
@ -439,3 +467,12 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
|
||||||
p = int(p)
|
p = int(p)
|
||||||
result.append(p)
|
result.append(p)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def setup_gpu(use_gpu: int) -> None:
|
||||||
|
"""Configure the GPU and log info."""
|
||||||
|
if use_gpu >= 0:
|
||||||
|
msg.info(f"Using GPU: {use_gpu}")
|
||||||
|
require_gpu(use_gpu)
|
||||||
|
else:
|
||||||
|
msg.info("Using CPU")
|
||||||
|
|
|
@ -9,7 +9,8 @@ import sys
|
||||||
from ._util import app, Arg, Opt
|
from ._util import app, Arg, Opt
|
||||||
from ..training import docs_to_json
|
from ..training import docs_to_json
|
||||||
from ..tokens import DocBin
|
from ..tokens import DocBin
|
||||||
from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs
|
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
||||||
|
from ..training.converters import conllu_to_docs
|
||||||
|
|
||||||
|
|
||||||
# Converters are matched by file extension except for ner/iob, which are
|
# Converters are matched by file extension except for ner/iob, which are
|
||||||
|
@ -18,12 +19,12 @@ from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2do
|
||||||
# imported from /converters.
|
# imported from /converters.
|
||||||
|
|
||||||
CONVERTERS = {
|
CONVERTERS = {
|
||||||
"conllubio": conllu2docs,
|
"conllubio": conllu_to_docs,
|
||||||
"conllu": conllu2docs,
|
"conllu": conllu_to_docs,
|
||||||
"conll": conllu2docs,
|
"conll": conllu_to_docs,
|
||||||
"ner": conll_ner2docs,
|
"ner": conll_ner_to_docs,
|
||||||
"iob": iob2docs,
|
"iob": iob_to_docs,
|
||||||
"json": json2docs,
|
"json": json_to_docs,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -209,6 +210,8 @@ def walk_directory(path: Path, converter: str) -> List[Path]:
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
locs.append(path)
|
locs.append(path)
|
||||||
|
# It's good to sort these, in case the ordering messes up cache.
|
||||||
|
locs.sort()
|
||||||
return locs
|
return locs
|
||||||
|
|
||||||
|
|
||||||
|
@ -250,7 +253,7 @@ def _get_converter(msg, converter, input_path):
|
||||||
if converter == "auto":
|
if converter == "auto":
|
||||||
converter = input_path.suffix[1:]
|
converter = input_path.suffix[1:]
|
||||||
if converter == "ner" or converter == "iob":
|
if converter == "ner" or converter == "iob":
|
||||||
with input_path.open() as file_:
|
with input_path.open(encoding="utf8") as file_:
|
||||||
input_data = file_.read()
|
input_data = file_.read()
|
||||||
converter_autodetect = autodetect_ner_format(input_data)
|
converter_autodetect = autodetect_ner_format(input_data)
|
||||||
if converter_autodetect == "ner":
|
if converter_autodetect == "ner":
|
||||||
|
|
|
@ -7,6 +7,8 @@ import typer
|
||||||
|
|
||||||
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
|
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
|
||||||
from ._util import import_code, debug_cli
|
from ._util import import_code, debug_cli
|
||||||
|
from ..schemas import ConfigSchemaTraining
|
||||||
|
from ..util import registry
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -51,7 +53,11 @@ def debug_config(
|
||||||
msg.divider("Config validation")
|
msg.divider("Config validation")
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
config = util.load_config(config_path, overrides=overrides)
|
config = util.load_config(config_path, overrides=overrides)
|
||||||
nlp, _ = util.load_model_from_config(config)
|
nlp = util.load_model_from_config(config)
|
||||||
|
config = nlp.config.interpolate()
|
||||||
|
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||||
|
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
||||||
|
util.resolve_dot_names(config, dot_names)
|
||||||
msg.good("Config is valid")
|
msg.good("Config is valid")
|
||||||
if show_vars:
|
if show_vars:
|
||||||
variables = get_variables(config)
|
variables = get_variables(config)
|
||||||
|
|
|
@ -7,10 +7,13 @@ from wasabi import Printer, MESSAGES, msg
|
||||||
import typer
|
import typer
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
||||||
from ._util import import_code, debug_cli, get_sourced_components
|
from ._util import import_code, debug_cli
|
||||||
from ..training import Corpus, Example
|
from ..training import Example
|
||||||
|
from ..training.initialize import get_sourced_components
|
||||||
|
from ..schemas import ConfigSchemaTraining
|
||||||
from ..pipeline._parser_internals import nonproj
|
from ..pipeline._parser_internals import nonproj
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
|
from ..util import registry, resolve_dot_names
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -24,7 +27,7 @@ BLANK_MODEL_THRESHOLD = 2000
|
||||||
|
|
||||||
|
|
||||||
@debug_cli.command(
|
@debug_cli.command(
|
||||||
"data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
"data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
|
||||||
)
|
)
|
||||||
@app.command(
|
@app.command(
|
||||||
"debug-data",
|
"debug-data",
|
||||||
|
@ -34,8 +37,6 @@ BLANK_MODEL_THRESHOLD = 2000
|
||||||
def debug_data_cli(
|
def debug_data_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True),
|
|
||||||
dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True),
|
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
|
ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
|
||||||
|
@ -59,8 +60,6 @@ def debug_data_cli(
|
||||||
overrides = parse_config_overrides(ctx.args)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
debug_data(
|
debug_data(
|
||||||
train_path,
|
|
||||||
dev_path,
|
|
||||||
config_path,
|
config_path,
|
||||||
config_overrides=overrides,
|
config_overrides=overrides,
|
||||||
ignore_warnings=ignore_warnings,
|
ignore_warnings=ignore_warnings,
|
||||||
|
@ -71,8 +70,6 @@ def debug_data_cli(
|
||||||
|
|
||||||
|
|
||||||
def debug_data(
|
def debug_data(
|
||||||
train_path: Path,
|
|
||||||
dev_path: Path,
|
|
||||||
config_path: Path,
|
config_path: Path,
|
||||||
*,
|
*,
|
||||||
config_overrides: Dict[str, Any] = {},
|
config_overrides: Dict[str, Any] = {},
|
||||||
|
@ -85,56 +82,29 @@ def debug_data(
|
||||||
no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings
|
no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings
|
||||||
)
|
)
|
||||||
# Make sure all files and paths exists if they are needed
|
# Make sure all files and paths exists if they are needed
|
||||||
if not train_path.exists():
|
|
||||||
msg.fail("Training data not found", train_path, exits=1)
|
|
||||||
if not dev_path.exists():
|
|
||||||
msg.fail("Development data not found", dev_path, exits=1)
|
|
||||||
if not config_path.exists():
|
|
||||||
msg.fail("Config file not found", config_path, exists=1)
|
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
cfg = util.load_config(config_path, overrides=config_overrides)
|
cfg = util.load_config(config_path, overrides=config_overrides)
|
||||||
nlp, config = util.load_model_from_config(cfg)
|
nlp = util.load_model_from_config(cfg)
|
||||||
|
config = nlp.config.interpolate()
|
||||||
|
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||||
# Use original config here, not resolved version
|
# Use original config here, not resolved version
|
||||||
sourced_components = get_sourced_components(cfg)
|
sourced_components = get_sourced_components(cfg)
|
||||||
frozen_components = config["training"]["frozen_components"]
|
frozen_components = T["frozen_components"]
|
||||||
resume_components = [p for p in sourced_components if p not in frozen_components]
|
resume_components = [p for p in sourced_components if p not in frozen_components]
|
||||||
pipeline = nlp.pipe_names
|
pipeline = nlp.pipe_names
|
||||||
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
||||||
tag_map_path = util.ensure_path(config["training"]["tag_map"])
|
|
||||||
tag_map = {}
|
|
||||||
if tag_map_path is not None:
|
|
||||||
tag_map = srsly.read_json(tag_map_path)
|
|
||||||
morph_rules_path = util.ensure_path(config["training"]["morph_rules"])
|
|
||||||
morph_rules = {}
|
|
||||||
if morph_rules_path is not None:
|
|
||||||
morph_rules = srsly.read_json(morph_rules_path)
|
|
||||||
# Replace tag map with provided mapping
|
|
||||||
nlp.vocab.morphology.load_tag_map(tag_map)
|
|
||||||
# Load morph rules
|
|
||||||
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
|
|
||||||
|
|
||||||
msg.divider("Data file validation")
|
msg.divider("Data file validation")
|
||||||
|
|
||||||
# Create the gold corpus to be able to better analyze data
|
# Create the gold corpus to be able to better analyze data
|
||||||
loading_train_error_message = ""
|
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
||||||
loading_dev_error_message = ""
|
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
|
||||||
with msg.loading("Loading corpus..."):
|
train_dataset = list(train_corpus(nlp))
|
||||||
try:
|
dev_dataset = list(dev_corpus(nlp))
|
||||||
train_dataset = list(Corpus(train_path)(nlp))
|
|
||||||
except ValueError as e:
|
|
||||||
loading_train_error_message = f"Training data cannot be loaded: {e}"
|
|
||||||
try:
|
|
||||||
dev_dataset = list(Corpus(dev_path)(nlp))
|
|
||||||
except ValueError as e:
|
|
||||||
loading_dev_error_message = f"Development data cannot be loaded: {e}"
|
|
||||||
if loading_train_error_message or loading_dev_error_message:
|
|
||||||
if loading_train_error_message:
|
|
||||||
msg.fail(loading_train_error_message)
|
|
||||||
if loading_dev_error_message:
|
|
||||||
msg.fail(loading_dev_error_message)
|
|
||||||
sys.exit(1)
|
|
||||||
msg.good("Corpus is loadable")
|
msg.good("Corpus is loadable")
|
||||||
|
|
||||||
|
nlp.initialize(lambda: train_dataset)
|
||||||
|
msg.good("Pipeline can be initialized with data")
|
||||||
|
|
||||||
# Create all gold data here to avoid iterating over the train_dataset constantly
|
# Create all gold data here to avoid iterating over the train_dataset constantly
|
||||||
gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True)
|
gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True)
|
||||||
gold_train_unpreprocessed_data = _compile_gold(
|
gold_train_unpreprocessed_data = _compile_gold(
|
||||||
|
@ -144,10 +114,10 @@ def debug_data(
|
||||||
|
|
||||||
train_texts = gold_train_data["texts"]
|
train_texts = gold_train_data["texts"]
|
||||||
dev_texts = gold_dev_data["texts"]
|
dev_texts = gold_dev_data["texts"]
|
||||||
frozen_components = config["training"]["frozen_components"]
|
frozen_components = T["frozen_components"]
|
||||||
|
|
||||||
msg.divider("Training stats")
|
msg.divider("Training stats")
|
||||||
msg.text(f"Language: {config['nlp']['lang']}")
|
msg.text(f"Language: {nlp.lang}")
|
||||||
msg.text(f"Training pipeline: {', '.join(pipeline)}")
|
msg.text(f"Training pipeline: {', '.join(pipeline)}")
|
||||||
if resume_components:
|
if resume_components:
|
||||||
msg.text(f"Components from other pipelines: {', '.join(resume_components)}")
|
msg.text(f"Components from other pipelines: {', '.join(resume_components)}")
|
||||||
|
@ -201,7 +171,7 @@ def debug_data(
|
||||||
n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
|
n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"{} words in training data without vectors ({:0.2f}%)".format(
|
"{} words in training data without vectors ({:0.2f}%)".format(
|
||||||
n_missing_vectors, n_missing_vectors / gold_train_data["n_words"],
|
n_missing_vectors, n_missing_vectors / gold_train_data["n_words"]
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
msg.text(
|
msg.text(
|
||||||
|
@ -354,17 +324,12 @@ def debug_data(
|
||||||
if "tagger" in factory_names:
|
if "tagger" in factory_names:
|
||||||
msg.divider("Part-of-speech Tagging")
|
msg.divider("Part-of-speech Tagging")
|
||||||
labels = [label for label in gold_train_data["tags"]]
|
labels = [label for label in gold_train_data["tags"]]
|
||||||
tag_map = nlp.vocab.morphology.tag_map
|
# TODO: does this need to be updated?
|
||||||
msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)")
|
msg.info(f"{len(labels)} label(s) in data")
|
||||||
labels_with_counts = _format_labels(
|
labels_with_counts = _format_labels(
|
||||||
gold_train_data["tags"].most_common(), counts=True
|
gold_train_data["tags"].most_common(), counts=True
|
||||||
)
|
)
|
||||||
msg.text(labels_with_counts, show=verbose)
|
msg.text(labels_with_counts, show=verbose)
|
||||||
non_tagmap = [l for l in labels if l not in tag_map]
|
|
||||||
if not non_tagmap:
|
|
||||||
msg.good(f"All labels present in tag map for language '{nlp.lang}'")
|
|
||||||
for label in non_tagmap:
|
|
||||||
msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'")
|
|
||||||
|
|
||||||
if "parser" in factory_names:
|
if "parser" in factory_names:
|
||||||
has_low_data_warning = False
|
has_low_data_warning = False
|
||||||
|
|
|
@ -1,16 +1,24 @@
|
||||||
from typing import Dict, Any, Optional
|
from typing import Dict, Any, Optional, Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from spacy.training import Example
|
||||||
|
from spacy.util import resolve_dot_names
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
|
from thinc.api import fix_random_seed, set_dropout_rate, Adam
|
||||||
from thinc.api import Model, data_validation
|
from thinc.api import Model, data_validation, set_gpu_allocator
|
||||||
import typer
|
import typer
|
||||||
|
|
||||||
from ._util import Arg, Opt, debug_cli, show_validation_error
|
from ._util import Arg, Opt, debug_cli, show_validation_error
|
||||||
from ._util import parse_config_overrides, string_to_list
|
from ._util import parse_config_overrides, string_to_list, setup_gpu
|
||||||
|
from ..schemas import ConfigSchemaTraining
|
||||||
|
from ..util import registry
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
@debug_cli.command("model")
|
@debug_cli.command(
|
||||||
|
"model",
|
||||||
|
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||||
|
)
|
||||||
def debug_model_cli(
|
def debug_model_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
|
@ -34,11 +42,7 @@ def debug_model_cli(
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/cli#debug-model
|
DOCS: https://nightly.spacy.io/api/cli#debug-model
|
||||||
"""
|
"""
|
||||||
if use_gpu >= 0:
|
setup_gpu(use_gpu)
|
||||||
msg.info("Using GPU")
|
|
||||||
require_gpu(use_gpu)
|
|
||||||
else:
|
|
||||||
msg.info("Using CPU")
|
|
||||||
layers = string_to_list(layers, intify=True)
|
layers = string_to_list(layers, intify=True)
|
||||||
print_settings = {
|
print_settings = {
|
||||||
"dimensions": dimensions,
|
"dimensions": dimensions,
|
||||||
|
@ -53,24 +57,39 @@ def debug_model_cli(
|
||||||
}
|
}
|
||||||
config_overrides = parse_config_overrides(ctx.args)
|
config_overrides = parse_config_overrides(ctx.args)
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
config = util.load_config(config_path, overrides=config_overrides)
|
raw_config = util.load_config(
|
||||||
nlp, config = util.load_model_from_config(config_path)
|
config_path, overrides=config_overrides, interpolate=False
|
||||||
seed = config["training"]["seed"]
|
)
|
||||||
|
config = raw_config.interpolate()
|
||||||
|
allocator = config["training"]["gpu_allocator"]
|
||||||
|
if use_gpu >= 0 and allocator:
|
||||||
|
set_gpu_allocator(allocator)
|
||||||
|
with show_validation_error(config_path):
|
||||||
|
nlp = util.load_model_from_config(raw_config)
|
||||||
|
config = nlp.config.interpolate()
|
||||||
|
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||||
|
seed = T["seed"]
|
||||||
if seed is not None:
|
if seed is not None:
|
||||||
msg.info(f"Fixing random seed: {seed}")
|
msg.info(f"Fixing random seed: {seed}")
|
||||||
fix_random_seed(seed)
|
fix_random_seed(seed)
|
||||||
pipe = nlp.get_pipe(component)
|
pipe = nlp.get_pipe(component)
|
||||||
if hasattr(pipe, "model"):
|
if not hasattr(pipe, "model"):
|
||||||
model = pipe.model
|
|
||||||
else:
|
|
||||||
msg.fail(
|
msg.fail(
|
||||||
f"The component '{component}' does not specify an object that holds a Model.",
|
f"The component '{component}' does not specify an object that holds a Model.",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
debug_model(model, print_settings=print_settings)
|
model = pipe.model
|
||||||
|
debug_model(config, T, nlp, model, print_settings=print_settings)
|
||||||
|
|
||||||
|
|
||||||
def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None):
|
def debug_model(
|
||||||
|
config,
|
||||||
|
resolved_train_config,
|
||||||
|
nlp,
|
||||||
|
model: Model,
|
||||||
|
*,
|
||||||
|
print_settings: Optional[Dict[str, Any]] = None,
|
||||||
|
):
|
||||||
if not isinstance(model, Model):
|
if not isinstance(model, Model):
|
||||||
msg.fail(
|
msg.fail(
|
||||||
f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
|
f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
|
||||||
|
@ -87,10 +106,26 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
|
||||||
|
|
||||||
# STEP 1: Initializing the model and printing again
|
# STEP 1: Initializing the model and printing again
|
||||||
X = _get_docs()
|
X = _get_docs()
|
||||||
Y = _get_output(model.ops.xp)
|
|
||||||
# The output vector might differ from the official type of the output layer
|
# The output vector might differ from the official type of the output layer
|
||||||
with data_validation(False):
|
with data_validation(False):
|
||||||
model.initialize(X=X, Y=Y)
|
try:
|
||||||
|
dot_names = [resolved_train_config["train_corpus"]]
|
||||||
|
with show_validation_error():
|
||||||
|
(train_corpus,) = resolve_dot_names(config, dot_names)
|
||||||
|
nlp.initialize(lambda: train_corpus(nlp))
|
||||||
|
msg.info("Initialized the model with the training corpus.")
|
||||||
|
except ValueError:
|
||||||
|
try:
|
||||||
|
_set_output_dim(nO=7, model=model)
|
||||||
|
with show_validation_error():
|
||||||
|
nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X])
|
||||||
|
msg.info("Initialized the model with dummy data.")
|
||||||
|
except Exception:
|
||||||
|
msg.fail(
|
||||||
|
"Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
|
||||||
if print_settings.get("print_after_init"):
|
if print_settings.get("print_after_init"):
|
||||||
msg.divider(f"STEP 1 - after initialization")
|
msg.divider(f"STEP 1 - after initialization")
|
||||||
_print_model(model, print_settings)
|
_print_model(model, print_settings)
|
||||||
|
@ -98,9 +133,18 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
|
||||||
# STEP 2: Updating the model and printing again
|
# STEP 2: Updating the model and printing again
|
||||||
optimizer = Adam(0.001)
|
optimizer = Adam(0.001)
|
||||||
set_dropout_rate(model, 0.2)
|
set_dropout_rate(model, 0.2)
|
||||||
|
# ugly hack to deal with Tok2Vec listeners
|
||||||
|
tok2vec = None
|
||||||
|
if model.has_ref("tok2vec") and model.get_ref("tok2vec").name == "tok2vec-listener":
|
||||||
|
tok2vec = nlp.get_pipe("tok2vec")
|
||||||
|
goldY = None
|
||||||
for e in range(3):
|
for e in range(3):
|
||||||
Y, get_dX = model.begin_update(_get_docs())
|
if tok2vec:
|
||||||
dY = get_gradient(model, Y)
|
tok2vec.update([Example.from_dict(x, {}) for x in X])
|
||||||
|
Y, get_dX = model.begin_update(X)
|
||||||
|
if goldY is None:
|
||||||
|
goldY = _simulate_gold(Y)
|
||||||
|
dY = get_gradient(goldY, Y, model.ops)
|
||||||
get_dX(dY)
|
get_dX(dY)
|
||||||
model.finish_update(optimizer)
|
model.finish_update(optimizer)
|
||||||
if print_settings.get("print_after_training"):
|
if print_settings.get("print_after_training"):
|
||||||
|
@ -108,15 +152,25 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
|
||||||
_print_model(model, print_settings)
|
_print_model(model, print_settings)
|
||||||
|
|
||||||
# STEP 3: the final prediction
|
# STEP 3: the final prediction
|
||||||
prediction = model.predict(_get_docs())
|
prediction = model.predict(X)
|
||||||
if print_settings.get("print_prediction"):
|
if print_settings.get("print_prediction"):
|
||||||
msg.divider(f"STEP 3 - prediction")
|
msg.divider(f"STEP 3 - prediction")
|
||||||
msg.info(str(prediction))
|
msg.info(str(prediction))
|
||||||
|
|
||||||
|
msg.good(f"Succesfully ended analysis - model looks good.")
|
||||||
|
|
||||||
def get_gradient(model, Y):
|
|
||||||
goldY = _get_output(model.ops.xp)
|
def get_gradient(goldY, Y, ops):
|
||||||
return Y - goldY
|
return ops.asarray(Y) - ops.asarray(goldY)
|
||||||
|
|
||||||
|
|
||||||
|
def _simulate_gold(element, counter=1):
|
||||||
|
if isinstance(element, Iterable):
|
||||||
|
for i in range(len(element)):
|
||||||
|
element[i] = _simulate_gold(element[i], counter + i)
|
||||||
|
return element
|
||||||
|
else:
|
||||||
|
return 1 / counter
|
||||||
|
|
||||||
|
|
||||||
def _sentences():
|
def _sentences():
|
||||||
|
@ -133,8 +187,13 @@ def _get_docs(lang: str = "en"):
|
||||||
return list(nlp.pipe(_sentences()))
|
return list(nlp.pipe(_sentences()))
|
||||||
|
|
||||||
|
|
||||||
def _get_output(xp):
|
def _set_output_dim(model, nO):
|
||||||
return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32")
|
# simulating dim inference by directly setting the nO argument of the model
|
||||||
|
if model.has_dim("nO") is None:
|
||||||
|
model.set_dim("nO", nO)
|
||||||
|
if model.has_ref("output_layer"):
|
||||||
|
if model.get_ref("output_layer").has_dim("nO") is None:
|
||||||
|
model.get_ref("output_layer").set_dim("nO", nO)
|
||||||
|
|
||||||
|
|
||||||
def _print_model(model, print_settings):
|
def _print_model(model, print_settings):
|
||||||
|
|
|
@ -88,7 +88,6 @@ def get_compatibility() -> dict:
|
||||||
|
|
||||||
|
|
||||||
def get_version(model: str, comp: dict) -> str:
|
def get_version(model: str, comp: dict) -> str:
|
||||||
model = get_base_version(model)
|
|
||||||
if model not in comp:
|
if model not in comp:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
f"No compatible package found for '{model}' (spaCy v{about.__version__})",
|
f"No compatible package found for '{model}' (spaCy v{about.__version__})",
|
||||||
|
|
|
@ -3,11 +3,11 @@ from wasabi import Printer
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import require_gpu, fix_random_seed
|
from thinc.api import fix_random_seed
|
||||||
|
|
||||||
from ..training import Corpus
|
from ..training import Corpus
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ._util import app, Arg, Opt
|
from ._util import app, Arg, Opt, setup_gpu, import_code
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from .. import util
|
from .. import util
|
||||||
from .. import displacy
|
from .. import displacy
|
||||||
|
@ -19,6 +19,7 @@ def evaluate_cli(
|
||||||
model: str = Arg(..., help="Model name or path"),
|
model: str = Arg(..., help="Model name or path"),
|
||||||
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
|
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
|
||||||
output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False),
|
output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False),
|
||||||
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||||
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
|
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||||
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
|
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
|
||||||
|
@ -37,6 +38,7 @@ def evaluate_cli(
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/cli#evaluate
|
DOCS: https://nightly.spacy.io/api/cli#evaluate
|
||||||
"""
|
"""
|
||||||
|
import_code(code_path)
|
||||||
evaluate(
|
evaluate(
|
||||||
model,
|
model,
|
||||||
data_path,
|
data_path,
|
||||||
|
@ -61,8 +63,7 @@ def evaluate(
|
||||||
) -> Scorer:
|
) -> Scorer:
|
||||||
msg = Printer(no_print=silent, pretty=not silent)
|
msg = Printer(no_print=silent, pretty=not silent)
|
||||||
fix_random_seed()
|
fix_random_seed()
|
||||||
if use_gpu >= 0:
|
setup_gpu(use_gpu)
|
||||||
require_gpu(use_gpu)
|
|
||||||
data_path = util.ensure_path(data_path)
|
data_path = util.ensure_path(data_path)
|
||||||
output_path = util.ensure_path(output)
|
output_path = util.ensure_path(output)
|
||||||
displacy_path = util.ensure_path(displacy_path)
|
displacy_path = util.ensure_path(displacy_path)
|
||||||
|
|
|
@ -91,7 +91,9 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
|
||||||
meta["source"] = str(model_path.resolve())
|
meta["source"] = str(model_path.resolve())
|
||||||
else:
|
else:
|
||||||
meta["source"] = str(model_path)
|
meta["source"] = str(model_path)
|
||||||
return {k: v for k, v in meta.items() if k not in ("accuracy", "speed")}
|
return {
|
||||||
|
k: v for k, v in meta.items() if k not in ("accuracy", "performance", "speed")
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_markdown(data: Dict[str, Any], title: Optional[str] = None) -> str:
|
def get_markdown(data: Dict[str, Any], title: Optional[str] = None) -> str:
|
||||||
|
|
|
@ -30,12 +30,13 @@ def init_config_cli(
|
||||||
pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
|
pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
|
||||||
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||||
cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||||
|
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Generate a starter config.cfg for training. Based on your requirements
|
Generate a starter config.cfg for training. Based on your requirements
|
||||||
specified via the CLI arguments, this command generates a config with the
|
specified via the CLI arguments, this command generates a config with the
|
||||||
optimal settings for you use case. This includes the choice of architecture,
|
optimal settings for your use case. This includes the choice of architecture,
|
||||||
pretrained weights and related hyperparameters.
|
pretrained weights and related hyperparameters.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/cli#init-config
|
DOCS: https://nightly.spacy.io/api/cli#init-config
|
||||||
|
@ -43,7 +44,14 @@ def init_config_cli(
|
||||||
if isinstance(optimize, Optimizations): # instance of enum from the CLI
|
if isinstance(optimize, Optimizations): # instance of enum from the CLI
|
||||||
optimize = optimize.value
|
optimize = optimize.value
|
||||||
pipeline = string_to_list(pipeline)
|
pipeline = string_to_list(pipeline)
|
||||||
init_config(output_file, lang=lang, pipeline=pipeline, optimize=optimize, cpu=cpu)
|
init_config(
|
||||||
|
output_file,
|
||||||
|
lang=lang,
|
||||||
|
pipeline=pipeline,
|
||||||
|
optimize=optimize,
|
||||||
|
cpu=cpu,
|
||||||
|
pretraining=pretraining,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@init_cli.command("fill-config")
|
@init_cli.command("fill-config")
|
||||||
|
@ -51,7 +59,7 @@ def init_fill_config_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False),
|
base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False),
|
||||||
output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
|
output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
|
||||||
pretraining: bool = Opt(False, "--pretraining", "-p", help="Include config for pretraining (with 'spacy pretrain')"),
|
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
|
||||||
diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes")
|
diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
|
@ -80,10 +88,10 @@ def fill_config(
|
||||||
msg = Printer(no_print=no_print)
|
msg = Printer(no_print=no_print)
|
||||||
with show_validation_error(hint_fill=False):
|
with show_validation_error(hint_fill=False):
|
||||||
config = util.load_config(base_path)
|
config = util.load_config(base_path)
|
||||||
nlp, _ = util.load_model_from_config(config, auto_fill=True, validate=False)
|
nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
|
||||||
# Load a second time with validation to be extra sure that the produced
|
# Load a second time with validation to be extra sure that the produced
|
||||||
# config result is a valid config
|
# config result is a valid config
|
||||||
nlp, _ = util.load_model_from_config(nlp.config)
|
nlp = util.load_model_from_config(nlp.config)
|
||||||
filled = nlp.config
|
filled = nlp.config
|
||||||
if pretraining:
|
if pretraining:
|
||||||
validate_config_for_pretrain(filled, msg)
|
validate_config_for_pretrain(filled, msg)
|
||||||
|
@ -109,7 +117,13 @@ def fill_config(
|
||||||
|
|
||||||
|
|
||||||
def init_config(
|
def init_config(
|
||||||
output_file: Path, *, lang: str, pipeline: List[str], optimize: str, cpu: bool
|
output_file: Path,
|
||||||
|
*,
|
||||||
|
lang: str,
|
||||||
|
pipeline: List[str],
|
||||||
|
optimize: str,
|
||||||
|
cpu: bool,
|
||||||
|
pretraining: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
is_stdout = str(output_file) == "-"
|
is_stdout = str(output_file) == "-"
|
||||||
msg = Printer(no_print=is_stdout)
|
msg = Printer(no_print=is_stdout)
|
||||||
|
@ -155,9 +169,14 @@ def init_config(
|
||||||
msg.text(f"- {label}: {value}")
|
msg.text(f"- {label}: {value}")
|
||||||
with show_validation_error(hint_fill=False):
|
with show_validation_error(hint_fill=False):
|
||||||
config = util.load_config_from_str(base_template)
|
config = util.load_config_from_str(base_template)
|
||||||
nlp, _ = util.load_model_from_config(config, auto_fill=True)
|
nlp = util.load_model_from_config(config, auto_fill=True)
|
||||||
|
config = nlp.config
|
||||||
|
if pretraining:
|
||||||
|
validate_config_for_pretrain(config, msg)
|
||||||
|
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
|
||||||
|
config = pretrain_config.merge(config)
|
||||||
msg.good("Auto-filled config with all values")
|
msg.good("Auto-filled config with all values")
|
||||||
save_config(nlp.config, output_file, is_stdout=is_stdout)
|
save_config(config, output_file, is_stdout=is_stdout)
|
||||||
|
|
||||||
|
|
||||||
def save_config(
|
def save_config(
|
||||||
|
|
|
@ -1,360 +0,0 @@
|
||||||
from typing import Optional, List, Dict, Any, Union, IO
|
|
||||||
import math
|
|
||||||
from tqdm import tqdm
|
|
||||||
import numpy
|
|
||||||
from ast import literal_eval
|
|
||||||
from pathlib import Path
|
|
||||||
from preshed.counter import PreshCounter
|
|
||||||
import tarfile
|
|
||||||
import gzip
|
|
||||||
import zipfile
|
|
||||||
import srsly
|
|
||||||
import warnings
|
|
||||||
from wasabi import msg, Printer
|
|
||||||
import typer
|
|
||||||
|
|
||||||
from ._util import app, init_cli, Arg, Opt
|
|
||||||
from ..vectors import Vectors
|
|
||||||
from ..errors import Errors, Warnings
|
|
||||||
from ..language import Language
|
|
||||||
from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
|
|
||||||
|
|
||||||
try:
|
|
||||||
import ftfy
|
|
||||||
except ImportError:
|
|
||||||
ftfy = None
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_OOV_PROB = -20
|
|
||||||
|
|
||||||
|
|
||||||
@init_cli.command("vocab")
|
|
||||||
@app.command(
|
|
||||||
"init-model",
|
|
||||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
|
||||||
hidden=True, # hide this from main CLI help but still allow it to work with warning
|
|
||||||
)
|
|
||||||
def init_model_cli(
|
|
||||||
# fmt: off
|
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
|
||||||
lang: str = Arg(..., help="Pipeline language"),
|
|
||||||
output_dir: Path = Arg(..., help="Pipeline output directory"),
|
|
||||||
freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
|
|
||||||
clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True),
|
|
||||||
jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True),
|
|
||||||
vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True),
|
|
||||||
prune_vectors: int = Opt(-1, "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
|
|
||||||
truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
|
||||||
vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
|
||||||
model_name: Optional[str] = Opt(None, "--meta-name", "-mn", help="Optional name of the package for the pipeline meta"),
|
|
||||||
base_model: Optional[str] = Opt(None, "--base", "-b", help="Name of or path to base pipeline to start with (mostly relevant for pipelines with custom tokenizers)")
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Create a new blank pipeline directory with vocab and vectors from raw data.
|
|
||||||
If vectors are provided in Word2Vec format, they can be either a .txt or
|
|
||||||
zipped as a .zip or .tar.gz.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/cli#init-vocab
|
|
||||||
"""
|
|
||||||
if ctx.command.name == "init-model":
|
|
||||||
msg.warn(
|
|
||||||
"The init-model command is now called 'init vocab'. You can run "
|
|
||||||
"'python -m spacy init --help' for an overview of the other "
|
|
||||||
"available initialization commands."
|
|
||||||
)
|
|
||||||
init_model(
|
|
||||||
lang,
|
|
||||||
output_dir,
|
|
||||||
freqs_loc=freqs_loc,
|
|
||||||
clusters_loc=clusters_loc,
|
|
||||||
jsonl_loc=jsonl_loc,
|
|
||||||
vectors_loc=vectors_loc,
|
|
||||||
prune_vectors=prune_vectors,
|
|
||||||
truncate_vectors=truncate_vectors,
|
|
||||||
vectors_name=vectors_name,
|
|
||||||
model_name=model_name,
|
|
||||||
base_model=base_model,
|
|
||||||
silent=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def init_model(
|
|
||||||
lang: str,
|
|
||||||
output_dir: Path,
|
|
||||||
freqs_loc: Optional[Path] = None,
|
|
||||||
clusters_loc: Optional[Path] = None,
|
|
||||||
jsonl_loc: Optional[Path] = None,
|
|
||||||
vectors_loc: Optional[Path] = None,
|
|
||||||
prune_vectors: int = -1,
|
|
||||||
truncate_vectors: int = 0,
|
|
||||||
vectors_name: Optional[str] = None,
|
|
||||||
model_name: Optional[str] = None,
|
|
||||||
base_model: Optional[str] = None,
|
|
||||||
silent: bool = True,
|
|
||||||
) -> Language:
|
|
||||||
msg = Printer(no_print=silent, pretty=not silent)
|
|
||||||
if jsonl_loc is not None:
|
|
||||||
if freqs_loc is not None or clusters_loc is not None:
|
|
||||||
settings = ["-j"]
|
|
||||||
if freqs_loc:
|
|
||||||
settings.append("-f")
|
|
||||||
if clusters_loc:
|
|
||||||
settings.append("-c")
|
|
||||||
msg.warn(
|
|
||||||
"Incompatible arguments",
|
|
||||||
"The -f and -c arguments are deprecated, and not compatible "
|
|
||||||
"with the -j argument, which should specify the same "
|
|
||||||
"information. Either merge the frequencies and clusters data "
|
|
||||||
"into the JSONL-formatted file (recommended), or use only the "
|
|
||||||
"-f and -c files, without the other lexical attributes.",
|
|
||||||
)
|
|
||||||
jsonl_loc = ensure_path(jsonl_loc)
|
|
||||||
lex_attrs = srsly.read_jsonl(jsonl_loc)
|
|
||||||
else:
|
|
||||||
clusters_loc = ensure_path(clusters_loc)
|
|
||||||
freqs_loc = ensure_path(freqs_loc)
|
|
||||||
if freqs_loc is not None and not freqs_loc.exists():
|
|
||||||
msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
|
|
||||||
lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc)
|
|
||||||
|
|
||||||
with msg.loading("Creating blank pipeline..."):
|
|
||||||
nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
|
|
||||||
|
|
||||||
msg.good("Successfully created blank pipeline")
|
|
||||||
if vectors_loc is not None:
|
|
||||||
add_vectors(
|
|
||||||
msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name
|
|
||||||
)
|
|
||||||
vec_added = len(nlp.vocab.vectors)
|
|
||||||
lex_added = len(nlp.vocab)
|
|
||||||
msg.good(
|
|
||||||
"Sucessfully compiled vocab", f"{lex_added} entries, {vec_added} vectors",
|
|
||||||
)
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir()
|
|
||||||
nlp.to_disk(output_dir)
|
|
||||||
return nlp
|
|
||||||
|
|
||||||
|
|
||||||
def open_file(loc: Union[str, Path]) -> IO:
|
|
||||||
"""Handle .gz, .tar.gz or unzipped files"""
|
|
||||||
loc = ensure_path(loc)
|
|
||||||
if tarfile.is_tarfile(str(loc)):
|
|
||||||
return tarfile.open(str(loc), "r:gz")
|
|
||||||
elif loc.parts[-1].endswith("gz"):
|
|
||||||
return (line.decode("utf8") for line in gzip.open(str(loc), "r"))
|
|
||||||
elif loc.parts[-1].endswith("zip"):
|
|
||||||
zip_file = zipfile.ZipFile(str(loc))
|
|
||||||
names = zip_file.namelist()
|
|
||||||
file_ = zip_file.open(names[0])
|
|
||||||
return (line.decode("utf8") for line in file_)
|
|
||||||
else:
|
|
||||||
return loc.open("r", encoding="utf8")
|
|
||||||
|
|
||||||
|
|
||||||
def read_attrs_from_deprecated(
|
|
||||||
msg: Printer, freqs_loc: Optional[Path], clusters_loc: Optional[Path]
|
|
||||||
) -> List[Dict[str, Any]]:
|
|
||||||
if freqs_loc is not None:
|
|
||||||
with msg.loading("Counting frequencies..."):
|
|
||||||
probs, _ = read_freqs(freqs_loc)
|
|
||||||
msg.good("Counted frequencies")
|
|
||||||
else:
|
|
||||||
probs, _ = ({}, DEFAULT_OOV_PROB) # noqa: F841
|
|
||||||
if clusters_loc:
|
|
||||||
with msg.loading("Reading clusters..."):
|
|
||||||
clusters = read_clusters(clusters_loc)
|
|
||||||
msg.good("Read clusters")
|
|
||||||
else:
|
|
||||||
clusters = {}
|
|
||||||
lex_attrs = []
|
|
||||||
sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
|
|
||||||
if len(sorted_probs):
|
|
||||||
for i, (word, prob) in tqdm(enumerate(sorted_probs)):
|
|
||||||
attrs = {"orth": word, "id": i, "prob": prob}
|
|
||||||
# Decode as a little-endian string, so that we can do & 15 to get
|
|
||||||
# the first 4 bits. See _parse_features.pyx
|
|
||||||
if word in clusters:
|
|
||||||
attrs["cluster"] = int(clusters[word][::-1], 2)
|
|
||||||
else:
|
|
||||||
attrs["cluster"] = 0
|
|
||||||
lex_attrs.append(attrs)
|
|
||||||
return lex_attrs
|
|
||||||
|
|
||||||
|
|
||||||
def create_model(
|
|
||||||
lang: str,
|
|
||||||
lex_attrs: List[Dict[str, Any]],
|
|
||||||
name: Optional[str] = None,
|
|
||||||
base_model: Optional[Union[str, Path]] = None,
|
|
||||||
) -> Language:
|
|
||||||
if base_model:
|
|
||||||
nlp = load_model(base_model)
|
|
||||||
# keep the tokenizer but remove any existing pipeline components due to
|
|
||||||
# potentially conflicting vectors
|
|
||||||
for pipe in nlp.pipe_names:
|
|
||||||
nlp.remove_pipe(pipe)
|
|
||||||
else:
|
|
||||||
lang_class = get_lang_class(lang)
|
|
||||||
nlp = lang_class()
|
|
||||||
for lexeme in nlp.vocab:
|
|
||||||
lexeme.rank = OOV_RANK
|
|
||||||
for attrs in lex_attrs:
|
|
||||||
if "settings" in attrs:
|
|
||||||
continue
|
|
||||||
lexeme = nlp.vocab[attrs["orth"]]
|
|
||||||
lexeme.set_attrs(**attrs)
|
|
||||||
if len(nlp.vocab):
|
|
||||||
oov_prob = min(lex.prob for lex in nlp.vocab) - 1
|
|
||||||
else:
|
|
||||||
oov_prob = DEFAULT_OOV_PROB
|
|
||||||
nlp.vocab.cfg.update({"oov_prob": oov_prob})
|
|
||||||
if name:
|
|
||||||
nlp.meta["name"] = name
|
|
||||||
return nlp
|
|
||||||
|
|
||||||
|
|
||||||
def add_vectors(
|
|
||||||
msg: Printer,
|
|
||||||
nlp: Language,
|
|
||||||
vectors_loc: Optional[Path],
|
|
||||||
truncate_vectors: int,
|
|
||||||
prune_vectors: int,
|
|
||||||
name: Optional[str] = None,
|
|
||||||
) -> None:
|
|
||||||
vectors_loc = ensure_path(vectors_loc)
|
|
||||||
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
|
||||||
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
|
|
||||||
for lex in nlp.vocab:
|
|
||||||
if lex.rank and lex.rank != OOV_RANK:
|
|
||||||
nlp.vocab.vectors.add(lex.orth, row=lex.rank)
|
|
||||||
else:
|
|
||||||
if vectors_loc:
|
|
||||||
with msg.loading(f"Reading vectors from {vectors_loc}"):
|
|
||||||
vectors_data, vector_keys = read_vectors(
|
|
||||||
msg, vectors_loc, truncate_vectors
|
|
||||||
)
|
|
||||||
msg.good(f"Loaded vectors from {vectors_loc}")
|
|
||||||
else:
|
|
||||||
vectors_data, vector_keys = (None, None)
|
|
||||||
if vector_keys is not None:
|
|
||||||
for word in vector_keys:
|
|
||||||
if word not in nlp.vocab:
|
|
||||||
nlp.vocab[word]
|
|
||||||
if vectors_data is not None:
|
|
||||||
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
|
||||||
if name is None:
|
|
||||||
# TODO: Is this correct? Does this matter?
|
|
||||||
nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
|
|
||||||
else:
|
|
||||||
nlp.vocab.vectors.name = name
|
|
||||||
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
|
||||||
if prune_vectors >= 1:
|
|
||||||
nlp.vocab.prune_vectors(prune_vectors)
|
|
||||||
|
|
||||||
|
|
||||||
def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int):
|
|
||||||
f = open_file(vectors_loc)
|
|
||||||
f = ensure_shape(f)
|
|
||||||
shape = tuple(int(size) for size in next(f).split())
|
|
||||||
if truncate_vectors >= 1:
|
|
||||||
shape = (truncate_vectors, shape[1])
|
|
||||||
vectors_data = numpy.zeros(shape=shape, dtype="f")
|
|
||||||
vectors_keys = []
|
|
||||||
for i, line in enumerate(tqdm(f)):
|
|
||||||
line = line.rstrip()
|
|
||||||
pieces = line.rsplit(" ", vectors_data.shape[1])
|
|
||||||
word = pieces.pop(0)
|
|
||||||
if len(pieces) != vectors_data.shape[1]:
|
|
||||||
msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
|
|
||||||
vectors_data[i] = numpy.asarray(pieces, dtype="f")
|
|
||||||
vectors_keys.append(word)
|
|
||||||
if i == truncate_vectors - 1:
|
|
||||||
break
|
|
||||||
return vectors_data, vectors_keys
|
|
||||||
|
|
||||||
|
|
||||||
def ensure_shape(lines):
|
|
||||||
"""Ensure that the first line of the data is the vectors shape.
|
|
||||||
|
|
||||||
If it's not, we read in the data and output the shape as the first result,
|
|
||||||
so that the reader doesn't have to deal with the problem.
|
|
||||||
"""
|
|
||||||
first_line = next(lines)
|
|
||||||
try:
|
|
||||||
shape = tuple(int(size) for size in first_line.split())
|
|
||||||
except ValueError:
|
|
||||||
shape = None
|
|
||||||
if shape is not None:
|
|
||||||
# All good, give the data
|
|
||||||
yield first_line
|
|
||||||
yield from lines
|
|
||||||
else:
|
|
||||||
# Figure out the shape, make it the first value, and then give the
|
|
||||||
# rest of the data.
|
|
||||||
width = len(first_line.split()) - 1
|
|
||||||
captured = [first_line] + list(lines)
|
|
||||||
length = len(captured)
|
|
||||||
yield f"{length} {width}"
|
|
||||||
yield from captured
|
|
||||||
|
|
||||||
|
|
||||||
def read_freqs(
|
|
||||||
freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50
|
|
||||||
):
|
|
||||||
counts = PreshCounter()
|
|
||||||
total = 0
|
|
||||||
with freqs_loc.open() as f:
|
|
||||||
for i, line in enumerate(f):
|
|
||||||
freq, doc_freq, key = line.rstrip().split("\t", 2)
|
|
||||||
freq = int(freq)
|
|
||||||
counts.inc(i + 1, freq)
|
|
||||||
total += freq
|
|
||||||
counts.smooth()
|
|
||||||
log_total = math.log(total)
|
|
||||||
probs = {}
|
|
||||||
with freqs_loc.open() as f:
|
|
||||||
for line in tqdm(f):
|
|
||||||
freq, doc_freq, key = line.rstrip().split("\t", 2)
|
|
||||||
doc_freq = int(doc_freq)
|
|
||||||
freq = int(freq)
|
|
||||||
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
|
|
||||||
try:
|
|
||||||
word = literal_eval(key)
|
|
||||||
except SyntaxError:
|
|
||||||
# Take odd strings literally.
|
|
||||||
word = literal_eval(f"'{key}'")
|
|
||||||
smooth_count = counts.smoother(int(freq))
|
|
||||||
probs[word] = math.log(smooth_count) - log_total
|
|
||||||
oov_prob = math.log(counts.smoother(0)) - log_total
|
|
||||||
return probs, oov_prob
|
|
||||||
|
|
||||||
|
|
||||||
def read_clusters(clusters_loc: Path) -> dict:
|
|
||||||
clusters = {}
|
|
||||||
if ftfy is None:
|
|
||||||
warnings.warn(Warnings.W004)
|
|
||||||
with clusters_loc.open() as f:
|
|
||||||
for line in tqdm(f):
|
|
||||||
try:
|
|
||||||
cluster, word, freq = line.split()
|
|
||||||
if ftfy is not None:
|
|
||||||
word = ftfy.fix_text(word)
|
|
||||||
except ValueError:
|
|
||||||
continue
|
|
||||||
# If the clusterer has only seen the word a few times, its
|
|
||||||
# cluster is unreliable.
|
|
||||||
if int(freq) >= 3:
|
|
||||||
clusters[word] = cluster
|
|
||||||
else:
|
|
||||||
clusters[word] = "0"
|
|
||||||
# Expand clusters with re-casing
|
|
||||||
for word, cluster in list(clusters.items()):
|
|
||||||
if word.lower() not in clusters:
|
|
||||||
clusters[word.lower()] = cluster
|
|
||||||
if word.title() not in clusters:
|
|
||||||
clusters[word.title()] = cluster
|
|
||||||
if word.upper() not in clusters:
|
|
||||||
clusters[word.upper()] = cluster
|
|
||||||
return clusters
|
|
117
spacy/cli/init_pipeline.py
Normal file
117
spacy/cli/init_pipeline.py
Normal file
|
@ -0,0 +1,117 @@
|
||||||
|
from typing import Optional
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from wasabi import msg
|
||||||
|
import typer
|
||||||
|
import srsly
|
||||||
|
|
||||||
|
from .. import util
|
||||||
|
from ..training.initialize import init_nlp, convert_vectors
|
||||||
|
from ..language import Language
|
||||||
|
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
|
||||||
|
from ._util import import_code, setup_gpu
|
||||||
|
|
||||||
|
|
||||||
|
@init_cli.command("vectors")
|
||||||
|
def init_vectors_cli(
|
||||||
|
# fmt: off
|
||||||
|
lang: str = Arg(..., help="The language of the nlp object to create"),
|
||||||
|
vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True),
|
||||||
|
output_dir: Path = Arg(..., help="Pipeline output directory"),
|
||||||
|
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
|
||||||
|
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
||||||
|
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||||
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
|
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""Convert word vectors for use with spaCy. Will export an nlp object that
|
||||||
|
you can use in the [initialize] block of your config to initialize
|
||||||
|
a model with vectors.
|
||||||
|
"""
|
||||||
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
|
msg.info(f"Creating blank nlp object for language '{lang}'")
|
||||||
|
nlp = util.get_lang_class(lang)()
|
||||||
|
if jsonl_loc is not None:
|
||||||
|
update_lexemes(nlp, jsonl_loc)
|
||||||
|
convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
|
||||||
|
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
||||||
|
nlp.to_disk(output_dir)
|
||||||
|
msg.good(
|
||||||
|
"Saved nlp object with vectors to output directory. You can now use the "
|
||||||
|
"path to it in your config as the 'vectors' setting in [initialize.vocab].",
|
||||||
|
output_dir.resolve(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
|
||||||
|
# Mostly used for backwards-compatibility and may be removed in the future
|
||||||
|
lex_attrs = srsly.read_jsonl(jsonl_loc)
|
||||||
|
for attrs in lex_attrs:
|
||||||
|
if "settings" in attrs:
|
||||||
|
continue
|
||||||
|
lexeme = nlp.vocab[attrs["orth"]]
|
||||||
|
lexeme.set_attrs(**attrs)
|
||||||
|
|
||||||
|
|
||||||
|
@init_cli.command(
|
||||||
|
"nlp",
|
||||||
|
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||||
|
hidden=True,
|
||||||
|
)
|
||||||
|
def init_pipeline_cli(
|
||||||
|
# fmt: off
|
||||||
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
|
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||||
|
output_path: Path = Arg(..., help="Output directory for the prepared data"),
|
||||||
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
|
overrides = parse_config_overrides(ctx.args)
|
||||||
|
import_code(code_path)
|
||||||
|
setup_gpu(use_gpu)
|
||||||
|
with show_validation_error(config_path):
|
||||||
|
config = util.load_config(config_path, overrides=overrides)
|
||||||
|
with show_validation_error(hint_fill=False):
|
||||||
|
nlp = init_nlp(config, use_gpu=use_gpu)
|
||||||
|
nlp.to_disk(output_path)
|
||||||
|
msg.good(f"Saved initialized pipeline to {output_path}")
|
||||||
|
|
||||||
|
|
||||||
|
@init_cli.command(
|
||||||
|
"labels",
|
||||||
|
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||||
|
)
|
||||||
|
def init_labels_cli(
|
||||||
|
# fmt: off
|
||||||
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
|
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||||
|
output_path: Path = Arg(..., help="Output directory for the labels"),
|
||||||
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""Generate JSON files for the labels in the data. This helps speed up the
|
||||||
|
training process, since spaCy won't have to preprocess the data to
|
||||||
|
extract the labels."""
|
||||||
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
|
if not output_path.exists():
|
||||||
|
output_path.mkdir()
|
||||||
|
overrides = parse_config_overrides(ctx.args)
|
||||||
|
import_code(code_path)
|
||||||
|
setup_gpu(use_gpu)
|
||||||
|
with show_validation_error(config_path):
|
||||||
|
config = util.load_config(config_path, overrides=overrides)
|
||||||
|
with show_validation_error(hint_fill=False):
|
||||||
|
nlp = init_nlp(config, use_gpu=use_gpu)
|
||||||
|
for name, component in nlp.pipeline:
|
||||||
|
if getattr(component, "label_data", None) is not None:
|
||||||
|
output_file = output_path / f"{name}.json"
|
||||||
|
srsly.write_json(output_file, component.label_data)
|
||||||
|
msg.good(f"Saving {name} labels to {output_file}")
|
||||||
|
else:
|
||||||
|
msg.info(f"No labels found for {name}")
|
|
@ -110,7 +110,7 @@ def package(
|
||||||
msg.good(f"Successfully created package '{model_name_v}'", main_path)
|
msg.good(f"Successfully created package '{model_name_v}'", main_path)
|
||||||
if create_sdist:
|
if create_sdist:
|
||||||
with util.working_dir(main_path):
|
with util.working_dir(main_path):
|
||||||
util.run_command([sys.executable, "setup.py", "sdist"])
|
util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
|
||||||
zip_file = main_path / "dist" / f"{model_name_v}.tar.gz"
|
zip_file = main_path / "dist" / f"{model_name_v}.tar.gz"
|
||||||
msg.good(f"Successfully created zipped Python package", zip_file)
|
msg.good(f"Successfully created zipped Python package", zip_file)
|
||||||
|
|
||||||
|
|
|
@ -1,25 +1,13 @@
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
import numpy
|
|
||||||
import time
|
|
||||||
import re
|
|
||||||
from collections import Counter
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from thinc.api import Config
|
|
||||||
from thinc.api import use_pytorch_for_gpu_memory, require_gpu
|
|
||||||
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
|
|
||||||
from thinc.api import CosineDistance, L2Distance
|
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
import srsly
|
|
||||||
from functools import partial
|
|
||||||
import typer
|
import typer
|
||||||
|
import re
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||||
from ._util import import_code
|
from ._util import import_code, setup_gpu
|
||||||
from ..ml.models.multi_task import build_cloze_multi_task_model
|
from ..training.pretrain import pretrain
|
||||||
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
|
from ..util import load_config
|
||||||
from ..tokens import Doc
|
|
||||||
from ..attrs import ID
|
|
||||||
from .. import util
|
|
||||||
|
|
||||||
|
|
||||||
@app.command(
|
@app.command(
|
||||||
|
@ -31,7 +19,7 @@ def pretrain_cli(
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
|
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
|
||||||
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
|
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
|
||||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||||
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
|
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
|
||||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||||
|
@ -61,280 +49,35 @@ def pretrain_cli(
|
||||||
config_overrides = parse_config_overrides(ctx.args)
|
config_overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
|
verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
|
||||||
if use_gpu >= 0:
|
setup_gpu(use_gpu)
|
||||||
msg.info("Using GPU")
|
|
||||||
require_gpu(use_gpu)
|
|
||||||
else:
|
|
||||||
msg.info("Using CPU")
|
|
||||||
msg.info(f"Loading config from: {config_path}")
|
msg.info(f"Loading config from: {config_path}")
|
||||||
|
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
config = util.load_config(
|
raw_config = load_config(
|
||||||
config_path,
|
config_path, overrides=config_overrides, interpolate=False
|
||||||
overrides=config_overrides,
|
|
||||||
interpolate=True
|
|
||||||
)
|
)
|
||||||
|
config = raw_config.interpolate()
|
||||||
if not config.get("pretraining"):
|
if not config.get("pretraining"):
|
||||||
# TODO: What's the solution here? How do we handle optional blocks?
|
# TODO: What's the solution here? How do we handle optional blocks?
|
||||||
msg.fail("The [pretraining] block in your config is empty", exits=1)
|
msg.fail("The [pretraining] block in your config is empty", exits=1)
|
||||||
if not output_dir.exists():
|
if not output_dir.exists():
|
||||||
output_dir.mkdir()
|
output_dir.mkdir()
|
||||||
msg.good(f"Created output directory: {output_dir}")
|
msg.good(f"Created output directory: {output_dir}")
|
||||||
|
# Save non-interpolated config
|
||||||
config.to_disk(output_dir / "config.cfg")
|
raw_config.to_disk(output_dir / "config.cfg")
|
||||||
msg.good("Saved config file in the output directory")
|
msg.good("Saved config file in the output directory")
|
||||||
|
|
||||||
pretrain(
|
pretrain(
|
||||||
config,
|
config,
|
||||||
output_dir,
|
output_dir,
|
||||||
resume_path=resume_path,
|
resume_path=resume_path,
|
||||||
epoch_resume=epoch_resume,
|
epoch_resume=epoch_resume,
|
||||||
use_gpu=use_gpu,
|
use_gpu=use_gpu,
|
||||||
|
silent=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def pretrain(
|
|
||||||
config: Config,
|
|
||||||
output_dir: Path,
|
|
||||||
resume_path: Optional[Path] = None,
|
|
||||||
epoch_resume: Optional[int] = None,
|
|
||||||
use_gpu: int=-1
|
|
||||||
):
|
|
||||||
if config["system"].get("seed") is not None:
|
|
||||||
fix_random_seed(config["system"]["seed"])
|
|
||||||
if use_gpu >= 0 and config["system"].get("use_pytorch_for_gpu_memory"):
|
|
||||||
use_pytorch_for_gpu_memory()
|
|
||||||
nlp, config = util.load_model_from_config(config)
|
|
||||||
P_cfg = config["pretraining"]
|
|
||||||
corpus = P_cfg["corpus"]
|
|
||||||
batcher = P_cfg["batcher"]
|
|
||||||
model = create_pretraining_model(nlp, config["pretraining"])
|
|
||||||
optimizer = config["pretraining"]["optimizer"]
|
|
||||||
|
|
||||||
# Load in pretrained weights to resume from
|
|
||||||
if resume_path is not None:
|
|
||||||
_resume_model(model, resume_path, epoch_resume)
|
|
||||||
else:
|
|
||||||
# Without '--resume-path' the '--epoch-resume' argument is ignored
|
|
||||||
epoch_resume = 0
|
|
||||||
|
|
||||||
tracker = ProgressTracker(frequency=10000)
|
|
||||||
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
|
|
||||||
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
|
||||||
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
|
|
||||||
|
|
||||||
def _save_model(epoch, is_temp=False):
|
|
||||||
is_temp_str = ".temp" if is_temp else ""
|
|
||||||
with model.use_params(optimizer.averages):
|
|
||||||
with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
|
|
||||||
file_.write(model.get_ref("tok2vec").to_bytes())
|
|
||||||
log = {
|
|
||||||
"nr_word": tracker.nr_word,
|
|
||||||
"loss": tracker.loss,
|
|
||||||
"epoch_loss": tracker.epoch_loss,
|
|
||||||
"epoch": epoch,
|
|
||||||
}
|
|
||||||
with (output_dir / "log.jsonl").open("a") as file_:
|
|
||||||
file_.write(srsly.json_dumps(log) + "\n")
|
|
||||||
|
|
||||||
objective = create_objective(P_cfg["objective"])
|
|
||||||
# TODO: I think we probably want this to look more like the
|
|
||||||
# 'create_train_batches' function?
|
|
||||||
for epoch in range(epoch_resume, P_cfg["max_epochs"]):
|
|
||||||
for batch_id, batch in enumerate(batcher(corpus(nlp))):
|
|
||||||
docs = ensure_docs(batch)
|
|
||||||
loss = make_update(model, docs, optimizer, objective)
|
|
||||||
progress = tracker.update(epoch, loss, docs)
|
|
||||||
if progress:
|
|
||||||
msg.row(progress, **row_settings)
|
|
||||||
if P_cfg["n_save_every"] and (
|
|
||||||
batch_id % P_cfg["n_save_every"] == 0
|
|
||||||
):
|
|
||||||
_save_model(epoch, is_temp=True)
|
|
||||||
_save_model(epoch)
|
|
||||||
tracker.epoch_loss = 0.0
|
|
||||||
msg.good("Successfully finished pretrain")
|
msg.good("Successfully finished pretrain")
|
||||||
|
|
||||||
|
|
||||||
def ensure_docs(examples_or_docs):
|
|
||||||
docs = []
|
|
||||||
for eg_or_doc in examples_or_docs:
|
|
||||||
if isinstance(eg_or_doc, Doc):
|
|
||||||
docs.append(eg_or_doc)
|
|
||||||
else:
|
|
||||||
docs.append(eg_or_doc.reference)
|
|
||||||
return docs
|
|
||||||
|
|
||||||
|
|
||||||
def _resume_model(model, resume_path, epoch_resume):
|
|
||||||
msg.info(f"Resume training tok2vec from: {resume_path}")
|
|
||||||
with resume_path.open("rb") as file_:
|
|
||||||
weights_data = file_.read()
|
|
||||||
model.get_ref("tok2vec").from_bytes(weights_data)
|
|
||||||
# Parse the epoch number from the given weight file
|
|
||||||
model_name = re.search(r"model\d+\.bin", str(resume_path))
|
|
||||||
if model_name:
|
|
||||||
# Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
|
|
||||||
epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
|
|
||||||
msg.info(f"Resuming from epoch: {epoch_resume}")
|
|
||||||
else:
|
|
||||||
msg.info(f"Resuming from epoch: {epoch_resume}")
|
|
||||||
|
|
||||||
|
|
||||||
def make_update(model, docs, optimizer, objective_func):
|
|
||||||
"""Perform an update over a single batch of documents.
|
|
||||||
|
|
||||||
docs (iterable): A batch of `Doc` objects.
|
|
||||||
optimizer (callable): An optimizer.
|
|
||||||
RETURNS loss: A float for the loss.
|
|
||||||
"""
|
|
||||||
predictions, backprop = model.begin_update(docs)
|
|
||||||
loss, gradients = objective_func(model.ops, docs, predictions)
|
|
||||||
backprop(gradients)
|
|
||||||
model.finish_update(optimizer)
|
|
||||||
# Don't want to return a cupy object here
|
|
||||||
# The gradients are modified in-place by the BERT MLM,
|
|
||||||
# so we get an accurate loss
|
|
||||||
return float(loss)
|
|
||||||
|
|
||||||
|
|
||||||
def create_objective(config):
|
|
||||||
"""Create the objective for pretraining.
|
|
||||||
|
|
||||||
We'd like to replace this with a registry function but it's tricky because
|
|
||||||
we're also making a model choice based on this. For now we hard-code support
|
|
||||||
for two types (characters, vectors). For characters you can specify
|
|
||||||
n_characters, for vectors you can specify the loss.
|
|
||||||
|
|
||||||
Bleh.
|
|
||||||
"""
|
|
||||||
objective_type = config["type"]
|
|
||||||
if objective_type == "characters":
|
|
||||||
return partial(get_characters_loss, nr_char=config["n_characters"])
|
|
||||||
elif objective_type == "vectors":
|
|
||||||
if config["loss"] == "cosine":
|
|
||||||
return partial(
|
|
||||||
get_vectors_loss,
|
|
||||||
distance=CosineDistance(normalize=True, ignore_zeros=True),
|
|
||||||
)
|
|
||||||
elif config["loss"] == "L2":
|
|
||||||
return partial(
|
|
||||||
get_vectors_loss, distance=L2Distance(normalize=True, ignore_zeros=True)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
raise ValueError("Unexpected loss type", config["loss"])
|
|
||||||
else:
|
|
||||||
raise ValueError("Unexpected objective_type", objective_type)
|
|
||||||
|
|
||||||
|
|
||||||
def get_vectors_loss(ops, docs, prediction, distance):
|
|
||||||
"""Compute a loss based on a distance between the documents' vectors and
|
|
||||||
the prediction.
|
|
||||||
"""
|
|
||||||
# The simplest way to implement this would be to vstack the
|
|
||||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
|
||||||
# Instead we fetch the index into the vectors table for each of our tokens,
|
|
||||||
# and look them up all at once. This prevents data copying.
|
|
||||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
|
||||||
target = docs[0].vocab.vectors.data[ids]
|
|
||||||
d_target, loss = distance(prediction, target)
|
|
||||||
return loss, d_target
|
|
||||||
|
|
||||||
|
|
||||||
def get_characters_loss(ops, docs, prediction, nr_char):
|
|
||||||
"""Compute a loss based on a number of characters predicted from the docs."""
|
|
||||||
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
|
|
||||||
target_ids = target_ids.reshape((-1,))
|
|
||||||
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
|
|
||||||
target = target.reshape((-1, 256 * nr_char))
|
|
||||||
diff = prediction - target
|
|
||||||
loss = (diff ** 2).sum()
|
|
||||||
d_target = diff / float(prediction.shape[0])
|
|
||||||
return loss, d_target
|
|
||||||
|
|
||||||
|
|
||||||
def create_pretraining_model(nlp, pretrain_config):
|
|
||||||
"""Define a network for the pretraining. We simply add an output layer onto
|
|
||||||
the tok2vec input model. The tok2vec input model needs to be a model that
|
|
||||||
takes a batch of Doc objects (as a list), and returns a list of arrays.
|
|
||||||
Each array in the output needs to have one row per token in the doc.
|
|
||||||
The actual tok2vec layer is stored as a reference, and only this bit will be
|
|
||||||
serialized to file and read back in when calling the 'train' command.
|
|
||||||
"""
|
|
||||||
component = nlp.get_pipe(pretrain_config["component"])
|
|
||||||
if pretrain_config.get("layer"):
|
|
||||||
tok2vec = component.model.get_ref(pretrain_config["layer"])
|
|
||||||
else:
|
|
||||||
tok2vec = component.model
|
|
||||||
|
|
||||||
# TODO
|
|
||||||
maxout_pieces = 3
|
|
||||||
hidden_size = 300
|
|
||||||
if pretrain_config["objective"]["type"] == "vectors":
|
|
||||||
model = build_cloze_multi_task_model(
|
|
||||||
nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
|
|
||||||
)
|
|
||||||
elif pretrain_config["objective"]["type"] == "characters":
|
|
||||||
model = build_cloze_characters_multi_task_model(
|
|
||||||
nlp.vocab,
|
|
||||||
tok2vec,
|
|
||||||
hidden_size=hidden_size,
|
|
||||||
maxout_pieces=maxout_pieces,
|
|
||||||
nr_char=pretrain_config["objective"]["n_characters"],
|
|
||||||
)
|
|
||||||
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
|
|
||||||
set_dropout_rate(model, pretrain_config["dropout"])
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
class ProgressTracker:
|
|
||||||
def __init__(self, frequency=1000000):
|
|
||||||
self.loss = 0.0
|
|
||||||
self.prev_loss = 0.0
|
|
||||||
self.nr_word = 0
|
|
||||||
self.words_per_epoch = Counter()
|
|
||||||
self.frequency = frequency
|
|
||||||
self.last_time = time.time()
|
|
||||||
self.last_update = 0
|
|
||||||
self.epoch_loss = 0.0
|
|
||||||
|
|
||||||
def update(self, epoch, loss, docs):
|
|
||||||
self.loss += loss
|
|
||||||
self.epoch_loss += loss
|
|
||||||
words_in_batch = sum(len(doc) for doc in docs)
|
|
||||||
self.words_per_epoch[epoch] += words_in_batch
|
|
||||||
self.nr_word += words_in_batch
|
|
||||||
words_since_update = self.nr_word - self.last_update
|
|
||||||
if words_since_update >= self.frequency:
|
|
||||||
wps = words_since_update / (time.time() - self.last_time)
|
|
||||||
self.last_update = self.nr_word
|
|
||||||
self.last_time = time.time()
|
|
||||||
loss_per_word = self.loss - self.prev_loss
|
|
||||||
status = (
|
|
||||||
epoch,
|
|
||||||
self.nr_word,
|
|
||||||
_smart_round(self.loss, width=10),
|
|
||||||
_smart_round(loss_per_word, width=6),
|
|
||||||
int(wps),
|
|
||||||
)
|
|
||||||
self.prev_loss = float(self.loss)
|
|
||||||
return status
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _smart_round(figure, width=10, max_decimal=4):
|
|
||||||
"""Round large numbers as integers, smaller numbers as decimals."""
|
|
||||||
n_digits = len(str(int(figure)))
|
|
||||||
n_decimal = width - (n_digits + 1)
|
|
||||||
if n_decimal <= 1:
|
|
||||||
return str(int(figure))
|
|
||||||
else:
|
|
||||||
n_decimal = min(n_decimal, max_decimal)
|
|
||||||
format_str = "%." + str(n_decimal) + "f"
|
|
||||||
return format_str % figure
|
|
||||||
|
|
||||||
|
|
||||||
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
|
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
|
||||||
if not config_path or not config_path.exists():
|
if not config_path or not config_path.exists():
|
||||||
msg.fail("Config file not found", config_path, exits=1)
|
msg.fail("Config file not found", config_path, exits=1)
|
||||||
|
|
|
@ -66,6 +66,7 @@ def project_assets(project_dir: Path, *, sparse_checkout: bool = False) -> None:
|
||||||
branch=asset["git"].get("branch"),
|
branch=asset["git"].get("branch"),
|
||||||
sparse=sparse_checkout,
|
sparse=sparse_checkout,
|
||||||
)
|
)
|
||||||
|
msg.good(f"Downloaded asset {dest}")
|
||||||
else:
|
else:
|
||||||
url = asset.get("url")
|
url = asset.get("url")
|
||||||
if not url:
|
if not url:
|
||||||
|
|
|
@ -114,6 +114,6 @@ def project_document(
|
||||||
content = f"{before}{content}{after}"
|
content = f"{before}{content}{after}"
|
||||||
else:
|
else:
|
||||||
msg.warn("Replacing existing file")
|
msg.warn("Replacing existing file")
|
||||||
with output_file.open("w") as f:
|
with output_file.open("w", encoding="utf8") as f:
|
||||||
f.write(content)
|
f.write(content)
|
||||||
msg.good("Saved project documentation", output_file)
|
msg.good("Saved project documentation", output_file)
|
||||||
|
|
|
@ -134,7 +134,7 @@ def update_dvc_config(
|
||||||
|
|
||||||
|
|
||||||
def run_dvc_commands(
|
def run_dvc_commands(
|
||||||
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {},
|
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Run a sequence of DVC commands in a subprocess, in order.
|
"""Run a sequence of DVC commands in a subprocess, in order.
|
||||||
|
|
||||||
|
|
|
@ -27,19 +27,32 @@ def project_pull_cli(
|
||||||
|
|
||||||
|
|
||||||
def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
||||||
|
# TODO: We don't have tests for this :(. It would take a bit of mockery to
|
||||||
|
# set up. I guess see if it breaks first?
|
||||||
config = load_project_config(project_dir)
|
config = load_project_config(project_dir)
|
||||||
if remote in config.get("remotes", {}):
|
if remote in config.get("remotes", {}):
|
||||||
remote = config["remotes"][remote]
|
remote = config["remotes"][remote]
|
||||||
storage = RemoteStorage(project_dir, remote)
|
storage = RemoteStorage(project_dir, remote)
|
||||||
for cmd in config.get("commands", []):
|
commands = list(config.get("commands", []))
|
||||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
# We use a while loop here because we don't know how the commands
|
||||||
if any(not dep.exists() for dep in deps):
|
# will be ordered. A command might need dependencies from one that's later
|
||||||
continue
|
# in the list.
|
||||||
cmd_hash = get_command_hash("", "", deps, cmd["script"])
|
while commands:
|
||||||
for output_path in cmd.get("outputs", []):
|
for i, cmd in enumerate(list(commands)):
|
||||||
url = storage.pull(output_path, command_hash=cmd_hash)
|
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
||||||
yield url, output_path
|
if all(dep.exists() for dep in deps):
|
||||||
|
cmd_hash = get_command_hash("", "", deps, cmd["script"])
|
||||||
|
for output_path in cmd.get("outputs", []):
|
||||||
|
url = storage.pull(output_path, command_hash=cmd_hash)
|
||||||
|
yield url, output_path
|
||||||
|
|
||||||
out_locs = [project_dir / out for out in cmd.get("outputs", [])]
|
out_locs = [project_dir / out for out in cmd.get("outputs", [])]
|
||||||
if all(loc.exists() for loc in out_locs):
|
if all(loc.exists() for loc in out_locs):
|
||||||
update_lockfile(project_dir, cmd)
|
update_lockfile(project_dir, cmd)
|
||||||
|
# We remove the command from the list here, and break, so that
|
||||||
|
# we iterate over the loop again.
|
||||||
|
commands.pop(i)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
# If we didn't break the for loop, break the while loop.
|
||||||
|
break
|
||||||
|
|
|
@ -7,7 +7,9 @@ import tarfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from .._util import get_hash, get_checksum, download_file, ensure_pathy
|
from .._util import get_hash, get_checksum, download_file, ensure_pathy
|
||||||
from ...util import make_tempdir
|
from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var
|
||||||
|
from ...git_info import GIT_VERSION
|
||||||
|
from ... import about
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from pathy import Pathy # noqa: F401
|
from pathy import Pathy # noqa: F401
|
||||||
|
@ -129,7 +131,10 @@ def get_command_hash(
|
||||||
currently installed packages, whatever environment variables have been marked
|
currently installed packages, whatever environment variables have been marked
|
||||||
as relevant, and the command.
|
as relevant, and the command.
|
||||||
"""
|
"""
|
||||||
hashes = [site_hash, env_hash] + [get_checksum(dep) for dep in sorted(deps)]
|
check_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
|
||||||
|
spacy_v = GIT_VERSION if check_commit else get_minor_version(about.__version__)
|
||||||
|
dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
|
||||||
|
hashes = [spacy_v, site_hash, env_hash] + dep_checksums
|
||||||
hashes.extend(cmd)
|
hashes.extend(cmd)
|
||||||
creation_bytes = "".join(hashes).encode("utf8")
|
creation_bytes = "".join(hashes).encode("utf8")
|
||||||
return hashlib.md5(creation_bytes).hexdigest()
|
return hashlib.md5(creation_bytes).hexdigest()
|
||||||
|
|
|
@ -4,8 +4,11 @@ from wasabi import msg
|
||||||
import sys
|
import sys
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
|
from ... import about
|
||||||
|
from ...git_info import GIT_VERSION
|
||||||
from ...util import working_dir, run_command, split_command, is_cwd, join_command
|
from ...util import working_dir, run_command, split_command, is_cwd, join_command
|
||||||
from ...util import SimpleFrozenList
|
from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
|
||||||
|
from ...util import check_bool_env_var
|
||||||
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
|
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
|
||||||
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND
|
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND
|
||||||
|
|
||||||
|
@ -59,14 +62,16 @@ def project_run(
|
||||||
for dep in cmd.get("deps", []):
|
for dep in cmd.get("deps", []):
|
||||||
if not (project_dir / dep).exists():
|
if not (project_dir / dep).exists():
|
||||||
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
||||||
|
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
|
||||||
err_kwargs = {"exits": 1} if not dry else {}
|
err_kwargs = {"exits": 1} if not dry else {}
|
||||||
msg.fail(err, **err_kwargs)
|
msg.fail(err, err_help, **err_kwargs)
|
||||||
|
check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
|
||||||
with working_dir(project_dir) as current_dir:
|
with working_dir(project_dir) as current_dir:
|
||||||
rerun = check_rerun(current_dir, cmd)
|
msg.divider(subcommand)
|
||||||
|
rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit)
|
||||||
if not rerun and not force:
|
if not rerun and not force:
|
||||||
msg.info(f"Skipping '{cmd['name']}': nothing changed")
|
msg.info(f"Skipping '{cmd['name']}': nothing changed")
|
||||||
else:
|
else:
|
||||||
msg.divider(subcommand)
|
|
||||||
run_commands(cmd["script"], dry=dry)
|
run_commands(cmd["script"], dry=dry)
|
||||||
if not dry:
|
if not dry:
|
||||||
update_lockfile(current_dir, cmd)
|
update_lockfile(current_dir, cmd)
|
||||||
|
@ -144,7 +149,7 @@ def run_commands(
|
||||||
if not silent:
|
if not silent:
|
||||||
print(f"Running command: {join_command(command)}")
|
print(f"Running command: {join_command(command)}")
|
||||||
if not dry:
|
if not dry:
|
||||||
run_command(command)
|
run_command(command, capture=False)
|
||||||
|
|
||||||
|
|
||||||
def validate_subcommand(
|
def validate_subcommand(
|
||||||
|
@ -170,12 +175,19 @@ def validate_subcommand(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool:
|
def check_rerun(
|
||||||
|
project_dir: Path,
|
||||||
|
command: Dict[str, Any],
|
||||||
|
*,
|
||||||
|
check_spacy_version: bool = True,
|
||||||
|
check_spacy_commit: bool = False,
|
||||||
|
) -> bool:
|
||||||
"""Check if a command should be rerun because its settings or inputs/outputs
|
"""Check if a command should be rerun because its settings or inputs/outputs
|
||||||
changed.
|
changed.
|
||||||
|
|
||||||
project_dir (Path): The current project directory.
|
project_dir (Path): The current project directory.
|
||||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||||
|
strict_version (bool):
|
||||||
RETURNS (bool): Whether to re-run the command.
|
RETURNS (bool): Whether to re-run the command.
|
||||||
"""
|
"""
|
||||||
lock_path = project_dir / PROJECT_LOCK
|
lock_path = project_dir / PROJECT_LOCK
|
||||||
|
@ -188,10 +200,23 @@ def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool:
|
||||||
# Always run commands with no outputs (otherwise they'd always be skipped)
|
# Always run commands with no outputs (otherwise they'd always be skipped)
|
||||||
if not entry.get("outs", []):
|
if not entry.get("outs", []):
|
||||||
return True
|
return True
|
||||||
|
# Always rerun if spaCy version or commit hash changed
|
||||||
|
spacy_v = entry.get("spacy_version")
|
||||||
|
commit = entry.get("spacy_git_version")
|
||||||
|
if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__):
|
||||||
|
info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)"
|
||||||
|
msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}")
|
||||||
|
return True
|
||||||
|
if check_spacy_commit and commit != GIT_VERSION:
|
||||||
|
info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)"
|
||||||
|
msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}")
|
||||||
|
return True
|
||||||
# If the entry in the lockfile matches the lockfile entry that would be
|
# If the entry in the lockfile matches the lockfile entry that would be
|
||||||
# generated from the current command, we don't rerun because it means that
|
# generated from the current command, we don't rerun because it means that
|
||||||
# all inputs/outputs, hashes and scripts are the same and nothing changed
|
# all inputs/outputs, hashes and scripts are the same and nothing changed
|
||||||
return get_hash(get_lock_entry(project_dir, command)) != get_hash(entry)
|
lock_entry = get_lock_entry(project_dir, command)
|
||||||
|
exclude = ["spacy_version", "spacy_git_version"]
|
||||||
|
return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude)
|
||||||
|
|
||||||
|
|
||||||
def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
|
def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
|
||||||
|
@ -230,6 +255,8 @@ def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]
|
||||||
"script": command["script"],
|
"script": command["script"],
|
||||||
"deps": deps,
|
"deps": deps,
|
||||||
"outs": [*outs, *outs_nc],
|
"outs": [*outs, *outs_nc],
|
||||||
|
"spacy_version": about.__version__,
|
||||||
|
"spacy_git_version": GIT_VERSION,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,11 +4,15 @@ can help generate the best possible configuration, given a user's requirements.
|
||||||
{%- set use_transformer = (transformer_data and hardware != "cpu") -%}
|
{%- set use_transformer = (transformer_data and hardware != "cpu") -%}
|
||||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||||
[paths]
|
[paths]
|
||||||
train = ""
|
train = null
|
||||||
dev = ""
|
dev = null
|
||||||
|
|
||||||
[system]
|
[system]
|
||||||
use_pytorch_for_gpu_memory = {{ "true" if use_transformer else "false" }}
|
{% if use_transformer -%}
|
||||||
|
gpu_allocator = "pytorch"
|
||||||
|
{% else -%}
|
||||||
|
gpu_allocator = null
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = "{{ lang }}"
|
lang = "{{ lang }}"
|
||||||
|
@ -33,6 +37,22 @@ tokenizer_config = {"use_fast": true}
|
||||||
window = 128
|
window = 128
|
||||||
stride = 96
|
stride = 96
|
||||||
|
|
||||||
|
{% if "morphologizer" in components %}
|
||||||
|
[components.morphologizer]
|
||||||
|
factory = "morphologizer"
|
||||||
|
|
||||||
|
[components.morphologizer.model]
|
||||||
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.morphologizer.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.morphologizer.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
{% if "tagger" in components %}
|
{% if "tagger" in components %}
|
||||||
[components.tagger]
|
[components.tagger]
|
||||||
factory = "tagger"
|
factory = "tagger"
|
||||||
|
@ -55,7 +75,8 @@ factory = "parser"
|
||||||
|
|
||||||
[components.parser.model]
|
[components.parser.model]
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
nr_feature_tokens = 8
|
state_type = "parser"
|
||||||
|
extra_state_tokens = false
|
||||||
hidden_width = 128
|
hidden_width = 128
|
||||||
maxout_pieces = 3
|
maxout_pieces = 3
|
||||||
use_upper = false
|
use_upper = false
|
||||||
|
@ -75,7 +96,8 @@ factory = "ner"
|
||||||
|
|
||||||
[components.ner.model]
|
[components.ner.model]
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
nr_feature_tokens = 3
|
state_type = "ner"
|
||||||
|
extra_state_tokens = false
|
||||||
hidden_width = 64
|
hidden_width = 64
|
||||||
maxout_pieces = 2
|
maxout_pieces = 2
|
||||||
use_upper = false
|
use_upper = false
|
||||||
|
@ -89,6 +111,49 @@ grad_factor = 1.0
|
||||||
@layers = "reduce_mean.v1"
|
@layers = "reduce_mean.v1"
|
||||||
{% endif -%}
|
{% endif -%}
|
||||||
|
|
||||||
|
{% if "entity_linker" in components -%}
|
||||||
|
[components.entity_linker]
|
||||||
|
factory = "entity_linker"
|
||||||
|
get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
|
||||||
|
incl_context = true
|
||||||
|
incl_prior = true
|
||||||
|
|
||||||
|
[components.entity_linker.model]
|
||||||
|
@architectures = "spacy.EntityLinker.v1"
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.entity_linker.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.entity_linker.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
|
{% endif -%}
|
||||||
|
|
||||||
|
{% if "textcat" in components %}
|
||||||
|
[components.textcat]
|
||||||
|
factory = "textcat"
|
||||||
|
|
||||||
|
{% if optimize == "accuracy" %}
|
||||||
|
[components.textcat.model]
|
||||||
|
@architectures = "spacy.TextCatEnsemble.v1"
|
||||||
|
exclusive_classes = false
|
||||||
|
width = 64
|
||||||
|
conv_depth = 2
|
||||||
|
embed_size = 2000
|
||||||
|
window_size = 1
|
||||||
|
ngram_size = 1
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
{% else -%}
|
||||||
|
[components.textcat.model]
|
||||||
|
@architectures = "spacy.TextCatBOW.v1"
|
||||||
|
exclusive_classes = false
|
||||||
|
ngram_size = 1
|
||||||
|
no_output_layer = false
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
{# NON-TRANSFORMER PIPELINE #}
|
{# NON-TRANSFORMER PIPELINE #}
|
||||||
{% else -%}
|
{% else -%}
|
||||||
|
|
||||||
|
@ -106,9 +171,14 @@ factory = "tok2vec"
|
||||||
[components.tok2vec.model.embed]
|
[components.tok2vec.model.embed]
|
||||||
@architectures = "spacy.MultiHashEmbed.v1"
|
@architectures = "spacy.MultiHashEmbed.v1"
|
||||||
width = ${components.tok2vec.model.encode.width}
|
width = ${components.tok2vec.model.encode.width}
|
||||||
rows = {{ 2000 if optimize == "efficiency" else 7000 }}
|
{% if has_letters -%}
|
||||||
also_embed_subwords = {{ "true" if has_letters else "false" }}
|
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
|
||||||
also_use_static_vectors = {{ "true" if optimize == "accuracy" else "false" }}
|
rows = [5000, 2500, 2500, 2500]
|
||||||
|
{% else -%}
|
||||||
|
attrs = ["ORTH", "SHAPE"]
|
||||||
|
rows = [5000, 2500]
|
||||||
|
{% endif -%}
|
||||||
|
include_static_vectors = {{ "true" if optimize == "accuracy" else "false" }}
|
||||||
|
|
||||||
[components.tok2vec.model.encode]
|
[components.tok2vec.model.encode]
|
||||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||||
|
@ -117,6 +187,19 @@ depth = {{ 4 if optimize == "efficiency" else 8 }}
|
||||||
window_size = 1
|
window_size = 1
|
||||||
maxout_pieces = 3
|
maxout_pieces = 3
|
||||||
|
|
||||||
|
{% if "morphologizer" in components %}
|
||||||
|
[components.morphologizer]
|
||||||
|
factory = "morphologizer"
|
||||||
|
|
||||||
|
[components.morphologizer.model]
|
||||||
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.morphologizer.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
{% if "tagger" in components %}
|
{% if "tagger" in components %}
|
||||||
[components.tagger]
|
[components.tagger]
|
||||||
factory = "tagger"
|
factory = "tagger"
|
||||||
|
@ -136,7 +219,8 @@ factory = "parser"
|
||||||
|
|
||||||
[components.parser.model]
|
[components.parser.model]
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
nr_feature_tokens = 8
|
state_type = "parser"
|
||||||
|
extra_state_tokens = false
|
||||||
hidden_width = 128
|
hidden_width = 128
|
||||||
maxout_pieces = 3
|
maxout_pieces = 3
|
||||||
use_upper = true
|
use_upper = true
|
||||||
|
@ -153,7 +237,8 @@ factory = "ner"
|
||||||
|
|
||||||
[components.ner.model]
|
[components.ner.model]
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
nr_feature_tokens = 6
|
state_type = "ner"
|
||||||
|
extra_state_tokens = false
|
||||||
hidden_width = 64
|
hidden_width = 64
|
||||||
maxout_pieces = 2
|
maxout_pieces = 2
|
||||||
use_upper = true
|
use_upper = true
|
||||||
|
@ -163,30 +248,78 @@ nO = null
|
||||||
@architectures = "spacy.Tok2VecListener.v1"
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
width = ${components.tok2vec.model.encode.width}
|
width = ${components.tok2vec.model.encode.width}
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
|
{% if "entity_linker" in components -%}
|
||||||
|
[components.entity_linker]
|
||||||
|
factory = "entity_linker"
|
||||||
|
get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
|
||||||
|
incl_context = true
|
||||||
|
incl_prior = true
|
||||||
|
|
||||||
|
[components.entity_linker.model]
|
||||||
|
@architectures = "spacy.EntityLinker.v1"
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.entity_linker.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
{% if "textcat" in components %}
|
||||||
|
[components.textcat]
|
||||||
|
factory = "textcat"
|
||||||
|
|
||||||
|
{% if optimize == "accuracy" %}
|
||||||
|
[components.textcat.model]
|
||||||
|
@architectures = "spacy.TextCatEnsemble.v1"
|
||||||
|
exclusive_classes = false
|
||||||
|
width = 64
|
||||||
|
conv_depth = 2
|
||||||
|
embed_size = 2000
|
||||||
|
window_size = 1
|
||||||
|
ngram_size = 1
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
{% else -%}
|
||||||
|
[components.textcat.model]
|
||||||
|
@architectures = "spacy.TextCatBOW.v1"
|
||||||
|
exclusive_classes = false
|
||||||
|
ngram_size = 1
|
||||||
|
no_output_layer = false
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
{% for pipe in components %}
|
{% for pipe in components %}
|
||||||
{% if pipe not in ["tagger", "parser", "ner"] %}
|
{% if pipe not in ["tagger", "morphologizer", "parser", "ner", "textcat", "entity_linker"] %}
|
||||||
{# Other components defined by the user: we just assume they're factories #}
|
{# Other components defined by the user: we just assume they're factories #}
|
||||||
[components.{{ pipe }}]
|
[components.{{ pipe }}]
|
||||||
factory = "{{ pipe }}"
|
factory = "{{ pipe }}"
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
|
||||||
|
[corpora]
|
||||||
|
|
||||||
|
[corpora.train]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths.train}
|
||||||
|
max_length = {{ 500 if hardware == "gpu" else 2000 }}
|
||||||
|
|
||||||
|
[corpora.dev]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths.dev}
|
||||||
|
max_length = 0
|
||||||
|
|
||||||
[training]
|
[training]
|
||||||
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
|
||||||
vectors = null
|
|
||||||
{% else -%}
|
|
||||||
vectors = "{{ word_vectors }}"
|
|
||||||
{% endif -%}
|
|
||||||
{% if use_transformer -%}
|
{% if use_transformer -%}
|
||||||
accumulate_gradient = {{ transformer["size_factor"] }}
|
accumulate_gradient = {{ transformer["size_factor"] }}
|
||||||
{% endif %}
|
{% endif -%}
|
||||||
|
dev_corpus = "corpora.dev"
|
||||||
|
train_corpus = "corpora.train"
|
||||||
|
|
||||||
[training.optimizer]
|
[training.optimizer]
|
||||||
@optimizers = "Adam.v1"
|
@optimizers = "Adam.v1"
|
||||||
|
|
||||||
|
|
||||||
{% if use_transformer -%}
|
{% if use_transformer -%}
|
||||||
[training.optimizer.learn_rate]
|
[training.optimizer.learn_rate]
|
||||||
@schedules = "warmup_linear.v1"
|
@schedules = "warmup_linear.v1"
|
||||||
|
@ -195,16 +328,6 @@ total_steps = 20000
|
||||||
initial_rate = 5e-5
|
initial_rate = 5e-5
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
[training.train_corpus]
|
|
||||||
@readers = "spacy.Corpus.v1"
|
|
||||||
path = ${paths.train}
|
|
||||||
max_length = {{ 500 if hardware == "gpu" else 2000 }}
|
|
||||||
|
|
||||||
[training.dev_corpus]
|
|
||||||
@readers = "spacy.Corpus.v1"
|
|
||||||
path = ${paths.dev}
|
|
||||||
max_length = 0
|
|
||||||
|
|
||||||
{% if use_transformer %}
|
{% if use_transformer %}
|
||||||
[training.batcher]
|
[training.batcher]
|
||||||
@batchers = "spacy.batch_by_padded.v1"
|
@batchers = "spacy.batch_by_padded.v1"
|
||||||
|
@ -224,17 +347,9 @@ stop = 1000
|
||||||
compound = 1.001
|
compound = 1.001
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
[training.score_weights]
|
[initialize]
|
||||||
{%- if "tagger" in components %}
|
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
||||||
tag_acc = {{ (1.0 / components|length)|round(2) }}
|
vectors = null
|
||||||
{%- endif -%}
|
{% else -%}
|
||||||
{%- if "parser" in components %}
|
vectors = "{{ word_vectors }}"
|
||||||
dep_uas = 0.0
|
{% endif -%}
|
||||||
dep_las = {{ (1.0 / components|length)|round(2) }}
|
|
||||||
sents_f = 0.0
|
|
||||||
{%- endif %}
|
|
||||||
{%- if "ner" in components %}
|
|
||||||
ents_f = {{ (1.0 / components|length)|round(2) }}
|
|
||||||
ents_p = 0.0
|
|
||||||
ents_r = 0.0
|
|
||||||
{%- endif -%}
|
|
||||||
|
|
|
@ -32,10 +32,10 @@ es:
|
||||||
word_vectors: null
|
word_vectors: null
|
||||||
transformer:
|
transformer:
|
||||||
efficiency:
|
efficiency:
|
||||||
name: mrm8488/RuPERTa-base
|
name: dccuchile/bert-base-spanish-wwm-cased
|
||||||
size_factor: 3
|
size_factor: 3
|
||||||
accuracy:
|
accuracy:
|
||||||
name: mrm8488/RuPERTa-base
|
name: dccuchile/bert-base-spanish-wwm-cased
|
||||||
size_factor: 3
|
size_factor: 3
|
||||||
sv:
|
sv:
|
||||||
word_vectors: null
|
word_vectors: null
|
||||||
|
@ -101,3 +101,21 @@ pl:
|
||||||
accuracy:
|
accuracy:
|
||||||
name: dkleczek/bert-base-polish-cased-v1
|
name: dkleczek/bert-base-polish-cased-v1
|
||||||
size_factor: 3
|
size_factor: 3
|
||||||
|
nl:
|
||||||
|
word_vectors: null
|
||||||
|
transformer:
|
||||||
|
efficiency:
|
||||||
|
name: pdelobelle/robbert-v2-dutch-base
|
||||||
|
size_factor: 3
|
||||||
|
accuracy:
|
||||||
|
name: pdelobelle/robbert-v2-dutch-base
|
||||||
|
size_factor: 3
|
||||||
|
pt:
|
||||||
|
word_vectors: null
|
||||||
|
transformer:
|
||||||
|
efficiency:
|
||||||
|
name: neuralmind/bert-base-portuguese-cased
|
||||||
|
size_factor: 3
|
||||||
|
accuracy:
|
||||||
|
name: neuralmind/bert-base-portuguese-cased
|
||||||
|
size_factor: 3
|
||||||
|
|
|
@ -1,23 +1,15 @@
|
||||||
from typing import Optional, Dict, Any, Tuple, Union, Callable, List
|
from typing import Optional
|
||||||
from timeit import default_timer as timer
|
|
||||||
import srsly
|
|
||||||
import tqdm
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
import thinc
|
|
||||||
import thinc.schedules
|
|
||||||
from thinc.api import use_pytorch_for_gpu_memory, require_gpu, fix_random_seed
|
|
||||||
from thinc.api import Config, Optimizer
|
|
||||||
import random
|
|
||||||
import typer
|
import typer
|
||||||
import logging
|
import logging
|
||||||
|
import sys
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||||
from ._util import import_code, get_sourced_components
|
from ._util import import_code, setup_gpu
|
||||||
from ..language import Language
|
from ..training.loop import train
|
||||||
|
from ..training.initialize import init_nlp
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..training.example import Example
|
|
||||||
from ..errors import Errors
|
|
||||||
|
|
||||||
|
|
||||||
@app.command(
|
@app.command(
|
||||||
|
@ -28,10 +20,9 @@ def train_cli(
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||||
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
|
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
|
||||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||||
resume: bool = Opt(False, "--resume", "-R", help="Resume training"),
|
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -48,381 +39,21 @@ def train_cli(
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/cli#train
|
DOCS: https://nightly.spacy.io/api/cli#train
|
||||||
"""
|
"""
|
||||||
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
verify_cli_args(config_path, output_path)
|
|
||||||
overrides = parse_config_overrides(ctx.args)
|
|
||||||
import_code(code_path)
|
|
||||||
train(
|
|
||||||
config_path,
|
|
||||||
output_path=output_path,
|
|
||||||
config_overrides=overrides,
|
|
||||||
use_gpu=use_gpu,
|
|
||||||
resume_training=resume,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def train(
|
|
||||||
config_path: Path,
|
|
||||||
output_path: Optional[Path] = None,
|
|
||||||
config_overrides: Dict[str, Any] = {},
|
|
||||||
use_gpu: int = -1,
|
|
||||||
resume_training: bool = False,
|
|
||||||
) -> None:
|
|
||||||
if use_gpu >= 0:
|
|
||||||
msg.info(f"Using GPU: {use_gpu}")
|
|
||||||
require_gpu(use_gpu)
|
|
||||||
else:
|
|
||||||
msg.info("Using CPU")
|
|
||||||
msg.info(f"Loading config and nlp from: {config_path}")
|
|
||||||
with show_validation_error(config_path):
|
|
||||||
config = util.load_config(
|
|
||||||
config_path, overrides=config_overrides, interpolate=True
|
|
||||||
)
|
|
||||||
if config.get("training", {}).get("seed") is not None:
|
|
||||||
fix_random_seed(config["training"]["seed"])
|
|
||||||
if config.get("system", {}).get("use_pytorch_for_gpu_memory"):
|
|
||||||
# It feels kind of weird to not have a default for this.
|
|
||||||
use_pytorch_for_gpu_memory()
|
|
||||||
# Use original config here before it's resolved to functions
|
|
||||||
sourced_components = get_sourced_components(config)
|
|
||||||
with show_validation_error(config_path):
|
|
||||||
nlp, config = util.load_model_from_config(config)
|
|
||||||
if config["training"]["vectors"] is not None:
|
|
||||||
util.load_vectors_into_model(nlp, config["training"]["vectors"])
|
|
||||||
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
|
|
||||||
T_cfg = config["training"]
|
|
||||||
optimizer = T_cfg["optimizer"]
|
|
||||||
train_corpus = T_cfg["train_corpus"]
|
|
||||||
dev_corpus = T_cfg["dev_corpus"]
|
|
||||||
batcher = T_cfg["batcher"]
|
|
||||||
train_logger = T_cfg["logger"]
|
|
||||||
# Components that shouldn't be updated during training
|
|
||||||
frozen_components = T_cfg["frozen_components"]
|
|
||||||
# Sourced components that require resume_training
|
|
||||||
resume_components = [p for p in sourced_components if p not in frozen_components]
|
|
||||||
msg.info(f"Pipeline: {nlp.pipe_names}")
|
|
||||||
if resume_components:
|
|
||||||
with nlp.select_pipes(enable=resume_components):
|
|
||||||
msg.info(f"Resuming training for: {resume_components}")
|
|
||||||
nlp.resume_training(sgd=optimizer)
|
|
||||||
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
|
||||||
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
|
|
||||||
# Verify the config after calling 'begin_training' to ensure labels are properly initialized
|
|
||||||
verify_config(nlp)
|
|
||||||
|
|
||||||
if tag_map:
|
|
||||||
# Replace tag map with provided mapping
|
|
||||||
nlp.vocab.morphology.load_tag_map(tag_map)
|
|
||||||
if morph_rules:
|
|
||||||
# Load morph rules
|
|
||||||
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
|
|
||||||
|
|
||||||
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
|
|
||||||
if weights_data is not None:
|
|
||||||
tok2vec_path = config["pretraining"].get("tok2vec_model", None)
|
|
||||||
if tok2vec_path is None:
|
|
||||||
msg.fail(
|
|
||||||
f"To pretrained tok2vec weights, the config needs to specify which "
|
|
||||||
f"tok2vec layer to load in the setting [pretraining.tok2vec_model].",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
tok2vec = config
|
|
||||||
for subpath in tok2vec_path.split("."):
|
|
||||||
tok2vec = tok2vec.get(subpath)
|
|
||||||
if not tok2vec:
|
|
||||||
err = f"Could not locate the tok2vec model at {tok2vec_path}"
|
|
||||||
msg.fail(err, exits=1)
|
|
||||||
tok2vec.from_bytes(weights_data)
|
|
||||||
|
|
||||||
# Create iterator, which yields out info after each optimization step.
|
|
||||||
msg.info("Start training")
|
|
||||||
score_weights = T_cfg["score_weights"]
|
|
||||||
training_step_iterator = train_while_improving(
|
|
||||||
nlp,
|
|
||||||
optimizer,
|
|
||||||
create_train_batches(train_corpus(nlp), batcher, T_cfg["max_epochs"]),
|
|
||||||
create_evaluation_callback(nlp, dev_corpus, score_weights),
|
|
||||||
dropout=T_cfg["dropout"],
|
|
||||||
accumulate_gradient=T_cfg["accumulate_gradient"],
|
|
||||||
patience=T_cfg["patience"],
|
|
||||||
max_steps=T_cfg["max_steps"],
|
|
||||||
eval_frequency=T_cfg["eval_frequency"],
|
|
||||||
raw_text=None,
|
|
||||||
exclude=frozen_components,
|
|
||||||
)
|
|
||||||
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
|
|
||||||
print_row, finalize_logger = train_logger(nlp)
|
|
||||||
|
|
||||||
try:
|
|
||||||
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
|
|
||||||
progress.set_description(f"Epoch 1")
|
|
||||||
for batch, info, is_best_checkpoint in training_step_iterator:
|
|
||||||
progress.update(1)
|
|
||||||
if is_best_checkpoint is not None:
|
|
||||||
progress.close()
|
|
||||||
print_row(info)
|
|
||||||
if is_best_checkpoint and output_path is not None:
|
|
||||||
update_meta(T_cfg, nlp, info)
|
|
||||||
with nlp.use_params(optimizer.averages):
|
|
||||||
nlp.to_disk(output_path / "model-best")
|
|
||||||
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
|
|
||||||
progress.set_description(f"Epoch {info['epoch']}")
|
|
||||||
except Exception as e:
|
|
||||||
finalize_logger()
|
|
||||||
if output_path is not None:
|
|
||||||
# We don't want to swallow the traceback if we don't have a
|
|
||||||
# specific error.
|
|
||||||
msg.warn(
|
|
||||||
f"Aborting and saving the final best model. "
|
|
||||||
f"Encountered exception: {str(e)}"
|
|
||||||
)
|
|
||||||
nlp.to_disk(output_path / "model-final")
|
|
||||||
raise e
|
|
||||||
finally:
|
|
||||||
finalize_logger()
|
|
||||||
if output_path is not None:
|
|
||||||
final_model_path = output_path / "model-final"
|
|
||||||
if optimizer.averages:
|
|
||||||
with nlp.use_params(optimizer.averages):
|
|
||||||
nlp.to_disk(final_model_path)
|
|
||||||
else:
|
|
||||||
nlp.to_disk(final_model_path)
|
|
||||||
msg.good(f"Saved pipeline to output directory {final_model_path}")
|
|
||||||
|
|
||||||
|
|
||||||
def create_train_batches(iterator, batcher, max_epochs: int):
|
|
||||||
epoch = 0
|
|
||||||
examples = list(iterator)
|
|
||||||
if not examples:
|
|
||||||
# Raise error if no data
|
|
||||||
raise ValueError(Errors.E986)
|
|
||||||
while max_epochs < 1 or epoch != max_epochs:
|
|
||||||
random.shuffle(examples)
|
|
||||||
for batch in batcher(examples):
|
|
||||||
yield epoch, batch
|
|
||||||
epoch += 1
|
|
||||||
|
|
||||||
|
|
||||||
def create_evaluation_callback(
|
|
||||||
nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
|
|
||||||
) -> Callable[[], Tuple[float, Dict[str, float]]]:
|
|
||||||
def evaluate() -> Tuple[float, Dict[str, float]]:
|
|
||||||
dev_examples = list(dev_corpus(nlp))
|
|
||||||
scores = nlp.evaluate(dev_examples)
|
|
||||||
# Calculate a weighted sum based on score_weights for the main score
|
|
||||||
try:
|
|
||||||
weighted_score = sum(
|
|
||||||
scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights
|
|
||||||
)
|
|
||||||
except KeyError as e:
|
|
||||||
keys = list(scores.keys())
|
|
||||||
err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
|
|
||||||
raise KeyError(err) from None
|
|
||||||
return weighted_score, scores
|
|
||||||
|
|
||||||
return evaluate
|
|
||||||
|
|
||||||
|
|
||||||
def train_while_improving(
|
|
||||||
nlp: Language,
|
|
||||||
optimizer: Optimizer,
|
|
||||||
train_data,
|
|
||||||
evaluate,
|
|
||||||
*,
|
|
||||||
dropout: float,
|
|
||||||
eval_frequency: int,
|
|
||||||
accumulate_gradient: int,
|
|
||||||
patience: int,
|
|
||||||
max_steps: int,
|
|
||||||
raw_text: List[Dict[str, str]],
|
|
||||||
exclude: List[str],
|
|
||||||
):
|
|
||||||
"""Train until an evaluation stops improving. Works as a generator,
|
|
||||||
with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
|
|
||||||
where info is a dict, and is_best_checkpoint is in [True, False, None] --
|
|
||||||
None indicating that the iteration was not evaluated as a checkpoint.
|
|
||||||
The evaluation is conducted by calling the evaluate callback.
|
|
||||||
|
|
||||||
Positional arguments:
|
|
||||||
nlp: The spaCy pipeline to evaluate.
|
|
||||||
optimizer: The optimizer callable.
|
|
||||||
train_data (Iterable[Batch]): A generator of batches, with the training
|
|
||||||
data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
|
|
||||||
data iterable needs to take care of iterating over the epochs and
|
|
||||||
shuffling.
|
|
||||||
evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
|
|
||||||
The callback should take no arguments and return a tuple
|
|
||||||
`(main_score, other_scores)`. The main_score should be a float where
|
|
||||||
higher is better. other_scores can be any object.
|
|
||||||
|
|
||||||
Every iteration, the function yields out a tuple with:
|
|
||||||
|
|
||||||
* batch: A list of Example objects.
|
|
||||||
* info: A dict with various information about the last update (see below).
|
|
||||||
* is_best_checkpoint: A value in None, False, True, indicating whether this
|
|
||||||
was the best evaluation so far. You should use this to save the model
|
|
||||||
checkpoints during training. If None, evaluation was not conducted on
|
|
||||||
that iteration. False means evaluation was conducted, but a previous
|
|
||||||
evaluation was better.
|
|
||||||
|
|
||||||
The info dict provides the following information:
|
|
||||||
|
|
||||||
epoch (int): How many passes over the data have been completed.
|
|
||||||
step (int): How many steps have been completed.
|
|
||||||
score (float): The main score from the last evaluation.
|
|
||||||
other_scores: : The other scores from the last evaluation.
|
|
||||||
losses: The accumulated losses throughout training.
|
|
||||||
checkpoints: A list of previous results, where each result is a
|
|
||||||
(score, step, epoch) tuple.
|
|
||||||
"""
|
|
||||||
if isinstance(dropout, float):
|
|
||||||
dropouts = thinc.schedules.constant(dropout)
|
|
||||||
else:
|
|
||||||
dropouts = dropout
|
|
||||||
results = []
|
|
||||||
losses = {}
|
|
||||||
if raw_text:
|
|
||||||
random.shuffle(raw_text)
|
|
||||||
raw_examples = [
|
|
||||||
Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text
|
|
||||||
]
|
|
||||||
raw_batches = util.minibatch(raw_examples, size=8)
|
|
||||||
|
|
||||||
words_seen = 0
|
|
||||||
start_time = timer()
|
|
||||||
for step, (epoch, batch) in enumerate(train_data):
|
|
||||||
dropout = next(dropouts)
|
|
||||||
for subbatch in subdivide_batch(batch, accumulate_gradient):
|
|
||||||
|
|
||||||
nlp.update(
|
|
||||||
subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
|
|
||||||
)
|
|
||||||
if raw_text:
|
|
||||||
# If raw text is available, perform 'rehearsal' updates,
|
|
||||||
# which use unlabelled data to reduce overfitting.
|
|
||||||
raw_batch = list(next(raw_batches))
|
|
||||||
nlp.rehearse(raw_batch, sgd=optimizer, losses=losses, exclude=exclude)
|
|
||||||
# TODO: refactor this so we don't have to run it separately in here
|
|
||||||
for name, proc in nlp.pipeline:
|
|
||||||
if (
|
|
||||||
name not in exclude
|
|
||||||
and hasattr(proc, "model")
|
|
||||||
and proc.model not in (True, False, None)
|
|
||||||
):
|
|
||||||
proc.model.finish_update(optimizer)
|
|
||||||
optimizer.step_schedules()
|
|
||||||
if not (step % eval_frequency):
|
|
||||||
if optimizer.averages:
|
|
||||||
with nlp.use_params(optimizer.averages):
|
|
||||||
score, other_scores = evaluate()
|
|
||||||
else:
|
|
||||||
score, other_scores = evaluate()
|
|
||||||
results.append((score, step))
|
|
||||||
is_best_checkpoint = score == max(results)[0]
|
|
||||||
else:
|
|
||||||
score, other_scores = (None, None)
|
|
||||||
is_best_checkpoint = None
|
|
||||||
words_seen += sum(len(eg) for eg in batch)
|
|
||||||
info = {
|
|
||||||
"epoch": epoch,
|
|
||||||
"step": step,
|
|
||||||
"score": score,
|
|
||||||
"other_scores": other_scores,
|
|
||||||
"losses": losses,
|
|
||||||
"checkpoints": results,
|
|
||||||
"seconds": int(timer() - start_time),
|
|
||||||
"words": words_seen,
|
|
||||||
}
|
|
||||||
yield batch, info, is_best_checkpoint
|
|
||||||
if is_best_checkpoint is not None:
|
|
||||||
losses = {}
|
|
||||||
# Stop if no improvement in `patience` updates (if specified)
|
|
||||||
best_score, best_step = max(results)
|
|
||||||
if patience and (step - best_step) >= patience:
|
|
||||||
break
|
|
||||||
# Stop if we've exhausted our max steps (if specified)
|
|
||||||
if max_steps and step >= max_steps:
|
|
||||||
break
|
|
||||||
|
|
||||||
|
|
||||||
def subdivide_batch(batch, accumulate_gradient):
|
|
||||||
batch = list(batch)
|
|
||||||
batch.sort(key=lambda eg: len(eg.predicted))
|
|
||||||
sub_len = len(batch) // accumulate_gradient
|
|
||||||
start = 0
|
|
||||||
for i in range(accumulate_gradient):
|
|
||||||
subbatch = batch[start : start + sub_len]
|
|
||||||
if subbatch:
|
|
||||||
yield subbatch
|
|
||||||
start += len(subbatch)
|
|
||||||
subbatch = batch[start:]
|
|
||||||
if subbatch:
|
|
||||||
yield subbatch
|
|
||||||
|
|
||||||
|
|
||||||
def update_meta(
|
|
||||||
training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
|
|
||||||
) -> None:
|
|
||||||
nlp.meta["performance"] = {}
|
|
||||||
for metric in training["score_weights"]:
|
|
||||||
nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
|
|
||||||
for pipe_name in nlp.pipe_names:
|
|
||||||
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
|
|
||||||
|
|
||||||
|
|
||||||
def load_from_paths(
|
|
||||||
config: Config,
|
|
||||||
) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
|
|
||||||
# TODO: separate checks from loading
|
|
||||||
raw_text = util.ensure_path(config["training"]["raw_text"])
|
|
||||||
if raw_text is not None:
|
|
||||||
if not raw_text.exists():
|
|
||||||
msg.fail("Can't find raw text", raw_text, exits=1)
|
|
||||||
raw_text = list(srsly.read_jsonl(config["training"]["raw_text"]))
|
|
||||||
tag_map = {}
|
|
||||||
morph_rules = {}
|
|
||||||
weights_data = None
|
|
||||||
init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"])
|
|
||||||
if init_tok2vec is not None:
|
|
||||||
if not init_tok2vec.exists():
|
|
||||||
msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
|
|
||||||
with init_tok2vec.open("rb") as file_:
|
|
||||||
weights_data = file_.read()
|
|
||||||
return raw_text, tag_map, morph_rules, weights_data
|
|
||||||
|
|
||||||
|
|
||||||
def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None:
|
|
||||||
# Make sure all files and paths exists if they are needed
|
# Make sure all files and paths exists if they are needed
|
||||||
if not config_path or not config_path.exists():
|
if not config_path or not config_path.exists():
|
||||||
msg.fail("Config file not found", config_path, exits=1)
|
msg.fail("Config file not found", config_path, exits=1)
|
||||||
if output_path is not None:
|
if output_path is not None and not output_path.exists():
|
||||||
if not output_path.exists():
|
output_path.mkdir()
|
||||||
output_path.mkdir()
|
msg.good(f"Created output directory: {output_path}")
|
||||||
msg.good(f"Created output directory: {output_path}")
|
overrides = parse_config_overrides(ctx.args)
|
||||||
|
import_code(code_path)
|
||||||
|
setup_gpu(use_gpu)
|
||||||
def verify_config(nlp: Language) -> None:
|
with show_validation_error(config_path):
|
||||||
"""Perform additional checks based on the config, loaded nlp object and training data."""
|
config = util.load_config(config_path, overrides=overrides, interpolate=False)
|
||||||
# TODO: maybe we should validate based on the actual components, the list
|
msg.divider("Initializing pipeline")
|
||||||
# in config["nlp"]["pipeline"] instead?
|
with show_validation_error(config_path, hint_fill=False):
|
||||||
for pipe_config in nlp.config["components"].values():
|
nlp = init_nlp(config, use_gpu=use_gpu)
|
||||||
# We can't assume that the component name == the factory
|
msg.good("Initialized pipeline")
|
||||||
factory = pipe_config["factory"]
|
msg.divider("Training pipeline")
|
||||||
if factory == "textcat":
|
train(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr)
|
||||||
verify_textcat_config(nlp, pipe_config)
|
|
||||||
|
|
||||||
|
|
||||||
def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
|
|
||||||
# if 'positive_label' is provided: double check whether it's in the data and
|
|
||||||
# the task is binary
|
|
||||||
if pipe_config.get("positive_label"):
|
|
||||||
textcat_labels = nlp.get_pipe("textcat").labels
|
|
||||||
pos_label = pipe_config.get("positive_label")
|
|
||||||
if pos_label not in textcat_labels:
|
|
||||||
raise ValueError(
|
|
||||||
Errors.E920.format(pos_label=pos_label, labels=textcat_labels)
|
|
||||||
)
|
|
||||||
if len(list(textcat_labels)) != 2:
|
|
||||||
raise ValueError(
|
|
||||||
Errors.E919.format(pos_label=pos_label, labels=textcat_labels)
|
|
||||||
)
|
|
||||||
|
|
|
@ -22,6 +22,11 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
cupy = None
|
cupy = None
|
||||||
|
|
||||||
|
try: # Python 3.8+
|
||||||
|
from typing import Literal
|
||||||
|
except ImportError:
|
||||||
|
from typing_extensions import Literal # noqa: F401
|
||||||
|
|
||||||
from thinc.api import Optimizer # noqa: F401
|
from thinc.api import Optimizer # noqa: F401
|
||||||
|
|
||||||
pickle = pickle
|
pickle = pickle
|
||||||
|
|
|
@ -1,18 +1,22 @@
|
||||||
[paths]
|
[paths]
|
||||||
train = ""
|
train = null
|
||||||
dev = ""
|
dev = null
|
||||||
raw = null
|
vectors = null
|
||||||
init_tok2vec = null
|
init_tok2vec = null
|
||||||
|
|
||||||
[system]
|
[system]
|
||||||
seed = 0
|
seed = 0
|
||||||
use_pytorch_for_gpu_memory = false
|
gpu_allocator = null
|
||||||
|
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = null
|
lang = null
|
||||||
|
# List of pipeline component names, in order. The names should correspond to
|
||||||
|
# components defined in the [components block]
|
||||||
pipeline = []
|
pipeline = []
|
||||||
|
# Components that are loaded but disabled by default
|
||||||
disabled = []
|
disabled = []
|
||||||
load_vocab_data = true
|
# Optional callbacks to modify the nlp object before it's initialized, after
|
||||||
|
# it's created and after the pipeline has been set up
|
||||||
before_creation = null
|
before_creation = null
|
||||||
after_creation = null
|
after_creation = null
|
||||||
after_pipeline_creation = null
|
after_pipeline_creation = null
|
||||||
|
@ -20,31 +24,13 @@ after_pipeline_creation = null
|
||||||
[nlp.tokenizer]
|
[nlp.tokenizer]
|
||||||
@tokenizers = "spacy.Tokenizer.v1"
|
@tokenizers = "spacy.Tokenizer.v1"
|
||||||
|
|
||||||
|
# The pipeline components and their models
|
||||||
[components]
|
[components]
|
||||||
|
|
||||||
# Training hyper-parameters and additional features.
|
# Readers for corpora like dev and train.
|
||||||
[training]
|
[corpora]
|
||||||
seed = ${system.seed}
|
|
||||||
dropout = 0.1
|
|
||||||
accumulate_gradient = 1
|
|
||||||
# Extra resources for transfer-learning or pseudo-rehearsal
|
|
||||||
init_tok2vec = ${paths.init_tok2vec}
|
|
||||||
raw_text = ${paths.raw}
|
|
||||||
vectors = null
|
|
||||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
|
||||||
patience = 1600
|
|
||||||
max_epochs = 0
|
|
||||||
max_steps = 20000
|
|
||||||
eval_frequency = 200
|
|
||||||
# Control how scores are printed and checkpoints are evaluated.
|
|
||||||
score_weights = {}
|
|
||||||
# Names of pipeline components that shouldn't be updated during training
|
|
||||||
frozen_components = []
|
|
||||||
|
|
||||||
[training.logger]
|
[corpora.train]
|
||||||
@loggers = "spacy.ConsoleLogger.v1"
|
|
||||||
|
|
||||||
[training.train_corpus]
|
|
||||||
@readers = "spacy.Corpus.v1"
|
@readers = "spacy.Corpus.v1"
|
||||||
path = ${paths.train}
|
path = ${paths.train}
|
||||||
# Whether to train on sequences with 'gold standard' sentence boundaries
|
# Whether to train on sequences with 'gold standard' sentence boundaries
|
||||||
|
@ -55,8 +41,12 @@ gold_preproc = false
|
||||||
max_length = 0
|
max_length = 0
|
||||||
# Limitation on number of training examples
|
# Limitation on number of training examples
|
||||||
limit = 0
|
limit = 0
|
||||||
|
# Apply some simply data augmentation, where we replace tokens with variations.
|
||||||
|
# This is especially useful for punctuation and case replacement, to help
|
||||||
|
# generalize beyond corpora that don't/only have smart quotes etc.
|
||||||
|
augmenter = null
|
||||||
|
|
||||||
[training.dev_corpus]
|
[corpora.dev]
|
||||||
@readers = "spacy.Corpus.v1"
|
@readers = "spacy.Corpus.v1"
|
||||||
path = ${paths.dev}
|
path = ${paths.dev}
|
||||||
# Whether to train on sequences with 'gold standard' sentence boundaries
|
# Whether to train on sequences with 'gold standard' sentence boundaries
|
||||||
|
@ -67,6 +57,33 @@ gold_preproc = false
|
||||||
max_length = 0
|
max_length = 0
|
||||||
# Limitation on number of training examples
|
# Limitation on number of training examples
|
||||||
limit = 0
|
limit = 0
|
||||||
|
# Optional callback for data augmentation
|
||||||
|
augmenter = null
|
||||||
|
|
||||||
|
# Training hyper-parameters and additional features.
|
||||||
|
[training]
|
||||||
|
seed = ${system.seed}
|
||||||
|
gpu_allocator = ${system.gpu_allocator}
|
||||||
|
dropout = 0.1
|
||||||
|
accumulate_gradient = 1
|
||||||
|
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||||
|
patience = 1600
|
||||||
|
max_epochs = 0
|
||||||
|
max_steps = 20000
|
||||||
|
eval_frequency = 200
|
||||||
|
# Control how scores are printed and checkpoints are evaluated.
|
||||||
|
score_weights = {}
|
||||||
|
# Names of pipeline components that shouldn't be updated during training
|
||||||
|
frozen_components = []
|
||||||
|
# Location in the config where the dev corpus is defined
|
||||||
|
dev_corpus = "corpora.dev"
|
||||||
|
# Location in the config where the train corpus is defined
|
||||||
|
train_corpus = "corpora.train"
|
||||||
|
# Optional callback before nlp object is saved to disk after training
|
||||||
|
before_to_disk = null
|
||||||
|
|
||||||
|
[training.logger]
|
||||||
|
@loggers = "spacy.ConsoleLogger.v1"
|
||||||
|
|
||||||
[training.batcher]
|
[training.batcher]
|
||||||
@batchers = "spacy.batch_by_words.v1"
|
@batchers = "spacy.batch_by_words.v1"
|
||||||
|
@ -89,3 +106,19 @@ grad_clip = 1.0
|
||||||
use_averages = false
|
use_averages = false
|
||||||
eps = 1e-8
|
eps = 1e-8
|
||||||
learn_rate = 0.001
|
learn_rate = 0.001
|
||||||
|
|
||||||
|
# These settings are used when nlp.initialize() is called (typically before
|
||||||
|
# training or pretraining). Components and the tokenizer can each define their
|
||||||
|
# own arguments via their initialize methods that are populated by the config.
|
||||||
|
# This lets them gather data resources, build label sets etc.
|
||||||
|
[initialize]
|
||||||
|
vectors = ${paths.vectors}
|
||||||
|
# Extra resources for transfer-learning or pseudo-rehearsal
|
||||||
|
init_tok2vec = ${paths.init_tok2vec}
|
||||||
|
# Data and lookups for vocabulary
|
||||||
|
vocab_data = null
|
||||||
|
lookups = null
|
||||||
|
# Arguments passed to the tokenizer's initialize method
|
||||||
|
tokenizer = {}
|
||||||
|
# Arguments for initialize methods of the components (keyed by component)
|
||||||
|
components = {}
|
||||||
|
|
|
@ -1,9 +1,13 @@
|
||||||
|
[paths]
|
||||||
|
raw_text = null
|
||||||
|
|
||||||
[pretraining]
|
[pretraining]
|
||||||
max_epochs = 1000
|
max_epochs = 1000
|
||||||
dropout = 0.2
|
dropout = 0.2
|
||||||
n_save_every = null
|
n_save_every = null
|
||||||
component = "tok2vec"
|
component = "tok2vec"
|
||||||
layer = ""
|
layer = ""
|
||||||
|
corpus = "corpora.pretrain"
|
||||||
|
|
||||||
[pretraining.batcher]
|
[pretraining.batcher]
|
||||||
@batchers = "spacy.batch_by_words.v1"
|
@batchers = "spacy.batch_by_words.v1"
|
||||||
|
@ -12,13 +16,6 @@ discard_oversize = false
|
||||||
tolerance = 0.2
|
tolerance = 0.2
|
||||||
get_length = null
|
get_length = null
|
||||||
|
|
||||||
[pretraining.corpus]
|
|
||||||
@readers = "spacy.JsonlReader.v1"
|
|
||||||
path = ${paths.raw}
|
|
||||||
min_length = 5
|
|
||||||
max_length = 500
|
|
||||||
limit = 0
|
|
||||||
|
|
||||||
[pretraining.objective]
|
[pretraining.objective]
|
||||||
type = "characters"
|
type = "characters"
|
||||||
n_characters = 4
|
n_characters = 4
|
||||||
|
@ -33,3 +30,12 @@ grad_clip = 1.0
|
||||||
use_averages = true
|
use_averages = true
|
||||||
eps = 1e-8
|
eps = 1e-8
|
||||||
learn_rate = 0.001
|
learn_rate = 0.001
|
||||||
|
|
||||||
|
[corpora]
|
||||||
|
|
||||||
|
[corpora.pretrain]
|
||||||
|
@readers = "spacy.JsonlCorpus.v1"
|
||||||
|
path = ${paths.raw_text}
|
||||||
|
min_length = 5
|
||||||
|
max_length = 500
|
||||||
|
limit = 0
|
||||||
|
|
|
@ -121,7 +121,7 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
||||||
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
||||||
"""
|
"""
|
||||||
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"]))
|
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"]))
|
||||||
if not doc.is_parsed:
|
if not doc.has_annotation("DEP"):
|
||||||
warnings.warn(Warnings.W005)
|
warnings.warn(Warnings.W005)
|
||||||
if options.get("collapse_phrases", False):
|
if options.get("collapse_phrases", False):
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
|
|
407
spacy/errors.py
407
spacy/errors.py
|
@ -16,8 +16,6 @@ def add_codes(err_cls):
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
class Warnings:
|
class Warnings:
|
||||||
W004 = ("No text fixing enabled. Run `pip install ftfy` to enable fixing "
|
|
||||||
"using ftfy.fix_text if necessary.")
|
|
||||||
W005 = ("Doc object not parsed. This means displaCy won't be able to "
|
W005 = ("Doc object not parsed. This means displaCy won't be able to "
|
||||||
"generate a dependency visualization for it. Make sure the Doc "
|
"generate a dependency visualization for it. Make sure the Doc "
|
||||||
"was processed with a model that supports dependency parsing, and "
|
"was processed with a model that supports dependency parsing, and "
|
||||||
|
@ -51,22 +49,23 @@ class Warnings:
|
||||||
W017 = ("Alias '{alias}' already exists in the Knowledge Base.")
|
W017 = ("Alias '{alias}' already exists in the Knowledge Base.")
|
||||||
W018 = ("Entity '{entity}' already exists in the Knowledge Base - "
|
W018 = ("Entity '{entity}' already exists in the Knowledge Base - "
|
||||||
"ignoring the duplicate entry.")
|
"ignoring the duplicate entry.")
|
||||||
W020 = ("Unnamed vectors. This won't allow multiple vectors models to be "
|
|
||||||
"loaded. (Shape: {shape})")
|
|
||||||
W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
|
W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
|
||||||
"incorrect. Modify PhraseMatcher._terminal_hash to fix.")
|
"incorrect. Modify PhraseMatcher._terminal_hash to fix.")
|
||||||
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
||||||
"the Knowledge Base.")
|
"the Knowledge Base.")
|
||||||
W026 = ("Unable to set all sentence boundaries from dependency parses.")
|
W026 = ("Unable to set all sentence boundaries from dependency parses. If "
|
||||||
|
"you are constructing a parse tree incrementally by setting "
|
||||||
|
"token.head values, you can probably ignore this warning. Consider "
|
||||||
|
"using Doc(words, ..., heads=heads, deps=deps) instead.")
|
||||||
W027 = ("Found a large training file of {size} bytes. Note that it may "
|
W027 = ("Found a large training file of {size} bytes. Note that it may "
|
||||||
"be more efficient to split your training data into multiple "
|
"be more efficient to split your training data into multiple "
|
||||||
"smaller JSON files instead.")
|
"smaller JSON files instead.")
|
||||||
W028 = ("Doc.from_array was called with a vector of type '{type}', "
|
W028 = ("Doc.from_array was called with a vector of type '{type}', "
|
||||||
"but is expecting one of type 'uint64' instead. This may result "
|
"but is expecting one of type uint64 instead. This may result "
|
||||||
"in problems with the vocab further on in the pipeline.")
|
"in problems with the vocab further on in the pipeline.")
|
||||||
W030 = ("Some entities could not be aligned in the text \"{text}\" with "
|
W030 = ("Some entities could not be aligned in the text \"{text}\" with "
|
||||||
"entities \"{entities}\". Use "
|
"entities \"{entities}\". Use "
|
||||||
"`spacy.training.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
|
"`spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)`"
|
||||||
" to check the alignment. Misaligned entities ('-') will be "
|
" to check the alignment. Misaligned entities ('-') will be "
|
||||||
"ignored during training.")
|
"ignored during training.")
|
||||||
W033 = ("Training a new {model} using a model with no lexeme normalization "
|
W033 = ("Training a new {model} using a model with no lexeme normalization "
|
||||||
|
@ -76,13 +75,18 @@ class Warnings:
|
||||||
"If this is surprising, make sure you have the spacy-lookups-data "
|
"If this is surprising, make sure you have the spacy-lookups-data "
|
||||||
"package installed. The languages with lexeme normalization tables "
|
"package installed. The languages with lexeme normalization tables "
|
||||||
"are currently: {langs}")
|
"are currently: {langs}")
|
||||||
W034 = ("Please install the package spacy-lookups-data in order to include "
|
|
||||||
"the default lexeme normalization table for the language '{lang}'.")
|
|
||||||
W035 = ('Discarding subpattern "{pattern}" due to an unrecognized '
|
W035 = ('Discarding subpattern "{pattern}" due to an unrecognized '
|
||||||
"attribute or operator.")
|
"attribute or operator.")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
W090 = ("Could not locate any binary .spacy files in path '{path}'.")
|
W088 = ("The pipeline component {name} implements a `begin_training` "
|
||||||
|
"method, which won't be called by spaCy. As of v3.0, `begin_training` "
|
||||||
|
"has been renamed to `initialize`, so you likely want to rename the "
|
||||||
|
"component method. See the documentation for details: "
|
||||||
|
"https://nightly.spacy.io/api/language#initialize")
|
||||||
|
W089 = ("As of spaCy v3.0, the `nlp.begin_training` method has been renamed "
|
||||||
|
"to `nlp.initialize`.")
|
||||||
|
W090 = ("Could not locate any {format} files in path '{path}'.")
|
||||||
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
|
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
|
||||||
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
|
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
|
||||||
W093 = ("Could not find any data to train the {name} on. Is your "
|
W093 = ("Could not find any data to train the {name} on. Is your "
|
||||||
|
@ -99,37 +103,33 @@ class Warnings:
|
||||||
"download a newer compatible model or retrain your custom model "
|
"download a newer compatible model or retrain your custom model "
|
||||||
"with the current spaCy version. For more details and available "
|
"with the current spaCy version. For more details and available "
|
||||||
"updates, run: python -m spacy validate")
|
"updates, run: python -m spacy validate")
|
||||||
W096 = ("The method 'disable_pipes' has become deprecated - use 'select_pipes' "
|
W096 = ("The method `nlp.disable_pipes` is now deprecated - use "
|
||||||
"instead.")
|
"`nlp.select_pipes` instead.")
|
||||||
W097 = ("No Model config was provided to create the '{name}' component, "
|
|
||||||
"and no default configuration could be found either.")
|
|
||||||
W098 = ("No Model config was provided to create the '{name}' component, "
|
|
||||||
"so a default configuration was used.")
|
|
||||||
W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', "
|
|
||||||
"but got '{type}' instead, so ignoring it.")
|
|
||||||
W100 = ("Skipping unsupported morphological feature(s): '{feature}'. "
|
W100 = ("Skipping unsupported morphological feature(s): '{feature}'. "
|
||||||
"Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
|
"Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
|
||||||
"string \"Field1=Value1,Value2|Field2=Value3\".")
|
"string \"Field1=Value1,Value2|Field2=Value3\".")
|
||||||
W101 = ("Skipping `Doc` custom extension '{name}' while merging docs.")
|
W101 = ("Skipping Doc custom extension '{name}' while merging docs.")
|
||||||
W102 = ("Skipping unsupported user data '{key}: {value}' while merging docs.")
|
W102 = ("Skipping unsupported user data '{key}: {value}' while merging docs.")
|
||||||
W103 = ("Unknown {lang} word segmenter '{segmenter}'. Supported "
|
W103 = ("Unknown {lang} word segmenter '{segmenter}'. Supported "
|
||||||
"word segmenters: {supported}. Defaulting to {default}.")
|
"word segmenters: {supported}. Defaulting to {default}.")
|
||||||
W104 = ("Skipping modifications for '{target}' segmenter. The current "
|
W104 = ("Skipping modifications for '{target}' segmenter. The current "
|
||||||
"segmenter is '{current}'.")
|
"segmenter is '{current}'.")
|
||||||
W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you "
|
W105 = ("As of spaCy v3.0, the `{matcher}.pipe` method is deprecated. If you "
|
||||||
"need to match on a stream of documents, you can use nlp.pipe and "
|
"need to match on a stream of documents, you can use `nlp.pipe` and "
|
||||||
"call the {matcher} on each Doc object.")
|
"call the {matcher} on each Doc object.")
|
||||||
|
W107 = ("The property `Doc.{prop}` is deprecated. Use "
|
||||||
|
"`Doc.has_annotation(\"{attr}\")` instead.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
class Errors:
|
class Errors:
|
||||||
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
|
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
|
||||||
E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
|
E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
|
||||||
"This usually happens when spaCy calls nlp.{method} with custom "
|
"This usually happens when spaCy calls `nlp.{method}` with custom "
|
||||||
"component name that's not registered on the current language class. "
|
"component name that's not registered on the current language class. "
|
||||||
"If you're using a custom component, make sure you've added the "
|
"If you're using a custom component, make sure you've added the "
|
||||||
"decorator @Language.component (for function components) or "
|
"decorator `@Language.component` (for function components) or "
|
||||||
"@Language.factory (for class components).\n\nAvailable "
|
"`@Language.factory` (for class components).\n\nAvailable "
|
||||||
"factories: {opts}")
|
"factories: {opts}")
|
||||||
E003 = ("Not a valid pipeline component. Expected callable, but "
|
E003 = ("Not a valid pipeline component. Expected callable, but "
|
||||||
"got {component} (name: '{name}'). If you're using a custom "
|
"got {component} (name: '{name}'). If you're using a custom "
|
||||||
|
@ -147,14 +147,13 @@ class Errors:
|
||||||
E008 = ("Can't restore disabled pipeline component '{name}' because it "
|
E008 = ("Can't restore disabled pipeline component '{name}' because it "
|
||||||
"doesn't exist in the pipeline anymore. If you want to remove "
|
"doesn't exist in the pipeline anymore. If you want to remove "
|
||||||
"components from the pipeline, you should do it before calling "
|
"components from the pipeline, you should do it before calling "
|
||||||
"`nlp.select_pipes()` or after restoring the disabled components.")
|
"`nlp.select_pipes` or after restoring the disabled components.")
|
||||||
E010 = ("Word vectors set to length 0. This may be because you don't have "
|
E010 = ("Word vectors set to length 0. This may be because you don't have "
|
||||||
"a model installed or loaded, or because your model doesn't "
|
"a model installed or loaded, or because your model doesn't "
|
||||||
"include word vectors. For more info, see the docs:\n"
|
"include word vectors. For more info, see the docs:\n"
|
||||||
"https://nightly.spacy.io/usage/models")
|
"https://nightly.spacy.io/usage/models")
|
||||||
E011 = ("Unknown operator: '{op}'. Options: {opts}")
|
E011 = ("Unknown operator: '{op}'. Options: {opts}")
|
||||||
E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
|
E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
|
||||||
E014 = ("Unknown tag ID: {tag}")
|
|
||||||
E016 = ("MultitaskObjective target should be function or one of: dep, "
|
E016 = ("MultitaskObjective target should be function or one of: dep, "
|
||||||
"tag, ent, dep_tag_offset, ent_tag.")
|
"tag, ent, dep_tag_offset, ent_tag.")
|
||||||
E017 = ("Can only add unicode or bytes. Got type: {value_type}")
|
E017 = ("Can only add unicode or bytes. Got type: {value_type}")
|
||||||
|
@ -170,33 +169,25 @@ class Errors:
|
||||||
"For example, are all labels added to the model? If you're "
|
"For example, are all labels added to the model? If you're "
|
||||||
"training a named entity recognizer, also make sure that none of "
|
"training a named entity recognizer, also make sure that none of "
|
||||||
"your annotated entity spans have leading or trailing whitespace "
|
"your annotated entity spans have leading or trailing whitespace "
|
||||||
"or punctuation. "
|
"or punctuation. You can also use the `debug data` command to "
|
||||||
"You can also use the experimental `debug data` command to "
|
|
||||||
"validate your JSON-formatted training data. For details, run:\n"
|
"validate your JSON-formatted training data. For details, run:\n"
|
||||||
"python -m spacy debug data --help")
|
"python -m spacy debug data --help")
|
||||||
E025 = ("String is too long: {length} characters. Max is 2**30.")
|
E025 = ("String is too long: {length} characters. Max is 2**30.")
|
||||||
E026 = ("Error accessing token at position {i}: out of bounds in Doc of "
|
E026 = ("Error accessing token at position {i}: out of bounds in Doc of "
|
||||||
"length {length}.")
|
"length {length}.")
|
||||||
E027 = ("Arguments 'words' and 'spaces' should be sequences of the same "
|
E027 = ("Arguments `words` and `spaces` should be sequences of the same "
|
||||||
"length, or 'spaces' should be left default at None. spaces "
|
"length, or `spaces` should be left default at None. `spaces` "
|
||||||
"should be a sequence of booleans, with True meaning that the "
|
"should be a sequence of booleans, with True meaning that the "
|
||||||
"word owns a ' ' character following it.")
|
"word owns a ' ' character following it.")
|
||||||
E028 = ("orths_and_spaces expects either a list of unicode string or a "
|
E028 = ("`words` expects a list of unicode strings, but got bytes instance: {value}")
|
||||||
"list of (unicode, bool) tuples. Got bytes instance: {value}")
|
E029 = ("`noun_chunks` requires the dependency parse, which requires a "
|
||||||
E029 = ("noun_chunks requires the dependency parse, which requires a "
|
|
||||||
"statistical model to be installed and loaded. For more info, see "
|
"statistical model to be installed and loaded. For more info, see "
|
||||||
"the documentation:\nhttps://nightly.spacy.io/usage/models")
|
"the documentation:\nhttps://nightly.spacy.io/usage/models")
|
||||||
E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
|
E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
|
||||||
"component to the pipeline with: "
|
"component to the pipeline with: `nlp.add_pipe('sentencizer')`. "
|
||||||
"nlp.add_pipe('sentencizer'). "
|
"Alternatively, add the dependency parser or sentence recognizer, "
|
||||||
"Alternatively, add the dependency parser, or set sentence "
|
"or set sentence boundaries by setting `doc[i].is_sent_start`.")
|
||||||
"boundaries by setting doc[i].is_sent_start.")
|
|
||||||
E031 = ("Invalid token: empty string ('') at position {i}.")
|
E031 = ("Invalid token: empty string ('') at position {i}.")
|
||||||
E032 = ("Conflicting attributes specified in doc.from_array(): "
|
|
||||||
"(HEAD, SENT_START). The HEAD attribute currently sets sentence "
|
|
||||||
"boundaries implicitly, based on the tree structure. This means "
|
|
||||||
"the HEAD attribute would potentially override the sentence "
|
|
||||||
"boundaries set by SENT_START.")
|
|
||||||
E033 = ("Cannot load into non-empty Doc of length {length}.")
|
E033 = ("Cannot load into non-empty Doc of length {length}.")
|
||||||
E035 = ("Error creating span with start {start} and end {end} for Doc of "
|
E035 = ("Error creating span with start {start} and end {end} for Doc of "
|
||||||
"length {length}.")
|
"length {length}.")
|
||||||
|
@ -209,7 +200,7 @@ class Errors:
|
||||||
"issue here: http://github.com/explosion/spaCy/issues")
|
"issue here: http://github.com/explosion/spaCy/issues")
|
||||||
E040 = ("Attempt to access token at {i}, max length {max_length}.")
|
E040 = ("Attempt to access token at {i}, max length {max_length}.")
|
||||||
E041 = ("Invalid comparison operator: {op}. Likely a Cython bug?")
|
E041 = ("Invalid comparison operator: {op}. Likely a Cython bug?")
|
||||||
E042 = ("Error accessing doc[{i}].nbor({j}), for doc of length {length}.")
|
E042 = ("Error accessing `doc[{i}].nbor({j})`, for doc of length {length}.")
|
||||||
E043 = ("Refusing to write to token.sent_start if its document is parsed, "
|
E043 = ("Refusing to write to token.sent_start if its document is parsed, "
|
||||||
"because this may cause inconsistent state.")
|
"because this may cause inconsistent state.")
|
||||||
E044 = ("Invalid value for token.sent_start: {value}. Must be one of: "
|
E044 = ("Invalid value for token.sent_start: {value}. Must be one of: "
|
||||||
|
@ -229,7 +220,7 @@ class Errors:
|
||||||
E056 = ("Invalid tokenizer exception: ORTH values combined don't match "
|
E056 = ("Invalid tokenizer exception: ORTH values combined don't match "
|
||||||
"original string.\nKey: {key}\nOrths: {orths}")
|
"original string.\nKey: {key}\nOrths: {orths}")
|
||||||
E057 = ("Stepped slices not supported in Span objects. Try: "
|
E057 = ("Stepped slices not supported in Span objects. Try: "
|
||||||
"list(tokens)[start:stop:step] instead.")
|
"`list(tokens)[start:stop:step]` instead.")
|
||||||
E058 = ("Could not retrieve vector for key {key}.")
|
E058 = ("Could not retrieve vector for key {key}.")
|
||||||
E059 = ("One (and only one) keyword arg must be set. Got: {kwargs}")
|
E059 = ("One (and only one) keyword arg must be set. Got: {kwargs}")
|
||||||
E060 = ("Cannot add new key to vectors: the table is full. Current shape: "
|
E060 = ("Cannot add new key to vectors: the table is full. Current shape: "
|
||||||
|
@ -238,7 +229,7 @@ class Errors:
|
||||||
"and 63 are occupied. You can replace one by specifying the "
|
"and 63 are occupied. You can replace one by specifying the "
|
||||||
"`flag_id` explicitly, e.g. "
|
"`flag_id` explicitly, e.g. "
|
||||||
"`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.")
|
"`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.")
|
||||||
E063 = ("Invalid value for flag_id: {value}. Flag IDs must be between 1 "
|
E063 = ("Invalid value for `flag_id`: {value}. Flag IDs must be between 1 "
|
||||||
"and 63 (inclusive).")
|
"and 63 (inclusive).")
|
||||||
E064 = ("Error fetching a Lexeme from the Vocab. When looking up a "
|
E064 = ("Error fetching a Lexeme from the Vocab. When looking up a "
|
||||||
"string, the lexeme returned had an orth ID that did not match "
|
"string, the lexeme returned had an orth ID that did not match "
|
||||||
|
@ -267,7 +258,7 @@ class Errors:
|
||||||
E085 = ("Can't create lexeme for string '{string}'.")
|
E085 = ("Can't create lexeme for string '{string}'.")
|
||||||
E087 = ("Unknown displaCy style: {style}.")
|
E087 = ("Unknown displaCy style: {style}.")
|
||||||
E088 = ("Text of length {length} exceeds maximum of {max_length}. The "
|
E088 = ("Text of length {length} exceeds maximum of {max_length}. The "
|
||||||
"v2.x parser and NER models require roughly 1GB of temporary "
|
"parser and NER models require roughly 1GB of temporary "
|
||||||
"memory per 100,000 characters in the input. This means long "
|
"memory per 100,000 characters in the input. This means long "
|
||||||
"texts may cause memory allocation errors. If you're not using "
|
"texts may cause memory allocation errors. If you're not using "
|
||||||
"the parser or NER, it's probably safe to increase the "
|
"the parser or NER, it's probably safe to increase the "
|
||||||
|
@ -284,8 +275,8 @@ class Errors:
|
||||||
E094 = ("Error reading line {line_num} in vectors file {loc}.")
|
E094 = ("Error reading line {line_num} in vectors file {loc}.")
|
||||||
E095 = ("Can't write to frozen dictionary. This is likely an internal "
|
E095 = ("Can't write to frozen dictionary. This is likely an internal "
|
||||||
"error. Are you writing to a default function argument?")
|
"error. Are you writing to a default function argument?")
|
||||||
E096 = ("Invalid object passed to displaCy: Can only visualize Doc or "
|
E096 = ("Invalid object passed to displaCy: Can only visualize `Doc` or "
|
||||||
"Span objects, or dicts if set to manual=True.")
|
"Span objects, or dicts if set to `manual=True`.")
|
||||||
E097 = ("Invalid pattern: expected token pattern (list of dicts) or "
|
E097 = ("Invalid pattern: expected token pattern (list of dicts) or "
|
||||||
"phrase pattern (string) but got:\n{pattern}")
|
"phrase pattern (string) but got:\n{pattern}")
|
||||||
E098 = ("Invalid pattern: expected both RIGHT_ID and RIGHT_ATTRS.")
|
E098 = ("Invalid pattern: expected both RIGHT_ID and RIGHT_ATTRS.")
|
||||||
|
@ -302,11 +293,11 @@ class Errors:
|
||||||
E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A "
|
E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A "
|
||||||
"token can only be part of one entity, so make sure the entities "
|
"token can only be part of one entity, so make sure the entities "
|
||||||
"you're setting don't overlap.")
|
"you're setting don't overlap.")
|
||||||
E106 = ("Can't find doc._.{attr} attribute specified in the underscore "
|
E106 = ("Can't find `doc._.{attr}` attribute specified in the underscore "
|
||||||
"settings: {opts}")
|
"settings: {opts}")
|
||||||
E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
|
E107 = ("Value of `doc._.{attr}` is not JSON-serializable: {value}")
|
||||||
E109 = ("Component '{name}' could not be run. Did you forget to "
|
E109 = ("Component '{name}' could not be run. Did you forget to "
|
||||||
"call begin_training()?")
|
"call `initialize()`?")
|
||||||
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
|
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
|
||||||
E111 = ("Pickling a token is not supported, because tokens are only views "
|
E111 = ("Pickling a token is not supported, because tokens are only views "
|
||||||
"of the parent Doc and can't exist on their own. A pickled token "
|
"of the parent Doc and can't exist on their own. A pickled token "
|
||||||
|
@ -323,8 +314,8 @@ class Errors:
|
||||||
E117 = ("The newly split tokens must match the text of the original token. "
|
E117 = ("The newly split tokens must match the text of the original token. "
|
||||||
"New orths: {new}. Old text: {old}.")
|
"New orths: {new}. Old text: {old}.")
|
||||||
E118 = ("The custom extension attribute '{attr}' is not registered on the "
|
E118 = ("The custom extension attribute '{attr}' is not registered on the "
|
||||||
"Token object so it can't be set during retokenization. To "
|
"`Token` object so it can't be set during retokenization. To "
|
||||||
"register an attribute, use the Token.set_extension classmethod.")
|
"register an attribute, use the `Token.set_extension` classmethod.")
|
||||||
E119 = ("Can't set custom extension attribute '{attr}' during "
|
E119 = ("Can't set custom extension attribute '{attr}' during "
|
||||||
"retokenization because it's not writable. This usually means it "
|
"retokenization because it's not writable. This usually means it "
|
||||||
"was registered with a getter function (and no setter) or as a "
|
"was registered with a getter function (and no setter) or as a "
|
||||||
|
@ -348,7 +339,7 @@ class Errors:
|
||||||
E130 = ("You are running a narrow unicode build, which is incompatible "
|
E130 = ("You are running a narrow unicode build, which is incompatible "
|
||||||
"with spacy >= 2.1.0. To fix this, reinstall Python and use a wide "
|
"with spacy >= 2.1.0. To fix this, reinstall Python and use a wide "
|
||||||
"unicode build instead. You can also rebuild Python and set the "
|
"unicode build instead. You can also rebuild Python and set the "
|
||||||
"--enable-unicode=ucs4 flag.")
|
"`--enable-unicode=ucs4 flag`.")
|
||||||
E131 = ("Cannot write the kb_id of an existing Span object because a Span "
|
E131 = ("Cannot write the kb_id of an existing Span object because a Span "
|
||||||
"is a read-only view of the underlying Token objects stored in "
|
"is a read-only view of the underlying Token objects stored in "
|
||||||
"the Doc. Instead, create a new Span object and specify the "
|
"the Doc. Instead, create a new Span object and specify the "
|
||||||
|
@ -361,27 +352,20 @@ class Errors:
|
||||||
E133 = ("The sum of prior probabilities for alias '{alias}' should not "
|
E133 = ("The sum of prior probabilities for alias '{alias}' should not "
|
||||||
"exceed 1, but found {sum}.")
|
"exceed 1, but found {sum}.")
|
||||||
E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
|
E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
|
||||||
E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure "
|
E139 = ("Knowledge base for component '{name}' is empty. Use the methods "
|
||||||
"to provide a valid JSON object as input with either the `text` "
|
"`kb.add_entity` and `kb.add_alias` to add entries.")
|
||||||
"or `tokens` key. For more info, see the docs:\n"
|
|
||||||
"https://nightly.spacy.io/api/cli#pretrain-jsonl")
|
|
||||||
E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input "
|
|
||||||
"includes either the `text` or `tokens` key. For more info, see "
|
|
||||||
"the docs:\nhttps://nightly.spacy.io/api/cli#pretrain-jsonl")
|
|
||||||
E139 = ("Knowledge Base for component '{name}' is empty. Use the methods "
|
|
||||||
"kb.add_entity and kb.add_alias to add entries.")
|
|
||||||
E140 = ("The list of entities, prior probabilities and entity vectors "
|
E140 = ("The list of entities, prior probabilities and entity vectors "
|
||||||
"should be of equal length.")
|
"should be of equal length.")
|
||||||
E141 = ("Entity vectors should be of length {required} instead of the "
|
E141 = ("Entity vectors should be of length {required} instead of the "
|
||||||
"provided {found}.")
|
"provided {found}.")
|
||||||
E143 = ("Labels for component '{name}' not initialized. This can be fixed "
|
E143 = ("Labels for component '{name}' not initialized. This can be fixed "
|
||||||
"by calling add_label, or by providing a representative batch of "
|
"by calling add_label, or by providing a representative batch of "
|
||||||
"examples to the component's begin_training method.")
|
"examples to the component's `initialize` method.")
|
||||||
E145 = ("Error reading `{param}` from input file.")
|
E145 = ("Error reading `{param}` from input file.")
|
||||||
E146 = ("Could not access `{path}`.")
|
E146 = ("Could not access {path}.")
|
||||||
E147 = ("Unexpected error in the {method} functionality of the "
|
E147 = ("Unexpected error in the {method} functionality of the "
|
||||||
"EntityLinker: {msg}. This is likely a bug in spaCy, so feel free "
|
"EntityLinker: {msg}. This is likely a bug in spaCy, so feel free "
|
||||||
"to open an issue.")
|
"to open an issue: https://github.com/explosion/spaCy/issues")
|
||||||
E148 = ("Expected {ents} KB identifiers but got {ids}. Make sure that "
|
E148 = ("Expected {ents} KB identifiers but got {ids}. Make sure that "
|
||||||
"each entity in `doc.ents` is assigned to a KB identifier.")
|
"each entity in `doc.ents` is assigned to a KB identifier.")
|
||||||
E149 = ("Error deserializing model. Check that the config used to create "
|
E149 = ("Error deserializing model. Check that the config used to create "
|
||||||
|
@ -389,22 +373,18 @@ class Errors:
|
||||||
E150 = ("The language of the `nlp` object and the `vocab` should be the "
|
E150 = ("The language of the `nlp` object and the `vocab` should be the "
|
||||||
"same, but found '{nlp}' and '{vocab}' respectively.")
|
"same, but found '{nlp}' and '{vocab}' respectively.")
|
||||||
E152 = ("The attribute {attr} is not supported for token patterns. "
|
E152 = ("The attribute {attr} is not supported for token patterns. "
|
||||||
"Please use the option validate=True with Matcher, PhraseMatcher, "
|
"Please use the option `validate=True` with the Matcher, PhraseMatcher, "
|
||||||
"or EntityRuler for more details.")
|
"or EntityRuler for more details.")
|
||||||
E153 = ("The value type {vtype} is not supported for token patterns. "
|
E153 = ("The value type {vtype} is not supported for token patterns. "
|
||||||
"Please use the option validate=True with Matcher, PhraseMatcher, "
|
"Please use the option validate=True with Matcher, PhraseMatcher, "
|
||||||
"or EntityRuler for more details.")
|
"or EntityRuler for more details.")
|
||||||
E154 = ("One of the attributes or values is not supported for token "
|
E154 = ("One of the attributes or values is not supported for token "
|
||||||
"patterns. Please use the option validate=True with Matcher, "
|
"patterns. Please use the option `validate=True` with the Matcher, "
|
||||||
"PhraseMatcher, or EntityRuler for more details.")
|
"PhraseMatcher, or EntityRuler for more details.")
|
||||||
E155 = ("The pipeline needs to include a tagger in order to use "
|
E155 = ("The pipeline needs to include a {pipe} in order to use "
|
||||||
"Matcher or PhraseMatcher with the attributes POS, TAG, or LEMMA. "
|
"Matcher or PhraseMatcher with the attribute {attr}. "
|
||||||
"Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) "
|
"Try using `nlp()` instead of `nlp.make_doc()` or `list(nlp.pipe())` "
|
||||||
"instead of list(nlp.tokenizer.pipe()).")
|
"instead of `list(nlp.tokenizer.pipe())`.")
|
||||||
E156 = ("The pipeline needs to include a parser in order to use "
|
|
||||||
"Matcher or PhraseMatcher with the attribute DEP. Try using "
|
|
||||||
"nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of "
|
|
||||||
"list(nlp.tokenizer.pipe()).")
|
|
||||||
E157 = ("Can't render negative values for dependency arc start or end. "
|
E157 = ("Can't render negative values for dependency arc start or end. "
|
||||||
"Make sure that you're passing in absolute token indices, not "
|
"Make sure that you're passing in absolute token indices, not "
|
||||||
"relative token offsets.\nstart: {start}, end: {end}, label: "
|
"relative token offsets.\nstart: {start}, end: {end}, label: "
|
||||||
|
@ -413,16 +393,14 @@ class Errors:
|
||||||
E159 = ("Can't find table '{name}' in lookups. Available tables: {tables}")
|
E159 = ("Can't find table '{name}' in lookups. Available tables: {tables}")
|
||||||
E160 = ("Can't find language data file: {path}")
|
E160 = ("Can't find language data file: {path}")
|
||||||
E161 = ("Found an internal inconsistency when predicting entity links. "
|
E161 = ("Found an internal inconsistency when predicting entity links. "
|
||||||
"This is likely a bug in spaCy, so feel free to open an issue.")
|
"This is likely a bug in spaCy, so feel free to open an issue: "
|
||||||
E162 = ("Cannot evaluate textcat model on data with different labels.\n"
|
"https://github.com/explosion/spaCy/issues")
|
||||||
"Labels in model: {model_labels}\nLabels in evaluation "
|
|
||||||
"data: {eval_labels}")
|
|
||||||
E163 = ("cumsum was found to be unstable: its last element does not "
|
E163 = ("cumsum was found to be unstable: its last element does not "
|
||||||
"correspond to sum")
|
"correspond to sum")
|
||||||
E164 = ("x is neither increasing nor decreasing: {}.")
|
E164 = ("x is neither increasing nor decreasing: {x}.")
|
||||||
E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
|
E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
|
||||||
"that case.")
|
"that case.")
|
||||||
E166 = ("Can only merge DocBins with the same pre-defined attributes.\n"
|
E166 = ("Can only merge DocBins with the same value for '{param}'.\n"
|
||||||
"Current DocBin: {current}\nOther DocBin: {other}")
|
"Current DocBin: {current}\nOther DocBin: {other}")
|
||||||
E169 = ("Can't find module: {module}")
|
E169 = ("Can't find module: {module}")
|
||||||
E170 = ("Cannot apply transition {name}: invalid for the current state.")
|
E170 = ("Cannot apply transition {name}: invalid for the current state.")
|
||||||
|
@ -434,10 +412,10 @@ class Errors:
|
||||||
E178 = ("Each pattern should be a list of dicts, but got: {pat}. Maybe you "
|
E178 = ("Each pattern should be a list of dicts, but got: {pat}. Maybe you "
|
||||||
"accidentally passed a single pattern to Matcher.add instead of a "
|
"accidentally passed a single pattern to Matcher.add instead of a "
|
||||||
"list of patterns? If you only want to add one pattern, make sure "
|
"list of patterns? If you only want to add one pattern, make sure "
|
||||||
"to wrap it in a list. For example: matcher.add('{key}', [pattern])")
|
"to wrap it in a list. For example: `matcher.add('{key}', [pattern])`")
|
||||||
E179 = ("Invalid pattern. Expected a list of Doc objects but got a single "
|
E179 = ("Invalid pattern. Expected a list of Doc objects but got a single "
|
||||||
"Doc. If you only want to add one pattern, make sure to wrap it "
|
"Doc. If you only want to add one pattern, make sure to wrap it "
|
||||||
"in a list. For example: matcher.add('{key}', [doc])")
|
"in a list. For example: `matcher.add('{key}', [doc])`")
|
||||||
E180 = ("Span attributes can't be declared as required or assigned by "
|
E180 = ("Span attributes can't be declared as required or assigned by "
|
||||||
"components, since spans are only views of the Doc. Use Doc and "
|
"components, since spans are only views of the Doc. Use Doc and "
|
||||||
"Token attributes (or custom extension attributes) only and remove "
|
"Token attributes (or custom extension attributes) only and remove "
|
||||||
|
@ -445,17 +423,16 @@ class Errors:
|
||||||
E181 = ("Received invalid attributes for unkown object {obj}: {attrs}. "
|
E181 = ("Received invalid attributes for unkown object {obj}: {attrs}. "
|
||||||
"Only Doc and Token attributes are supported.")
|
"Only Doc and Token attributes are supported.")
|
||||||
E182 = ("Received invalid attribute declaration: {attr}\nDid you forget "
|
E182 = ("Received invalid attribute declaration: {attr}\nDid you forget "
|
||||||
"to define the attribute? For example: {attr}.???")
|
"to define the attribute? For example: `{attr}.???`")
|
||||||
E183 = ("Received invalid attribute declaration: {attr}\nOnly top-level "
|
E183 = ("Received invalid attribute declaration: {attr}\nOnly top-level "
|
||||||
"attributes are supported, for example: {solution}")
|
"attributes are supported, for example: {solution}")
|
||||||
E184 = ("Only attributes without underscores are supported in component "
|
E184 = ("Only attributes without underscores are supported in component "
|
||||||
"attribute declarations (because underscore and non-underscore "
|
"attribute declarations (because underscore and non-underscore "
|
||||||
"attributes are connected anyways): {attr} -> {solution}")
|
"attributes are connected anyways): {attr} -> {solution}")
|
||||||
E185 = ("Received invalid attribute in component attribute declaration: "
|
E185 = ("Received invalid attribute in component attribute declaration: "
|
||||||
"{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
|
"`{obj}.{attr}`\nAttribute '{attr}' does not exist on {obj}.")
|
||||||
E186 = ("'{tok_a}' and '{tok_b}' are different texts.")
|
|
||||||
E187 = ("Only unicode strings are supported as labels.")
|
E187 = ("Only unicode strings are supported as labels.")
|
||||||
E189 = ("Each argument to `get_doc` should be of equal length.")
|
E189 = ("Each argument to `Doc.__init__` should be of equal length.")
|
||||||
E190 = ("Token head out of range in `Doc.from_array()` for token index "
|
E190 = ("Token head out of range in `Doc.from_array()` for token index "
|
||||||
"'{index}' with value '{value}' (equivalent to relative head "
|
"'{index}' with value '{value}' (equivalent to relative head "
|
||||||
"index: '{rel_head_index}'). The head indices should be relative "
|
"index: '{rel_head_index}'). The head indices should be relative "
|
||||||
|
@ -469,56 +446,99 @@ class Errors:
|
||||||
"({curr_dim}).")
|
"({curr_dim}).")
|
||||||
E194 = ("Unable to aligned mismatched text '{text}' and words '{words}'.")
|
E194 = ("Unable to aligned mismatched text '{text}' and words '{words}'.")
|
||||||
E195 = ("Matcher can be called on {good} only, got {got}.")
|
E195 = ("Matcher can be called on {good} only, got {got}.")
|
||||||
E196 = ("Refusing to write to token.is_sent_end. Sentence boundaries can "
|
E196 = ("Refusing to write to `token.is_sent_end`. Sentence boundaries can "
|
||||||
"only be fixed with token.is_sent_start.")
|
"only be fixed with `token.is_sent_start`.")
|
||||||
E197 = ("Row out of bounds, unable to add row {row} for key {key}.")
|
E197 = ("Row out of bounds, unable to add row {row} for key {key}.")
|
||||||
E198 = ("Unable to return {n} most similar vectors for the current vectors "
|
E198 = ("Unable to return {n} most similar vectors for the current vectors "
|
||||||
"table, which contains {n_rows} vectors.")
|
"table, which contains {n_rows} vectors.")
|
||||||
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
|
||||||
E200 = ("Specifying a base model with a pretrained component '{component}' "
|
E200 = ("Can't yet set {attr} from Span. Vote for this feature on the "
|
||||||
"can not be combined with adding a pretrained Tok2Vec layer.")
|
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||||
E201 = ("Span index out of range.")
|
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute "
|
||||||
|
"is not set or None. If you've implemented a custom component, make "
|
||||||
|
"sure to store the component model as `self.model` in your "
|
||||||
|
"component's __init__ method.")
|
||||||
|
E899 = ("Can't serialize trainable pipe '{name}': the `vocab` attribute "
|
||||||
|
"is not set or None. If you've implemented a custom component, make "
|
||||||
|
"sure to store the current `nlp` object's vocab as `self.vocab` in "
|
||||||
|
"your component's __init__ method.")
|
||||||
|
E900 = ("Could not run the full pipeline for evaluation. If you specified "
|
||||||
|
"frozen components, make sure they were already initialized and "
|
||||||
|
"trained. Full pipeline: {pipeline}")
|
||||||
|
E901 = ("Failed to remove existing output directory: {path}. If your "
|
||||||
|
"config and the components you train change between runs, a "
|
||||||
|
"non-empty output directory can lead to stale pipeline data. To "
|
||||||
|
"solve this, remove the existing directories in the output directory.")
|
||||||
|
E902 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. "
|
||||||
|
"Try checking whitespace and delimiters. See "
|
||||||
|
"https://nightly.spacy.io/api/cli#convert")
|
||||||
|
E903 = ("The token-per-line NER file is not formatted correctly. Try checking "
|
||||||
|
"whitespace and delimiters. See https://nightly.spacy.io/api/cli#convert")
|
||||||
|
E904 = ("Cannot initialize StaticVectors layer: nO dimension unset. This "
|
||||||
|
"dimension refers to the output width, after the linear projection "
|
||||||
|
"has been applied.")
|
||||||
|
E905 = ("Cannot initialize StaticVectors layer: nM dimension unset. This "
|
||||||
|
"dimension refers to the width of the vectors table.")
|
||||||
|
E906 = ("Unexpected `loss` value in pretraining objective: {loss_type}")
|
||||||
|
E907 = ("Unexpected `objective_type` value in pretraining objective: {objective_type}")
|
||||||
|
E908 = ("Can't set `spaces` without `words` in `Doc.__init__`.")
|
||||||
|
E909 = ("Expected {name} in parser internals. This is likely a bug in spaCy.")
|
||||||
|
E910 = ("Encountered NaN value when computing loss for component '{name}'.")
|
||||||
|
E911 = ("Invalid feature: {feat}. Must be a token attribute.")
|
||||||
|
E912 = ("Failed to initialize lemmatizer. Missing lemmatizer table(s) found "
|
||||||
|
"for mode '{mode}'. Required tables: {tables}. Found: {found}.")
|
||||||
|
E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
|
||||||
|
"config.cfg or override it on the CLI?")
|
||||||
|
E914 = ("Executing {name} callback failed. Expected the function to "
|
||||||
|
"return the nlp object but got: {value}. Maybe you forgot to return "
|
||||||
|
"the modified object in your function?")
|
||||||
|
E915 = ("Can't use score '{name}' to calculate final weighted score. Expected "
|
||||||
|
"float or int but got: {score_type}. To exclude the score from the "
|
||||||
|
"final score, set its weight to null in the [training.score_weights] "
|
||||||
|
"section of your training config.")
|
||||||
|
E916 = ("Can't log score for '{name}' in table: not a valid score ({score_type})")
|
||||||
|
E917 = ("Received invalid value {value} for `state_type` in "
|
||||||
|
"TransitionBasedParser: only 'parser' or 'ner' are valid options.")
|
||||||
E918 = ("Received invalid value for vocab: {vocab} ({vocab_type}). Valid "
|
E918 = ("Received invalid value for vocab: {vocab} ({vocab_type}). Valid "
|
||||||
"values are an instance of spacy.vocab.Vocab or True to create one"
|
"values are an instance of `spacy.vocab.Vocab` or True to create one"
|
||||||
" (default).")
|
" (default).")
|
||||||
E919 = ("A textcat 'positive_label' '{pos_label}' was provided for training "
|
E919 = ("A textcat `positive_label` '{pos_label}' was provided for training "
|
||||||
"data that does not appear to be a binary classification problem "
|
"data that does not appear to be a binary classification problem "
|
||||||
"with two labels. Labels found: {labels}")
|
"with two labels. Labels found: {labels}")
|
||||||
E920 = ("The textcat's 'positive_label' config setting '{pos_label}' "
|
E920 = ("The textcat's `positive_label` setting '{pos_label}' "
|
||||||
"does not match any label in the training data. Labels found: {labels}")
|
"does not match any label in the training data or provided during "
|
||||||
E921 = ("The method 'set_output' can only be called on components that have "
|
"initialization. Available labels: {labels}")
|
||||||
"a Model with a 'resize_output' attribute. Otherwise, the output "
|
E921 = ("The method `set_output` can only be called on components that have "
|
||||||
|
"a Model with a `resize_output` attribute. Otherwise, the output "
|
||||||
"layer can not be dynamically changed.")
|
"layer can not be dynamically changed.")
|
||||||
E922 = ("Component '{name}' has been initialized with an output dimension of "
|
E922 = ("Component '{name}' has been initialized with an output dimension of "
|
||||||
"{nO} - cannot add any more labels.")
|
"{nO} - cannot add any more labels.")
|
||||||
E923 = ("It looks like there is no proper sample data to initialize the "
|
E923 = ("It looks like there is no proper sample data to initialize the "
|
||||||
"Model of component '{name}'. "
|
"Model of component '{name}'. This is likely a bug in spaCy, so "
|
||||||
"This is likely a bug in spaCy, so feel free to open an issue.")
|
"feel free to open an issue: https://github.com/explosion/spaCy/issues")
|
||||||
E924 = ("The '{name}' component does not seem to be initialized properly. "
|
E924 = ("The '{name}' component does not seem to be initialized properly. "
|
||||||
"This is likely a bug in spaCy, so feel free to open an issue.")
|
"This is likely a bug in spaCy, so feel free to open an issue: "
|
||||||
|
"https://github.com/explosion/spaCy/issues")
|
||||||
E925 = ("Invalid color values for displaCy visualizer: expected dictionary "
|
E925 = ("Invalid color values for displaCy visualizer: expected dictionary "
|
||||||
"mapping label names to colors but got: {obj}")
|
"mapping label names to colors but got: {obj}")
|
||||||
E926 = ("It looks like you're trying to modify nlp.{attr} directly. This "
|
E926 = ("It looks like you're trying to modify `nlp.{attr}` directly. This "
|
||||||
"doesn't work because it's an immutable computed property. If you "
|
"doesn't work because it's an immutable computed property. If you "
|
||||||
"need to modify the pipeline, use the built-in methods like "
|
"need to modify the pipeline, use the built-in methods like "
|
||||||
"nlp.add_pipe, nlp.remove_pipe, nlp.disable_pipe or nlp.enable_pipe "
|
"`nlp.add_pipe`, `nlp.remove_pipe`, `nlp.disable_pipe` or "
|
||||||
"instead.")
|
"`nlp.enable_pipe` instead.")
|
||||||
E927 = ("Can't write to frozen list Maybe you're trying to modify a computed "
|
E927 = ("Can't write to frozen list Maybe you're trying to modify a computed "
|
||||||
"property or default function argument?")
|
"property or default function argument?")
|
||||||
E928 = ("A 'KnowledgeBase' should be written to / read from a file, but the "
|
E928 = ("A KnowledgeBase can only be serialized to/from from a directory, "
|
||||||
"provided argument {loc} is an existing directory.")
|
"but the provided argument {loc} points to a file.")
|
||||||
E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
|
E929 = ("Couldn't read KnowledgeBase from {loc}. The path does not seem to exist.")
|
||||||
"not seem to exist.")
|
E930 = ("Received invalid get_examples callback in `{method}`. "
|
||||||
E930 = ("Received invalid get_examples callback in {name}.begin_training. "
|
|
||||||
"Expected function that returns an iterable of Example objects but "
|
"Expected function that returns an iterable of Example objects but "
|
||||||
"got: {obj}")
|
"got: {obj}")
|
||||||
E931 = ("Encountered Pipe subclass without Pipe.{method} method in component "
|
E931 = ("Encountered {parent} subclass without `{parent}.{method}` "
|
||||||
"'{name}'. If the component is trainable and you want to use this "
|
"method in component '{name}'. If you want to use this "
|
||||||
"method, make sure it's overwritten on the subclass. If your "
|
"method, make sure it's overwritten on the subclass.")
|
||||||
"component isn't trainable, add a method that does nothing or "
|
|
||||||
"don't use the Pipe base class.")
|
|
||||||
E940 = ("Found NaN values in scores.")
|
E940 = ("Found NaN values in scores.")
|
||||||
E941 = ("Can't find model '{name}'. It looks like you're trying to load a "
|
E941 = ("Can't find model '{name}'. It looks like you're trying to load a "
|
||||||
"model from a shortcut, which is deprecated as of spaCy v3.0. To "
|
"model from a shortcut, which is deprecated as of spaCy v3.0. To "
|
||||||
|
@ -527,31 +547,35 @@ class Errors:
|
||||||
"models, see the models directory: https://spacy.io/models. If you "
|
"models, see the models directory: https://spacy.io/models. If you "
|
||||||
"want to create a blank model, use spacy.blank: "
|
"want to create a blank model, use spacy.blank: "
|
||||||
"nlp = spacy.blank(\"{name}\")")
|
"nlp = spacy.blank(\"{name}\")")
|
||||||
E942 = ("Executing after_{name} callback failed. Expected the function to "
|
E942 = ("Executing `after_{name}` callback failed. Expected the function to "
|
||||||
"return an initialized nlp object but got: {value}. Maybe "
|
"return an initialized nlp object but got: {value}. Maybe "
|
||||||
"you forgot to return the modified object in your function?")
|
"you forgot to return the modified object in your function?")
|
||||||
E943 = ("Executing before_creation callback failed. Expected the function to "
|
E943 = ("Executing `before_creation` callback failed. Expected the function to "
|
||||||
"return an uninitialized Language subclass but got: {value}. Maybe "
|
"return an uninitialized Language subclass but got: {value}. Maybe "
|
||||||
"you forgot to return the modified object in your function or "
|
"you forgot to return the modified object in your function or "
|
||||||
"returned the initialized nlp object instead?")
|
"returned the initialized nlp object instead?")
|
||||||
E944 = ("Can't copy pipeline component '{name}' from source model '{model}': "
|
E944 = ("Can't copy pipeline component '{name}' from source '{model}': "
|
||||||
"not found in pipeline. Available components: {opts}")
|
"not found in pipeline. Available components: {opts}")
|
||||||
E945 = ("Can't copy pipeline component '{name}' from source. Expected loaded "
|
E945 = ("Can't copy pipeline component '{name}' from source. Expected loaded "
|
||||||
"nlp object, but got: {source}")
|
"nlp object, but got: {source}")
|
||||||
E947 = ("Matcher.add received invalid 'greedy' argument: expected "
|
E947 = ("`Matcher.add` received invalid `greedy` argument: expected "
|
||||||
"a string value from {expected} but got: '{arg}'")
|
"a string value from {expected} but got: '{arg}'")
|
||||||
E948 = ("Matcher.add received invalid 'patterns' argument: expected "
|
E948 = ("`Matcher.add` received invalid 'patterns' argument: expected "
|
||||||
"a List, but got: {arg_type}")
|
"a list, but got: {arg_type}")
|
||||||
E949 = ("Can only create an alignment when the texts are the same.")
|
E949 = ("Can only create an alignment when the texts are the same.")
|
||||||
E952 = ("The section '{name}' is not a valid section in the provided config.")
|
E952 = ("The section '{name}' is not a valid section in the provided config.")
|
||||||
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
|
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
|
||||||
E954 = ("The Tok2Vec listener did not receive a valid input.")
|
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
|
||||||
E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.")
|
"component.")
|
||||||
|
E955 = ("Can't find table(s) {table} for language '{lang}' in "
|
||||||
|
"spacy-lookups-data. Make sure you have the package installed or "
|
||||||
|
"provide your own lookup tables if no default lookups are available "
|
||||||
|
"for your language.")
|
||||||
E956 = ("Can't find component '{name}' in [components] block in the config. "
|
E956 = ("Can't find component '{name}' in [components] block in the config. "
|
||||||
"Available components: {opts}")
|
"Available components: {opts}")
|
||||||
E957 = ("Writing directly to Language.factories isn't needed anymore in "
|
E957 = ("Writing directly to `Language.factories` isn't needed anymore in "
|
||||||
"spaCy v3. Instead, you can use the @Language.factory decorator "
|
"spaCy v3. Instead, you can use the `@Language.factory` decorator "
|
||||||
"to register your custom component factory or @Language.component "
|
"to register your custom component factory or `@Language.component` "
|
||||||
"to register a simple stateless function component that just takes "
|
"to register a simple stateless function component that just takes "
|
||||||
"a Doc and returns it.")
|
"a Doc and returns it.")
|
||||||
E958 = ("Language code defined in config ({bad_lang_code}) does not match "
|
E958 = ("Language code defined in config ({bad_lang_code}) does not match "
|
||||||
|
@ -569,129 +593,120 @@ class Errors:
|
||||||
"component.\n\n{config}")
|
"component.\n\n{config}")
|
||||||
E962 = ("Received incorrect {style} for pipe '{name}'. Expected dict, "
|
E962 = ("Received incorrect {style} for pipe '{name}'. Expected dict, "
|
||||||
"got: {cfg_type}.")
|
"got: {cfg_type}.")
|
||||||
E963 = ("Can't read component info from @Language.{decorator} decorator. "
|
E963 = ("Can't read component info from `@Language.{decorator}` decorator. "
|
||||||
"Maybe you forgot to call it? Make sure you're using "
|
"Maybe you forgot to call it? Make sure you're using "
|
||||||
"@Language.{decorator}() instead of @Language.{decorator}.")
|
"`@Language.{decorator}()` instead of `@Language.{decorator}`.")
|
||||||
E964 = ("The pipeline component factory for '{name}' needs to have the "
|
E964 = ("The pipeline component factory for '{name}' needs to have the "
|
||||||
"following named arguments, which are passed in by spaCy:\n- nlp: "
|
"following named arguments, which are passed in by spaCy:\n- nlp: "
|
||||||
"receives the current nlp object and lets you access the vocab\n- "
|
"receives the current nlp object and lets you access the vocab\n- "
|
||||||
"name: the name of the component instance, can be used to identify "
|
"name: the name of the component instance, can be used to identify "
|
||||||
"the component, output losses etc.")
|
"the component, output losses etc.")
|
||||||
E965 = ("It looks like you're using the @Language.component decorator to "
|
E965 = ("It looks like you're using the `@Language.component` decorator to "
|
||||||
"register '{name}' on a class instead of a function component. If "
|
"register '{name}' on a class instead of a function component. If "
|
||||||
"you need to register a class or function that *returns* a component "
|
"you need to register a class or function that *returns* a component "
|
||||||
"function, use the @Language.factory decorator instead.")
|
"function, use the `@Language.factory` decorator instead.")
|
||||||
E966 = ("nlp.add_pipe now takes the string name of the registered component "
|
E966 = ("`nlp.add_pipe` now takes the string name of the registered component "
|
||||||
"factory, not a callable component. Expected string, but got "
|
"factory, not a callable component. Expected string, but got "
|
||||||
"{component} (name: '{name}').\n\n- If you created your component "
|
"{component} (name: '{name}').\n\n- If you created your component "
|
||||||
"with nlp.create_pipe('name'): remove nlp.create_pipe and call "
|
"with `nlp.create_pipe('name')`: remove nlp.create_pipe and call "
|
||||||
"nlp.add_pipe('name') instead.\n\n- If you passed in a component "
|
"`nlp.add_pipe('name')` instead.\n\n- If you passed in a component "
|
||||||
"like TextCategorizer(): call nlp.add_pipe with the string name "
|
"like `TextCategorizer()`: call `nlp.add_pipe` with the string name "
|
||||||
"instead, e.g. nlp.add_pipe('textcat').\n\n- If you're using a custom "
|
"instead, e.g. `nlp.add_pipe('textcat')`.\n\n- If you're using a custom "
|
||||||
"component: Add the decorator @Language.component (for function "
|
"component: Add the decorator `@Language.component` (for function "
|
||||||
"components) or @Language.factory (for class components / factories) "
|
"components) or `@Language.factory` (for class components / factories) "
|
||||||
"to your custom component and assign it a name, e.g. "
|
"to your custom component and assign it a name, e.g. "
|
||||||
"@Language.component('your_name'). You can then run "
|
"`@Language.component('your_name')`. You can then run "
|
||||||
"nlp.add_pipe('your_name') to add it to the pipeline.")
|
"`nlp.add_pipe('your_name')` to add it to the pipeline.")
|
||||||
E967 = ("No {meta} meta information found for '{name}'. This is likely a bug in spaCy.")
|
E967 = ("No {meta} meta information found for '{name}'. This is likely a bug in spaCy.")
|
||||||
E968 = ("nlp.replace_pipe now takes the string name of the registered component "
|
E968 = ("`nlp.replace_pipe` now takes the string name of the registered component "
|
||||||
"factory, not a callable component. Expected string, but got "
|
"factory, not a callable component. Expected string, but got "
|
||||||
"{component}.\n\n- If you created your component with"
|
"{component}.\n\n- If you created your component with"
|
||||||
"with nlp.create_pipe('name'): remove nlp.create_pipe and call "
|
"with `nlp.create_pipe('name')`: remove `nlp.create_pipe` and call "
|
||||||
"nlp.replace_pipe('{name}', 'name') instead.\n\n- If you passed in a "
|
"`nlp.replace_pipe('{name}', 'name')` instead.\n\n- If you passed in a "
|
||||||
"component like TextCategorizer(): call nlp.replace_pipe with the "
|
"component like `TextCategorizer()`: call `nlp.replace_pipe` with the "
|
||||||
"string name instead, e.g. nlp.replace_pipe('{name}', 'textcat').\n\n"
|
"string name instead, e.g. `nlp.replace_pipe('{name}', 'textcat')`.\n\n"
|
||||||
"- If you're using a custom component: Add the decorator "
|
"- If you're using a custom component: Add the decorator "
|
||||||
"@Language.component (for function components) or @Language.factory "
|
"`@Language.component` (for function components) or `@Language.factory` "
|
||||||
"(for class components / factories) to your custom component and "
|
"(for class components / factories) to your custom component and "
|
||||||
"assign it a name, e.g. @Language.component('your_name'). You can "
|
"assign it a name, e.g. `@Language.component('your_name')`. You can "
|
||||||
"then run nlp.replace_pipe('{name}', 'your_name').")
|
"then run `nlp.replace_pipe('{name}', 'your_name')`.")
|
||||||
E969 = ("Expected string values for field '{field}', but received {types} instead. ")
|
E969 = ("Expected string values for field '{field}', but received {types} instead. ")
|
||||||
E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
|
E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
|
||||||
E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
|
E971 = ("Found incompatible lengths in `Doc.from_array`: {array_length} for the "
|
||||||
"array and {doc_length} for the Doc itself.")
|
"array and {doc_length} for the Doc itself.")
|
||||||
E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.")
|
E972 = ("`Example.__init__` got None for '{arg}'. Requires Doc.")
|
||||||
E973 = ("Unexpected type for NER data")
|
E973 = ("Unexpected type for NER data")
|
||||||
E974 = ("Unknown {obj} attribute: {key}")
|
E974 = ("Unknown {obj} attribute: {key}")
|
||||||
E976 = ("The method 'Example.from_dict' expects a {type} as {n} argument, "
|
E976 = ("The method `Example.from_dict` expects a {type} as {n} argument, "
|
||||||
"but received None.")
|
"but received None.")
|
||||||
E977 = ("Can not compare a MorphAnalysis with a string object. "
|
E977 = ("Can not compare a MorphAnalysis with a string object. "
|
||||||
"This is likely a bug in spaCy, so feel free to open an issue.")
|
"This is likely a bug in spaCy, so feel free to open an issue: "
|
||||||
|
"https://github.com/explosion/spaCy/issues")
|
||||||
E978 = ("The {name} method takes a list of Example objects, but got: {types}")
|
E978 = ("The {name} method takes a list of Example objects, but got: {types}")
|
||||||
E979 = ("Cannot convert {type} to an Example object.")
|
|
||||||
E980 = ("Each link annotation should refer to a dictionary with at most one "
|
E980 = ("Each link annotation should refer to a dictionary with at most one "
|
||||||
"identifier mapping to 1.0, and all others to 0.0.")
|
"identifier mapping to 1.0, and all others to 0.0.")
|
||||||
E981 = ("The offsets of the annotations for 'links' could not be aligned "
|
E981 = ("The offsets of the annotations for `links` could not be aligned "
|
||||||
"to token boundaries.")
|
"to token boundaries.")
|
||||||
E982 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
|
E982 = ("The `Token.ent_iob` attribute should be an integer indexing "
|
||||||
"into {values}, but found {value}.")
|
"into {values}, but found {value}.")
|
||||||
E983 = ("Invalid key for '{dict}': {key}. Available keys: "
|
E983 = ("Invalid key for '{dict}': {key}. Available keys: "
|
||||||
"{keys}")
|
"{keys}")
|
||||||
E984 = ("Invalid component config for '{name}': component block needs either "
|
E984 = ("Invalid component config for '{name}': component block needs either "
|
||||||
"a key 'factory' specifying the registered function used to "
|
"a key `factory` specifying the registered function used to "
|
||||||
"initialize the component, or a key 'source' key specifying a "
|
"initialize the component, or a key `source` key specifying a "
|
||||||
"spaCy model to copy the component from. For example, factory = "
|
"spaCy model to copy the component from. For example, `factory = "
|
||||||
"\"ner\" will use the 'ner' factory and all other settings in the "
|
"\"ner\"` will use the 'ner' factory and all other settings in the "
|
||||||
"block will be passed to it as arguments. Alternatively, source = "
|
"block will be passed to it as arguments. Alternatively, `source = "
|
||||||
"\"en_core_web_sm\" will copy the component from that model.\n\n{config}")
|
"\"en_core_web_sm\"` will copy the component from that model.\n\n{config}")
|
||||||
E985 = ("Can't load model from config file: no 'nlp' section found.\n\n{config}")
|
E985 = ("Can't load model from config file: no [nlp] section found.\n\n{config}")
|
||||||
E986 = ("Could not create any training batches: check your input. "
|
E986 = ("Could not create any training batches: check your input. "
|
||||||
"Are the train and dev paths defined? "
|
"Are the train and dev paths defined? Is `discard_oversize` set appropriately? ")
|
||||||
"Is 'discard_oversize' set appropriately? ")
|
E989 = ("`nlp.update()` was called with two positional arguments. This "
|
||||||
E987 = ("The text of an example training instance is either a Doc or "
|
|
||||||
"a string, but found {type} instead.")
|
|
||||||
E988 = ("Could not parse any training examples. Ensure the data is "
|
|
||||||
"formatted correctly.")
|
|
||||||
E989 = ("'nlp.update()' was called with two positional arguments. This "
|
|
||||||
"may be due to a backwards-incompatible change to the format "
|
"may be due to a backwards-incompatible change to the format "
|
||||||
"of the training data in spaCy 3.0 onwards. The 'update' "
|
"of the training data in spaCy 3.0 onwards. The 'update' "
|
||||||
"function should now be called with a batch of 'Example' "
|
"function should now be called with a batch of Example "
|
||||||
"objects, instead of (text, annotation) tuples. ")
|
"objects, instead of `(text, annotation)` tuples. ")
|
||||||
E991 = ("The function 'select_pipes' should be called with either a "
|
E991 = ("The function `nlp.select_pipes` should be called with either a "
|
||||||
"'disable' argument to list the names of the pipe components "
|
"`disable` argument to list the names of the pipe components "
|
||||||
"that should be disabled, or with an 'enable' argument that "
|
"that should be disabled, or with an 'enable' argument that "
|
||||||
"specifies which pipes should not be disabled.")
|
"specifies which pipes should not be disabled.")
|
||||||
E992 = ("The function `select_pipes` was called with `enable`={enable} "
|
E992 = ("The function `select_pipes` was called with `enable`={enable} "
|
||||||
"and `disable`={disable} but that information is conflicting "
|
"and `disable`={disable} but that information is conflicting "
|
||||||
"for the `nlp` pipeline with components {names}.")
|
"for the `nlp` pipeline with components {names}.")
|
||||||
E993 = ("The config for 'nlp' needs to include a key 'lang' specifying "
|
E993 = ("The config for the nlp object needs to include a key `lang` specifying "
|
||||||
"the code of the language to initialize it with (for example "
|
"the code of the language to initialize it with (for example "
|
||||||
"'en' for English) - this can't be 'None'.\n\n{config}")
|
"'en' for English) - this can't be None.\n\n{config}")
|
||||||
E996 = ("Could not parse {file}: {msg}")
|
|
||||||
E997 = ("Tokenizer special cases are not allowed to modify the text. "
|
E997 = ("Tokenizer special cases are not allowed to modify the text. "
|
||||||
"This would map '{chunk}' to '{orth}' given token attributes "
|
"This would map '{chunk}' to '{orth}' given token attributes "
|
||||||
"'{token_attrs}'.")
|
"'{token_attrs}'.")
|
||||||
E999 = ("Unable to merge the `Doc` objects because they do not all share "
|
E999 = ("Unable to merge the Doc objects because they do not all share "
|
||||||
"the same `Vocab`.")
|
"the same `Vocab`.")
|
||||||
E1000 = ("No pkuseg model available. Provide a pkuseg model when "
|
E1000 = ("The Chinese word segmenter is pkuseg but no pkuseg model was "
|
||||||
"initializing the pipeline:\n"
|
"loaded. Provide the name of a pretrained model or the path to "
|
||||||
'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\n'
|
"a model and initialize the pipeline:\n\n"
|
||||||
'nlp = Chinese(config=cfg)')
|
'nlp.tokenizer.initialize(pkuseg_model="default")')
|
||||||
E1001 = ("Target token outside of matched span for match with tokens "
|
E1001 = ("Target token outside of matched span for match with tokens "
|
||||||
"'{span}' and offset '{index}' matched by patterns '{patterns}'.")
|
"'{span}' and offset '{index}' matched by patterns '{patterns}'.")
|
||||||
E1002 = ("Span index out of range.")
|
E1002 = ("Span index out of range.")
|
||||||
E1003 = ("Unsupported lemmatizer mode '{mode}'.")
|
E1003 = ("Unsupported lemmatizer mode '{mode}'.")
|
||||||
E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. "
|
E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. "
|
||||||
"Required tables '{tables}', found '{found}'. If you are not "
|
"Required tables: {tables}. Found: {found}. Maybe you forgot to "
|
||||||
"providing custom lookups, make sure you have the package "
|
"call `nlp.initialize()` to load in the data?")
|
||||||
"spacy-lookups-data installed.")
|
|
||||||
E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for "
|
E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for "
|
||||||
"'{chunk}'. Tokenizer exceptions are only allowed to specify "
|
"'{chunk}'. Tokenizer exceptions are only allowed to specify "
|
||||||
"`ORTH` and `NORM`.")
|
"ORTH and NORM.")
|
||||||
E1006 = ("Unable to initialize {name} model with 0 labels.")
|
|
||||||
E1007 = ("Unsupported DependencyMatcher operator '{op}'.")
|
E1007 = ("Unsupported DependencyMatcher operator '{op}'.")
|
||||||
E1008 = ("Invalid pattern: each pattern should be a list of dicts. Check "
|
E1008 = ("Invalid pattern: each pattern should be a list of dicts. Check "
|
||||||
"that you are providing a list of patterns as `List[List[dict]]`.")
|
"that you are providing a list of patterns as `List[List[dict]]`.")
|
||||||
E1009 = ("String for hash '{val}' not found in StringStore. Set the value "
|
E1010 = ("Unable to set entity information for token {i} which is included "
|
||||||
"through token.morph_ instead or add the string to the "
|
"in more than one span in entities, blocked, missing or outside.")
|
||||||
"StringStore with `nlp.vocab.strings.add(string)`.")
|
E1011 = ("Unsupported default '{default}' in `doc.set_ents`. Available "
|
||||||
|
"options: {modes}")
|
||||||
|
E1012 = ("Entity spans and blocked/missing/outside spans should be "
|
||||||
@add_codes
|
"provided to `doc.set_ents` as lists of Span objects.")
|
||||||
class TempErrors:
|
E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the "
|
||||||
T003 = ("Resizing pretrained Tagger models is not currently supported.")
|
"token itself. To set the morph from this MorphAnalysis, set from "
|
||||||
T007 = ("Can't yet set {attr} from Span. Vote for this feature on the "
|
"the string value with: `token.set_morph(str(other_morph))`.")
|
||||||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
|
||||||
|
|
||||||
|
|
||||||
# Deprecated model shortcuts, only used in errors and warnings
|
# Deprecated model shortcuts, only used in errors and warnings
|
||||||
|
|
|
@ -140,7 +140,6 @@ cdef class KnowledgeBase:
|
||||||
self._entries.push_back(entry)
|
self._entries.push_back(entry)
|
||||||
self._aliases_table.push_back(alias)
|
self._aliases_table.push_back(alias)
|
||||||
|
|
||||||
cpdef from_disk(self, loc)
|
|
||||||
cpdef set_entities(self, entity_list, freq_list, vector_list)
|
cpdef set_entities(self, entity_list, freq_list, vector_list)
|
||||||
|
|
||||||
|
|
||||||
|
|
62
spacy/kb.pyx
62
spacy/kb.pyx
|
@ -1,5 +1,7 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: infer_types=True, profile=True
|
||||||
from typing import Iterator
|
from typing import Iterator, Iterable
|
||||||
|
|
||||||
|
import srsly
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
from cpython.exc cimport PyErr_SetFromErrno
|
from cpython.exc cimport PyErr_SetFromErrno
|
||||||
|
@ -9,11 +11,11 @@ from libcpp.vector cimport vector
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import warnings
|
import warnings
|
||||||
from os import path
|
|
||||||
|
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
from .errors import Errors, Warnings
|
from .errors import Errors, Warnings
|
||||||
|
from . import util
|
||||||
|
from .util import SimpleFrozenList, ensure_path
|
||||||
|
|
||||||
cdef class Candidate:
|
cdef class Candidate:
|
||||||
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
||||||
|
@ -89,7 +91,6 @@ cdef class KnowledgeBase:
|
||||||
self._entry_index = PreshMap()
|
self._entry_index = PreshMap()
|
||||||
self._alias_index = PreshMap()
|
self._alias_index = PreshMap()
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.vocab.strings.add("")
|
|
||||||
self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
|
self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -318,9 +319,30 @@ cdef class KnowledgeBase:
|
||||||
|
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
|
def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
|
||||||
|
path = ensure_path(path)
|
||||||
|
if not path.exists():
|
||||||
|
path.mkdir(parents=True)
|
||||||
|
if not path.is_dir():
|
||||||
|
raise ValueError(Errors.E928.format(loc=path))
|
||||||
|
serialize = {}
|
||||||
|
serialize["contents"] = lambda p: self.write_contents(p)
|
||||||
|
serialize["strings.json"] = lambda p: self.vocab.strings.to_disk(p)
|
||||||
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
def to_disk(self, loc):
|
def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
|
||||||
cdef Writer writer = Writer(loc)
|
path = ensure_path(path)
|
||||||
|
if not path.exists():
|
||||||
|
raise ValueError(Errors.E929.format(loc=path))
|
||||||
|
if not path.is_dir():
|
||||||
|
raise ValueError(Errors.E928.format(loc=path))
|
||||||
|
deserialize = {}
|
||||||
|
deserialize["contents"] = lambda p: self.read_contents(p)
|
||||||
|
deserialize["strings.json"] = lambda p: self.vocab.strings.from_disk(p)
|
||||||
|
util.from_disk(path, deserialize, exclude)
|
||||||
|
|
||||||
|
def write_contents(self, file_path):
|
||||||
|
cdef Writer writer = Writer(file_path)
|
||||||
writer.write_header(self.get_size_entities(), self.entity_vector_length)
|
writer.write_header(self.get_size_entities(), self.entity_vector_length)
|
||||||
|
|
||||||
# dumping the entity vectors in their original order
|
# dumping the entity vectors in their original order
|
||||||
|
@ -359,7 +381,7 @@ cdef class KnowledgeBase:
|
||||||
|
|
||||||
writer.close()
|
writer.close()
|
||||||
|
|
||||||
cpdef from_disk(self, loc):
|
def read_contents(self, file_path):
|
||||||
cdef hash_t entity_hash
|
cdef hash_t entity_hash
|
||||||
cdef hash_t alias_hash
|
cdef hash_t alias_hash
|
||||||
cdef int64_t entry_index
|
cdef int64_t entry_index
|
||||||
|
@ -369,7 +391,7 @@ cdef class KnowledgeBase:
|
||||||
cdef AliasC alias
|
cdef AliasC alias
|
||||||
cdef float vector_element
|
cdef float vector_element
|
||||||
|
|
||||||
cdef Reader reader = Reader(loc)
|
cdef Reader reader = Reader(file_path)
|
||||||
|
|
||||||
# STEP 0: load header and initialize KB
|
# STEP 0: load header and initialize KB
|
||||||
cdef int64_t nr_entities
|
cdef int64_t nr_entities
|
||||||
|
@ -450,16 +472,13 @@ cdef class KnowledgeBase:
|
||||||
|
|
||||||
|
|
||||||
cdef class Writer:
|
cdef class Writer:
|
||||||
def __init__(self, object loc):
|
def __init__(self, path):
|
||||||
if isinstance(loc, Path):
|
assert isinstance(path, Path)
|
||||||
loc = bytes(loc)
|
content = bytes(path)
|
||||||
if path.exists(loc):
|
cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
|
||||||
if path.isdir(loc):
|
|
||||||
raise ValueError(Errors.E928.format(loc=loc))
|
|
||||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
|
||||||
self._fp = fopen(<char*>bytes_loc, 'wb')
|
self._fp = fopen(<char*>bytes_loc, 'wb')
|
||||||
if not self._fp:
|
if not self._fp:
|
||||||
raise IOError(Errors.E146.format(path=loc))
|
raise IOError(Errors.E146.format(path=path))
|
||||||
fseek(self._fp, 0, 0)
|
fseek(self._fp, 0, 0)
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
|
@ -496,14 +515,9 @@ cdef class Writer:
|
||||||
|
|
||||||
|
|
||||||
cdef class Reader:
|
cdef class Reader:
|
||||||
def __init__(self, object loc):
|
def __init__(self, path):
|
||||||
if isinstance(loc, Path):
|
content = bytes(path)
|
||||||
loc = bytes(loc)
|
cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
|
||||||
if not path.exists(loc):
|
|
||||||
raise ValueError(Errors.E929.format(loc=loc))
|
|
||||||
if path.isdir(loc):
|
|
||||||
raise ValueError(Errors.E928.format(loc=loc))
|
|
||||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
|
||||||
self._fp = fopen(<char*>bytes_loc, 'rb')
|
self._fp = fopen(<char*>bytes_loc, 'rb')
|
||||||
if not self._fp:
|
if not self._fp:
|
||||||
PyErr_SetFromErrno(IOError)
|
PyErr_SetFromErrno(IOError)
|
||||||
|
|
|
@ -1,7 +1,10 @@
|
||||||
|
from typing import Optional
|
||||||
|
from thinc.api import Model
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...pipeline import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
class BengaliDefaults(Language.Defaults):
|
class BengaliDefaults(Language.Defaults):
|
||||||
|
@ -17,4 +20,14 @@ class Bengali(Language):
|
||||||
Defaults = BengaliDefaults
|
Defaults = BengaliDefaults
|
||||||
|
|
||||||
|
|
||||||
|
@Bengali.factory(
|
||||||
|
"lemmatizer",
|
||||||
|
assigns=["token.lemma"],
|
||||||
|
default_config={"model": None, "mode": "rule"},
|
||||||
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
|
)
|
||||||
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
|
return Lemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Bengali"]
|
__all__ = ["Bengali"]
|
||||||
|
|
|
@ -16,15 +16,19 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
labels = ["sb", "oa", "da", "nk", "mo", "ag", "ROOT", "root", "cj", "pd", "og", "app"]
|
labels = ["sb", "oa", "da", "nk", "mo", "ag", "ROOT", "root", "cj", "pd", "og", "app"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
if not doc.is_parsed:
|
if not doc.has_annotation("DEP"):
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
np_deps = set(doc.vocab.strings.add(label) for label in labels)
|
np_deps = set(doc.vocab.strings.add(label) for label in labels)
|
||||||
close_app = doc.vocab.strings.add("nk")
|
close_app = doc.vocab.strings.add("nk")
|
||||||
rbracket = 0
|
rbracket = 0
|
||||||
|
prev_end = -1
|
||||||
for i, word in enumerate(doclike):
|
for i, word in enumerate(doclike):
|
||||||
if i < rbracket:
|
if i < rbracket:
|
||||||
continue
|
continue
|
||||||
|
# Prevent nested chunks from being produced
|
||||||
|
if word.left_edge.i <= prev_end:
|
||||||
|
continue
|
||||||
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
|
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
|
||||||
rbracket = word.i + 1
|
rbracket = word.i + 1
|
||||||
# try to extend the span to the right
|
# try to extend the span to the right
|
||||||
|
@ -32,6 +36,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
for rdep in doc[word.i].rights:
|
for rdep in doc[word.i].rights:
|
||||||
if rdep.pos in (NOUN, PROPN) and rdep.dep == close_app:
|
if rdep.pos in (NOUN, PROPN) and rdep.dep == close_app:
|
||||||
rbracket = rdep.i + 1
|
rbracket = rdep.i + 1
|
||||||
|
prev_end = rbracket - 1
|
||||||
yield word.left_edge.i, rbracket, np_label
|
yield word.left_edge.i, rbracket, np_label
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,6 @@ from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .lemmatizer import GreekLemmatizer
|
from .lemmatizer import GreekLemmatizer
|
||||||
from ...lookups import Lookups
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
|
||||||
|
|
||||||
|
@ -29,19 +28,11 @@ class Greek(Language):
|
||||||
@Greek.factory(
|
@Greek.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
default_config={"model": None, "mode": "rule"},
|
||||||
scores=["lemma_acc"],
|
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
nlp: Language,
|
return GreekLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
|
||||||
lookups = GreekLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
|
||||||
return GreekLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Greek"]
|
__all__ = ["Greek"]
|
||||||
|
|
|
@ -13,7 +13,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
# Further improvement of the models will eliminate the need for this tag.
|
# Further improvement of the models will eliminate the need for this tag.
|
||||||
labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
|
labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
|
||||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
if not doc.is_parsed:
|
if not doc.has_annotation("DEP"):
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
@ -9,7 +8,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from .lemmatizer import EnglishLemmatizer
|
from .lemmatizer import EnglishLemmatizer
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lookups import Lookups
|
|
||||||
|
|
||||||
|
|
||||||
class EnglishDefaults(Language.Defaults):
|
class EnglishDefaults(Language.Defaults):
|
||||||
|
@ -28,19 +26,11 @@ class English(Language):
|
||||||
@English.factory(
|
@English.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
default_config={"model": None, "mode": "rule"},
|
||||||
scores=["lemma_acc"],
|
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
nlp: Language,
|
return EnglishLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
|
||||||
lookups = EnglishLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
|
||||||
return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["English"]
|
__all__ = ["English"]
|
||||||
|
|
|
@ -3,8 +3,7 @@ from ...tokens import Token
|
||||||
|
|
||||||
|
|
||||||
class EnglishLemmatizer(Lemmatizer):
|
class EnglishLemmatizer(Lemmatizer):
|
||||||
"""English lemmatizer. Only overrides is_base_form.
|
"""English lemmatizer. Only overrides is_base_form."""
|
||||||
"""
|
|
||||||
|
|
||||||
def is_base_form(self, token: Token) -> bool:
|
def is_base_form(self, token: Token) -> bool:
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
labels = ["nsubj", "dobj", "nsubjpass", "pcomp", "pobj", "dative", "appos", "attr", "ROOT"]
|
labels = ["nsubj", "dobj", "nsubjpass", "pcomp", "pobj", "dative", "appos", "attr", "ROOT"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
if not doc.is_parsed:
|
if not doc.has_annotation("DEP"):
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
|
|
|
@ -8,7 +8,7 @@ from ...tokens import Doc, Span, Token
|
||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||||
doc = doclike.doc
|
doc = doclike.doc
|
||||||
if not doc.is_parsed:
|
if not doc.has_annotation("DEP"):
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
if not len(doc):
|
if not len(doc):
|
||||||
return
|
return
|
||||||
|
@ -58,7 +58,7 @@ def noun_bounds(
|
||||||
doc, token, np_left_deps, np_right_deps, stop_deps
|
doc, token, np_left_deps, np_right_deps, stop_deps
|
||||||
)
|
)
|
||||||
filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps
|
filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps
|
||||||
if list(filter(filter_func, doc[left_bound.i : right.i],)):
|
if list(filter(filter_func, doc[left_bound.i : right.i])):
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
right_bound = right
|
right_bound = right
|
||||||
|
|
|
@ -1,9 +1,12 @@
|
||||||
|
from typing import Optional
|
||||||
|
from thinc.api import Model
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...pipeline import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
class PersianDefaults(Language.Defaults):
|
class PersianDefaults(Language.Defaults):
|
||||||
|
@ -20,4 +23,14 @@ class Persian(Language):
|
||||||
Defaults = PersianDefaults
|
Defaults = PersianDefaults
|
||||||
|
|
||||||
|
|
||||||
|
@Persian.factory(
|
||||||
|
"lemmatizer",
|
||||||
|
assigns=["token.lemma"],
|
||||||
|
default_config={"model": None, "mode": "rule"},
|
||||||
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
|
)
|
||||||
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
|
return Lemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Persian"]
|
__all__ = ["Persian"]
|
||||||
|
|
|
@ -19,7 +19,7 @@ def noun_chunks(doclike):
|
||||||
]
|
]
|
||||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
if not doc.is_parsed:
|
if not doc.has_annotation("DEP"):
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
|
|
||||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||||
|
|
|
@ -9,7 +9,6 @@ from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .lemmatizer import FrenchLemmatizer
|
from .lemmatizer import FrenchLemmatizer
|
||||||
from ...lookups import Lookups
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
|
||||||
|
|
||||||
|
@ -32,19 +31,11 @@ class French(Language):
|
||||||
@French.factory(
|
@French.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
default_config={"model": None, "mode": "rule"},
|
||||||
scores=["lemma_acc"],
|
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
nlp: Language,
|
return FrenchLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
|
||||||
lookups = FrenchLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
|
||||||
return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["French"]
|
__all__ = ["French"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import List, Dict
|
from typing import List, Tuple
|
||||||
|
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
from ...tokens import Token
|
from ...tokens import Token
|
||||||
|
@ -15,17 +15,10 @@ class FrenchLemmatizer(Lemmatizer):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_lookups_config(cls, mode: str) -> Dict:
|
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
|
||||||
if mode == "rule":
|
if mode == "rule":
|
||||||
return {
|
required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
|
||||||
"required_tables": [
|
return (required, [])
|
||||||
"lemma_lookup",
|
|
||||||
"lemma_rules",
|
|
||||||
"lemma_exc",
|
|
||||||
"lemma_index",
|
|
||||||
],
|
|
||||||
"optional_tables": [],
|
|
||||||
}
|
|
||||||
else:
|
else:
|
||||||
return super().get_lookups_config(mode)
|
return super().get_lookups_config(mode)
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
if not doc.is_parsed:
|
if not doc.has_annotation("DEP"):
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
|
|
|
@ -7,8 +7,8 @@ Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
||||||
sentences = [
|
sentences = [
|
||||||
"Al Qaidah mengklaim bom mobil yang menewaskan 60 Orang di Mali",
|
"Indonesia merupakan negara kepulauan yang kaya akan budaya.",
|
||||||
"Abu Sayyaf mengeksekusi sandera warga Filipina",
|
"Berapa banyak warga yang dibutuhkan saat kerja bakti?",
|
||||||
"Penyaluran pupuk berasal dari lima lokasi yakni Bontang, Kalimantan Timur, Surabaya, Banyuwangi, Semarang, dan Makassar.",
|
"Penyaluran pupuk berasal dari lima lokasi yakni Bontang, Kalimantan Timur, Surabaya, Banyuwangi, Semarang, dan Makassar.",
|
||||||
"PT Pupuk Kaltim telah menyalurkan 274.707 ton pupuk bersubsidi ke wilayah penyaluran di 14 provinsi.",
|
"PT Pupuk Kaltim telah menyalurkan 274.707 ton pupuk bersubsidi ke wilayah penyaluran di 14 provinsi.",
|
||||||
"Jakarta adalah kota besar yang nyaris tidak pernah tidur."
|
"Jakarta adalah kota besar yang nyaris tidak pernah tidur."
|
||||||
|
|
|
@ -13,7 +13,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
if not doc.is_parsed:
|
if not doc.has_annotation("DEP"):
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
|
|
|
@ -2,7 +2,6 @@ from typing import Optional, Union, Dict, Any
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import srsly
|
import srsly
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
@ -12,9 +11,11 @@ from .tag_bigram_map import TAG_BIGRAM_MAP
|
||||||
from ...compat import copy_reg
|
from ...compat import copy_reg
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...scorer import Scorer
|
||||||
from ...symbols import POS
|
from ...symbols import POS
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...util import DummyTokenizer, registry
|
from ...training import validate_examples
|
||||||
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
from ... import util
|
from ... import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -130,6 +131,10 @@ class JapaneseTokenizer(DummyTokenizer):
|
||||||
)
|
)
|
||||||
return sub_tokens_list
|
return sub_tokens_list
|
||||||
|
|
||||||
|
def score(self, examples):
|
||||||
|
validate_examples(examples, "JapaneseTokenizer.score")
|
||||||
|
return Scorer.score_tokenization(examples)
|
||||||
|
|
||||||
def _get_config(self) -> Dict[str, Any]:
|
def _get_config(self) -> Dict[str, Any]:
|
||||||
return {"split_mode": self.split_mode}
|
return {"split_mode": self.split_mode}
|
||||||
|
|
||||||
|
@ -160,7 +165,7 @@ class JapaneseTokenizer(DummyTokenizer):
|
||||||
|
|
||||||
|
|
||||||
class JapaneseDefaults(Language.Defaults):
|
class JapaneseDefaults(Language.Defaults):
|
||||||
config = Config().from_str(DEFAULT_CONFIG)
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from typing import Optional, Any, Dict
|
from typing import Optional, Any, Dict
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
|
@ -7,8 +6,10 @@ from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...compat import copy_reg
|
from ...compat import copy_reg
|
||||||
|
from ...scorer import Scorer
|
||||||
from ...symbols import POS
|
from ...symbols import POS
|
||||||
from ...util import DummyTokenizer, registry
|
from ...training import validate_examples
|
||||||
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -62,9 +63,13 @@ class KoreanTokenizer(DummyTokenizer):
|
||||||
lemma = surface
|
lemma = surface
|
||||||
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
||||||
|
|
||||||
|
def score(self, examples):
|
||||||
|
validate_examples(examples, "KoreanTokenizer.score")
|
||||||
|
return Scorer.score_tokenization(examples)
|
||||||
|
|
||||||
|
|
||||||
class KoreanDefaults(Language.Defaults):
|
class KoreanDefaults(Language.Defaults):
|
||||||
config = Config().from_str(DEFAULT_CONFIG)
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||||
|
|
|
@ -1,9 +1,12 @@
|
||||||
|
from typing import Optional
|
||||||
|
from thinc.api import Model
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...pipeline import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
class NorwegianDefaults(Language.Defaults):
|
class NorwegianDefaults(Language.Defaults):
|
||||||
|
@ -20,4 +23,14 @@ class Norwegian(Language):
|
||||||
Defaults = NorwegianDefaults
|
Defaults = NorwegianDefaults
|
||||||
|
|
||||||
|
|
||||||
|
@Norwegian.factory(
|
||||||
|
"lemmatizer",
|
||||||
|
assigns=["token.lemma"],
|
||||||
|
default_config={"model": None, "mode": "rule"},
|
||||||
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
|
)
|
||||||
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
|
return Lemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Norwegian"]
|
__all__ = ["Norwegian"]
|
||||||
|
|
|
@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
if not doc.is_parsed:
|
if not doc.has_annotation("DEP"):
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
@ -8,7 +7,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .lemmatizer import DutchLemmatizer
|
from .lemmatizer import DutchLemmatizer
|
||||||
from ...lookups import Lookups
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
|
||||||
|
|
||||||
|
@ -29,19 +27,11 @@ class Dutch(Language):
|
||||||
@Dutch.factory(
|
@Dutch.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
default_config={"model": None, "mode": "rule"},
|
||||||
scores=["lemma_acc"],
|
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
nlp: Language,
|
return DutchLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
|
||||||
lookups = DutchLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
|
||||||
return DutchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Dutch"]
|
__all__ = ["Dutch"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import List, Dict
|
from typing import List, Tuple
|
||||||
|
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
from ...tokens import Token
|
from ...tokens import Token
|
||||||
|
@ -6,16 +6,10 @@ from ...tokens import Token
|
||||||
|
|
||||||
class DutchLemmatizer(Lemmatizer):
|
class DutchLemmatizer(Lemmatizer):
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_lookups_config(cls, mode: str) -> Dict:
|
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
|
||||||
if mode == "rule":
|
if mode == "rule":
|
||||||
return {
|
required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
|
||||||
"required_tables": [
|
return (required, [])
|
||||||
"lemma_lookup",
|
|
||||||
"lemma_rules",
|
|
||||||
"lemma_exc",
|
|
||||||
"lemma_index",
|
|
||||||
],
|
|
||||||
}
|
|
||||||
else:
|
else:
|
||||||
return super().get_lookups_config(mode)
|
return super().get_lookups_config(mode)
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,6 @@ from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .lemmatizer import PolishLemmatizer
|
from .lemmatizer import PolishLemmatizer
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...lookups import Lookups
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
|
||||||
|
|
||||||
|
@ -34,19 +33,11 @@ class Polish(Language):
|
||||||
@Polish.factory(
|
@Polish.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "pos_lookup", "lookups": None},
|
default_config={"model": None, "mode": "pos_lookup"},
|
||||||
scores=["lemma_acc"],
|
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
nlp: Language,
|
return PolishLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
|
||||||
lookups = PolishLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
|
||||||
return PolishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Polish"]
|
__all__ = ["Polish"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import List, Dict
|
from typing import List, Dict, Tuple
|
||||||
|
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
from ...tokens import Token
|
from ...tokens import Token
|
||||||
|
@ -11,21 +11,16 @@ class PolishLemmatizer(Lemmatizer):
|
||||||
# lemmatization, as well as case-sensitive lemmatization for nouns.
|
# lemmatization, as well as case-sensitive lemmatization for nouns.
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_lookups_config(cls, mode: str) -> Dict:
|
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
|
||||||
if mode == "pos_lookup":
|
if mode == "pos_lookup":
|
||||||
return {
|
# fmt: off
|
||||||
"required_tables": [
|
required = [
|
||||||
"lemma_lookup_adj",
|
"lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv",
|
||||||
"lemma_lookup_adp",
|
"lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num",
|
||||||
"lemma_lookup_adv",
|
"lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb"
|
||||||
"lemma_lookup_aux",
|
]
|
||||||
"lemma_lookup_noun",
|
# fmt: on
|
||||||
"lemma_lookup_num",
|
return (required, [])
|
||||||
"lemma_lookup_part",
|
|
||||||
"lemma_lookup_pron",
|
|
||||||
"lemma_lookup_verb",
|
|
||||||
]
|
|
||||||
}
|
|
||||||
else:
|
else:
|
||||||
return super().get_lookups_config(mode)
|
return super().get_lookups_config(mode)
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
@ -7,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .lemmatizer import RussianLemmatizer
|
from .lemmatizer import RussianLemmatizer
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lookups import Lookups
|
|
||||||
|
|
||||||
|
|
||||||
class RussianDefaults(Language.Defaults):
|
class RussianDefaults(Language.Defaults):
|
||||||
|
@ -24,8 +22,7 @@ class Russian(Language):
|
||||||
@Russian.factory(
|
@Russian.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "pymorphy2", "lookups": None},
|
default_config={"model": None, "mode": "pymorphy2"},
|
||||||
scores=["lemma_acc"],
|
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
|
@ -33,9 +30,9 @@ def make_lemmatizer(
|
||||||
model: Optional[Model],
|
model: Optional[Model],
|
||||||
name: str,
|
name: str,
|
||||||
mode: str,
|
mode: str,
|
||||||
lookups: Optional[Lookups],
|
overwrite: bool = False,
|
||||||
):
|
):
|
||||||
return RussianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
return RussianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Russian"]
|
__all__ = ["Russian"]
|
||||||
|
|
|
@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Tuple
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from ...lookups import Lookups
|
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
from ...symbols import POS
|
from ...symbols import POS
|
||||||
from ...tokens import Token
|
from ...tokens import Token
|
||||||
|
@ -22,9 +21,9 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
name: str = "lemmatizer",
|
name: str = "lemmatizer",
|
||||||
*,
|
*,
|
||||||
mode: str = "pymorphy2",
|
mode: str = "pymorphy2",
|
||||||
lookups: Optional[Lookups] = None,
|
overwrite: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__(vocab, model, name, mode=mode, lookups=lookups)
|
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from pymorphy2 import MorphAnalyzer
|
from pymorphy2 import MorphAnalyzer
|
||||||
|
|
|
@ -108,8 +108,8 @@ _num_words = [
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
"""
|
"""
|
||||||
Check if text resembles a number
|
Check if text resembles a number
|
||||||
"""
|
"""
|
||||||
if text.startswith(("+", "-", "±", "~")):
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
text = text[1:]
|
text = text[1:]
|
||||||
text = text.replace(",", "").replace(".", "")
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
|
|
@ -1,8 +1,12 @@
|
||||||
|
from typing import Optional
|
||||||
|
from thinc.api import Model
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...pipeline import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
# Punctuation stolen from Danish
|
# Punctuation stolen from Danish
|
||||||
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
|
@ -22,4 +26,14 @@ class Swedish(Language):
|
||||||
Defaults = SwedishDefaults
|
Defaults = SwedishDefaults
|
||||||
|
|
||||||
|
|
||||||
|
@Swedish.factory(
|
||||||
|
"lemmatizer",
|
||||||
|
assigns=["token.lemma"],
|
||||||
|
default_config={"model": None, "mode": "rule"},
|
||||||
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
|
)
|
||||||
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
|
return Lemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Swedish"]
|
__all__ = ["Swedish"]
|
||||||
|
|
|
@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
labels = ["nsubj", "nsubj:pass", "dobj", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
labels = ["nsubj", "nsubj:pass", "dobj", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
if not doc.is_parsed:
|
if not doc.has_annotation("DEP"):
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
|
|
|
@ -1,10 +1,8 @@
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...util import DummyTokenizer, registry
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -42,7 +40,7 @@ class ThaiTokenizer(DummyTokenizer):
|
||||||
|
|
||||||
|
|
||||||
class ThaiDefaults(Language.Defaults):
|
class ThaiDefaults(Language.Defaults):
|
||||||
config = Config().from_str(DEFAULT_CONFIG)
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
|
||||||
|
@ -8,6 +9,7 @@ class TurkishDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
class Turkish(Language):
|
class Turkish(Language):
|
||||||
|
|
|
@ -32,6 +32,37 @@ _num_words = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
_ordinal_words = [
|
||||||
|
"birinci",
|
||||||
|
"ikinci",
|
||||||
|
"üçüncü",
|
||||||
|
"dördüncü",
|
||||||
|
"beşinci",
|
||||||
|
"altıncı",
|
||||||
|
"yedinci",
|
||||||
|
"sekizinci",
|
||||||
|
"dokuzuncu",
|
||||||
|
"onuncu",
|
||||||
|
"yirminci",
|
||||||
|
"otuzuncu",
|
||||||
|
"kırkıncı",
|
||||||
|
"ellinci",
|
||||||
|
"altmışıncı",
|
||||||
|
"yetmişinci",
|
||||||
|
"sekseninci",
|
||||||
|
"doksanıncı",
|
||||||
|
"yüzüncü",
|
||||||
|
"bininci",
|
||||||
|
"mliyonuncu",
|
||||||
|
"milyarıncı",
|
||||||
|
"trilyonuncu",
|
||||||
|
"katrilyonuncu",
|
||||||
|
"kentilyonuncu",
|
||||||
|
]
|
||||||
|
|
||||||
|
_ordinal_endings = ("inci", "ıncı", "nci", "ncı", "uncu", "üncü")
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
if text.startswith(("+", "-", "±", "~")):
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
text = text[1:]
|
text = text[1:]
|
||||||
|
@ -42,8 +73,20 @@ def like_num(text):
|
||||||
num, denom = text.split("/")
|
num, denom = text.split("/")
|
||||||
if num.isdigit() and denom.isdigit():
|
if num.isdigit() and denom.isdigit():
|
||||||
return True
|
return True
|
||||||
if text.lower() in _num_words:
|
|
||||||
|
text_lower = text.lower()
|
||||||
|
|
||||||
|
# Check cardinal number
|
||||||
|
if text_lower in _num_words:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
# Check ordinal number
|
||||||
|
if text_lower in _ordinal_words:
|
||||||
|
return True
|
||||||
|
if text_lower.endswith(_ordinal_endings):
|
||||||
|
if text_lower[:-3].isdigit() or text_lower[:-4].isdigit():
|
||||||
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|
58
spacy/lang/tr/syntax_iterators.py
Normal file
58
spacy/lang/tr/syntax_iterators.py
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
|
def noun_chunks(doclike):
|
||||||
|
"""
|
||||||
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
|
"""
|
||||||
|
# Please see documentation for Turkish NP structure
|
||||||
|
labels = [
|
||||||
|
"nsubj",
|
||||||
|
"iobj",
|
||||||
|
"obj",
|
||||||
|
"obl",
|
||||||
|
"appos",
|
||||||
|
"orphan",
|
||||||
|
"dislocated",
|
||||||
|
"ROOT",
|
||||||
|
]
|
||||||
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
if not doc.has_annotation("DEP"):
|
||||||
|
raise ValueError(Errors.E029)
|
||||||
|
|
||||||
|
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||||
|
conj = doc.vocab.strings.add("conj")
|
||||||
|
flat = doc.vocab.strings.add("flat")
|
||||||
|
np_label = doc.vocab.strings.add("NP")
|
||||||
|
|
||||||
|
def extend_right(w): # Playing a trick for flat
|
||||||
|
rindex = w.i + 1
|
||||||
|
for rdep in doc[w.i].rights: # Extend the span to right if there is a flat
|
||||||
|
if rdep.dep == flat and rdep.pos in (NOUN, PROPN):
|
||||||
|
rindex = rdep.i + 1
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
return rindex
|
||||||
|
|
||||||
|
prev_end = len(doc) + 1
|
||||||
|
for i, word in reversed(list(enumerate(doclike))):
|
||||||
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
|
continue
|
||||||
|
# Prevent nested chunks from being produced
|
||||||
|
if word.i >= prev_end:
|
||||||
|
continue
|
||||||
|
if word.dep in np_deps:
|
||||||
|
prev_end = word.left_edge.i
|
||||||
|
yield word.left_edge.i, extend_right(word), np_label
|
||||||
|
elif word.dep == conj:
|
||||||
|
cc_token = word.left_edge
|
||||||
|
prev_end = cc_token.i
|
||||||
|
# Shave off cc tokens from the NP
|
||||||
|
yield cc_token.right_edge.i + 1, extend_right(word), np_label
|
||||||
|
|
||||||
|
|
||||||
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
|
@ -7,7 +7,6 @@ from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .lemmatizer import UkrainianLemmatizer
|
from .lemmatizer import UkrainianLemmatizer
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lookups import Lookups
|
|
||||||
|
|
||||||
|
|
||||||
class UkrainianDefaults(Language.Defaults):
|
class UkrainianDefaults(Language.Defaults):
|
||||||
|
@ -24,18 +23,13 @@ class Ukrainian(Language):
|
||||||
@Ukrainian.factory(
|
@Ukrainian.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "pymorphy2", "lookups": None},
|
default_config={"model": None, "mode": "pymorphy2"},
|
||||||
scores=["lemma_acc"],
|
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language,
|
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
lookups: Optional[Lookups],
|
|
||||||
):
|
):
|
||||||
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Ukrainian"]
|
__all__ = ["Ukrainian"]
|
||||||
|
|
|
@ -3,7 +3,6 @@ from typing import Optional
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from ..ru.lemmatizer import RussianLemmatizer
|
from ..ru.lemmatizer import RussianLemmatizer
|
||||||
from ...lookups import Lookups
|
|
||||||
from ...vocab import Vocab
|
from ...vocab import Vocab
|
||||||
|
|
||||||
|
|
||||||
|
@ -15,9 +14,9 @@ class UkrainianLemmatizer(RussianLemmatizer):
|
||||||
name: str = "lemmatizer",
|
name: str = "lemmatizer",
|
||||||
*,
|
*,
|
||||||
mode: str = "pymorphy2",
|
mode: str = "pymorphy2",
|
||||||
lookups: Optional[Lookups] = None,
|
overwrite: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__(vocab, model, name, mode=mode, lookups=lookups)
|
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
|
||||||
try:
|
try:
|
||||||
from pymorphy2 import MorphAnalyzer
|
from pymorphy2 import MorphAnalyzer
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
|
|
@ -1,10 +1,8 @@
|
||||||
from thinc.api import Config
|
from .stop_words import STOP_WORDS
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from .stop_words import STOP_WORDS
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
from ...util import DummyTokenizer, registry
|
|
||||||
from .lex_attrs import LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -17,7 +15,7 @@ use_pyvi = true
|
||||||
|
|
||||||
|
|
||||||
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
|
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
|
||||||
def create_vietnamese_tokenizer(use_pyvi: bool = True,):
|
def create_vietnamese_tokenizer(use_pyvi: bool = True):
|
||||||
def vietnamese_tokenizer_factory(nlp):
|
def vietnamese_tokenizer_factory(nlp):
|
||||||
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
|
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
|
||||||
|
|
||||||
|
@ -55,7 +53,7 @@ class VietnameseTokenizer(DummyTokenizer):
|
||||||
|
|
||||||
|
|
||||||
class VietnameseDefaults(Language.Defaults):
|
class VietnameseDefaults(Language.Defaults):
|
||||||
config = Config().from_str(DEFAULT_CONFIG)
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
|
@ -1,21 +1,24 @@
|
||||||
from typing import Optional, List, Dict, Any
|
from typing import Optional, List, Dict, Any, Callable, Iterable
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
import tempfile
|
import tempfile
|
||||||
import srsly
|
import srsly
|
||||||
import warnings
|
import warnings
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from ...errors import Warnings, Errors
|
from ...errors import Warnings, Errors
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...scorer import Scorer
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...util import DummyTokenizer, registry
|
from ...training import validate_examples, Example
|
||||||
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ... import util
|
from ... import util
|
||||||
|
|
||||||
|
|
||||||
_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python"
|
# fmt: off
|
||||||
|
_PKUSEG_INSTALL_MSG = "install spacy-pkuseg with `pip install spacy-pkuseg==0.0.26`"
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
[nlp]
|
[nlp]
|
||||||
|
@ -23,6 +26,10 @@ DEFAULT_CONFIG = """
|
||||||
[nlp.tokenizer]
|
[nlp.tokenizer]
|
||||||
@tokenizers = "spacy.zh.ChineseTokenizer"
|
@tokenizers = "spacy.zh.ChineseTokenizer"
|
||||||
segmenter = "char"
|
segmenter = "char"
|
||||||
|
|
||||||
|
[initialize]
|
||||||
|
|
||||||
|
[initialize.tokenizer]
|
||||||
pkuseg_model = null
|
pkuseg_model = null
|
||||||
pkuseg_user_dict = "default"
|
pkuseg_user_dict = "default"
|
||||||
"""
|
"""
|
||||||
|
@ -39,41 +46,21 @@ class Segmenter(str, Enum):
|
||||||
|
|
||||||
|
|
||||||
@registry.tokenizers("spacy.zh.ChineseTokenizer")
|
@registry.tokenizers("spacy.zh.ChineseTokenizer")
|
||||||
def create_chinese_tokenizer(
|
def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
|
||||||
segmenter: Segmenter = Segmenter.char,
|
|
||||||
pkuseg_model: Optional[str] = None,
|
|
||||||
pkuseg_user_dict: Optional[str] = "default",
|
|
||||||
):
|
|
||||||
def chinese_tokenizer_factory(nlp):
|
def chinese_tokenizer_factory(nlp):
|
||||||
return ChineseTokenizer(
|
return ChineseTokenizer(nlp, segmenter=segmenter)
|
||||||
nlp,
|
|
||||||
segmenter=segmenter,
|
|
||||||
pkuseg_model=pkuseg_model,
|
|
||||||
pkuseg_user_dict=pkuseg_user_dict,
|
|
||||||
)
|
|
||||||
|
|
||||||
return chinese_tokenizer_factory
|
return chinese_tokenizer_factory
|
||||||
|
|
||||||
|
|
||||||
class ChineseTokenizer(DummyTokenizer):
|
class ChineseTokenizer(DummyTokenizer):
|
||||||
def __init__(
|
def __init__(self, nlp: Language, segmenter: Segmenter = Segmenter.char):
|
||||||
self,
|
|
||||||
nlp: Language,
|
|
||||||
segmenter: Segmenter = Segmenter.char,
|
|
||||||
pkuseg_model: Optional[str] = None,
|
|
||||||
pkuseg_user_dict: Optional[str] = None,
|
|
||||||
):
|
|
||||||
self.vocab = nlp.vocab
|
self.vocab = nlp.vocab
|
||||||
if isinstance(segmenter, Segmenter): # we might have the Enum here
|
if isinstance(segmenter, Segmenter):
|
||||||
segmenter = segmenter.value
|
segmenter = segmenter.value
|
||||||
self.segmenter = segmenter
|
self.segmenter = segmenter
|
||||||
self.pkuseg_model = pkuseg_model
|
|
||||||
self.pkuseg_user_dict = pkuseg_user_dict
|
|
||||||
self.pkuseg_seg = None
|
self.pkuseg_seg = None
|
||||||
self.jieba_seg = None
|
self.jieba_seg = None
|
||||||
self.configure_segmenter(segmenter)
|
|
||||||
|
|
||||||
def configure_segmenter(self, segmenter: str):
|
|
||||||
if segmenter not in Segmenter.values():
|
if segmenter not in Segmenter.values():
|
||||||
warn_msg = Warnings.W103.format(
|
warn_msg = Warnings.W103.format(
|
||||||
lang="Chinese",
|
lang="Chinese",
|
||||||
|
@ -83,12 +70,23 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
)
|
)
|
||||||
warnings.warn(warn_msg)
|
warnings.warn(warn_msg)
|
||||||
self.segmenter = Segmenter.char
|
self.segmenter = Segmenter.char
|
||||||
self.jieba_seg = try_jieba_import(self.segmenter)
|
if segmenter == Segmenter.jieba:
|
||||||
self.pkuseg_seg = try_pkuseg_import(
|
self.jieba_seg = try_jieba_import()
|
||||||
self.segmenter,
|
|
||||||
pkuseg_model=self.pkuseg_model,
|
def initialize(
|
||||||
pkuseg_user_dict=self.pkuseg_user_dict,
|
self,
|
||||||
)
|
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||||
|
*,
|
||||||
|
nlp: Optional[Language] = None,
|
||||||
|
pkuseg_model: Optional[str] = None,
|
||||||
|
pkuseg_user_dict: Optional[str] = "default",
|
||||||
|
):
|
||||||
|
if self.segmenter == Segmenter.pkuseg:
|
||||||
|
if pkuseg_user_dict is None:
|
||||||
|
pkuseg_user_dict = pkuseg_model
|
||||||
|
self.pkuseg_seg = try_pkuseg_import(
|
||||||
|
pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict
|
||||||
|
)
|
||||||
|
|
||||||
def __call__(self, text: str) -> Doc:
|
def __call__(self, text: str) -> Doc:
|
||||||
if self.segmenter == Segmenter.jieba:
|
if self.segmenter == Segmenter.jieba:
|
||||||
|
@ -121,12 +119,12 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
if self.segmenter == Segmenter.pkuseg:
|
if self.segmenter == Segmenter.pkuseg:
|
||||||
if reset:
|
if reset:
|
||||||
try:
|
try:
|
||||||
import pkuseg
|
import spacy_pkuseg
|
||||||
|
|
||||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
|
self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(None)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
msg = (
|
msg = (
|
||||||
"pkuseg not installed: unable to reset pkuseg "
|
"spacy_pkuseg not installed: unable to reset pkuseg "
|
||||||
"user dict. Please " + _PKUSEG_INSTALL_MSG
|
"user dict. Please " + _PKUSEG_INSTALL_MSG
|
||||||
)
|
)
|
||||||
raise ImportError(msg) from None
|
raise ImportError(msg) from None
|
||||||
|
@ -136,17 +134,17 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
|
warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
|
||||||
warnings.warn(warn_msg)
|
warnings.warn(warn_msg)
|
||||||
|
|
||||||
|
def score(self, examples):
|
||||||
|
validate_examples(examples, "ChineseTokenizer.score")
|
||||||
|
return Scorer.score_tokenization(examples)
|
||||||
|
|
||||||
def _get_config(self) -> Dict[str, Any]:
|
def _get_config(self) -> Dict[str, Any]:
|
||||||
return {
|
return {
|
||||||
"segmenter": self.segmenter,
|
"segmenter": self.segmenter,
|
||||||
"pkuseg_model": self.pkuseg_model,
|
|
||||||
"pkuseg_user_dict": self.pkuseg_user_dict,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def _set_config(self, config: Dict[str, Any] = {}) -> None:
|
def _set_config(self, config: Dict[str, Any] = {}) -> None:
|
||||||
self.segmenter = config.get("segmenter", Segmenter.char)
|
self.segmenter = config.get("segmenter", Segmenter.char)
|
||||||
self.pkuseg_model = config.get("pkuseg_model", None)
|
|
||||||
self.pkuseg_user_dict = config.get("pkuseg_user_dict", "default")
|
|
||||||
|
|
||||||
def to_bytes(self, **kwargs):
|
def to_bytes(self, **kwargs):
|
||||||
pkuseg_features_b = b""
|
pkuseg_features_b = b""
|
||||||
|
@ -157,7 +155,7 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
self.pkuseg_seg.feature_extractor.save(tempdir)
|
self.pkuseg_seg.feature_extractor.save(tempdir)
|
||||||
self.pkuseg_seg.model.save(tempdir)
|
self.pkuseg_seg.model.save(tempdir)
|
||||||
tempdir = Path(tempdir)
|
tempdir = Path(tempdir)
|
||||||
with open(tempdir / "features.pkl", "rb") as fileh:
|
with open(tempdir / "features.msgpack", "rb") as fileh:
|
||||||
pkuseg_features_b = fileh.read()
|
pkuseg_features_b = fileh.read()
|
||||||
with open(tempdir / "weights.npz", "rb") as fileh:
|
with open(tempdir / "weights.npz", "rb") as fileh:
|
||||||
pkuseg_weights_b = fileh.read()
|
pkuseg_weights_b = fileh.read()
|
||||||
|
@ -198,22 +196,22 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
if pkuseg_data["features_b"] and pkuseg_data["weights_b"]:
|
if pkuseg_data["features_b"] and pkuseg_data["weights_b"]:
|
||||||
with tempfile.TemporaryDirectory() as tempdir:
|
with tempfile.TemporaryDirectory() as tempdir:
|
||||||
tempdir = Path(tempdir)
|
tempdir = Path(tempdir)
|
||||||
with open(tempdir / "features.pkl", "wb") as fileh:
|
with open(tempdir / "features.msgpack", "wb") as fileh:
|
||||||
fileh.write(pkuseg_data["features_b"])
|
fileh.write(pkuseg_data["features_b"])
|
||||||
with open(tempdir / "weights.npz", "wb") as fileh:
|
with open(tempdir / "weights.npz", "wb") as fileh:
|
||||||
fileh.write(pkuseg_data["weights_b"])
|
fileh.write(pkuseg_data["weights_b"])
|
||||||
try:
|
try:
|
||||||
import pkuseg
|
import spacy_pkuseg
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"pkuseg not installed. To use this model, "
|
"spacy-pkuseg not installed. To use this model, "
|
||||||
+ _PKUSEG_INSTALL_MSG
|
+ _PKUSEG_INSTALL_MSG
|
||||||
) from None
|
) from None
|
||||||
self.pkuseg_seg = pkuseg.pkuseg(str(tempdir))
|
self.pkuseg_seg = spacy_pkuseg.pkuseg(str(tempdir))
|
||||||
if pkuseg_data["processors_data"]:
|
if pkuseg_data["processors_data"]:
|
||||||
processors_data = pkuseg_data["processors_data"]
|
processors_data = pkuseg_data["processors_data"]
|
||||||
(user_dict, do_process, common_words, other_words) = processors_data
|
(user_dict, do_process, common_words, other_words) = processors_data
|
||||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
|
self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict)
|
||||||
self.pkuseg_seg.postprocesser.do_process = do_process
|
self.pkuseg_seg.postprocesser.do_process = do_process
|
||||||
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
||||||
self.pkuseg_seg.postprocesser.other_words = set(other_words)
|
self.pkuseg_seg.postprocesser.other_words = set(other_words)
|
||||||
|
@ -252,26 +250,26 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
|
|
||||||
def load_pkuseg_model(path):
|
def load_pkuseg_model(path):
|
||||||
try:
|
try:
|
||||||
import pkuseg
|
import spacy_pkuseg
|
||||||
except ImportError:
|
except ImportError:
|
||||||
if self.segmenter == Segmenter.pkuseg:
|
if self.segmenter == Segmenter.pkuseg:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"pkuseg not installed. To use this model, "
|
"spacy-pkuseg not installed. To use this model, "
|
||||||
+ _PKUSEG_INSTALL_MSG
|
+ _PKUSEG_INSTALL_MSG
|
||||||
) from None
|
) from None
|
||||||
if path.exists():
|
if path.exists():
|
||||||
self.pkuseg_seg = pkuseg.pkuseg(path)
|
self.pkuseg_seg = spacy_pkuseg.pkuseg(path)
|
||||||
|
|
||||||
def load_pkuseg_processors(path):
|
def load_pkuseg_processors(path):
|
||||||
try:
|
try:
|
||||||
import pkuseg
|
import spacy_pkuseg
|
||||||
except ImportError:
|
except ImportError:
|
||||||
if self.segmenter == Segmenter.pkuseg:
|
if self.segmenter == Segmenter.pkuseg:
|
||||||
raise ImportError(self._pkuseg_install_msg) from None
|
raise ImportError(self._pkuseg_install_msg) from None
|
||||||
if self.segmenter == Segmenter.pkuseg:
|
if self.segmenter == Segmenter.pkuseg:
|
||||||
data = srsly.read_msgpack(path)
|
data = srsly.read_msgpack(path)
|
||||||
(user_dict, do_process, common_words, other_words) = data
|
(user_dict, do_process, common_words, other_words) = data
|
||||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
|
self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict)
|
||||||
self.pkuseg_seg.postprocesser.do_process = do_process
|
self.pkuseg_seg.postprocesser.do_process = do_process
|
||||||
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
||||||
self.pkuseg_seg.postprocesser.other_words = set(other_words)
|
self.pkuseg_seg.postprocesser.other_words = set(other_words)
|
||||||
|
@ -285,7 +283,7 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
|
|
||||||
|
|
||||||
class ChineseDefaults(Language.Defaults):
|
class ChineseDefaults(Language.Defaults):
|
||||||
config = Config().from_str(DEFAULT_CONFIG)
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||||
|
@ -296,47 +294,34 @@ class Chinese(Language):
|
||||||
Defaults = ChineseDefaults
|
Defaults = ChineseDefaults
|
||||||
|
|
||||||
|
|
||||||
def try_jieba_import(segmenter: str) -> None:
|
def try_jieba_import() -> None:
|
||||||
try:
|
try:
|
||||||
import jieba
|
import jieba
|
||||||
|
|
||||||
if segmenter == Segmenter.jieba:
|
# segment a short text to have jieba initialize its cache in advance
|
||||||
# segment a short text to have jieba initialize its cache in advance
|
list(jieba.cut("作为", cut_all=False))
|
||||||
list(jieba.cut("作为", cut_all=False))
|
|
||||||
|
|
||||||
return jieba
|
return jieba
|
||||||
except ImportError:
|
except ImportError:
|
||||||
if segmenter == Segmenter.jieba:
|
msg = (
|
||||||
msg = (
|
"Jieba not installed. To use jieba, install it with `pip "
|
||||||
"Jieba not installed. To use jieba, install it with `pip "
|
" install jieba` or from https://github.com/fxsjy/jieba"
|
||||||
" install jieba` or from https://github.com/fxsjy/jieba"
|
)
|
||||||
)
|
raise ImportError(msg) from None
|
||||||
raise ImportError(msg) from None
|
|
||||||
|
|
||||||
|
|
||||||
def try_pkuseg_import(segmenter: str, pkuseg_model: str, pkuseg_user_dict: str) -> None:
|
def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None:
|
||||||
try:
|
try:
|
||||||
import pkuseg
|
import spacy_pkuseg
|
||||||
|
|
||||||
if pkuseg_model:
|
|
||||||
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
|
||||||
elif segmenter == Segmenter.pkuseg:
|
|
||||||
msg = (
|
|
||||||
"The Chinese word segmenter is 'pkuseg' but no pkuseg model "
|
|
||||||
"was specified. Please provide the name of a pretrained model "
|
|
||||||
"or the path to a model with:\n"
|
|
||||||
'cfg = {"nlp": {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path }}\n'
|
|
||||||
"nlp = Chinese.from_config(cfg)"
|
|
||||||
)
|
|
||||||
raise ValueError(msg)
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
if segmenter == Segmenter.pkuseg:
|
msg = "spacy-pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
|
||||||
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
|
raise ImportError(msg) from None
|
||||||
raise ImportError(msg) from None
|
try:
|
||||||
|
return spacy_pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
if segmenter == Segmenter.pkuseg:
|
msg = "Unable to load pkuseg model from: " + pkuseg_model
|
||||||
msg = "Unable to load pkuseg model from: " + pkuseg_model
|
raise FileNotFoundError(msg) from None
|
||||||
raise FileNotFoundError(msg) from None
|
|
||||||
|
|
||||||
|
|
||||||
def _get_pkuseg_trie_data(node, path=""):
|
def _get_pkuseg_trie_data(node, path=""):
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from typing import Optional, Any, Dict, Callable, Iterable, Union, List, Pattern
|
from typing import Optional, Any, Dict, Callable, Iterable, Union, List, Pattern
|
||||||
from typing import Tuple, Iterator
|
from typing import Tuple
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import random
|
import random
|
||||||
import itertools
|
import itertools
|
||||||
|
@ -8,7 +8,7 @@ from contextlib import contextmanager
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import warnings
|
import warnings
|
||||||
from thinc.api import get_current_ops, Config, require_gpu, Optimizer
|
from thinc.api import Model, get_current_ops, Config, Optimizer
|
||||||
import srsly
|
import srsly
|
||||||
import multiprocessing as mp
|
import multiprocessing as mp
|
||||||
from itertools import chain, cycle
|
from itertools import chain, cycle
|
||||||
|
@ -18,8 +18,9 @@ from .tokens.underscore import Underscore
|
||||||
from .vocab import Vocab, create_vocab
|
from .vocab import Vocab, create_vocab
|
||||||
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
|
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
|
||||||
from .training import Example, validate_examples
|
from .training import Example, validate_examples
|
||||||
|
from .training.initialize import init_vocab, init_tok2vec
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
from .util import create_default_optimizer, registry, SimpleFrozenList
|
from .util import registry, SimpleFrozenList, _pipe
|
||||||
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
|
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
|
||||||
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
|
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
|
||||||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
|
@ -27,10 +28,12 @@ from .lang.punctuation import TOKENIZER_INFIXES
|
||||||
from .tokens import Doc
|
from .tokens import Doc
|
||||||
from .tokenizer import Tokenizer
|
from .tokenizer import Tokenizer
|
||||||
from .errors import Errors, Warnings
|
from .errors import Errors, Warnings
|
||||||
from .schemas import ConfigSchema
|
from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit
|
||||||
|
from .schemas import ConfigSchemaPretrain, validate_init_settings
|
||||||
from .git_info import GIT_VERSION
|
from .git_info import GIT_VERSION
|
||||||
from . import util
|
from . import util
|
||||||
from . import about
|
from . import about
|
||||||
|
from .lookups import load_lookups
|
||||||
|
|
||||||
|
|
||||||
# This is the base config will all settings (training etc.)
|
# This is the base config will all settings (training etc.)
|
||||||
|
@ -86,6 +89,13 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
|
||||||
return tokenizer_factory
|
return tokenizer_factory
|
||||||
|
|
||||||
|
|
||||||
|
@registry.misc("spacy.LookupsDataLoader.v1")
|
||||||
|
def load_lookups_data(lang, tables):
|
||||||
|
util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
|
||||||
|
lookups = load_lookups(lang=lang, tables=tables)
|
||||||
|
return lookups
|
||||||
|
|
||||||
|
|
||||||
class Language:
|
class Language:
|
||||||
"""A text-processing pipeline. Usually you'll load this once per process,
|
"""A text-processing pipeline. Usually you'll load this once per process,
|
||||||
and pass the instance around your application.
|
and pass the instance around your application.
|
||||||
|
@ -148,12 +158,7 @@ class Language:
|
||||||
raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
|
raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
|
||||||
if vocab is True:
|
if vocab is True:
|
||||||
vectors_name = meta.get("vectors", {}).get("name")
|
vectors_name = meta.get("vectors", {}).get("name")
|
||||||
vocab = create_vocab(
|
vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
|
||||||
self.lang,
|
|
||||||
self.Defaults,
|
|
||||||
vectors_name=vectors_name,
|
|
||||||
load_data=self._config["nlp"]["load_vocab_data"],
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
||||||
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
|
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
|
||||||
|
@ -163,11 +168,10 @@ class Language:
|
||||||
self._components = []
|
self._components = []
|
||||||
self._disabled = set()
|
self._disabled = set()
|
||||||
self.max_length = max_length
|
self.max_length = max_length
|
||||||
self.resolved = {}
|
|
||||||
# Create the default tokenizer from the default config
|
# Create the default tokenizer from the default config
|
||||||
if not create_tokenizer:
|
if not create_tokenizer:
|
||||||
tokenizer_cfg = {"tokenizer": self._config["nlp"]["tokenizer"]}
|
tokenizer_cfg = {"tokenizer": self._config["nlp"]["tokenizer"]}
|
||||||
create_tokenizer = registry.make_from_config(tokenizer_cfg)["tokenizer"]
|
create_tokenizer = registry.resolve(tokenizer_cfg)["tokenizer"]
|
||||||
self.tokenizer = create_tokenizer(self)
|
self.tokenizer = create_tokenizer(self)
|
||||||
|
|
||||||
def __init_subclass__(cls, **kwargs):
|
def __init_subclass__(cls, **kwargs):
|
||||||
|
@ -245,9 +249,12 @@ class Language:
|
||||||
self._config["nlp"]["pipeline"] = list(self.component_names)
|
self._config["nlp"]["pipeline"] = list(self.component_names)
|
||||||
self._config["nlp"]["disabled"] = list(self.disabled)
|
self._config["nlp"]["disabled"] = list(self.disabled)
|
||||||
self._config["components"] = pipeline
|
self._config["components"] = pipeline
|
||||||
if not self._config["training"].get("score_weights"):
|
# We're merging the existing score weights back into the combined
|
||||||
combined_score_weights = combine_score_weights(score_weights)
|
# weights to make sure we're preserving custom settings in the config
|
||||||
self._config["training"]["score_weights"] = combined_score_weights
|
# but also reflect updates (e.g. new components added)
|
||||||
|
prev_weights = self._config["training"].get("score_weights", {})
|
||||||
|
combined_score_weights = combine_score_weights(score_weights, prev_weights)
|
||||||
|
self._config["training"]["score_weights"] = combined_score_weights
|
||||||
if not srsly.is_json_serializable(self._config):
|
if not srsly.is_json_serializable(self._config):
|
||||||
raise ValueError(Errors.E961.format(config=self._config))
|
raise ValueError(Errors.E961.format(config=self._config))
|
||||||
return self._config
|
return self._config
|
||||||
|
@ -409,7 +416,6 @@ class Language:
|
||||||
assigns: Iterable[str] = SimpleFrozenList(),
|
assigns: Iterable[str] = SimpleFrozenList(),
|
||||||
requires: Iterable[str] = SimpleFrozenList(),
|
requires: Iterable[str] = SimpleFrozenList(),
|
||||||
retokenizes: bool = False,
|
retokenizes: bool = False,
|
||||||
scores: Iterable[str] = SimpleFrozenList(),
|
|
||||||
default_score_weights: Dict[str, float] = SimpleFrozenDict(),
|
default_score_weights: Dict[str, float] = SimpleFrozenDict(),
|
||||||
func: Optional[Callable] = None,
|
func: Optional[Callable] = None,
|
||||||
) -> Callable:
|
) -> Callable:
|
||||||
|
@ -427,12 +433,11 @@ class Language:
|
||||||
e.g. "token.ent_id". Used for pipeline analyis.
|
e.g. "token.ent_id". Used for pipeline analyis.
|
||||||
retokenizes (bool): Whether the component changes the tokenization.
|
retokenizes (bool): Whether the component changes the tokenization.
|
||||||
Used for pipeline analysis.
|
Used for pipeline analysis.
|
||||||
scores (Iterable[str]): All scores set by the component if it's trainable,
|
|
||||||
e.g. ["ents_f", "ents_r", "ents_p"].
|
|
||||||
default_score_weights (Dict[str, float]): The scores to report during
|
default_score_weights (Dict[str, float]): The scores to report during
|
||||||
training, and their default weight towards the final score used to
|
training, and their default weight towards the final score used to
|
||||||
select the best model. Weights should sum to 1.0 per component and
|
select the best model. Weights should sum to 1.0 per component and
|
||||||
will be combined and normalized for the whole pipeline.
|
will be combined and normalized for the whole pipeline. If None,
|
||||||
|
the score won't be shown in the logs or be weighted.
|
||||||
func (Optional[Callable]): Factory function if not used as a decorator.
|
func (Optional[Callable]): Factory function if not used as a decorator.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/language#factory
|
DOCS: https://nightly.spacy.io/api/language#factory
|
||||||
|
@ -463,7 +468,7 @@ class Language:
|
||||||
if "nlp" not in arg_names or "name" not in arg_names:
|
if "nlp" not in arg_names or "name" not in arg_names:
|
||||||
raise ValueError(Errors.E964.format(name=name))
|
raise ValueError(Errors.E964.format(name=name))
|
||||||
# Officially register the factory so we can later call
|
# Officially register the factory so we can later call
|
||||||
# registry.make_from_config and refer to it in the config as
|
# registry.resolve and refer to it in the config as
|
||||||
# @factories = "spacy.Language.xyz". We use the class name here so
|
# @factories = "spacy.Language.xyz". We use the class name here so
|
||||||
# different classes can have different factories.
|
# different classes can have different factories.
|
||||||
registry.factories.register(internal_name, func=factory_func)
|
registry.factories.register(internal_name, func=factory_func)
|
||||||
|
@ -472,7 +477,7 @@ class Language:
|
||||||
default_config=default_config,
|
default_config=default_config,
|
||||||
assigns=validate_attrs(assigns),
|
assigns=validate_attrs(assigns),
|
||||||
requires=validate_attrs(requires),
|
requires=validate_attrs(requires),
|
||||||
scores=scores,
|
scores=list(default_score_weights.keys()),
|
||||||
default_score_weights=default_score_weights,
|
default_score_weights=default_score_weights,
|
||||||
retokenizes=retokenizes,
|
retokenizes=retokenizes,
|
||||||
)
|
)
|
||||||
|
@ -646,8 +651,9 @@ class Language:
|
||||||
cfg = {factory_name: config}
|
cfg = {factory_name: config}
|
||||||
# We're calling the internal _fill here to avoid constructing the
|
# We're calling the internal _fill here to avoid constructing the
|
||||||
# registered functions twice
|
# registered functions twice
|
||||||
resolved, filled = registry.resolve(cfg, validate=validate)
|
resolved = registry.resolve(cfg, validate=validate)
|
||||||
filled = Config(filled[factory_name])
|
filled = registry.fill({"cfg": cfg[factory_name]}, validate=validate)["cfg"]
|
||||||
|
filled = Config(filled)
|
||||||
filled["factory"] = factory_name
|
filled["factory"] = factory_name
|
||||||
filled.pop("@factories", None)
|
filled.pop("@factories", None)
|
||||||
# Remove the extra values we added because we don't want to keep passing
|
# Remove the extra values we added because we don't want to keep passing
|
||||||
|
@ -837,7 +843,7 @@ class Language:
|
||||||
*,
|
*,
|
||||||
config: Dict[str, Any] = SimpleFrozenDict(),
|
config: Dict[str, Any] = SimpleFrozenDict(),
|
||||||
validate: bool = True,
|
validate: bool = True,
|
||||||
) -> None:
|
) -> Callable[[Doc], Doc]:
|
||||||
"""Replace a component in the pipeline.
|
"""Replace a component in the pipeline.
|
||||||
|
|
||||||
name (str): Name of the component to replace.
|
name (str): Name of the component to replace.
|
||||||
|
@ -846,6 +852,7 @@ class Language:
|
||||||
component. Will be merged with default config, if available.
|
component. Will be merged with default config, if available.
|
||||||
validate (bool): Whether to validate the component config against the
|
validate (bool): Whether to validate the component config against the
|
||||||
arguments and types expected by the factory.
|
arguments and types expected by the factory.
|
||||||
|
RETURNS (Callable[[Doc], Doc]): The new pipeline component.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/language#replace_pipe
|
DOCS: https://nightly.spacy.io/api/language#replace_pipe
|
||||||
"""
|
"""
|
||||||
|
@ -860,9 +867,11 @@ class Language:
|
||||||
self.remove_pipe(name)
|
self.remove_pipe(name)
|
||||||
if not len(self._components) or pipe_index == len(self._components):
|
if not len(self._components) or pipe_index == len(self._components):
|
||||||
# we have no components to insert before/after, or we're replacing the last component
|
# we have no components to insert before/after, or we're replacing the last component
|
||||||
self.add_pipe(factory_name, name=name, config=config, validate=validate)
|
return self.add_pipe(
|
||||||
|
factory_name, name=name, config=config, validate=validate
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
self.add_pipe(
|
return self.add_pipe(
|
||||||
factory_name,
|
factory_name,
|
||||||
name=name,
|
name=name,
|
||||||
before=pipe_index,
|
before=pipe_index,
|
||||||
|
@ -890,6 +899,10 @@ class Language:
|
||||||
self._components[i] = (new_name, self._components[i][1])
|
self._components[i] = (new_name, self._components[i][1])
|
||||||
self._pipe_meta[new_name] = self._pipe_meta.pop(old_name)
|
self._pipe_meta[new_name] = self._pipe_meta.pop(old_name)
|
||||||
self._pipe_configs[new_name] = self._pipe_configs.pop(old_name)
|
self._pipe_configs[new_name] = self._pipe_configs.pop(old_name)
|
||||||
|
# Make sure [initialize] config is adjusted
|
||||||
|
if old_name in self._config["initialize"]["components"]:
|
||||||
|
init_cfg = self._config["initialize"]["components"].pop(old_name)
|
||||||
|
self._config["initialize"]["components"][new_name] = init_cfg
|
||||||
|
|
||||||
def remove_pipe(self, name: str) -> Tuple[str, Callable[[Doc], Doc]]:
|
def remove_pipe(self, name: str) -> Tuple[str, Callable[[Doc], Doc]]:
|
||||||
"""Remove a component from the pipeline.
|
"""Remove a component from the pipeline.
|
||||||
|
@ -906,6 +919,9 @@ class Language:
|
||||||
# because factory may be used for something else
|
# because factory may be used for something else
|
||||||
self._pipe_meta.pop(name)
|
self._pipe_meta.pop(name)
|
||||||
self._pipe_configs.pop(name)
|
self._pipe_configs.pop(name)
|
||||||
|
# Make sure name is removed from the [initialize] config
|
||||||
|
if name in self._config["initialize"]["components"]:
|
||||||
|
self._config["initialize"]["components"].pop(name)
|
||||||
# Make sure the name is also removed from the set of disabled components
|
# Make sure the name is also removed from the set of disabled components
|
||||||
if name in self.disabled:
|
if name in self.disabled:
|
||||||
self._disabled.remove(name)
|
self._disabled.remove(name)
|
||||||
|
@ -966,8 +982,9 @@ class Language:
|
||||||
raise ValueError(Errors.E003.format(component=type(proc), name=name))
|
raise ValueError(Errors.E003.format(component=type(proc), name=name))
|
||||||
try:
|
try:
|
||||||
doc = proc(doc, **component_cfg.get(name, {}))
|
doc = proc(doc, **component_cfg.get(name, {}))
|
||||||
except KeyError:
|
except KeyError as e:
|
||||||
raise ValueError(Errors.E109.format(name=name)) from None
|
# This typically happens if a component is not initialized
|
||||||
|
raise ValueError(Errors.E109.format(name=name)) from e
|
||||||
if doc is None:
|
if doc is None:
|
||||||
raise ValueError(Errors.E005.format(name=name))
|
raise ValueError(Errors.E005.format(name=name))
|
||||||
return doc
|
return doc
|
||||||
|
@ -1017,6 +1034,9 @@ class Language:
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
disable = to_disable
|
disable = to_disable
|
||||||
|
# DisabledPipes will restore the pipes in 'disable' when it's done, so we need to exclude
|
||||||
|
# those pipes that were already disabled.
|
||||||
|
disable = [d for d in disable if d not in self._disabled]
|
||||||
return DisabledPipes(self, disable)
|
return DisabledPipes(self, disable)
|
||||||
|
|
||||||
def make_doc(self, text: str) -> Doc:
|
def make_doc(self, text: str) -> Doc:
|
||||||
|
@ -1061,7 +1081,7 @@ class Language:
|
||||||
validate_examples(examples, "Language.update")
|
validate_examples(examples, "Language.update")
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
if self._optimizer is None:
|
if self._optimizer is None:
|
||||||
self._optimizer = create_default_optimizer()
|
self._optimizer = self.create_optimizer()
|
||||||
sgd = self._optimizer
|
sgd = self._optimizer
|
||||||
if component_cfg is None:
|
if component_cfg is None:
|
||||||
component_cfg = {}
|
component_cfg = {}
|
||||||
|
@ -1077,10 +1097,11 @@ class Language:
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if (
|
if (
|
||||||
name not in exclude
|
name not in exclude
|
||||||
and hasattr(proc, "model")
|
and hasattr(proc, "is_trainable")
|
||||||
|
and proc.is_trainable
|
||||||
and proc.model not in (True, False, None)
|
and proc.model not in (True, False, None)
|
||||||
):
|
):
|
||||||
proc.model.finish_update(sgd)
|
proc.finish_update(sgd)
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def rehearse(
|
def rehearse(
|
||||||
|
@ -1119,7 +1140,7 @@ class Language:
|
||||||
validate_examples(examples, "Language.rehearse")
|
validate_examples(examples, "Language.rehearse")
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
if self._optimizer is None:
|
if self._optimizer is None:
|
||||||
self._optimizer = create_default_optimizer()
|
self._optimizer = self.create_optimizer()
|
||||||
sgd = self._optimizer
|
sgd = self._optimizer
|
||||||
pipes = list(self.pipeline)
|
pipes = list(self.pipeline)
|
||||||
random.shuffle(pipes)
|
random.shuffle(pipes)
|
||||||
|
@ -1149,61 +1170,75 @@ class Language:
|
||||||
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||||
*,
|
*,
|
||||||
sgd: Optional[Optimizer] = None,
|
sgd: Optional[Optimizer] = None,
|
||||||
device: int = -1,
|
) -> Optimizer:
|
||||||
|
warnings.warn(Warnings.W089, DeprecationWarning)
|
||||||
|
return self.initialize(get_examples, sgd=sgd)
|
||||||
|
|
||||||
|
def initialize(
|
||||||
|
self,
|
||||||
|
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||||
|
*,
|
||||||
|
sgd: Optional[Optimizer] = None,
|
||||||
) -> Optimizer:
|
) -> Optimizer:
|
||||||
"""Initialize the pipe for training, using data examples if available.
|
"""Initialize the pipe for training, using data examples if available.
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Optional function that
|
get_examples (Callable[[], Iterable[Example]]): Optional function that
|
||||||
returns gold-standard Example objects.
|
returns gold-standard Example objects.
|
||||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
sgd (Optional[Optimizer]): An optimizer to use for updates. If not
|
||||||
create_optimizer if it doesn't exist.
|
provided, will be created using the .create_optimizer() method.
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/language#begin_training
|
DOCS: https://nightly.spacy.io/api/language#initialize
|
||||||
"""
|
"""
|
||||||
if get_examples is None:
|
if get_examples is None:
|
||||||
util.logger.debug(
|
util.logger.debug(
|
||||||
"No 'get_examples' callback provided to 'Language.begin_training', creating dummy examples"
|
"No 'get_examples' callback provided to 'Language.initialize', creating dummy examples"
|
||||||
)
|
)
|
||||||
doc = Doc(self.vocab, words=["x", "y", "z"])
|
doc = Doc(self.vocab, words=["x", "y", "z"])
|
||||||
get_examples = lambda: [Example.from_dict(doc, {})]
|
get_examples = lambda: [Example.from_dict(doc, {})]
|
||||||
# Populate vocab
|
|
||||||
if not hasattr(get_examples, "__call__"):
|
if not hasattr(get_examples, "__call__"):
|
||||||
err = Errors.E930.format(name="Language", obj=type(get_examples))
|
err = Errors.E930.format(
|
||||||
raise ValueError(err)
|
method="Language.initialize", obj=type(get_examples)
|
||||||
valid_examples = False
|
)
|
||||||
for example in get_examples():
|
raise TypeError(err)
|
||||||
if not isinstance(example, Example):
|
# Make sure the config is interpolated so we can resolve subsections
|
||||||
err = Errors.E978.format(
|
config = self.config.interpolate()
|
||||||
name="Language.begin_training", types=type(example)
|
# These are the settings provided in the [initialize] block in the config
|
||||||
)
|
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
||||||
raise ValueError(err)
|
init_vocab(
|
||||||
else:
|
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
|
||||||
valid_examples = True
|
)
|
||||||
for word in [t.text for t in example.reference]:
|
pretrain_cfg = config.get("pretraining")
|
||||||
_ = self.vocab[word] # noqa: F841
|
if pretrain_cfg:
|
||||||
if not valid_examples:
|
P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
|
||||||
err = Errors.E930.format(name="Language", obj="empty list")
|
init_tok2vec(self, P, I)
|
||||||
raise ValueError(err)
|
if self.vocab.vectors.data.shape[1] >= 1:
|
||||||
if device >= 0: # TODO: do we need this here?
|
ops = get_current_ops()
|
||||||
require_gpu(device)
|
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
||||||
if self.vocab.vectors.data.shape[1] >= 1:
|
if hasattr(self.tokenizer, "initialize"):
|
||||||
ops = get_current_ops()
|
tok_settings = validate_init_settings(
|
||||||
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
self.tokenizer.initialize,
|
||||||
if sgd is None:
|
I["tokenizer"],
|
||||||
sgd = create_default_optimizer()
|
section="tokenizer",
|
||||||
self._optimizer = sgd
|
name="tokenizer",
|
||||||
|
)
|
||||||
|
self.tokenizer.initialize(get_examples, nlp=self, **tok_settings)
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if hasattr(proc, "begin_training"):
|
if hasattr(proc, "initialize"):
|
||||||
proc.begin_training(
|
p_settings = I["components"].get(name, {})
|
||||||
get_examples, pipeline=self.pipeline, sgd=self._optimizer
|
p_settings = validate_init_settings(
|
||||||
|
proc.initialize, p_settings, section="components", name=name
|
||||||
)
|
)
|
||||||
|
proc.initialize(get_examples, nlp=self, **p_settings)
|
||||||
self._link_components()
|
self._link_components()
|
||||||
|
self._optimizer = sgd
|
||||||
|
if sgd is not None:
|
||||||
|
self._optimizer = sgd
|
||||||
|
elif self._optimizer is None:
|
||||||
|
self._optimizer = self.create_optimizer()
|
||||||
return self._optimizer
|
return self._optimizer
|
||||||
|
|
||||||
def resume_training(
|
def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:
|
||||||
self, *, sgd: Optional[Optimizer] = None, device: int = -1
|
|
||||||
) -> Optimizer:
|
|
||||||
"""Continue training a pretrained model.
|
"""Continue training a pretrained model.
|
||||||
|
|
||||||
Create and return an optimizer, and initialize "rehearsal" for any pipeline
|
Create and return an optimizer, and initialize "rehearsal" for any pipeline
|
||||||
|
@ -1212,22 +1247,20 @@ class Language:
|
||||||
rehearsal, collect samples of text you want the models to retain performance
|
rehearsal, collect samples of text you want the models to retain performance
|
||||||
on, and call nlp.rehearse() with a batch of Example objects.
|
on, and call nlp.rehearse() with a batch of Example objects.
|
||||||
|
|
||||||
sgd (Optional[Optimizer]): An optimizer.
|
|
||||||
RETURNS (Optimizer): The optimizer.
|
RETURNS (Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/language#resume_training
|
DOCS: https://nightly.spacy.io/api/language#resume_training
|
||||||
"""
|
"""
|
||||||
if device >= 0: # TODO: do we need this here?
|
ops = get_current_ops()
|
||||||
require_gpu(device)
|
if self.vocab.vectors.data.shape[1] >= 1:
|
||||||
ops = get_current_ops()
|
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
||||||
if self.vocab.vectors.data.shape[1] >= 1:
|
|
||||||
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
|
||||||
if sgd is None:
|
|
||||||
sgd = create_default_optimizer()
|
|
||||||
self._optimizer = sgd
|
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if hasattr(proc, "_rehearsal_model"):
|
if hasattr(proc, "_rehearsal_model"):
|
||||||
proc._rehearsal_model = deepcopy(proc.model)
|
proc._rehearsal_model = deepcopy(proc.model)
|
||||||
|
if sgd is not None:
|
||||||
|
self._optimizer = sgd
|
||||||
|
elif self._optimizer is None:
|
||||||
|
self._optimizer = self.create_optimizer()
|
||||||
return self._optimizer
|
return self._optimizer
|
||||||
|
|
||||||
def evaluate(
|
def evaluate(
|
||||||
|
@ -1273,10 +1306,7 @@ class Language:
|
||||||
for name, pipe in self.pipeline:
|
for name, pipe in self.pipeline:
|
||||||
kwargs = component_cfg.get(name, {})
|
kwargs = component_cfg.get(name, {})
|
||||||
kwargs.setdefault("batch_size", batch_size)
|
kwargs.setdefault("batch_size", batch_size)
|
||||||
if not hasattr(pipe, "pipe"):
|
docs = _pipe(docs, pipe, kwargs)
|
||||||
docs = _pipe(docs, pipe, kwargs)
|
|
||||||
else:
|
|
||||||
docs = pipe.pipe(docs, **kwargs)
|
|
||||||
# iterate over the final generator
|
# iterate over the final generator
|
||||||
if len(self.pipeline):
|
if len(self.pipeline):
|
||||||
docs = list(docs)
|
docs = list(docs)
|
||||||
|
@ -1289,6 +1319,11 @@ class Language:
|
||||||
results["speed"] = n_words / (end_time - start_time)
|
results["speed"] = n_words / (end_time - start_time)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
def create_optimizer(self):
|
||||||
|
"""Create an optimizer, usually using the [training.optimizer] config."""
|
||||||
|
subconfig = {"optimizer": self.config["training"]["optimizer"]}
|
||||||
|
return registry.resolve(subconfig)["optimizer"]
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def use_params(self, params: Optional[dict]):
|
def use_params(self, params: Optional[dict]):
|
||||||
"""Replace weights of models in the pipeline with those provided in the
|
"""Replace weights of models in the pipeline with those provided in the
|
||||||
|
@ -1378,11 +1413,7 @@ class Language:
|
||||||
kwargs = component_cfg.get(name, {})
|
kwargs = component_cfg.get(name, {})
|
||||||
# Allow component_cfg to overwrite the top-level kwargs.
|
# Allow component_cfg to overwrite the top-level kwargs.
|
||||||
kwargs.setdefault("batch_size", batch_size)
|
kwargs.setdefault("batch_size", batch_size)
|
||||||
if hasattr(proc, "pipe"):
|
f = functools.partial(_pipe, proc=proc, kwargs=kwargs)
|
||||||
f = functools.partial(proc.pipe, **kwargs)
|
|
||||||
else:
|
|
||||||
# Apply the function, but yield the doc
|
|
||||||
f = functools.partial(_pipe, proc=proc, kwargs=kwargs)
|
|
||||||
pipes.append(f)
|
pipes.append(f)
|
||||||
|
|
||||||
if n_process != 1:
|
if n_process != 1:
|
||||||
|
@ -1448,10 +1479,15 @@ class Language:
|
||||||
"""Register 'listeners' within pipeline components, to allow them to
|
"""Register 'listeners' within pipeline components, to allow them to
|
||||||
effectively share weights.
|
effectively share weights.
|
||||||
"""
|
"""
|
||||||
|
# I had though, "Why do we do this inside the Language object? Shouldn't
|
||||||
|
# it be the tok2vec/transformer/etc's job?
|
||||||
|
# The problem is we need to do it during deserialization...And the
|
||||||
|
# components don't receive the pipeline then. So this does have to be
|
||||||
|
# here :(
|
||||||
for i, (name1, proc1) in enumerate(self.pipeline):
|
for i, (name1, proc1) in enumerate(self.pipeline):
|
||||||
if hasattr(proc1, "find_listeners"):
|
if hasattr(proc1, "find_listeners"):
|
||||||
for name2, proc2 in self.pipeline[i:]:
|
for name2, proc2 in self.pipeline[i + 1 :]:
|
||||||
if hasattr(proc2, "model"):
|
if isinstance(getattr(proc2, "model", None), Model):
|
||||||
proc1.find_listeners(proc2.model)
|
proc1.find_listeners(proc2.model)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -1492,7 +1528,7 @@ class Language:
|
||||||
).merge(config)
|
).merge(config)
|
||||||
if "nlp" not in config:
|
if "nlp" not in config:
|
||||||
raise ValueError(Errors.E985.format(config=config))
|
raise ValueError(Errors.E985.format(config=config))
|
||||||
config_lang = config["nlp"]["lang"]
|
config_lang = config["nlp"].get("lang")
|
||||||
if config_lang is not None and config_lang != cls.lang:
|
if config_lang is not None and config_lang != cls.lang:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
Errors.E958.format(
|
Errors.E958.format(
|
||||||
|
@ -1509,15 +1545,19 @@ class Language:
|
||||||
config = util.copy_config(config)
|
config = util.copy_config(config)
|
||||||
orig_pipeline = config.pop("components", {})
|
orig_pipeline = config.pop("components", {})
|
||||||
config["components"] = {}
|
config["components"] = {}
|
||||||
resolved, filled = registry.resolve(
|
if auto_fill:
|
||||||
config, validate=validate, schema=ConfigSchema
|
filled = registry.fill(config, validate=validate, schema=ConfigSchema)
|
||||||
)
|
else:
|
||||||
|
filled = config
|
||||||
filled["components"] = orig_pipeline
|
filled["components"] = orig_pipeline
|
||||||
config["components"] = orig_pipeline
|
config["components"] = orig_pipeline
|
||||||
create_tokenizer = resolved["nlp"]["tokenizer"]
|
resolved_nlp = registry.resolve(
|
||||||
before_creation = resolved["nlp"]["before_creation"]
|
filled["nlp"], validate=validate, schema=ConfigSchemaNlp
|
||||||
after_creation = resolved["nlp"]["after_creation"]
|
)
|
||||||
after_pipeline_creation = resolved["nlp"]["after_pipeline_creation"]
|
create_tokenizer = resolved_nlp["tokenizer"]
|
||||||
|
before_creation = resolved_nlp["before_creation"]
|
||||||
|
after_creation = resolved_nlp["after_creation"]
|
||||||
|
after_pipeline_creation = resolved_nlp["after_pipeline_creation"]
|
||||||
lang_cls = cls
|
lang_cls = cls
|
||||||
if before_creation is not None:
|
if before_creation is not None:
|
||||||
lang_cls = before_creation(cls)
|
lang_cls = before_creation(cls)
|
||||||
|
@ -1578,7 +1618,6 @@ class Language:
|
||||||
disabled_pipes = [*config["nlp"]["disabled"], *disable]
|
disabled_pipes = [*config["nlp"]["disabled"], *disable]
|
||||||
nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
|
nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
|
||||||
nlp.config = filled if auto_fill else config
|
nlp.config = filled if auto_fill else config
|
||||||
nlp.resolved = resolved
|
|
||||||
if after_pipeline_creation is not None:
|
if after_pipeline_creation is not None:
|
||||||
nlp = after_pipeline_creation(nlp)
|
nlp = after_pipeline_creation(nlp)
|
||||||
if not isinstance(nlp, cls):
|
if not isinstance(nlp, cls):
|
||||||
|
@ -1773,19 +1812,6 @@ class DisabledPipes(list):
|
||||||
self[:] = []
|
self[:] = []
|
||||||
|
|
||||||
|
|
||||||
def _pipe(
|
|
||||||
examples: Iterable[Example], proc: Callable[[Doc], Doc], kwargs: Dict[str, Any]
|
|
||||||
) -> Iterator[Example]:
|
|
||||||
# We added some args for pipe that __call__ doesn't expect.
|
|
||||||
kwargs = dict(kwargs)
|
|
||||||
for arg in ["batch_size"]:
|
|
||||||
if arg in kwargs:
|
|
||||||
kwargs.pop(arg)
|
|
||||||
for eg in examples:
|
|
||||||
eg = proc(eg, **kwargs)
|
|
||||||
yield eg
|
|
||||||
|
|
||||||
|
|
||||||
def _apply_pipes(
|
def _apply_pipes(
|
||||||
make_doc: Callable[[str], Doc],
|
make_doc: Callable[[str], Doc],
|
||||||
pipes: Iterable[Callable[[Doc], Doc]],
|
pipes: Iterable[Callable[[Doc], Doc]],
|
||||||
|
|
|
@ -289,13 +289,12 @@ class Lookups:
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/lookups#to_disk
|
DOCS: https://nightly.spacy.io/api/lookups#to_disk
|
||||||
"""
|
"""
|
||||||
if len(self._tables):
|
path = ensure_path(path)
|
||||||
path = ensure_path(path)
|
if not path.exists():
|
||||||
if not path.exists():
|
path.mkdir()
|
||||||
path.mkdir()
|
filepath = path / filename
|
||||||
filepath = path / filename
|
with filepath.open("wb") as file_:
|
||||||
with filepath.open("wb") as file_:
|
file_.write(self.to_bytes())
|
||||||
file_.write(self.to_bytes())
|
|
||||||
|
|
||||||
def from_disk(
|
def from_disk(
|
||||||
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
|
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
|
||||||
|
|
|
@ -17,7 +17,8 @@ from ..vocab cimport Vocab
|
||||||
from ..tokens.doc cimport Doc, get_token_attr_for_matcher
|
from ..tokens.doc cimport Doc, get_token_attr_for_matcher
|
||||||
from ..tokens.span cimport Span
|
from ..tokens.span cimport Span
|
||||||
from ..tokens.token cimport Token
|
from ..tokens.token cimport Token
|
||||||
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA
|
from ..tokens.morphanalysis cimport MorphAnalysis
|
||||||
|
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
|
||||||
|
|
||||||
from ..schemas import validate_token_pattern
|
from ..schemas import validate_token_pattern
|
||||||
from ..errors import Errors, MatchPatternError, Warnings
|
from ..errors import Errors, MatchPatternError, Warnings
|
||||||
|
@ -124,7 +125,7 @@ cdef class Matcher:
|
||||||
key = self._normalize_key(key)
|
key = self._normalize_key(key)
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
try:
|
try:
|
||||||
specs = _preprocess_pattern(pattern, self.vocab.strings,
|
specs = _preprocess_pattern(pattern, self.vocab,
|
||||||
self._extensions, self._extra_predicates)
|
self._extensions, self._extra_predicates)
|
||||||
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
||||||
for spec in specs:
|
for spec in specs:
|
||||||
|
@ -195,7 +196,7 @@ cdef class Matcher:
|
||||||
else:
|
else:
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
def __call__(self, object doclike, *, as_spans=False):
|
def __call__(self, object doclike, *, as_spans=False, allow_missing=False):
|
||||||
"""Find all token sequences matching the supplied pattern.
|
"""Find all token sequences matching the supplied pattern.
|
||||||
|
|
||||||
doclike (Doc or Span): The document to match over.
|
doclike (Doc or Span): The document to match over.
|
||||||
|
@ -215,11 +216,19 @@ cdef class Matcher:
|
||||||
else:
|
else:
|
||||||
raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
|
raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
|
||||||
cdef Pool tmp_pool = Pool()
|
cdef Pool tmp_pool = Pool()
|
||||||
if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \
|
if not allow_missing:
|
||||||
and not doc.is_tagged:
|
for attr in (TAG, POS, MORPH, LEMMA, DEP):
|
||||||
raise ValueError(Errors.E155.format())
|
if attr in self._seen_attrs and not doc.has_annotation(attr):
|
||||||
if DEP in self._seen_attrs and not doc.is_parsed:
|
if attr == TAG:
|
||||||
raise ValueError(Errors.E156.format())
|
pipe = "tagger"
|
||||||
|
elif attr in (POS, MORPH):
|
||||||
|
pipe = "morphologizer"
|
||||||
|
elif attr == LEMMA:
|
||||||
|
pipe = "lemmatizer"
|
||||||
|
elif attr == DEP:
|
||||||
|
pipe = "parser"
|
||||||
|
error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
|
||||||
|
raise ValueError(error_msg)
|
||||||
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
|
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
|
||||||
extensions=self._extensions, predicates=self._extra_predicates)
|
extensions=self._extensions, predicates=self._extra_predicates)
|
||||||
final_matches = []
|
final_matches = []
|
||||||
|
@ -655,7 +664,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
|
||||||
return id_attr.value
|
return id_attr.value
|
||||||
|
|
||||||
|
|
||||||
def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predicates):
|
def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
|
||||||
"""This function interprets the pattern, converting the various bits of
|
"""This function interprets the pattern, converting the various bits of
|
||||||
syntactic sugar before we compile it into a struct with init_pattern.
|
syntactic sugar before we compile it into a struct with init_pattern.
|
||||||
|
|
||||||
|
@ -670,6 +679,7 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
|
||||||
extra_predicates.
|
extra_predicates.
|
||||||
"""
|
"""
|
||||||
tokens = []
|
tokens = []
|
||||||
|
string_store = vocab.strings
|
||||||
for spec in token_specs:
|
for spec in token_specs:
|
||||||
if not spec:
|
if not spec:
|
||||||
# Signifier for 'any token'
|
# Signifier for 'any token'
|
||||||
|
@ -680,7 +690,7 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
|
||||||
ops = _get_operators(spec)
|
ops = _get_operators(spec)
|
||||||
attr_values = _get_attr_values(spec, string_store)
|
attr_values = _get_attr_values(spec, string_store)
|
||||||
extensions = _get_extensions(spec, string_store, extensions_table)
|
extensions = _get_extensions(spec, string_store, extensions_table)
|
||||||
predicates = _get_extra_predicates(spec, extra_predicates)
|
predicates = _get_extra_predicates(spec, extra_predicates, vocab)
|
||||||
for op in ops:
|
for op in ops:
|
||||||
tokens.append((op, list(attr_values), list(extensions), list(predicates)))
|
tokens.append((op, list(attr_values), list(extensions), list(predicates)))
|
||||||
return tokens
|
return tokens
|
||||||
|
@ -724,7 +734,7 @@ def _get_attr_values(spec, string_store):
|
||||||
class _RegexPredicate:
|
class _RegexPredicate:
|
||||||
operators = ("REGEX",)
|
operators = ("REGEX",)
|
||||||
|
|
||||||
def __init__(self, i, attr, value, predicate, is_extension=False):
|
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
|
||||||
self.i = i
|
self.i = i
|
||||||
self.attr = attr
|
self.attr = attr
|
||||||
self.value = re.compile(value)
|
self.value = re.compile(value)
|
||||||
|
@ -742,13 +752,18 @@ class _RegexPredicate:
|
||||||
return bool(self.value.search(value))
|
return bool(self.value.search(value))
|
||||||
|
|
||||||
|
|
||||||
class _SetMemberPredicate:
|
class _SetPredicate:
|
||||||
operators = ("IN", "NOT_IN")
|
operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET")
|
||||||
|
|
||||||
def __init__(self, i, attr, value, predicate, is_extension=False):
|
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
|
||||||
self.i = i
|
self.i = i
|
||||||
self.attr = attr
|
self.attr = attr
|
||||||
self.value = set(get_string_id(v) for v in value)
|
self.vocab = vocab
|
||||||
|
if self.attr == MORPH:
|
||||||
|
# normalize morph strings
|
||||||
|
self.value = set(self.vocab.morphology.add(v) for v in value)
|
||||||
|
else:
|
||||||
|
self.value = set(get_string_id(v) for v in value)
|
||||||
self.predicate = predicate
|
self.predicate = predicate
|
||||||
self.is_extension = is_extension
|
self.is_extension = is_extension
|
||||||
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||||
|
@ -760,19 +775,32 @@ class _SetMemberPredicate:
|
||||||
value = get_string_id(token._.get(self.attr))
|
value = get_string_id(token._.get(self.attr))
|
||||||
else:
|
else:
|
||||||
value = get_token_attr_for_matcher(token.c, self.attr)
|
value = get_token_attr_for_matcher(token.c, self.attr)
|
||||||
|
|
||||||
|
if self.predicate in ("IS_SUBSET", "IS_SUPERSET"):
|
||||||
|
if self.attr == MORPH:
|
||||||
|
# break up MORPH into individual Feat=Val values
|
||||||
|
value = set(get_string_id(v) for v in MorphAnalysis.from_id(self.vocab, value))
|
||||||
|
else:
|
||||||
|
# IS_SUBSET for other attrs will be equivalent to "IN"
|
||||||
|
# IS_SUPERSET will only match for other attrs with 0 or 1 values
|
||||||
|
value = set([value])
|
||||||
if self.predicate == "IN":
|
if self.predicate == "IN":
|
||||||
return value in self.value
|
return value in self.value
|
||||||
else:
|
elif self.predicate == "NOT_IN":
|
||||||
return value not in self.value
|
return value not in self.value
|
||||||
|
elif self.predicate == "IS_SUBSET":
|
||||||
|
return value <= self.value
|
||||||
|
elif self.predicate == "IS_SUPERSET":
|
||||||
|
return value >= self.value
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return repr(("SetMemberPredicate", self.i, self.attr, self.value, self.predicate))
|
return repr(("SetPredicate", self.i, self.attr, self.value, self.predicate))
|
||||||
|
|
||||||
|
|
||||||
class _ComparisonPredicate:
|
class _ComparisonPredicate:
|
||||||
operators = ("==", "!=", ">=", "<=", ">", "<")
|
operators = ("==", "!=", ">=", "<=", ">", "<")
|
||||||
|
|
||||||
def __init__(self, i, attr, value, predicate, is_extension=False):
|
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
|
||||||
self.i = i
|
self.i = i
|
||||||
self.attr = attr
|
self.attr = attr
|
||||||
self.value = value
|
self.value = value
|
||||||
|
@ -801,11 +829,13 @@ class _ComparisonPredicate:
|
||||||
return value < self.value
|
return value < self.value
|
||||||
|
|
||||||
|
|
||||||
def _get_extra_predicates(spec, extra_predicates):
|
def _get_extra_predicates(spec, extra_predicates, vocab):
|
||||||
predicate_types = {
|
predicate_types = {
|
||||||
"REGEX": _RegexPredicate,
|
"REGEX": _RegexPredicate,
|
||||||
"IN": _SetMemberPredicate,
|
"IN": _SetPredicate,
|
||||||
"NOT_IN": _SetMemberPredicate,
|
"NOT_IN": _SetPredicate,
|
||||||
|
"IS_SUBSET": _SetPredicate,
|
||||||
|
"IS_SUPERSET": _SetPredicate,
|
||||||
"==": _ComparisonPredicate,
|
"==": _ComparisonPredicate,
|
||||||
"!=": _ComparisonPredicate,
|
"!=": _ComparisonPredicate,
|
||||||
">=": _ComparisonPredicate,
|
">=": _ComparisonPredicate,
|
||||||
|
@ -833,7 +863,7 @@ def _get_extra_predicates(spec, extra_predicates):
|
||||||
value_with_upper_keys = {k.upper(): v for k, v in value.items()}
|
value_with_upper_keys = {k.upper(): v for k, v in value.items()}
|
||||||
for type_, cls in predicate_types.items():
|
for type_, cls in predicate_types.items():
|
||||||
if type_ in value_with_upper_keys:
|
if type_ in value_with_upper_keys:
|
||||||
predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_)
|
predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab)
|
||||||
# Don't create a redundant predicates.
|
# Don't create a redundant predicates.
|
||||||
# This helps with efficiency, as we're caching the results.
|
# This helps with efficiency, as we're caching the results.
|
||||||
if predicate.key in seen_predicates:
|
if predicate.key in seen_predicates:
|
||||||
|
|
|
@ -4,7 +4,7 @@ from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
|
||||||
|
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA
|
from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, MORPH
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
from ..tokens.token cimport Token
|
from ..tokens.token cimport Token
|
||||||
from ..tokens.span cimport Span
|
from ..tokens.span cimport Span
|
||||||
|
@ -184,12 +184,22 @@ cdef class PhraseMatcher:
|
||||||
if len(doc) == 0:
|
if len(doc) == 0:
|
||||||
continue
|
continue
|
||||||
if isinstance(doc, Doc):
|
if isinstance(doc, Doc):
|
||||||
if self.attr in (POS, TAG, LEMMA) and not doc.is_tagged:
|
attrs = (TAG, POS, MORPH, LEMMA, DEP)
|
||||||
raise ValueError(Errors.E155.format())
|
has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
|
||||||
if self.attr == DEP and not doc.is_parsed:
|
for attr in attrs:
|
||||||
raise ValueError(Errors.E156.format())
|
if self.attr == attr and not has_annotation[attr]:
|
||||||
if self._validate and (doc.is_tagged or doc.is_parsed) \
|
if attr == TAG:
|
||||||
and self.attr not in (DEP, POS, TAG, LEMMA):
|
pipe = "tagger"
|
||||||
|
elif attr in (POS, MORPH):
|
||||||
|
pipe = "morphologizer"
|
||||||
|
elif attr == LEMMA:
|
||||||
|
pipe = "lemmatizer"
|
||||||
|
elif attr == DEP:
|
||||||
|
pipe = "parser"
|
||||||
|
error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
|
||||||
|
raise ValueError(error_msg)
|
||||||
|
if self._validate and any(has_annotation.values()) \
|
||||||
|
and self.attr not in attrs:
|
||||||
string_attr = self.vocab.strings[self.attr]
|
string_attr = self.vocab.strings[self.attr]
|
||||||
warnings.warn(Warnings.W012.format(key=key, attr=string_attr))
|
warnings.warn(Warnings.W012.format(key=key, attr=string_attr))
|
||||||
keyword = self._convert_to_array(doc)
|
keyword = self._convert_to_array(doc)
|
||||||
|
|
28
spacy/ml/featureextractor.py
Normal file
28
spacy/ml/featureextractor.py
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
from typing import List, Union, Callable, Tuple
|
||||||
|
from thinc.types import Ints2d
|
||||||
|
from thinc.api import Model, registry
|
||||||
|
|
||||||
|
from ..tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
@registry.layers("spacy.FeatureExtractor.v1")
|
||||||
|
def FeatureExtractor(columns: List[Union[int, str]]) -> Model[List[Doc], List[Ints2d]]:
|
||||||
|
return Model("extract_features", forward, attrs={"columns": columns})
|
||||||
|
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
model: Model[List[Doc], List[Ints2d]], docs, is_train: bool
|
||||||
|
) -> Tuple[List[Ints2d], Callable]:
|
||||||
|
columns = model.attrs["columns"]
|
||||||
|
features: List[Ints2d] = []
|
||||||
|
for doc in docs:
|
||||||
|
if hasattr(doc, "to_array"):
|
||||||
|
attrs = doc.to_array(columns)
|
||||||
|
else:
|
||||||
|
attrs = doc.doc.to_array(columns)[doc.start : doc.end]
|
||||||
|
if attrs.ndim == 1:
|
||||||
|
attrs = attrs.reshape((attrs.shape[0], 1))
|
||||||
|
features.append(model.ops.asarray2i(attrs, dtype="uint64"))
|
||||||
|
|
||||||
|
backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
|
||||||
|
return features, backprop
|
|
@ -1,3 +1,4 @@
|
||||||
|
from pathlib import Path
|
||||||
from typing import Optional, Callable, Iterable
|
from typing import Optional, Callable, Iterable
|
||||||
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
|
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
|
||||||
from thinc.api import Model, Maxout, Linear
|
from thinc.api import Model, Maxout, Linear
|
||||||
|
@ -25,7 +26,7 @@ def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
|
||||||
|
|
||||||
|
|
||||||
@registry.misc.register("spacy.KBFromFile.v1")
|
@registry.misc.register("spacy.KBFromFile.v1")
|
||||||
def load_kb(kb_path: str) -> Callable[[Vocab], KnowledgeBase]:
|
def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:
|
||||||
def kb_from_file(vocab):
|
def kb_from_file(vocab):
|
||||||
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
||||||
kb.from_disk(kb_path)
|
kb.from_disk(kb_path)
|
||||||
|
|
|
@ -2,6 +2,8 @@ from typing import Optional, List
|
||||||
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
|
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d
|
||||||
|
|
||||||
|
from ...errors import Errors
|
||||||
|
from ...compat import Literal
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from .._precomputable_affine import PrecomputableAffine
|
from .._precomputable_affine import PrecomputableAffine
|
||||||
from ..tb_framework import TransitionModel
|
from ..tb_framework import TransitionModel
|
||||||
|
@ -11,7 +13,8 @@ from ...tokens import Doc
|
||||||
@registry.architectures.register("spacy.TransitionBasedParser.v1")
|
@registry.architectures.register("spacy.TransitionBasedParser.v1")
|
||||||
def build_tb_parser_model(
|
def build_tb_parser_model(
|
||||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||||
nr_feature_tokens: int,
|
state_type: Literal["parser", "ner"],
|
||||||
|
extra_state_tokens: bool,
|
||||||
hidden_width: int,
|
hidden_width: int,
|
||||||
maxout_pieces: int,
|
maxout_pieces: int,
|
||||||
use_upper: bool = True,
|
use_upper: bool = True,
|
||||||
|
@ -40,20 +43,12 @@ def build_tb_parser_model(
|
||||||
|
|
||||||
tok2vec (Model[List[Doc], List[Floats2d]]):
|
tok2vec (Model[List[Doc], List[Floats2d]]):
|
||||||
Subnetwork to map tokens into vector representations.
|
Subnetwork to map tokens into vector representations.
|
||||||
nr_feature_tokens (int): The number of tokens in the context to use to
|
state_type (str):
|
||||||
construct the state vector. Valid choices are 1, 2, 3, 6, 8 and 13. The
|
String value denoting the type of parser model: "parser" or "ner"
|
||||||
2, 8 and 13 feature sets are designed for the parser, while the 3 and 6
|
extra_state_tokens (bool): Whether or not to use additional tokens in the context
|
||||||
feature sets are designed for the NER. The recommended feature sets are
|
to construct the state vector. Defaults to `False`, which means 3 and 8
|
||||||
3 for NER, and 8 for the dependency parser.
|
for the NER and parser respectively. When set to `True`, this would become 6
|
||||||
|
feature sets (for the NER) or 13 (for the parser).
|
||||||
TODO: This feature should be split into two, state_type: ["deps", "ner"]
|
|
||||||
and extra_state_features: [True, False]. This would map into:
|
|
||||||
|
|
||||||
(deps, False): 8
|
|
||||||
(deps, True): 13
|
|
||||||
(ner, False): 3
|
|
||||||
(ner, True): 6
|
|
||||||
|
|
||||||
hidden_width (int): The width of the hidden layer.
|
hidden_width (int): The width of the hidden layer.
|
||||||
maxout_pieces (int): How many pieces to use in the state prediction layer.
|
maxout_pieces (int): How many pieces to use in the state prediction layer.
|
||||||
Recommended values are 1, 2 or 3. If 1, the maxout non-linearity
|
Recommended values are 1, 2 or 3. If 1, the maxout non-linearity
|
||||||
|
@ -68,8 +63,14 @@ def build_tb_parser_model(
|
||||||
Usually inferred from data at the beginning of training, or loaded from
|
Usually inferred from data at the beginning of training, or loaded from
|
||||||
disk.
|
disk.
|
||||||
"""
|
"""
|
||||||
|
if state_type == "parser":
|
||||||
|
nr_feature_tokens = 13 if extra_state_tokens else 8
|
||||||
|
elif state_type == "ner":
|
||||||
|
nr_feature_tokens = 6 if extra_state_tokens else 3
|
||||||
|
else:
|
||||||
|
raise ValueError(Errors.E917.format(value=state_type))
|
||||||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||||
tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),)
|
tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width))
|
||||||
tok2vec.set_dim("nO", hidden_width)
|
tok2vec.set_dim("nO", hidden_width)
|
||||||
lower = PrecomputableAffine(
|
lower = PrecomputableAffine(
|
||||||
nO=hidden_width if use_upper else nO,
|
nO=hidden_width if use_upper else nO,
|
||||||
|
|
|
@ -3,12 +3,13 @@ from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
||||||
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
||||||
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
||||||
from thinc.api import HashEmbed, with_array, with_cpu, uniqued
|
from thinc.api import HashEmbed, with_array, with_cpu, uniqued
|
||||||
from thinc.api import Relu, residual, expand_window, FeatureExtractor
|
from thinc.api import Relu, residual, expand_window
|
||||||
|
|
||||||
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
|
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from ..extract_ngrams import extract_ngrams
|
from ..extract_ngrams import extract_ngrams
|
||||||
from ..staticvectors import StaticVectors
|
from ..staticvectors import StaticVectors
|
||||||
|
from ..featureextractor import FeatureExtractor
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TextCatCNN.v1")
|
@registry.architectures.register("spacy.TextCatCNN.v1")
|
||||||
|
@ -23,11 +24,11 @@ def build_simple_cnn_text_classifier(
|
||||||
"""
|
"""
|
||||||
with Model.define_operators({">>": chain}):
|
with Model.define_operators({">>": chain}):
|
||||||
if exclusive_classes:
|
if exclusive_classes:
|
||||||
output_layer = Softmax(nO=nO, nI=tok2vec.get_dim("nO"))
|
output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
|
||||||
model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
|
model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
|
||||||
model.set_ref("output_layer", output_layer)
|
model.set_ref("output_layer", output_layer)
|
||||||
else:
|
else:
|
||||||
linear_layer = Linear(nO=nO, nI=tok2vec.get_dim("nO"))
|
linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
|
||||||
model = (
|
model = (
|
||||||
tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
|
tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,16 +1,17 @@
|
||||||
from typing import Optional, List
|
from typing import Optional, List, Union
|
||||||
from thinc.api import chain, clone, concatenate, with_array, with_padded
|
|
||||||
from thinc.api import Model, noop, list2ragged, ragged2list
|
|
||||||
from thinc.api import FeatureExtractor, HashEmbed
|
|
||||||
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
|
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d
|
||||||
|
from thinc.api import chain, clone, concatenate, with_array, with_padded
|
||||||
|
from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
|
||||||
|
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
|
||||||
|
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
|
from ...errors import Errors
|
||||||
from ...ml import _character_embed
|
from ...ml import _character_embed
|
||||||
from ..staticvectors import StaticVectors
|
from ..staticvectors import StaticVectors
|
||||||
|
from ..featureextractor import FeatureExtractor
|
||||||
from ...pipeline.tok2vec import Tok2VecListener
|
from ...pipeline.tok2vec import Tok2VecListener
|
||||||
from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE
|
from ...attrs import intify_attr
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.Tok2VecListener.v1")
|
@registry.architectures.register("spacy.Tok2VecListener.v1")
|
||||||
|
@ -28,7 +29,7 @@ def build_hash_embed_cnn_tok2vec(
|
||||||
window_size: int,
|
window_size: int,
|
||||||
maxout_pieces: int,
|
maxout_pieces: int,
|
||||||
subword_features: bool,
|
subword_features: bool,
|
||||||
pretrained_vectors: Optional[bool]
|
pretrained_vectors: Optional[bool],
|
||||||
) -> Model[List[Doc], List[Floats2d]]:
|
) -> Model[List[Doc], List[Floats2d]]:
|
||||||
"""Build spaCy's 'standard' tok2vec layer, which uses hash embedding
|
"""Build spaCy's 'standard' tok2vec layer, which uses hash embedding
|
||||||
with subword features and a CNN with layer-normalized maxout.
|
with subword features and a CNN with layer-normalized maxout.
|
||||||
|
@ -53,12 +54,18 @@ def build_hash_embed_cnn_tok2vec(
|
||||||
a language such as Chinese.
|
a language such as Chinese.
|
||||||
pretrained_vectors (bool): Whether to also use static vectors.
|
pretrained_vectors (bool): Whether to also use static vectors.
|
||||||
"""
|
"""
|
||||||
|
if subword_features:
|
||||||
|
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
|
||||||
|
row_sizes = [embed_size, embed_size // 2, embed_size // 2, embed_size // 2]
|
||||||
|
else:
|
||||||
|
attrs = ["NORM"]
|
||||||
|
row_sizes = [embed_size]
|
||||||
return build_Tok2Vec_model(
|
return build_Tok2Vec_model(
|
||||||
embed=MultiHashEmbed(
|
embed=MultiHashEmbed(
|
||||||
width=width,
|
width=width,
|
||||||
rows=embed_size,
|
rows=row_sizes,
|
||||||
also_embed_subwords=subword_features,
|
attrs=attrs,
|
||||||
also_use_static_vectors=bool(pretrained_vectors),
|
include_static_vectors=bool(pretrained_vectors),
|
||||||
),
|
),
|
||||||
encode=MaxoutWindowEncoder(
|
encode=MaxoutWindowEncoder(
|
||||||
width=width,
|
width=width,
|
||||||
|
@ -92,58 +99,59 @@ def build_Tok2Vec_model(
|
||||||
|
|
||||||
@registry.architectures.register("spacy.MultiHashEmbed.v1")
|
@registry.architectures.register("spacy.MultiHashEmbed.v1")
|
||||||
def MultiHashEmbed(
|
def MultiHashEmbed(
|
||||||
width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool
|
width: int,
|
||||||
):
|
attrs: List[Union[str, int]],
|
||||||
|
rows: List[int],
|
||||||
|
include_static_vectors: bool,
|
||||||
|
) -> Model[List[Doc], List[Floats2d]]:
|
||||||
"""Construct an embedding layer that separately embeds a number of lexical
|
"""Construct an embedding layer that separately embeds a number of lexical
|
||||||
attributes using hash embedding, concatenates the results, and passes it
|
attributes using hash embedding, concatenates the results, and passes it
|
||||||
through a feed-forward subnetwork to build a mixed representations.
|
through a feed-forward subnetwork to build a mixed representations.
|
||||||
|
|
||||||
The features used are the NORM, PREFIX, SUFFIX and SHAPE, which can have
|
The features used can be configured with the 'attrs' argument. The suggested
|
||||||
varying definitions depending on the Vocab of the Doc object passed in.
|
attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
|
||||||
Vectors from pretrained static vectors can also be incorporated into the
|
account some subword information, without constructing a fully character-based
|
||||||
concatenated representation.
|
representation. If pretrained vectors are available, they can be included in
|
||||||
|
the representation as well, with the vectors table will be kept static
|
||||||
|
(i.e. it's not updated).
|
||||||
|
|
||||||
|
The `width` parameter specifies the output width of the layer and the widths
|
||||||
|
of all embedding tables. If static vectors are included, a learned linear
|
||||||
|
layer is used to map the vectors to the specified width before concatenating
|
||||||
|
it with the other embedding outputs. A single Maxout layer is then used to
|
||||||
|
reduce the concatenated vectors to the final width.
|
||||||
|
|
||||||
|
The `rows` parameter controls the number of rows used by the `HashEmbed`
|
||||||
|
tables. The HashEmbed layer needs surprisingly few rows, due to its use of
|
||||||
|
the hashing trick. Generally between 2000 and 10000 rows is sufficient,
|
||||||
|
even for very large vocabularies. A number of rows must be specified for each
|
||||||
|
table, so the `rows` list must be of the same length as the `attrs` parameter.
|
||||||
|
|
||||||
width (int): The output width. Also used as the width of the embedding tables.
|
width (int): The output width. Also used as the width of the embedding tables.
|
||||||
Recommended values are between 64 and 300.
|
Recommended values are between 64 and 300.
|
||||||
rows (int): The number of rows for the embedding tables. Can be low, due
|
attrs (list of attr IDs): The token attributes to embed. A separate
|
||||||
to the hashing trick. Embeddings for prefix, suffix and word shape
|
embedding table will be constructed for each attribute.
|
||||||
use half as many rows. Recommended values are between 2000 and 10000.
|
rows (List[int]): The number of rows in the embedding tables. Must have the
|
||||||
also_embed_subwords (bool): Whether to use the PREFIX, SUFFIX and SHAPE
|
same length as attrs.
|
||||||
features in the embeddings. If not using these, you may need more
|
include_static_vectors (bool): Whether to also use static word vectors.
|
||||||
rows in your hash embeddings, as there will be increased chance of
|
|
||||||
collisions.
|
|
||||||
also_use_static_vectors (bool): Whether to also use static word vectors.
|
|
||||||
Requires a vectors table to be loaded in the Doc objects' vocab.
|
Requires a vectors table to be loaded in the Doc objects' vocab.
|
||||||
"""
|
"""
|
||||||
cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
if len(rows) != len(attrs):
|
||||||
|
raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}")
|
||||||
seed = 7
|
seed = 7
|
||||||
|
|
||||||
def make_hash_embed(feature):
|
def make_hash_embed(index):
|
||||||
nonlocal seed
|
nonlocal seed
|
||||||
seed += 1
|
seed += 1
|
||||||
return HashEmbed(
|
return HashEmbed(width, rows[index], column=index, seed=seed, dropout=0.0)
|
||||||
width,
|
|
||||||
rows if feature == NORM else rows // 2,
|
|
||||||
column=cols.index(feature),
|
|
||||||
seed=seed,
|
|
||||||
dropout=0.0,
|
|
||||||
)
|
|
||||||
|
|
||||||
if also_embed_subwords:
|
embeddings = [make_hash_embed(i) for i in range(len(attrs))]
|
||||||
embeddings = [
|
concat_size = width * (len(embeddings) + include_static_vectors)
|
||||||
make_hash_embed(NORM),
|
if include_static_vectors:
|
||||||
make_hash_embed(PREFIX),
|
|
||||||
make_hash_embed(SUFFIX),
|
|
||||||
make_hash_embed(SHAPE),
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
embeddings = [make_hash_embed(NORM)]
|
|
||||||
concat_size = width * (len(embeddings) + also_use_static_vectors)
|
|
||||||
if also_use_static_vectors:
|
|
||||||
model = chain(
|
model = chain(
|
||||||
concatenate(
|
concatenate(
|
||||||
chain(
|
chain(
|
||||||
FeatureExtractor(cols),
|
FeatureExtractor(attrs),
|
||||||
list2ragged(),
|
list2ragged(),
|
||||||
with_array(concatenate(*embeddings)),
|
with_array(concatenate(*embeddings)),
|
||||||
),
|
),
|
||||||
|
@ -154,7 +162,7 @@ def MultiHashEmbed(
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
model = chain(
|
model = chain(
|
||||||
FeatureExtractor(cols),
|
FeatureExtractor(list(attrs)),
|
||||||
list2ragged(),
|
list2ragged(),
|
||||||
with_array(concatenate(*embeddings)),
|
with_array(concatenate(*embeddings)),
|
||||||
with_array(Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)),
|
with_array(Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)),
|
||||||
|
@ -164,7 +172,14 @@ def MultiHashEmbed(
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
||||||
def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
|
def CharacterEmbed(
|
||||||
|
width: int,
|
||||||
|
rows: int,
|
||||||
|
nM: int,
|
||||||
|
nC: int,
|
||||||
|
include_static_vectors: bool,
|
||||||
|
feature: Union[int, str] = "LOWER",
|
||||||
|
) -> Model[List[Doc], List[Floats2d]]:
|
||||||
"""Construct an embedded representation based on character embeddings, using
|
"""Construct an embedded representation based on character embeddings, using
|
||||||
a feed-forward network. A fixed number of UTF-8 byte characters are used for
|
a feed-forward network. A fixed number of UTF-8 byte characters are used for
|
||||||
each word, taken from the beginning and end of the word equally. Padding is
|
each word, taken from the beginning and end of the word equally. Padding is
|
||||||
|
@ -177,30 +192,55 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
|
||||||
of being in an arbitrary position depending on the word length.
|
of being in an arbitrary position depending on the word length.
|
||||||
|
|
||||||
The characters are embedded in a embedding table with a given number of rows,
|
The characters are embedded in a embedding table with a given number of rows,
|
||||||
and the vectors concatenated. A hash-embedded vector of the NORM of the word is
|
and the vectors concatenated. A hash-embedded vector of the LOWER of the word is
|
||||||
also concatenated on, and the result is then passed through a feed-forward
|
also concatenated on, and the result is then passed through a feed-forward
|
||||||
network to construct a single vector to represent the information.
|
network to construct a single vector to represent the information.
|
||||||
|
|
||||||
width (int): The width of the output vector and the NORM hash embedding.
|
feature (int or str): An attribute to embed, to concatenate with the characters.
|
||||||
rows (int): The number of rows in the NORM hash embedding table.
|
width (int): The width of the output vector and the feature embedding.
|
||||||
|
rows (int): The number of rows in the LOWER hash embedding table.
|
||||||
nM (int): The dimensionality of the character embeddings. Recommended values
|
nM (int): The dimensionality of the character embeddings. Recommended values
|
||||||
are between 16 and 64.
|
are between 16 and 64.
|
||||||
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
|
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
|
||||||
are between 3 and 8, although it may depend on the length of words in the
|
are between 3 and 8, although it may depend on the length of words in the
|
||||||
language.
|
language.
|
||||||
|
include_static_vectors (bool): Whether to also use static word vectors.
|
||||||
|
Requires a vectors table to be loaded in the Doc objects' vocab.
|
||||||
"""
|
"""
|
||||||
model = chain(
|
feature = intify_attr(feature)
|
||||||
concatenate(
|
if feature is None:
|
||||||
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
raise ValueError(Errors.E911(feat=feature))
|
||||||
chain(
|
if include_static_vectors:
|
||||||
FeatureExtractor([NORM]),
|
model = chain(
|
||||||
list2ragged(),
|
concatenate(
|
||||||
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
||||||
|
chain(
|
||||||
|
FeatureExtractor([feature]),
|
||||||
|
list2ragged(),
|
||||||
|
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
||||||
|
),
|
||||||
|
StaticVectors(width, dropout=0.0),
|
||||||
),
|
),
|
||||||
),
|
with_array(
|
||||||
with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
|
Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)
|
||||||
ragged2list(),
|
),
|
||||||
)
|
ragged2list(),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
model = chain(
|
||||||
|
concatenate(
|
||||||
|
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
||||||
|
chain(
|
||||||
|
FeatureExtractor([feature]),
|
||||||
|
list2ragged(),
|
||||||
|
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
with_array(
|
||||||
|
Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)
|
||||||
|
),
|
||||||
|
ragged2list(),
|
||||||
|
)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
from typing import List, Tuple, Callable, Optional, cast
|
from typing import List, Tuple, Callable, Optional, cast
|
||||||
|
|
||||||
from thinc.initializers import glorot_uniform_init
|
from thinc.initializers import glorot_uniform_init
|
||||||
from thinc.util import partial
|
from thinc.util import partial
|
||||||
from thinc.types import Ragged, Floats2d, Floats1d
|
from thinc.types import Ragged, Floats2d, Floats1d
|
||||||
from thinc.api import Model, Ops, registry
|
from thinc.api import Model, Ops, registry
|
||||||
|
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
|
from ..errors import Errors
|
||||||
|
|
||||||
|
|
||||||
@registry.layers("spacy.StaticVectors.v1")
|
@registry.layers("spacy.StaticVectors.v1")
|
||||||
|
@ -34,12 +34,11 @@ def StaticVectors(
|
||||||
def forward(
|
def forward(
|
||||||
model: Model[List[Doc], Ragged], docs: List[Doc], is_train: bool
|
model: Model[List[Doc], Ragged], docs: List[Doc], is_train: bool
|
||||||
) -> Tuple[Ragged, Callable]:
|
) -> Tuple[Ragged, Callable]:
|
||||||
if not len(docs):
|
if not sum(len(doc) for doc in docs):
|
||||||
return _handle_empty(model.ops, model.get_dim("nO"))
|
return _handle_empty(model.ops, model.get_dim("nO"))
|
||||||
key_attr = model.attrs["key_attr"]
|
key_attr = model.attrs["key_attr"]
|
||||||
W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
|
W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
|
||||||
V = cast(Floats2d, docs[0].vocab.vectors.data)
|
V = cast(Floats2d, docs[0].vocab.vectors.data)
|
||||||
mask = _get_drop_mask(model.ops, W.shape[0], model.attrs.get("dropout_rate"))
|
|
||||||
rows = model.ops.flatten(
|
rows = model.ops.flatten(
|
||||||
[doc.vocab.vectors.find(keys=doc.to_array(key_attr)) for doc in docs]
|
[doc.vocab.vectors.find(keys=doc.to_array(key_attr)) for doc in docs]
|
||||||
)
|
)
|
||||||
|
@ -47,8 +46,11 @@ def forward(
|
||||||
model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True),
|
model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True),
|
||||||
model.ops.asarray([len(doc) for doc in docs], dtype="i"),
|
model.ops.asarray([len(doc) for doc in docs], dtype="i"),
|
||||||
)
|
)
|
||||||
if mask is not None:
|
mask = None
|
||||||
output.data *= mask
|
if is_train:
|
||||||
|
mask = _get_drop_mask(model.ops, W.shape[0], model.attrs.get("dropout_rate"))
|
||||||
|
if mask is not None:
|
||||||
|
output.data *= mask
|
||||||
|
|
||||||
def backprop(d_output: Ragged) -> List[Doc]:
|
def backprop(d_output: Ragged) -> List[Doc]:
|
||||||
if mask is not None:
|
if mask is not None:
|
||||||
|
@ -76,16 +78,9 @@ def init(
|
||||||
nO = Y.data.shape[1]
|
nO = Y.data.shape[1]
|
||||||
|
|
||||||
if nM is None:
|
if nM is None:
|
||||||
raise ValueError(
|
raise ValueError(Errors.E905)
|
||||||
"Cannot initialize StaticVectors layer: nM dimension unset. "
|
|
||||||
"This dimension refers to the width of the vectors table."
|
|
||||||
)
|
|
||||||
if nO is None:
|
if nO is None:
|
||||||
raise ValueError(
|
raise ValueError(Errors.E904)
|
||||||
"Cannot initialize StaticVectors layer: nO dimension unset. "
|
|
||||||
"This dimension refers to the output width, after the linear "
|
|
||||||
"projection has been applied."
|
|
||||||
)
|
|
||||||
model.set_dim("nM", nM)
|
model.set_dim("nM", nM)
|
||||||
model.set_dim("nO", nO)
|
model.set_dim("nO", nO)
|
||||||
model.set_param("W", init_W(model.ops, (nO, nM)))
|
model.set_param("W", init_W(model.ops, (nO, nM)))
|
||||||
|
|
|
@ -29,7 +29,8 @@ cdef class Morphology:
|
||||||
FEATURE_SEP = "|"
|
FEATURE_SEP = "|"
|
||||||
FIELD_SEP = "="
|
FIELD_SEP = "="
|
||||||
VALUE_SEP = ","
|
VALUE_SEP = ","
|
||||||
EMPTY_MORPH = "_" # not an empty string so that the PreshMap key is not 0
|
# not an empty string so that the PreshMap key is not 0
|
||||||
|
EMPTY_MORPH = symbols.NAMES[symbols._]
|
||||||
|
|
||||||
def __init__(self, StringStore strings):
|
def __init__(self, StringStore strings):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
|
|
|
@ -78,7 +78,7 @@ def get_attr_info(nlp: "Language", attr: str) -> Dict[str, List[str]]:
|
||||||
|
|
||||||
|
|
||||||
def analyze_pipes(
|
def analyze_pipes(
|
||||||
nlp: "Language", *, keys: List[str] = DEFAULT_KEYS,
|
nlp: "Language", *, keys: List[str] = DEFAULT_KEYS
|
||||||
) -> Dict[str, Union[List[str], Dict[str, List[str]]]]:
|
) -> Dict[str, Union[List[str], Dict[str, List[str]]]]:
|
||||||
"""Print a formatted summary for the current nlp object's pipeline. Shows
|
"""Print a formatted summary for the current nlp object's pipeline. Shows
|
||||||
a table with the pipeline components and why they assign and require, as
|
a table with the pipeline components and why they assign and require, as
|
||||||
|
|
|
@ -6,6 +6,7 @@ from .entityruler import EntityRuler
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
from .morphologizer import Morphologizer
|
from .morphologizer import Morphologizer
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
|
from .trainable_pipe import TrainablePipe
|
||||||
from .senter import SentenceRecognizer
|
from .senter import SentenceRecognizer
|
||||||
from .sentencizer import Sentencizer
|
from .sentencizer import Sentencizer
|
||||||
from .tagger import Tagger
|
from .tagger import Tagger
|
||||||
|
@ -21,6 +22,7 @@ __all__ = [
|
||||||
"EntityRuler",
|
"EntityRuler",
|
||||||
"Morphologizer",
|
"Morphologizer",
|
||||||
"Lemmatizer",
|
"Lemmatizer",
|
||||||
|
"TrainablePipe",
|
||||||
"Pipe",
|
"Pipe",
|
||||||
"SentenceRecognizer",
|
"SentenceRecognizer",
|
||||||
"Sentencizer",
|
"Sentencizer",
|
||||||
|
|
|
@ -9,10 +9,11 @@ from ...strings cimport hash_string
|
||||||
from ...structs cimport TokenC
|
from ...structs cimport TokenC
|
||||||
from ...tokens.doc cimport Doc, set_children_from_heads
|
from ...tokens.doc cimport Doc, set_children_from_heads
|
||||||
from ...training.example cimport Example
|
from ...training.example cimport Example
|
||||||
from ...errors import Errors
|
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
|
|
||||||
|
from ...errors import Errors
|
||||||
|
|
||||||
# Calculate cost as gold/not gold. We don't use scalar value anyway.
|
# Calculate cost as gold/not gold. We don't use scalar value anyway.
|
||||||
cdef int BINARY_COSTS = 1
|
cdef int BINARY_COSTS = 1
|
||||||
cdef weight_t MIN_SCORE = -90000
|
cdef weight_t MIN_SCORE = -90000
|
||||||
|
@ -86,7 +87,7 @@ cdef GoldParseStateC create_gold_state(Pool mem, StateClass stcls,
|
||||||
SENT_START_UNKNOWN,
|
SENT_START_UNKNOWN,
|
||||||
0
|
0
|
||||||
)
|
)
|
||||||
|
|
||||||
elif is_sent_start is None:
|
elif is_sent_start is None:
|
||||||
gs.state_bits[i] = set_state_flag(
|
gs.state_bits[i] = set_state_flag(
|
||||||
gs.state_bits[i],
|
gs.state_bits[i],
|
||||||
|
@ -109,7 +110,7 @@ cdef GoldParseStateC create_gold_state(Pool mem, StateClass stcls,
|
||||||
IS_SENT_START,
|
IS_SENT_START,
|
||||||
0
|
0
|
||||||
)
|
)
|
||||||
|
|
||||||
for i, (head, label) in enumerate(zip(heads, labels)):
|
for i, (head, label) in enumerate(zip(heads, labels)):
|
||||||
if head is not None:
|
if head is not None:
|
||||||
gs.heads[i] = head
|
gs.heads[i] = head
|
||||||
|
@ -158,7 +159,7 @@ cdef void update_gold_state(GoldParseStateC* gs, StateClass stcls) nogil:
|
||||||
)
|
)
|
||||||
gs.n_kids_in_stack[i] = 0
|
gs.n_kids_in_stack[i] = 0
|
||||||
gs.n_kids_in_buffer[i] = 0
|
gs.n_kids_in_buffer[i] = 0
|
||||||
|
|
||||||
for i in range(stcls.stack_depth()):
|
for i in range(stcls.stack_depth()):
|
||||||
s_i = stcls.S(i)
|
s_i = stcls.S(i)
|
||||||
if not is_head_unknown(gs, s_i):
|
if not is_head_unknown(gs, s_i):
|
||||||
|
@ -403,7 +404,7 @@ cdef class RightArc:
|
||||||
return 0
|
return 0
|
||||||
sent_start = st._sent[st.B_(0).l_edge].sent_start
|
sent_start = st._sent[st.B_(0).l_edge].sent_start
|
||||||
return sent_start != 1 and st.H(st.S(0)) != st.B(0)
|
return sent_start != 1 and st.H(st.S(0)) != st.B(0)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, attr_t label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
st.add_arc(st.S(0), st.B(0), label)
|
st.add_arc(st.S(0), st.B(0), label)
|
||||||
|
@ -679,8 +680,7 @@ cdef class ArcEager(TransitionSystem):
|
||||||
st._sent[i].dep = self.root_label
|
st._sent[i].dep = self.root_label
|
||||||
|
|
||||||
def finalize_doc(self, Doc doc):
|
def finalize_doc(self, Doc doc):
|
||||||
doc.is_parsed = True
|
set_children_from_heads(doc.c, 0, doc.length)
|
||||||
set_children_from_heads(doc.c, doc.length)
|
|
||||||
|
|
||||||
def has_gold(self, Example eg, start=0, end=None):
|
def has_gold(self, Example eg, start=0, end=None):
|
||||||
for word in eg.y[start:end]:
|
for word in eg.y[start:end]:
|
||||||
|
@ -702,10 +702,10 @@ cdef class ArcEager(TransitionSystem):
|
||||||
output[i] = self.c[i].is_valid(st, self.c[i].label)
|
output[i] = self.c[i].is_valid(st, self.c[i].label)
|
||||||
else:
|
else:
|
||||||
output[i] = is_valid[self.c[i].move]
|
output[i] = is_valid[self.c[i].move]
|
||||||
|
|
||||||
def get_cost(self, StateClass stcls, gold, int i):
|
def get_cost(self, StateClass stcls, gold, int i):
|
||||||
if not isinstance(gold, ArcEagerGold):
|
if not isinstance(gold, ArcEagerGold):
|
||||||
raise TypeError("Expected ArcEagerGold")
|
raise TypeError(Errors.E909.format(name="ArcEagerGold"))
|
||||||
cdef ArcEagerGold gold_ = gold
|
cdef ArcEagerGold gold_ = gold
|
||||||
gold_state = gold_.c
|
gold_state = gold_.c
|
||||||
n_gold = 0
|
n_gold = 0
|
||||||
|
@ -718,7 +718,7 @@ cdef class ArcEager(TransitionSystem):
|
||||||
cdef int set_costs(self, int* is_valid, weight_t* costs,
|
cdef int set_costs(self, int* is_valid, weight_t* costs,
|
||||||
StateClass stcls, gold) except -1:
|
StateClass stcls, gold) except -1:
|
||||||
if not isinstance(gold, ArcEagerGold):
|
if not isinstance(gold, ArcEagerGold):
|
||||||
raise TypeError("Expected ArcEagerGold")
|
raise TypeError(Errors.E909.format(name="ArcEagerGold"))
|
||||||
cdef ArcEagerGold gold_ = gold
|
cdef ArcEagerGold gold_ = gold
|
||||||
gold_.update(stcls)
|
gold_.update(stcls)
|
||||||
gold_state = gold_.c
|
gold_state = gold_.c
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user