mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 02:36:32 +03:00
Merge remote-tracking branch 'upstream/develop' into feature/more-layers-docs
This commit is contained in:
commit
06ef66fd73
107
.github/contributors/bittlingmayer.md
vendored
Normal file
107
.github/contributors/bittlingmayer.md
vendored
Normal file
|
@ -0,0 +1,107 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Adam Bittlingmayer |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 12 Aug 2020 |
|
||||||
|
| GitHub username | bittlingmayer |
|
||||||
|
| Website (optional) | |
|
||||||
|
|
106
.github/contributors/graue70.md
vendored
Normal file
106
.github/contributors/graue70.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Thomas |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-08-11 |
|
||||||
|
| GitHub username | graue70 |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/holubvl3.md
vendored
Normal file
106
.github/contributors/holubvl3.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Vladimir Holubec |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 30.07.2020 |
|
||||||
|
| GitHub username | holubvl3 |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/idoshr.md
vendored
Normal file
106
.github/contributors/idoshr.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Ido Shraga |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 20-09-2020 |
|
||||||
|
| GitHub username | idoshr |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/jgutix.md
vendored
Normal file
106
.github/contributors/jgutix.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Juan Gutiérrez |
|
||||||
|
| Company name (if applicable) | Ojtli |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-08-28 |
|
||||||
|
| GitHub username | jgutix |
|
||||||
|
| Website (optional) | ojtli.app |
|
106
.github/contributors/leyendecker.md
vendored
Normal file
106
.github/contributors/leyendecker.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | ---------------------------- |
|
||||||
|
| Name | Gustavo Zadrozny Leyendecker |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | July 29, 2020 |
|
||||||
|
| GitHub username | leyendecker |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/lizhe2004.md
vendored
Normal file
106
.github/contributors/lizhe2004.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | ------------------------ |
|
||||||
|
| Name | Zhe li |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-07-24 |
|
||||||
|
| GitHub username | lizhe2004 |
|
||||||
|
| Website (optional) | http://www.huahuaxia.net|
|
106
.github/contributors/snsten.md
vendored
Normal file
106
.github/contributors/snsten.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Shashank Shekhar |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-08-23 |
|
||||||
|
| GitHub username | snsten |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/solarmist.md
vendored
Normal file
106
.github/contributors/solarmist.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | ------------------------- |
|
||||||
|
| Name | Joshua Olson |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-07-22 |
|
||||||
|
| GitHub username | solarmist |
|
||||||
|
| Website (optional) | http://blog.solarmist.net |
|
106
.github/contributors/tilusnet.md
vendored
Normal file
106
.github/contributors/tilusnet.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Attila Szász |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 12 Aug 2020 |
|
||||||
|
| GitHub username | tilusnet |
|
||||||
|
| Website (optional) | |
|
|
@ -36,7 +36,7 @@ max_length = 0
|
||||||
limit = 0
|
limit = 0
|
||||||
|
|
||||||
[training.batcher]
|
[training.batcher]
|
||||||
@batchers = "batch_by_words.v1"
|
@batchers = "spacy.batch_by_words.v1"
|
||||||
discard_oversize = false
|
discard_oversize = false
|
||||||
tolerance = 0.2
|
tolerance = 0.2
|
||||||
|
|
||||||
|
|
|
@ -35,7 +35,7 @@ max_length = 0
|
||||||
limit = 0
|
limit = 0
|
||||||
|
|
||||||
[training.batcher]
|
[training.batcher]
|
||||||
@batchers = "batch_by_words.v1"
|
@batchers = "spacy.batch_by_words.v1"
|
||||||
discard_oversize = false
|
discard_oversize = false
|
||||||
tolerance = 0.2
|
tolerance = 0.2
|
||||||
|
|
||||||
|
|
38
licenses/3rd_party_licenses.txt
Normal file
38
licenses/3rd_party_licenses.txt
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
Third Party Licenses for spaCy
|
||||||
|
==============================
|
||||||
|
|
||||||
|
NumPy
|
||||||
|
-----
|
||||||
|
|
||||||
|
* Files: setup.py
|
||||||
|
|
||||||
|
Copyright (c) 2005-2020, NumPy Developers.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above
|
||||||
|
copyright notice, this list of conditions and the following
|
||||||
|
disclaimer in the documentation and/or other materials provided
|
||||||
|
with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of the NumPy Developers nor the names of any
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -24,7 +24,7 @@ redirects = [
|
||||||
{from = "/docs/usage/customizing-tokenizer", to = "/usage/linguistic-features#tokenization", force = true},
|
{from = "/docs/usage/customizing-tokenizer", to = "/usage/linguistic-features#tokenization", force = true},
|
||||||
{from = "/docs/usage/language-processing-pipeline", to = "/usage/processing-pipelines", force = true},
|
{from = "/docs/usage/language-processing-pipeline", to = "/usage/processing-pipelines", force = true},
|
||||||
{from = "/docs/usage/customizing-pipeline", to = "/usage/processing-pipelines", force = true},
|
{from = "/docs/usage/customizing-pipeline", to = "/usage/processing-pipelines", force = true},
|
||||||
{from = "/docs/usage/training-ner", to = "/usage/training#ner", force = true},
|
{from = "/docs/usage/training-ner", to = "/usage/training", force = true},
|
||||||
{from = "/docs/usage/tutorials", to = "/usage/examples", force = true},
|
{from = "/docs/usage/tutorials", to = "/usage/examples", force = true},
|
||||||
{from = "/docs/usage/data-model", to = "/api", force = true},
|
{from = "/docs/usage/data-model", to = "/api", force = true},
|
||||||
{from = "/docs/usage/cli", to = "/api/cli", force = true},
|
{from = "/docs/usage/cli", to = "/api/cli", force = true},
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy-nightly"
|
__title__ = "spacy-nightly"
|
||||||
__version__ = "3.0.0a13"
|
__version__ = "3.0.0a14"
|
||||||
__release__ = True
|
__release__ = True
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
|
|
|
@ -29,9 +29,9 @@ from .project.document import project_document # noqa: F401
|
||||||
|
|
||||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||||
def link(*args, **kwargs):
|
def link(*args, **kwargs):
|
||||||
"""As of spaCy v3.0, model symlinks are deprecated. You can load models
|
"""As of spaCy v3.0, symlinks like "en" are deprecated. You can load trained
|
||||||
using their full names or from a directory path."""
|
pipeline packages using their full names or from a directory path."""
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"As of spaCy v3.0, model symlinks are deprecated. You can load models "
|
"As of spaCy v3.0, model symlinks are deprecated. You can load trained "
|
||||||
"using their full names or from a directory path."
|
"pipeline packages using their full names or from a directory path."
|
||||||
)
|
)
|
||||||
|
|
|
@ -25,7 +25,7 @@ COMMAND = "python -m spacy"
|
||||||
NAME = "spacy"
|
NAME = "spacy"
|
||||||
HELP = """spaCy Command-line Interface
|
HELP = """spaCy Command-line Interface
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/cli
|
DOCS: https://nightly.spacy.io/api/cli
|
||||||
"""
|
"""
|
||||||
PROJECT_HELP = f"""Command-line interface for spaCy projects and templates.
|
PROJECT_HELP = f"""Command-line interface for spaCy projects and templates.
|
||||||
You'd typically start by cloning a project template to a local directory and
|
You'd typically start by cloning a project template to a local directory and
|
||||||
|
@ -36,7 +36,7 @@ DEBUG_HELP = """Suite of helpful commands for debugging and profiling. Includes
|
||||||
commands to check and validate your config files, training and evaluation data,
|
commands to check and validate your config files, training and evaluation data,
|
||||||
and custom model implementations.
|
and custom model implementations.
|
||||||
"""
|
"""
|
||||||
INIT_HELP = """Commands for initializing configs and models."""
|
INIT_HELP = """Commands for initializing configs and pipeline packages."""
|
||||||
|
|
||||||
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
||||||
# keep the names short, but not needed at the moment.
|
# keep the names short, but not needed at the moment.
|
||||||
|
|
|
@ -44,7 +44,7 @@ def convert_cli(
|
||||||
file_type: FileTypes = Opt("spacy", "--file-type", "-t", help="Type of data to produce"),
|
file_type: FileTypes = Opt("spacy", "--file-type", "-t", help="Type of data to produce"),
|
||||||
n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"),
|
n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"),
|
||||||
seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"),
|
seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"),
|
||||||
model: Optional[str] = Opt(None, "--model", "-b", help="Model for sentence segmentation (for -s)"),
|
model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"),
|
||||||
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
|
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
|
||||||
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
|
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
|
||||||
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
||||||
|
@ -61,6 +61,8 @@ def convert_cli(
|
||||||
If no output_dir is specified and the output format is JSON, the data
|
If no output_dir is specified and the output format is JSON, the data
|
||||||
is written to stdout, so you can pipe them forward to a JSON file:
|
is written to stdout, so you can pipe them forward to a JSON file:
|
||||||
$ spacy convert some_file.conllu --file-type json > some_file.json
|
$ spacy convert some_file.conllu --file-type json > some_file.json
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#convert
|
||||||
"""
|
"""
|
||||||
if isinstance(file_type, FileTypes):
|
if isinstance(file_type, FileTypes):
|
||||||
# We get an instance of the FileTypes from the CLI so we need its string value
|
# We get an instance of the FileTypes from the CLI so we need its string value
|
||||||
|
@ -261,6 +263,6 @@ def _get_converter(msg, converter, input_path):
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Can't automatically detect NER format. "
|
"Can't automatically detect NER format. "
|
||||||
"Conversion may not succeed. "
|
"Conversion may not succeed. "
|
||||||
"See https://spacy.io/api/cli#convert"
|
"See https://nightly.spacy.io/api/cli#convert"
|
||||||
)
|
)
|
||||||
return converter
|
return converter
|
||||||
|
|
|
@ -31,6 +31,8 @@ def debug_config_cli(
|
||||||
Similar as with the 'train' command, you can override settings from the config
|
Similar as with the 'train' command, you can override settings from the config
|
||||||
as command line options. For instance, --training.batch_size 128 overrides
|
as command line options. For instance, --training.batch_size 128 overrides
|
||||||
the value of "batch_size" in the block "[training]".
|
the value of "batch_size" in the block "[training]".
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#debug-config
|
||||||
"""
|
"""
|
||||||
overrides = parse_config_overrides(ctx.args)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
|
|
|
@ -18,7 +18,7 @@ from .. import util
|
||||||
NEW_LABEL_THRESHOLD = 50
|
NEW_LABEL_THRESHOLD = 50
|
||||||
# Minimum number of expected occurrences of dependency labels
|
# Minimum number of expected occurrences of dependency labels
|
||||||
DEP_LABEL_THRESHOLD = 20
|
DEP_LABEL_THRESHOLD = 20
|
||||||
# Minimum number of expected examples to train a blank model
|
# Minimum number of expected examples to train a new pipeline
|
||||||
BLANK_MODEL_MIN_THRESHOLD = 100
|
BLANK_MODEL_MIN_THRESHOLD = 100
|
||||||
BLANK_MODEL_THRESHOLD = 2000
|
BLANK_MODEL_THRESHOLD = 2000
|
||||||
|
|
||||||
|
@ -47,6 +47,8 @@ def debug_data_cli(
|
||||||
Analyze, debug and validate your training and development data. Outputs
|
Analyze, debug and validate your training and development data. Outputs
|
||||||
useful stats, and can help you find problems like invalid entity annotations,
|
useful stats, and can help you find problems like invalid entity annotations,
|
||||||
cyclic dependencies, low data labels and more.
|
cyclic dependencies, low data labels and more.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#debug-data
|
||||||
"""
|
"""
|
||||||
if ctx.command.name == "debug-data":
|
if ctx.command.name == "debug-data":
|
||||||
msg.warn(
|
msg.warn(
|
||||||
|
@ -148,7 +150,7 @@ def debug_data(
|
||||||
msg.text(f"Language: {config['nlp']['lang']}")
|
msg.text(f"Language: {config['nlp']['lang']}")
|
||||||
msg.text(f"Training pipeline: {', '.join(pipeline)}")
|
msg.text(f"Training pipeline: {', '.join(pipeline)}")
|
||||||
if resume_components:
|
if resume_components:
|
||||||
msg.text(f"Components from other models: {', '.join(resume_components)}")
|
msg.text(f"Components from other pipelines: {', '.join(resume_components)}")
|
||||||
if frozen_components:
|
if frozen_components:
|
||||||
msg.text(f"Frozen components: {', '.join(frozen_components)}")
|
msg.text(f"Frozen components: {', '.join(frozen_components)}")
|
||||||
msg.text(f"{len(train_dataset)} training docs")
|
msg.text(f"{len(train_dataset)} training docs")
|
||||||
|
@ -164,9 +166,7 @@ def debug_data(
|
||||||
# TODO: make this feedback more fine-grained and report on updated
|
# TODO: make this feedback more fine-grained and report on updated
|
||||||
# components vs. blank components
|
# components vs. blank components
|
||||||
if not resume_components and len(train_dataset) < BLANK_MODEL_THRESHOLD:
|
if not resume_components and len(train_dataset) < BLANK_MODEL_THRESHOLD:
|
||||||
text = (
|
text = f"Low number of examples to train a new pipeline ({len(train_dataset)})"
|
||||||
f"Low number of examples to train from a blank model ({len(train_dataset)})"
|
|
||||||
)
|
|
||||||
if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD:
|
if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD:
|
||||||
msg.fail(text)
|
msg.fail(text)
|
||||||
else:
|
else:
|
||||||
|
@ -214,7 +214,7 @@ def debug_data(
|
||||||
show=verbose,
|
show=verbose,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
msg.info("No word vectors present in the model")
|
msg.info("No word vectors present in the package")
|
||||||
|
|
||||||
if "ner" in factory_names:
|
if "ner" in factory_names:
|
||||||
# Get all unique NER labels present in the data
|
# Get all unique NER labels present in the data
|
||||||
|
|
|
@ -30,6 +30,8 @@ def debug_model_cli(
|
||||||
"""
|
"""
|
||||||
Analyze a Thinc model implementation. Includes checks for internal structure
|
Analyze a Thinc model implementation. Includes checks for internal structure
|
||||||
and activations during training.
|
and activations during training.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#debug-model
|
||||||
"""
|
"""
|
||||||
if use_gpu >= 0:
|
if use_gpu >= 0:
|
||||||
msg.info("Using GPU")
|
msg.info("Using GPU")
|
||||||
|
|
|
@ -17,16 +17,19 @@ from ..errors import OLD_MODEL_SHORTCUTS
|
||||||
def download_cli(
|
def download_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context,
|
ctx: typer.Context,
|
||||||
model: str = Arg(..., help="Name of model to download"),
|
model: str = Arg(..., help="Name of pipeline package to download"),
|
||||||
direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"),
|
direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Download compatible model from default download path using pip. If --direct
|
Download compatible trained pipeline from the default download path using
|
||||||
flag is set, the command expects the full model name with version.
|
pip. If --direct flag is set, the command expects the full package name with
|
||||||
For direct downloads, the compatibility check will be skipped. All
|
version. For direct downloads, the compatibility check will be skipped. All
|
||||||
additional arguments provided to this command will be passed to `pip install`
|
additional arguments provided to this command will be passed to `pip install`
|
||||||
on model installation.
|
on package installation.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#download
|
||||||
|
AVAILABLE PACKAGES: https://spacy.io/models
|
||||||
"""
|
"""
|
||||||
download(model, direct, *ctx.args)
|
download(model, direct, *ctx.args)
|
||||||
|
|
||||||
|
@ -34,11 +37,11 @@ def download_cli(
|
||||||
def download(model: str, direct: bool = False, *pip_args) -> None:
|
def download(model: str, direct: bool = False, *pip_args) -> None:
|
||||||
if not is_package("spacy") and "--no-deps" not in pip_args:
|
if not is_package("spacy") and "--no-deps" not in pip_args:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Skipping model package dependencies and setting `--no-deps`. "
|
"Skipping pipeline package dependencies and setting `--no-deps`. "
|
||||||
"You don't seem to have the spaCy package itself installed "
|
"You don't seem to have the spaCy package itself installed "
|
||||||
"(maybe because you've built from source?), so installing the "
|
"(maybe because you've built from source?), so installing the "
|
||||||
"model dependencies would cause spaCy to be downloaded, which "
|
"package dependencies would cause spaCy to be downloaded, which "
|
||||||
"probably isn't what you want. If the model package has other "
|
"probably isn't what you want. If the pipeline package has other "
|
||||||
"dependencies, you'll have to install them manually."
|
"dependencies, you'll have to install them manually."
|
||||||
)
|
)
|
||||||
pip_args = pip_args + ("--no-deps",)
|
pip_args = pip_args + ("--no-deps",)
|
||||||
|
@ -53,7 +56,7 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
|
||||||
if model in OLD_MODEL_SHORTCUTS:
|
if model in OLD_MODEL_SHORTCUTS:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please"
|
f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please"
|
||||||
f"use the full model name '{OLD_MODEL_SHORTCUTS[model]}' instead."
|
f"use the full pipeline package name '{OLD_MODEL_SHORTCUTS[model]}' instead."
|
||||||
)
|
)
|
||||||
model_name = OLD_MODEL_SHORTCUTS[model]
|
model_name = OLD_MODEL_SHORTCUTS[model]
|
||||||
compatibility = get_compatibility()
|
compatibility = get_compatibility()
|
||||||
|
@ -61,7 +64,7 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
|
||||||
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
||||||
msg.good(
|
msg.good(
|
||||||
"Download and installation successful",
|
"Download and installation successful",
|
||||||
f"You can now load the model via spacy.load('{model_name}')",
|
f"You can now load the package via spacy.load('{model_name}')",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -71,16 +74,16 @@ def get_compatibility() -> dict:
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
f"Server error ({r.status_code})",
|
f"Server error ({r.status_code})",
|
||||||
f"Couldn't fetch compatibility table. Please find a model for your spaCy "
|
f"Couldn't fetch compatibility table. Please find a package for your spaCy "
|
||||||
f"installation (v{about.__version__}), and download it manually. "
|
f"installation (v{about.__version__}), and download it manually. "
|
||||||
f"For more details, see the documentation: "
|
f"For more details, see the documentation: "
|
||||||
f"https://spacy.io/usage/models",
|
f"https://nightly.spacy.io/usage/models",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
comp_table = r.json()
|
comp_table = r.json()
|
||||||
comp = comp_table["spacy"]
|
comp = comp_table["spacy"]
|
||||||
if version not in comp:
|
if version not in comp:
|
||||||
msg.fail(f"No compatible models found for v{version} of spaCy", exits=1)
|
msg.fail(f"No compatible packages found for v{version} of spaCy", exits=1)
|
||||||
return comp[version]
|
return comp[version]
|
||||||
|
|
||||||
|
|
||||||
|
@ -88,7 +91,7 @@ def get_version(model: str, comp: dict) -> str:
|
||||||
model = get_base_version(model)
|
model = get_base_version(model)
|
||||||
if model not in comp:
|
if model not in comp:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
f"No compatible model found for '{model}' (spaCy v{about.__version__})",
|
f"No compatible package found for '{model}' (spaCy v{about.__version__})",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
return comp[model][0]
|
return comp[model][0]
|
||||||
|
|
|
@ -26,13 +26,16 @@ def evaluate_cli(
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Evaluate a model. Expects a loadable spaCy model and evaluation data in the
|
Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation
|
||||||
binary .spacy format. The --gold-preproc option sets up the evaluation
|
data in the binary .spacy format. The --gold-preproc option sets up the
|
||||||
examples with gold-standard sentences and tokens for the predictions. Gold
|
evaluation examples with gold-standard sentences and tokens for the
|
||||||
preprocessing helps the annotations align to the tokenization, and may
|
predictions. Gold preprocessing helps the annotations align to the
|
||||||
result in sequences of more consistent length. However, it may reduce
|
tokenization, and may result in sequences of more consistent length. However,
|
||||||
runtime accuracy due to train/test skew. To render a sample of dependency
|
it may reduce runtime accuracy due to train/test skew. To render a sample of
|
||||||
parses in a HTML file, set as output directory as the displacy_path argument.
|
dependency parses in a HTML file, set as output directory as the
|
||||||
|
displacy_path argument.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#evaluate
|
||||||
"""
|
"""
|
||||||
evaluate(
|
evaluate(
|
||||||
model,
|
model,
|
||||||
|
|
|
@ -12,15 +12,17 @@ from .. import about
|
||||||
@app.command("info")
|
@app.command("info")
|
||||||
def info_cli(
|
def info_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
model: Optional[str] = Arg(None, help="Optional model name"),
|
model: Optional[str] = Arg(None, help="Optional loadable spaCy pipeline"),
|
||||||
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
|
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
|
||||||
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
|
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Print info about spaCy installation. If a model is speficied as an argument,
|
Print info about spaCy installation. If a pipeline is speficied as an argument,
|
||||||
print model information. Flag --markdown prints details in Markdown for easy
|
print its meta information. Flag --markdown prints details in Markdown for easy
|
||||||
copy-pasting to GitHub issues.
|
copy-pasting to GitHub issues.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#info
|
||||||
"""
|
"""
|
||||||
info(model, markdown=markdown, silent=silent)
|
info(model, markdown=markdown, silent=silent)
|
||||||
|
|
||||||
|
@ -30,14 +32,16 @@ def info(
|
||||||
) -> Union[str, dict]:
|
) -> Union[str, dict]:
|
||||||
msg = Printer(no_print=silent, pretty=not silent)
|
msg = Printer(no_print=silent, pretty=not silent)
|
||||||
if model:
|
if model:
|
||||||
title = f"Info about model '{model}'"
|
title = f"Info about pipeline '{model}'"
|
||||||
data = info_model(model, silent=silent)
|
data = info_model(model, silent=silent)
|
||||||
else:
|
else:
|
||||||
title = "Info about spaCy"
|
title = "Info about spaCy"
|
||||||
data = info_spacy()
|
data = info_spacy()
|
||||||
raw_data = {k.lower().replace(" ", "_"): v for k, v in data.items()}
|
raw_data = {k.lower().replace(" ", "_"): v for k, v in data.items()}
|
||||||
if "Models" in data and isinstance(data["Models"], dict):
|
if "Pipelines" in data and isinstance(data["Pipelines"], dict):
|
||||||
data["Models"] = ", ".join(f"{n} ({v})" for n, v in data["Models"].items())
|
data["Pipelines"] = ", ".join(
|
||||||
|
f"{n} ({v})" for n, v in data["Pipelines"].items()
|
||||||
|
)
|
||||||
markdown_data = get_markdown(data, title=title)
|
markdown_data = get_markdown(data, title=title)
|
||||||
if markdown:
|
if markdown:
|
||||||
if not silent:
|
if not silent:
|
||||||
|
@ -63,7 +67,7 @@ def info_spacy() -> Dict[str, any]:
|
||||||
"Location": str(Path(__file__).parent.parent),
|
"Location": str(Path(__file__).parent.parent),
|
||||||
"Platform": platform.platform(),
|
"Platform": platform.platform(),
|
||||||
"Python version": platform.python_version(),
|
"Python version": platform.python_version(),
|
||||||
"Models": all_models,
|
"Pipelines": all_models,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -81,7 +85,7 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
|
||||||
model_path = model
|
model_path = model
|
||||||
meta_path = model_path / "meta.json"
|
meta_path = model_path / "meta.json"
|
||||||
if not meta_path.is_file():
|
if not meta_path.is_file():
|
||||||
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
msg.fail("Can't find pipeline meta.json", meta_path, exits=1)
|
||||||
meta = srsly.read_json(meta_path)
|
meta = srsly.read_json(meta_path)
|
||||||
if model_path.resolve() != model_path:
|
if model_path.resolve() != model_path:
|
||||||
meta["source"] = str(model_path.resolve())
|
meta["source"] = str(model_path.resolve())
|
||||||
|
|
|
@ -27,7 +27,7 @@ def init_config_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
output_file: Path = Arg(..., help="File to save config.cfg to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
|
output_file: Path = Arg(..., help="File to save config.cfg to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
|
||||||
lang: Optional[str] = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
|
lang: Optional[str] = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
|
||||||
pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include in the model (without 'tok2vec' or 'transformer')"),
|
pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
|
||||||
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||||
cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
@ -37,6 +37,8 @@ def init_config_cli(
|
||||||
specified via the CLI arguments, this command generates a config with the
|
specified via the CLI arguments, this command generates a config with the
|
||||||
optimal settings for you use case. This includes the choice of architecture,
|
optimal settings for you use case. This includes the choice of architecture,
|
||||||
pretrained weights and related hyperparameters.
|
pretrained weights and related hyperparameters.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#init-config
|
||||||
"""
|
"""
|
||||||
if isinstance(optimize, Optimizations): # instance of enum from the CLI
|
if isinstance(optimize, Optimizations): # instance of enum from the CLI
|
||||||
optimize = optimize.value
|
optimize = optimize.value
|
||||||
|
@ -59,6 +61,8 @@ def init_fill_config_cli(
|
||||||
functions for their default values and update the base config. This command
|
functions for their default values and update the base config. This command
|
||||||
can be used with a config generated via the training quickstart widget:
|
can be used with a config generated via the training quickstart widget:
|
||||||
https://nightly.spacy.io/usage/training#quickstart
|
https://nightly.spacy.io/usage/training#quickstart
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#init-fill-config
|
||||||
"""
|
"""
|
||||||
fill_config(output_file, base_path, pretraining=pretraining, diff=diff)
|
fill_config(output_file, base_path, pretraining=pretraining, diff=diff)
|
||||||
|
|
||||||
|
@ -168,7 +172,7 @@ def save_config(
|
||||||
output_file.parent.mkdir(parents=True)
|
output_file.parent.mkdir(parents=True)
|
||||||
config.to_disk(output_file, interpolate=False)
|
config.to_disk(output_file, interpolate=False)
|
||||||
msg.good("Saved config", output_file)
|
msg.good("Saved config", output_file)
|
||||||
msg.text("You can now add your data and train your model:")
|
msg.text("You can now add your data and train your pipeline:")
|
||||||
variables = ["--paths.train ./train.spacy", "--paths.dev ./dev.spacy"]
|
variables = ["--paths.train ./train.spacy", "--paths.dev ./dev.spacy"]
|
||||||
if not no_print:
|
if not no_print:
|
||||||
print(f"{COMMAND} train {output_file.parts[-1]} {' '.join(variables)}")
|
print(f"{COMMAND} train {output_file.parts[-1]} {' '.join(variables)}")
|
||||||
|
|
|
@ -28,7 +28,7 @@ except ImportError:
|
||||||
DEFAULT_OOV_PROB = -20
|
DEFAULT_OOV_PROB = -20
|
||||||
|
|
||||||
|
|
||||||
@init_cli.command("model")
|
@init_cli.command("vocab")
|
||||||
@app.command(
|
@app.command(
|
||||||
"init-model",
|
"init-model",
|
||||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||||
|
@ -37,8 +37,8 @@ DEFAULT_OOV_PROB = -20
|
||||||
def init_model_cli(
|
def init_model_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
lang: str = Arg(..., help="Model language"),
|
lang: str = Arg(..., help="Pipeline language"),
|
||||||
output_dir: Path = Arg(..., help="Model output directory"),
|
output_dir: Path = Arg(..., help="Pipeline output directory"),
|
||||||
freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
|
freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
|
||||||
clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True),
|
clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True),
|
||||||
jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True),
|
jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True),
|
||||||
|
@ -46,19 +46,22 @@ def init_model_cli(
|
||||||
prune_vectors: int = Opt(-1, "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
|
prune_vectors: int = Opt(-1, "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
|
||||||
truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
||||||
vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||||
model_name: Optional[str] = Opt(None, "--model-name", "-mn", help="Optional name for the model meta"),
|
model_name: Optional[str] = Opt(None, "--meta-name", "-mn", help="Optional name of the package for the pipeline meta"),
|
||||||
base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Base model (for languages with custom tokenizers)")
|
base_model: Optional[str] = Opt(None, "--base", "-b", help="Name of or path to base pipeline to start with (mostly relevant for pipelines with custom tokenizers)")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Create a new model from raw data. If vectors are provided in Word2Vec format,
|
Create a new blank pipeline directory with vocab and vectors from raw data.
|
||||||
they can be either a .txt or zipped as a .zip or .tar.gz.
|
If vectors are provided in Word2Vec format, they can be either a .txt or
|
||||||
|
zipped as a .zip or .tar.gz.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#init-vocab
|
||||||
"""
|
"""
|
||||||
if ctx.command.name == "init-model":
|
if ctx.command.name == "init-model":
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"The init-model command is now available via the 'init model' "
|
"The init-model command is now called 'init vocab'. You can run "
|
||||||
"subcommand (without the hyphen). You can run python -m spacy init "
|
"'python -m spacy init --help' for an overview of the other "
|
||||||
"--help for an overview of the other available initialization commands."
|
"available initialization commands."
|
||||||
)
|
)
|
||||||
init_model(
|
init_model(
|
||||||
lang,
|
lang,
|
||||||
|
@ -115,10 +118,10 @@ def init_model(
|
||||||
msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
|
msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
|
||||||
lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc)
|
lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc)
|
||||||
|
|
||||||
with msg.loading("Creating model..."):
|
with msg.loading("Creating blank pipeline..."):
|
||||||
nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
|
nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
|
||||||
|
|
||||||
msg.good("Successfully created model")
|
msg.good("Successfully created blank pipeline")
|
||||||
if vectors_loc is not None:
|
if vectors_loc is not None:
|
||||||
add_vectors(
|
add_vectors(
|
||||||
msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name
|
msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name
|
||||||
|
@ -242,7 +245,8 @@ def add_vectors(
|
||||||
if vectors_data is not None:
|
if vectors_data is not None:
|
||||||
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
||||||
if name is None:
|
if name is None:
|
||||||
nlp.vocab.vectors.name = f"{nlp.meta['lang']}_model.vectors"
|
# TODO: Is this correct? Does this matter?
|
||||||
|
nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
|
||||||
else:
|
else:
|
||||||
nlp.vocab.vectors.name = name
|
nlp.vocab.vectors.name = name
|
||||||
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
||||||
|
|
|
@ -14,23 +14,25 @@ from .. import about
|
||||||
@app.command("package")
|
@app.command("package")
|
||||||
def package_cli(
|
def package_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
input_dir: Path = Arg(..., help="Directory with model data", exists=True, file_okay=False),
|
input_dir: Path = Arg(..., help="Directory with pipeline data", exists=True, file_okay=False),
|
||||||
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
|
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
|
||||||
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
|
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
|
||||||
create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
|
create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
|
||||||
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
|
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
|
||||||
no_sdist: bool = Opt(False, "--no-sdist", "-NS", help="Don't build .tar.gz sdist, can be set if you want to run this step manually"),
|
no_sdist: bool = Opt(False, "--no-sdist", "-NS", help="Don't build .tar.gz sdist, can be set if you want to run this step manually"),
|
||||||
force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing model in output directory"),
|
force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing data in output directory"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Generate an installable Python package for a model. Includes model data,
|
Generate an installable Python package for a pipeline. Includes binary data,
|
||||||
meta and required installation files. A new directory will be created in the
|
meta and required installation files. A new directory will be created in the
|
||||||
specified output directory, and model data will be copied over. If
|
specified output directory, and the data will be copied over. If
|
||||||
--create-meta is set and a meta.json already exists in the output directory,
|
--create-meta is set and a meta.json already exists in the output directory,
|
||||||
the existing values will be used as the defaults in the command-line prompt.
|
the existing values will be used as the defaults in the command-line prompt.
|
||||||
After packaging, "python setup.py sdist" is run in the package directory,
|
After packaging, "python setup.py sdist" is run in the package directory,
|
||||||
which will create a .tar.gz archive that can be installed via "pip install".
|
which will create a .tar.gz archive that can be installed via "pip install".
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#package
|
||||||
"""
|
"""
|
||||||
package(
|
package(
|
||||||
input_dir,
|
input_dir,
|
||||||
|
@ -59,14 +61,14 @@ def package(
|
||||||
output_path = util.ensure_path(output_dir)
|
output_path = util.ensure_path(output_dir)
|
||||||
meta_path = util.ensure_path(meta_path)
|
meta_path = util.ensure_path(meta_path)
|
||||||
if not input_path or not input_path.exists():
|
if not input_path or not input_path.exists():
|
||||||
msg.fail("Can't locate model data", input_path, exits=1)
|
msg.fail("Can't locate pipeline data", input_path, exits=1)
|
||||||
if not output_path or not output_path.exists():
|
if not output_path or not output_path.exists():
|
||||||
msg.fail("Output directory not found", output_path, exits=1)
|
msg.fail("Output directory not found", output_path, exits=1)
|
||||||
if meta_path and not meta_path.exists():
|
if meta_path and not meta_path.exists():
|
||||||
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
msg.fail("Can't find pipeline meta.json", meta_path, exits=1)
|
||||||
meta_path = meta_path or input_dir / "meta.json"
|
meta_path = meta_path or input_dir / "meta.json"
|
||||||
if not meta_path.exists() or not meta_path.is_file():
|
if not meta_path.exists() or not meta_path.is_file():
|
||||||
msg.fail("Can't load model meta.json", meta_path, exits=1)
|
msg.fail("Can't load pipeline meta.json", meta_path, exits=1)
|
||||||
meta = srsly.read_json(meta_path)
|
meta = srsly.read_json(meta_path)
|
||||||
meta = get_meta(input_dir, meta)
|
meta = get_meta(input_dir, meta)
|
||||||
if version is not None:
|
if version is not None:
|
||||||
|
@ -77,7 +79,7 @@ def package(
|
||||||
meta = generate_meta(meta, msg)
|
meta = generate_meta(meta, msg)
|
||||||
errors = validate(ModelMetaSchema, meta)
|
errors = validate(ModelMetaSchema, meta)
|
||||||
if errors:
|
if errors:
|
||||||
msg.fail("Invalid model meta.json")
|
msg.fail("Invalid pipeline meta.json")
|
||||||
print("\n".join(errors))
|
print("\n".join(errors))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
model_name = meta["lang"] + "_" + meta["name"]
|
model_name = meta["lang"] + "_" + meta["name"]
|
||||||
|
@ -118,7 +120,7 @@ def get_meta(
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
meta = {
|
meta = {
|
||||||
"lang": "en",
|
"lang": "en",
|
||||||
"name": "model",
|
"name": "pipeline",
|
||||||
"version": "0.0.0",
|
"version": "0.0.0",
|
||||||
"description": "",
|
"description": "",
|
||||||
"author": "",
|
"author": "",
|
||||||
|
@ -143,10 +145,10 @@ def get_meta(
|
||||||
def generate_meta(existing_meta: Dict[str, Any], msg: Printer) -> Dict[str, Any]:
|
def generate_meta(existing_meta: Dict[str, Any], msg: Printer) -> Dict[str, Any]:
|
||||||
meta = existing_meta or {}
|
meta = existing_meta or {}
|
||||||
settings = [
|
settings = [
|
||||||
("lang", "Model language", meta.get("lang", "en")),
|
("lang", "Pipeline language", meta.get("lang", "en")),
|
||||||
("name", "Model name", meta.get("name", "model")),
|
("name", "Pipeline name", meta.get("name", "pipeline")),
|
||||||
("version", "Model version", meta.get("version", "0.0.0")),
|
("version", "Package version", meta.get("version", "0.0.0")),
|
||||||
("description", "Model description", meta.get("description", None)),
|
("description", "Package description", meta.get("description", None)),
|
||||||
("author", "Author", meta.get("author", None)),
|
("author", "Author", meta.get("author", None)),
|
||||||
("email", "Author email", meta.get("email", None)),
|
("email", "Author email", meta.get("email", None)),
|
||||||
("url", "Author website", meta.get("url", None)),
|
("url", "Author website", meta.get("url", None)),
|
||||||
|
@ -154,8 +156,8 @@ def generate_meta(existing_meta: Dict[str, Any], msg: Printer) -> Dict[str, Any]
|
||||||
]
|
]
|
||||||
msg.divider("Generating meta.json")
|
msg.divider("Generating meta.json")
|
||||||
msg.text(
|
msg.text(
|
||||||
"Enter the package settings for your model. The following information "
|
"Enter the package settings for your pipeline. The following information "
|
||||||
"will be read from your model data: pipeline, vectors."
|
"will be read from your pipeline data: pipeline, vectors."
|
||||||
)
|
)
|
||||||
for setting, desc, default in settings:
|
for setting, desc, default in settings:
|
||||||
response = get_raw_input(desc, default)
|
response = get_raw_input(desc, default)
|
||||||
|
|
|
@ -31,7 +31,7 @@ def pretrain_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
texts_loc: Path = Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", exists=True),
|
texts_loc: Path = Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", exists=True),
|
||||||
output_dir: Path = Arg(..., help="Directory to write models to on each epoch"),
|
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
|
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
|
||||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||||
|
@ -57,6 +57,8 @@ def pretrain_cli(
|
||||||
To load the weights back in during 'spacy train', you need to ensure
|
To load the weights back in during 'spacy train', you need to ensure
|
||||||
all settings are the same between pretraining and training. Ideally,
|
all settings are the same between pretraining and training. Ideally,
|
||||||
this is done by using the same config file for both commands.
|
this is done by using the same config file for both commands.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#pretrain
|
||||||
"""
|
"""
|
||||||
overrides = parse_config_overrides(ctx.args)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
|
@ -377,9 +379,8 @@ def verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resum
|
||||||
if resume_path:
|
if resume_path:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Output directory is not empty.",
|
"Output directory is not empty.",
|
||||||
"If you're resuming a run from a previous model in this directory, "
|
"If you're resuming a run in this directory, the old weights "
|
||||||
"the old models for the consecutive epochs will be overwritten "
|
"for the consecutive epochs will be overwritten with the new ones.",
|
||||||
"with the new ones.",
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
|
|
|
@ -19,7 +19,7 @@ from ..util import load_model
|
||||||
def profile_cli(
|
def profile_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read current calling context
|
ctx: typer.Context, # This is only used to read current calling context
|
||||||
model: str = Arg(..., help="Model to load"),
|
model: str = Arg(..., help="Trained pipeline to load"),
|
||||||
inputs: Optional[Path] = Arg(None, help="Location of input file. '-' for stdin.", exists=True, allow_dash=True),
|
inputs: Optional[Path] = Arg(None, help="Location of input file. '-' for stdin.", exists=True, allow_dash=True),
|
||||||
n_texts: int = Opt(10000, "--n-texts", "-n", help="Maximum number of texts to use if available"),
|
n_texts: int = Opt(10000, "--n-texts", "-n", help="Maximum number of texts to use if available"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
@ -29,6 +29,8 @@ def profile_cli(
|
||||||
Input should be formatted as one JSON object per line with a key "text".
|
Input should be formatted as one JSON object per line with a key "text".
|
||||||
It can either be provided as a JSONL file, or be read from sys.sytdin.
|
It can either be provided as a JSONL file, or be read from sys.sytdin.
|
||||||
If no input file is specified, the IMDB dataset is loaded via Thinc.
|
If no input file is specified, the IMDB dataset is loaded via Thinc.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#debug-profile
|
||||||
"""
|
"""
|
||||||
if ctx.parent.command.name == NAME: # called as top-level command
|
if ctx.parent.command.name == NAME: # called as top-level command
|
||||||
msg.warn(
|
msg.warn(
|
||||||
|
@ -60,9 +62,9 @@ def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) ->
|
||||||
inputs, _ = zip(*imdb_train)
|
inputs, _ = zip(*imdb_train)
|
||||||
msg.info(f"Loaded IMDB dataset and using {n_inputs} examples")
|
msg.info(f"Loaded IMDB dataset and using {n_inputs} examples")
|
||||||
inputs = inputs[:n_inputs]
|
inputs = inputs[:n_inputs]
|
||||||
with msg.loading(f"Loading model '{model}'..."):
|
with msg.loading(f"Loading pipeline '{model}'..."):
|
||||||
nlp = load_model(model)
|
nlp = load_model(model)
|
||||||
msg.good(f"Loaded model '{model}'")
|
msg.good(f"Loaded pipeline '{model}'")
|
||||||
texts = list(itertools.islice(inputs, n_texts))
|
texts = list(itertools.islice(inputs, n_texts))
|
||||||
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
||||||
s = pstats.Stats("Profile.prof")
|
s = pstats.Stats("Profile.prof")
|
||||||
|
|
|
@ -20,6 +20,8 @@ def project_assets_cli(
|
||||||
defined in the "assets" section of the project.yml. If a checksum is
|
defined in the "assets" section of the project.yml. If a checksum is
|
||||||
provided in the project.yml, the file is only downloaded if no local file
|
provided in the project.yml, the file is only downloaded if no local file
|
||||||
with the same checksum exists.
|
with the same checksum exists.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#project-assets
|
||||||
"""
|
"""
|
||||||
project_assets(project_dir)
|
project_assets(project_dir)
|
||||||
|
|
||||||
|
|
|
@ -22,6 +22,8 @@ def project_clone_cli(
|
||||||
only download the files from the given subdirectory. The GitHub repo
|
only download the files from the given subdirectory. The GitHub repo
|
||||||
defaults to the official spaCy template repo, but can be customized
|
defaults to the official spaCy template repo, but can be customized
|
||||||
(including using a private repo).
|
(including using a private repo).
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#project-clone
|
||||||
"""
|
"""
|
||||||
if dest is None:
|
if dest is None:
|
||||||
dest = Path.cwd() / name
|
dest = Path.cwd() / name
|
||||||
|
|
|
@ -43,6 +43,8 @@ def project_document_cli(
|
||||||
hidden markers are added so you can add custom content before or after the
|
hidden markers are added so you can add custom content before or after the
|
||||||
auto-generated section and only the auto-generated docs will be replaced
|
auto-generated section and only the auto-generated docs will be replaced
|
||||||
when you re-run the command.
|
when you re-run the command.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#project-document
|
||||||
"""
|
"""
|
||||||
project_document(project_dir, output_file, no_emoji=no_emoji)
|
project_document(project_dir, output_file, no_emoji=no_emoji)
|
||||||
|
|
||||||
|
|
|
@ -31,7 +31,10 @@ def project_update_dvc_cli(
|
||||||
"""Auto-generate Data Version Control (DVC) config. A DVC
|
"""Auto-generate Data Version Control (DVC) config. A DVC
|
||||||
project can only define one pipeline, so you need to specify one workflow
|
project can only define one pipeline, so you need to specify one workflow
|
||||||
defined in the project.yml. If no workflow is specified, the first defined
|
defined in the project.yml. If no workflow is specified, the first defined
|
||||||
workflow is used. The DVC config will only be updated if the project.yml changed.
|
workflow is used. The DVC config will only be updated if the project.yml
|
||||||
|
changed.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#project-dvc
|
||||||
"""
|
"""
|
||||||
project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
|
project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,9 @@ def project_pull_cli(
|
||||||
"""Retrieve available precomputed outputs from a remote storage.
|
"""Retrieve available precomputed outputs from a remote storage.
|
||||||
You can alias remotes in your project.yml by mapping them to storage paths.
|
You can alias remotes in your project.yml by mapping them to storage paths.
|
||||||
A storage can be anything that the smart-open library can upload to, e.g.
|
A storage can be anything that the smart-open library can upload to, e.g.
|
||||||
gcs, aws, ssh, local directories etc
|
AWS, Google Cloud Storage, SSH, local directories etc.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#project-pull
|
||||||
"""
|
"""
|
||||||
for url, output_path in project_pull(project_dir, remote):
|
for url, output_path in project_pull(project_dir, remote):
|
||||||
if url is not None:
|
if url is not None:
|
||||||
|
@ -38,5 +40,6 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
||||||
url = storage.pull(output_path, command_hash=cmd_hash)
|
url = storage.pull(output_path, command_hash=cmd_hash)
|
||||||
yield url, output_path
|
yield url, output_path
|
||||||
|
|
||||||
if cmd.get("outptus") and all(loc.exists() for loc in cmd["outputs"]):
|
out_locs = [project_dir / out for out in cmd.get("outputs", [])]
|
||||||
|
if all(loc.exists() for loc in out_locs):
|
||||||
update_lockfile(project_dir, cmd)
|
update_lockfile(project_dir, cmd)
|
||||||
|
|
|
@ -13,9 +13,12 @@ def project_push_cli(
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Persist outputs to a remote storage. You can alias remotes in your project.yml
|
"""Persist outputs to a remote storage. You can alias remotes in your
|
||||||
by mapping them to storage paths. A storage can be anything that the smart-open
|
project.yml by mapping them to storage paths. A storage can be anything that
|
||||||
library can upload to, e.g. gcs, aws, ssh, local directories etc
|
the smart-open library can upload to, e.g. AWS, Google Cloud Storage, SSH,
|
||||||
|
local directories etc.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#project-push
|
||||||
"""
|
"""
|
||||||
for output_path, url in project_push(project_dir, remote):
|
for output_path, url in project_push(project_dir, remote):
|
||||||
if url is None:
|
if url is None:
|
||||||
|
@ -42,10 +45,19 @@ def project_push(project_dir: Path, remote: str):
|
||||||
)
|
)
|
||||||
for output_path in cmd.get("outputs", []):
|
for output_path in cmd.get("outputs", []):
|
||||||
output_loc = project_dir / output_path
|
output_loc = project_dir / output_path
|
||||||
if output_loc.exists():
|
if output_loc.exists() and _is_not_empty_dir(output_loc):
|
||||||
url = storage.push(
|
url = storage.push(
|
||||||
output_path,
|
output_path,
|
||||||
command_hash=cmd_hash,
|
command_hash=cmd_hash,
|
||||||
content_hash=get_content_hash(output_loc),
|
content_hash=get_content_hash(output_loc),
|
||||||
)
|
)
|
||||||
yield output_path, url
|
yield output_path, url
|
||||||
|
|
||||||
|
|
||||||
|
def _is_not_empty_dir(loc: Path):
|
||||||
|
if not loc.is_dir():
|
||||||
|
return True
|
||||||
|
elif any(_is_not_empty_dir(child) for child in loc.iterdir()):
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
|
@ -24,6 +24,8 @@ def project_run_cli(
|
||||||
name is specified, all commands in the workflow are run, in order. If
|
name is specified, all commands in the workflow are run, in order. If
|
||||||
commands define dependencies and/or outputs, they will only be re-run if
|
commands define dependencies and/or outputs, they will only be re-run if
|
||||||
state has changed.
|
state has changed.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#project-run
|
||||||
"""
|
"""
|
||||||
if show_help or not subcommand:
|
if show_help or not subcommand:
|
||||||
print_run_help(project_dir, subcommand)
|
print_run_help(project_dir, subcommand)
|
||||||
|
|
|
@ -29,7 +29,7 @@ name = "{{ transformer["name"] }}"
|
||||||
tokenizer_config = {"use_fast": true}
|
tokenizer_config = {"use_fast": true}
|
||||||
|
|
||||||
[components.transformer.model.get_spans]
|
[components.transformer.model.get_spans]
|
||||||
@span_getters = "strided_spans.v1"
|
@span_getters = "spacy-transformers.strided_spans.v1"
|
||||||
window = 128
|
window = 128
|
||||||
stride = 96
|
stride = 96
|
||||||
|
|
||||||
|
@ -186,11 +186,14 @@ accumulate_gradient = {{ transformer["size_factor"] }}
|
||||||
[training.optimizer]
|
[training.optimizer]
|
||||||
@optimizers = "Adam.v1"
|
@optimizers = "Adam.v1"
|
||||||
|
|
||||||
|
|
||||||
|
{% if use_transformer -%}
|
||||||
[training.optimizer.learn_rate]
|
[training.optimizer.learn_rate]
|
||||||
@schedules = "warmup_linear.v1"
|
@schedules = "warmup_linear.v1"
|
||||||
warmup_steps = 250
|
warmup_steps = 250
|
||||||
total_steps = 20000
|
total_steps = 20000
|
||||||
initial_rate = 5e-5
|
initial_rate = 5e-5
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
[training.train_corpus]
|
[training.train_corpus]
|
||||||
@readers = "spacy.Corpus.v1"
|
@readers = "spacy.Corpus.v1"
|
||||||
|
@ -204,13 +207,13 @@ max_length = 0
|
||||||
|
|
||||||
{% if use_transformer %}
|
{% if use_transformer %}
|
||||||
[training.batcher]
|
[training.batcher]
|
||||||
@batchers = "batch_by_padded.v1"
|
@batchers = "spacy.batch_by_padded.v1"
|
||||||
discard_oversize = true
|
discard_oversize = true
|
||||||
size = 2000
|
size = 2000
|
||||||
buffer = 256
|
buffer = 256
|
||||||
{%- else %}
|
{%- else %}
|
||||||
[training.batcher]
|
[training.batcher]
|
||||||
@batchers = "batch_by_words.v1"
|
@batchers = "spacy.batch_by_words.v1"
|
||||||
discard_oversize = false
|
discard_oversize = false
|
||||||
tolerance = 0.2
|
tolerance = 0.2
|
||||||
|
|
||||||
|
|
|
@ -26,7 +26,7 @@ def train_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||||
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"),
|
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
|
||||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||||
|
@ -34,7 +34,7 @@ def train_cli(
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Train or update a spaCy model. Requires data in spaCy's binary format. To
|
Train or update a spaCy pipeline. Requires data in spaCy's binary format. To
|
||||||
convert data from other formats, use the `spacy convert` command. The
|
convert data from other formats, use the `spacy convert` command. The
|
||||||
config file includes all settings and hyperparameters used during traing.
|
config file includes all settings and hyperparameters used during traing.
|
||||||
To override settings in the config, e.g. settings that point to local
|
To override settings in the config, e.g. settings that point to local
|
||||||
|
@ -44,6 +44,8 @@ def train_cli(
|
||||||
lets you pass in a Python file that's imported before training. It can be
|
lets you pass in a Python file that's imported before training. It can be
|
||||||
used to register custom functions and architectures that can then be
|
used to register custom functions and architectures that can then be
|
||||||
referenced in the config.
|
referenced in the config.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#train
|
||||||
"""
|
"""
|
||||||
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
|
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
|
||||||
verify_cli_args(config_path, output_path)
|
verify_cli_args(config_path, output_path)
|
||||||
|
@ -113,12 +115,12 @@ def train(
|
||||||
# Load morph rules
|
# Load morph rules
|
||||||
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
|
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
|
||||||
|
|
||||||
# Load a pretrained tok2vec model - cf. CLI command 'pretrain'
|
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
|
||||||
if weights_data is not None:
|
if weights_data is not None:
|
||||||
tok2vec_path = config["pretraining"].get("tok2vec_model", None)
|
tok2vec_path = config["pretraining"].get("tok2vec_model", None)
|
||||||
if tok2vec_path is None:
|
if tok2vec_path is None:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
f"To use a pretrained tok2vec model, the config needs to specify which "
|
f"To pretrained tok2vec weights, the config needs to specify which "
|
||||||
f"tok2vec layer to load in the setting [pretraining.tok2vec_model].",
|
f"tok2vec layer to load in the setting [pretraining.tok2vec_model].",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
|
@ -183,7 +185,7 @@ def train(
|
||||||
nlp.to_disk(final_model_path)
|
nlp.to_disk(final_model_path)
|
||||||
else:
|
else:
|
||||||
nlp.to_disk(final_model_path)
|
nlp.to_disk(final_model_path)
|
||||||
msg.good(f"Saved model to output directory {final_model_path}")
|
msg.good(f"Saved pipeline to output directory {final_model_path}")
|
||||||
|
|
||||||
|
|
||||||
def create_train_batches(iterator, batcher, max_epochs: int):
|
def create_train_batches(iterator, batcher, max_epochs: int):
|
||||||
|
|
|
@ -13,9 +13,11 @@ from ..util import get_package_path, get_model_meta, is_compatible_version
|
||||||
@app.command("validate")
|
@app.command("validate")
|
||||||
def validate_cli():
|
def validate_cli():
|
||||||
"""
|
"""
|
||||||
Validate the currently installed models and spaCy version. Checks if the
|
Validate the currently installed pipeline packages and spaCy version. Checks
|
||||||
installed models are compatible and shows upgrade instructions if available.
|
if the installed packages are compatible and shows upgrade instructions if
|
||||||
Should be run after `pip install -U spacy`.
|
available. Should be run after `pip install -U spacy`.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#validate
|
||||||
"""
|
"""
|
||||||
validate()
|
validate()
|
||||||
|
|
||||||
|
@ -25,13 +27,13 @@ def validate() -> None:
|
||||||
spacy_version = get_base_version(about.__version__)
|
spacy_version = get_base_version(about.__version__)
|
||||||
current_compat = compat.get(spacy_version, {})
|
current_compat = compat.get(spacy_version, {})
|
||||||
if not current_compat:
|
if not current_compat:
|
||||||
msg.warn(f"No compatible models found for v{spacy_version} of spaCy")
|
msg.warn(f"No compatible packages found for v{spacy_version} of spaCy")
|
||||||
incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
|
incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
|
||||||
na_models = [m for m in incompat_models if m not in current_compat]
|
na_models = [m for m in incompat_models if m not in current_compat]
|
||||||
update_models = [m for m in incompat_models if m in current_compat]
|
update_models = [m for m in incompat_models if m in current_compat]
|
||||||
spacy_dir = Path(__file__).parent.parent
|
spacy_dir = Path(__file__).parent.parent
|
||||||
|
|
||||||
msg.divider(f"Installed models (spaCy v{about.__version__})")
|
msg.divider(f"Installed pipeline packages (spaCy v{about.__version__})")
|
||||||
msg.info(f"spaCy installation: {spacy_dir}")
|
msg.info(f"spaCy installation: {spacy_dir}")
|
||||||
|
|
||||||
if model_pkgs:
|
if model_pkgs:
|
||||||
|
@ -47,15 +49,15 @@ def validate() -> None:
|
||||||
rows.append((data["name"], data["spacy"], version, comp))
|
rows.append((data["name"], data["spacy"], version, comp))
|
||||||
msg.table(rows, header=header)
|
msg.table(rows, header=header)
|
||||||
else:
|
else:
|
||||||
msg.text("No models found in your current environment.", exits=0)
|
msg.text("No pipeline packages found in your current environment.", exits=0)
|
||||||
if update_models:
|
if update_models:
|
||||||
msg.divider("Install updates")
|
msg.divider("Install updates")
|
||||||
msg.text("Use the following commands to update the model packages:")
|
msg.text("Use the following commands to update the packages:")
|
||||||
cmd = "python -m spacy download {}"
|
cmd = "python -m spacy download {}"
|
||||||
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
|
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
|
||||||
if na_models:
|
if na_models:
|
||||||
msg.info(
|
msg.info(
|
||||||
f"The following models are custom spaCy models or not "
|
f"The following packages are custom spaCy pipelines or not "
|
||||||
f"available for spaCy v{about.__version__}:",
|
f"available for spaCy v{about.__version__}:",
|
||||||
", ".join(na_models),
|
", ".join(na_models),
|
||||||
)
|
)
|
||||||
|
|
|
@ -69,7 +69,7 @@ max_length = 2000
|
||||||
limit = 0
|
limit = 0
|
||||||
|
|
||||||
[training.batcher]
|
[training.batcher]
|
||||||
@batchers = "batch_by_words.v1"
|
@batchers = "spacy.batch_by_words.v1"
|
||||||
discard_oversize = false
|
discard_oversize = false
|
||||||
tolerance = 0.2
|
tolerance = 0.2
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
"""
|
"""
|
||||||
spaCy's built in visualization suite for dependencies and named entities.
|
spaCy's built in visualization suite for dependencies and named entities.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/top-level#displacy
|
DOCS: https://nightly.spacy.io/api/top-level#displacy
|
||||||
USAGE: https://spacy.io/usage/visualizers
|
USAGE: https://nightly.spacy.io/usage/visualizers
|
||||||
"""
|
"""
|
||||||
from typing import Union, Iterable, Optional, Dict, Any, Callable
|
from typing import Union, Iterable, Optional, Dict, Any, Callable
|
||||||
import warnings
|
import warnings
|
||||||
|
@ -37,8 +37,8 @@ def render(
|
||||||
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||||
RETURNS (str): Rendered HTML markup.
|
RETURNS (str): Rendered HTML markup.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/top-level#displacy.render
|
DOCS: https://nightly.spacy.io/api/top-level#displacy.render
|
||||||
USAGE: https://spacy.io/usage/visualizers
|
USAGE: https://nightly.spacy.io/usage/visualizers
|
||||||
"""
|
"""
|
||||||
factories = {
|
factories = {
|
||||||
"dep": (DependencyRenderer, parse_deps),
|
"dep": (DependencyRenderer, parse_deps),
|
||||||
|
@ -88,8 +88,8 @@ def serve(
|
||||||
port (int): Port to serve visualisation.
|
port (int): Port to serve visualisation.
|
||||||
host (str): Host to serve visualisation.
|
host (str): Host to serve visualisation.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/top-level#displacy.serve
|
DOCS: https://nightly.spacy.io/api/top-level#displacy.serve
|
||||||
USAGE: https://spacy.io/usage/visualizers
|
USAGE: https://nightly.spacy.io/usage/visualizers
|
||||||
"""
|
"""
|
||||||
from wsgiref import simple_server
|
from wsgiref import simple_server
|
||||||
|
|
||||||
|
|
|
@ -249,6 +249,12 @@ class EntityRenderer:
|
||||||
colors = dict(DEFAULT_LABEL_COLORS)
|
colors = dict(DEFAULT_LABEL_COLORS)
|
||||||
user_colors = registry.displacy_colors.get_all()
|
user_colors = registry.displacy_colors.get_all()
|
||||||
for user_color in user_colors.values():
|
for user_color in user_colors.values():
|
||||||
|
if callable(user_color):
|
||||||
|
# Since this comes from the function registry, we want to make
|
||||||
|
# sure we support functions that *return* a dict of colors
|
||||||
|
user_color = user_color()
|
||||||
|
if not isinstance(user_color, dict):
|
||||||
|
raise ValueError(Errors.E925.format(obj=type(user_color)))
|
||||||
colors.update(user_color)
|
colors.update(user_color)
|
||||||
colors.update(options.get("colors", {}))
|
colors.update(options.get("colors", {}))
|
||||||
self.default_color = DEFAULT_ENTITY_COLOR
|
self.default_color = DEFAULT_ENTITY_COLOR
|
||||||
|
@ -323,7 +329,11 @@ class EntityRenderer:
|
||||||
else:
|
else:
|
||||||
markup += entity
|
markup += entity
|
||||||
offset = end
|
offset = end
|
||||||
markup += escape_html(text[offset:])
|
fragments = text[offset:].split("\n")
|
||||||
|
for i, fragment in enumerate(fragments):
|
||||||
|
markup += escape_html(fragment)
|
||||||
|
if len(fragments) > 1 and i != len(fragments) - 1:
|
||||||
|
markup += "</br>"
|
||||||
markup = TPL_ENTS.format(content=markup, dir=self.direction)
|
markup = TPL_ENTS.format(content=markup, dir=self.direction)
|
||||||
if title:
|
if title:
|
||||||
markup = TPL_TITLE.format(title=title) + markup
|
markup = TPL_TITLE.format(title=title) + markup
|
||||||
|
|
|
@ -22,7 +22,7 @@ class Warnings:
|
||||||
"generate a dependency visualization for it. Make sure the Doc "
|
"generate a dependency visualization for it. Make sure the Doc "
|
||||||
"was processed with a model that supports dependency parsing, and "
|
"was processed with a model that supports dependency parsing, and "
|
||||||
"not just a language class like `English()`. For more info, see "
|
"not just a language class like `English()`. For more info, see "
|
||||||
"the docs:\nhttps://spacy.io/usage/models")
|
"the docs:\nhttps://nightly.spacy.io/usage/models")
|
||||||
W006 = ("No entities to visualize found in Doc object. If this is "
|
W006 = ("No entities to visualize found in Doc object. If this is "
|
||||||
"surprising to you, make sure the Doc was processed using a model "
|
"surprising to you, make sure the Doc was processed using a model "
|
||||||
"that supports named entity recognition, and check the `doc.ents` "
|
"that supports named entity recognition, and check the `doc.ents` "
|
||||||
|
@ -76,6 +76,10 @@ class Warnings:
|
||||||
"If this is surprising, make sure you have the spacy-lookups-data "
|
"If this is surprising, make sure you have the spacy-lookups-data "
|
||||||
"package installed. The languages with lexeme normalization tables "
|
"package installed. The languages with lexeme normalization tables "
|
||||||
"are currently: {langs}")
|
"are currently: {langs}")
|
||||||
|
W034 = ("Please install the package spacy-lookups-data in order to include "
|
||||||
|
"the default lexeme normalization table for the language '{lang}'.")
|
||||||
|
W035 = ('Discarding subpattern "{pattern}" due to an unrecognized '
|
||||||
|
"attribute or operator.")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
W090 = ("Could not locate any binary .spacy files in path '{path}'.")
|
W090 = ("Could not locate any binary .spacy files in path '{path}'.")
|
||||||
|
@ -147,7 +151,7 @@ class Errors:
|
||||||
E010 = ("Word vectors set to length 0. This may be because you don't have "
|
E010 = ("Word vectors set to length 0. This may be because you don't have "
|
||||||
"a model installed or loaded, or because your model doesn't "
|
"a model installed or loaded, or because your model doesn't "
|
||||||
"include word vectors. For more info, see the docs:\n"
|
"include word vectors. For more info, see the docs:\n"
|
||||||
"https://spacy.io/usage/models")
|
"https://nightly.spacy.io/usage/models")
|
||||||
E011 = ("Unknown operator: '{op}'. Options: {opts}")
|
E011 = ("Unknown operator: '{op}'. Options: {opts}")
|
||||||
E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
|
E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
|
||||||
E014 = ("Unknown tag ID: {tag}")
|
E014 = ("Unknown tag ID: {tag}")
|
||||||
|
@ -181,7 +185,7 @@ class Errors:
|
||||||
"list of (unicode, bool) tuples. Got bytes instance: {value}")
|
"list of (unicode, bool) tuples. Got bytes instance: {value}")
|
||||||
E029 = ("noun_chunks requires the dependency parse, which requires a "
|
E029 = ("noun_chunks requires the dependency parse, which requires a "
|
||||||
"statistical model to be installed and loaded. For more info, see "
|
"statistical model to be installed and loaded. For more info, see "
|
||||||
"the documentation:\nhttps://spacy.io/usage/models")
|
"the documentation:\nhttps://nightly.spacy.io/usage/models")
|
||||||
E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
|
E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
|
||||||
"component to the pipeline with: "
|
"component to the pipeline with: "
|
||||||
"nlp.add_pipe('sentencizer'). "
|
"nlp.add_pipe('sentencizer'). "
|
||||||
|
@ -284,17 +288,17 @@ class Errors:
|
||||||
"Span objects, or dicts if set to manual=True.")
|
"Span objects, or dicts if set to manual=True.")
|
||||||
E097 = ("Invalid pattern: expected token pattern (list of dicts) or "
|
E097 = ("Invalid pattern: expected token pattern (list of dicts) or "
|
||||||
"phrase pattern (string) but got:\n{pattern}")
|
"phrase pattern (string) but got:\n{pattern}")
|
||||||
E098 = ("Invalid pattern specified: expected both SPEC and PATTERN.")
|
E098 = ("Invalid pattern: expected both RIGHT_ID and RIGHT_ATTRS.")
|
||||||
E099 = ("First node of pattern should be a root node. The root should "
|
E099 = ("Invalid pattern: the first node of pattern should be an anchor "
|
||||||
"only contain NODE_NAME.")
|
"node. The node should only contain RIGHT_ID and RIGHT_ATTRS.")
|
||||||
E100 = ("Nodes apart from the root should contain NODE_NAME, NBOR_NAME and "
|
E100 = ("Nodes other than the anchor node should all contain LEFT_ID, "
|
||||||
"NBOR_RELOP.")
|
"REL_OP and RIGHT_ID.")
|
||||||
E101 = ("NODE_NAME should be a new node and NBOR_NAME should already have "
|
E101 = ("RIGHT_ID should be a new node and LEFT_ID should already have "
|
||||||
"have been declared in previous edges.")
|
"have been declared in previous edges.")
|
||||||
E102 = ("Can't merge non-disjoint spans. '{token}' is already part of "
|
E102 = ("Can't merge non-disjoint spans. '{token}' is already part of "
|
||||||
"tokens to merge. If you want to find the longest non-overlapping "
|
"tokens to merge. If you want to find the longest non-overlapping "
|
||||||
"spans, you can use the util.filter_spans helper:\n"
|
"spans, you can use the util.filter_spans helper:\n"
|
||||||
"https://spacy.io/api/top-level#util.filter_spans")
|
"https://nightly.spacy.io/api/top-level#util.filter_spans")
|
||||||
E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A "
|
E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A "
|
||||||
"token can only be part of one entity, so make sure the entities "
|
"token can only be part of one entity, so make sure the entities "
|
||||||
"you're setting don't overlap.")
|
"you're setting don't overlap.")
|
||||||
|
@ -364,10 +368,10 @@ class Errors:
|
||||||
E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure "
|
E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure "
|
||||||
"to provide a valid JSON object as input with either the `text` "
|
"to provide a valid JSON object as input with either the `text` "
|
||||||
"or `tokens` key. For more info, see the docs:\n"
|
"or `tokens` key. For more info, see the docs:\n"
|
||||||
"https://spacy.io/api/cli#pretrain-jsonl")
|
"https://nightly.spacy.io/api/cli#pretrain-jsonl")
|
||||||
E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input "
|
E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input "
|
||||||
"includes either the `text` or `tokens` key. For more info, see "
|
"includes either the `text` or `tokens` key. For more info, see "
|
||||||
"the docs:\nhttps://spacy.io/api/cli#pretrain-jsonl")
|
"the docs:\nhttps://nightly.spacy.io/api/cli#pretrain-jsonl")
|
||||||
E139 = ("Knowledge Base for component '{name}' is empty. Use the methods "
|
E139 = ("Knowledge Base for component '{name}' is empty. Use the methods "
|
||||||
"kb.add_entity and kb.add_alias to add entries.")
|
"kb.add_entity and kb.add_alias to add entries.")
|
||||||
E140 = ("The list of entities, prior probabilities and entity vectors "
|
E140 = ("The list of entities, prior probabilities and entity vectors "
|
||||||
|
@ -474,8 +478,13 @@ class Errors:
|
||||||
E198 = ("Unable to return {n} most similar vectors for the current vectors "
|
E198 = ("Unable to return {n} most similar vectors for the current vectors "
|
||||||
"table, which contains {n_rows} vectors.")
|
"table, which contains {n_rows} vectors.")
|
||||||
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
||||||
|
E200 = ("Specifying a base model with a pretrained component '{component}' "
|
||||||
|
"can not be combined with adding a pretrained Tok2Vec layer.")
|
||||||
|
E201 = ("Span index out of range.")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
E925 = ("Invalid color values for displaCy visualizer: expected dictionary "
|
||||||
|
"mapping label names to colors but got: {obj}")
|
||||||
E926 = ("It looks like you're trying to modify nlp.{attr} directly. This "
|
E926 = ("It looks like you're trying to modify nlp.{attr} directly. This "
|
||||||
"doesn't work because it's an immutable computed property. If you "
|
"doesn't work because it's an immutable computed property. If you "
|
||||||
"need to modify the pipeline, use the built-in methods like "
|
"need to modify the pipeline, use the built-in methods like "
|
||||||
|
@ -652,6 +661,9 @@ class Errors:
|
||||||
"'{chunk}'. Tokenizer exceptions are only allowed to specify "
|
"'{chunk}'. Tokenizer exceptions are only allowed to specify "
|
||||||
"`ORTH` and `NORM`.")
|
"`ORTH` and `NORM`.")
|
||||||
E1006 = ("Unable to initialize {name} model with 0 labels.")
|
E1006 = ("Unable to initialize {name} model with 0 labels.")
|
||||||
|
E1007 = ("Unsupported DependencyMatcher operator '{op}'.")
|
||||||
|
E1008 = ("Invalid pattern: each pattern should be a list of dicts. Check "
|
||||||
|
"that you are providing a list of patterns as `List[List[dict]]`.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
|
@ -11,7 +11,7 @@ ItemT = TypeVar("ItemT")
|
||||||
BatcherT = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]
|
BatcherT = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]
|
||||||
|
|
||||||
|
|
||||||
@registry.batchers("batch_by_padded.v1")
|
@registry.batchers("spacy.batch_by_padded.v1")
|
||||||
def configure_minibatch_by_padded_size(
|
def configure_minibatch_by_padded_size(
|
||||||
*,
|
*,
|
||||||
size: Sizing,
|
size: Sizing,
|
||||||
|
@ -46,7 +46,7 @@ def configure_minibatch_by_padded_size(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@registry.batchers("batch_by_words.v1")
|
@registry.batchers("spacy.batch_by_words.v1")
|
||||||
def configure_minibatch_by_words(
|
def configure_minibatch_by_words(
|
||||||
*,
|
*,
|
||||||
size: Sizing,
|
size: Sizing,
|
||||||
|
@ -70,7 +70,7 @@ def configure_minibatch_by_words(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@registry.batchers("batch_by_sequence.v1")
|
@registry.batchers("spacy.batch_by_sequence.v1")
|
||||||
def configure_minibatch(
|
def configure_minibatch(
|
||||||
size: Sizing, get_length: Optional[Callable[[ItemT], int]] = None
|
size: Sizing, get_length: Optional[Callable[[ItemT], int]] = None
|
||||||
) -> BatcherT:
|
) -> BatcherT:
|
||||||
|
|
|
@ -106,7 +106,7 @@ def conll_ner2docs(
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"The token-per-line NER file is not formatted correctly. "
|
"The token-per-line NER file is not formatted correctly. "
|
||||||
"Try checking whitespace and delimiters. See "
|
"Try checking whitespace and delimiters. See "
|
||||||
"https://spacy.io/api/cli#convert"
|
"https://nightly.spacy.io/api/cli#convert"
|
||||||
)
|
)
|
||||||
length = len(cols[0])
|
length = len(cols[0])
|
||||||
words.extend(cols[0])
|
words.extend(cols[0])
|
||||||
|
|
|
@ -44,7 +44,7 @@ def read_iob(raw_sents, vocab, n_sents):
|
||||||
sent_tags = ["-"] * len(sent_words)
|
sent_tags = ["-"] * len(sent_words)
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
|
"The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://nightly.spacy.io/api/cli#convert"
|
||||||
)
|
)
|
||||||
words.extend(sent_words)
|
words.extend(sent_words)
|
||||||
tags.extend(sent_tags)
|
tags.extend(sent_tags)
|
||||||
|
|
|
@ -38,7 +38,7 @@ class Corpus:
|
||||||
limit (int): Limit corpus to a subset of examples, e.g. for debugging.
|
limit (int): Limit corpus to a subset of examples, e.g. for debugging.
|
||||||
Defaults to 0, which indicates no limit.
|
Defaults to 0, which indicates no limit.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/corpus
|
DOCS: https://nightly.spacy.io/api/corpus
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -83,7 +83,7 @@ class Corpus:
|
||||||
nlp (Language): The current nlp object.
|
nlp (Language): The current nlp object.
|
||||||
YIELDS (Example): The examples.
|
YIELDS (Example): The examples.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/corpus#call
|
DOCS: https://nightly.spacy.io/api/corpus#call
|
||||||
"""
|
"""
|
||||||
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.path))
|
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.path))
|
||||||
if self.gold_preproc:
|
if self.gold_preproc:
|
||||||
|
|
|
@ -21,7 +21,7 @@ cdef class Candidate:
|
||||||
algorithm which will disambiguate the various candidates to the correct one.
|
algorithm which will disambiguate the various candidates to the correct one.
|
||||||
Each candidate (alias, entity) pair is assigned to a certain prior probability.
|
Each candidate (alias, entity) pair is assigned to a certain prior probability.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/kb/#candidate_init
|
DOCS: https://nightly.spacy.io/api/kb/#candidate_init
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
|
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
|
||||||
|
@ -79,7 +79,7 @@ cdef class KnowledgeBase:
|
||||||
"""A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
|
"""A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
|
||||||
to support entity linking of named entities to real-world concepts.
|
to support entity linking of named entities to real-world concepts.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/kb
|
DOCS: https://nightly.spacy.io/api/kb
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, entity_vector_length):
|
def __init__(self, Vocab vocab, entity_vector_length):
|
||||||
|
|
|
@ -1,9 +1,11 @@
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
|
||||||
|
|
||||||
class CzechDefaults(Language.Defaults):
|
class CzechDefaults(Language.Defaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class Czech(Language):
|
class Czech(Language):
|
||||||
|
|
38
spacy/lang/cs/examples.py
Normal file
38
spacy/lang/cs/examples.py
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
>>> from spacy.lang.cs.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Máma mele maso.",
|
||||||
|
"Příliš žluťoučký kůň úpěl ďábelské ódy.",
|
||||||
|
"ArcGIS je geografický informační systém určený pro práci s prostorovými daty.",
|
||||||
|
"Může data vytvářet a spravovat, ale především je dokáže analyzovat, najít v nich nové vztahy a vše přehledně vizualizovat.",
|
||||||
|
"Dnes je krásné počasí.",
|
||||||
|
"Nestihl autobus, protože pozdě vstal z postele.",
|
||||||
|
"Než budeš jíst, jdi si umýt ruce.",
|
||||||
|
"Dnes je neděle.",
|
||||||
|
"Škola začíná v 8:00.",
|
||||||
|
"Poslední autobus jede v jedenáct hodin večer.",
|
||||||
|
"V roce 2020 se téměř zastavila světová ekonomika.",
|
||||||
|
"Praha je hlavní město České republiky.",
|
||||||
|
"Kdy půjdeš ven?",
|
||||||
|
"Kam pojedete na dovolenou?",
|
||||||
|
"Kolik stojí iPhone 12?",
|
||||||
|
"Průměrná mzda je 30000 Kč.",
|
||||||
|
"1. ledna 1993 byla založena Česká republika.",
|
||||||
|
"Co se stalo 21.8.1968?",
|
||||||
|
"Moje telefonní číslo je 712 345 678.",
|
||||||
|
"Můj pes má blechy.",
|
||||||
|
"Když bude přes noc více než 20°, tak nás čeká tropická noc.",
|
||||||
|
"Kolik bylo letos tropických nocí?",
|
||||||
|
"Jak to mám udělat?",
|
||||||
|
"Bydlíme ve čtvrtém patře.",
|
||||||
|
"Vysílají 30. sezonu seriálu Simpsonovi.",
|
||||||
|
"Adresa ČVUT je Thákurova 7, 166 29, Praha 6.",
|
||||||
|
"Jaké PSČ má Praha 1?",
|
||||||
|
"PSČ Prahy 1 je 110 00.",
|
||||||
|
"Za 20 minut jede vlak.",
|
||||||
|
]
|
61
spacy/lang/cs/lex_attrs.py
Normal file
61
spacy/lang/cs/lex_attrs.py
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
_num_words = [
|
||||||
|
"nula",
|
||||||
|
"jedna",
|
||||||
|
"dva",
|
||||||
|
"tři",
|
||||||
|
"čtyři",
|
||||||
|
"pět",
|
||||||
|
"šest",
|
||||||
|
"sedm",
|
||||||
|
"osm",
|
||||||
|
"devět",
|
||||||
|
"deset",
|
||||||
|
"jedenáct",
|
||||||
|
"dvanáct",
|
||||||
|
"třináct",
|
||||||
|
"čtrnáct",
|
||||||
|
"patnáct",
|
||||||
|
"šestnáct",
|
||||||
|
"sedmnáct",
|
||||||
|
"osmnáct",
|
||||||
|
"devatenáct",
|
||||||
|
"dvacet",
|
||||||
|
"třicet",
|
||||||
|
"čtyřicet",
|
||||||
|
"padesát",
|
||||||
|
"šedesát",
|
||||||
|
"sedmdesát",
|
||||||
|
"osmdesát",
|
||||||
|
"devadesát",
|
||||||
|
"sto",
|
||||||
|
"tisíc",
|
||||||
|
"milion",
|
||||||
|
"miliarda",
|
||||||
|
"bilion",
|
||||||
|
"biliarda",
|
||||||
|
"trilion",
|
||||||
|
"triliarda",
|
||||||
|
"kvadrilion",
|
||||||
|
"kvadriliarda",
|
||||||
|
"kvintilion",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
|
text = text[1:]
|
||||||
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
if text.count("/") == 1:
|
||||||
|
num, denom = text.split("/")
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
if text.lower() in _num_words:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
|
@ -1,14 +1,23 @@
|
||||||
# Source: https://github.com/Alir3z4/stop-words
|
# Source: https://github.com/Alir3z4/stop-words
|
||||||
|
# Source: https://github.com/stopwords-iso/stopwords-cs/blob/master/stopwords-cs.txt
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
ačkoli
|
a
|
||||||
|
aby
|
||||||
ahoj
|
ahoj
|
||||||
|
ačkoli
|
||||||
ale
|
ale
|
||||||
|
alespoň
|
||||||
anebo
|
anebo
|
||||||
|
ani
|
||||||
|
aniž
|
||||||
ano
|
ano
|
||||||
|
atd.
|
||||||
|
atp.
|
||||||
asi
|
asi
|
||||||
aspoň
|
aspoň
|
||||||
|
až
|
||||||
během
|
během
|
||||||
bez
|
bez
|
||||||
beze
|
beze
|
||||||
|
@ -21,12 +30,14 @@ budeš
|
||||||
budete
|
budete
|
||||||
budou
|
budou
|
||||||
budu
|
budu
|
||||||
|
by
|
||||||
byl
|
byl
|
||||||
byla
|
byla
|
||||||
byli
|
byli
|
||||||
bylo
|
bylo
|
||||||
byly
|
byly
|
||||||
bys
|
bys
|
||||||
|
být
|
||||||
čau
|
čau
|
||||||
chce
|
chce
|
||||||
chceme
|
chceme
|
||||||
|
@ -35,14 +46,21 @@ chcete
|
||||||
chci
|
chci
|
||||||
chtějí
|
chtějí
|
||||||
chtít
|
chtít
|
||||||
chut'
|
chuť
|
||||||
chuti
|
chuti
|
||||||
co
|
co
|
||||||
|
což
|
||||||
|
cz
|
||||||
|
či
|
||||||
|
článek
|
||||||
|
článku
|
||||||
|
články
|
||||||
čtrnáct
|
čtrnáct
|
||||||
čtyři
|
čtyři
|
||||||
dál
|
dál
|
||||||
dále
|
dále
|
||||||
daleko
|
daleko
|
||||||
|
další
|
||||||
děkovat
|
děkovat
|
||||||
děkujeme
|
děkujeme
|
||||||
děkuji
|
děkuji
|
||||||
|
@ -50,6 +68,7 @@ den
|
||||||
deset
|
deset
|
||||||
devatenáct
|
devatenáct
|
||||||
devět
|
devět
|
||||||
|
dnes
|
||||||
do
|
do
|
||||||
dobrý
|
dobrý
|
||||||
docela
|
docela
|
||||||
|
@ -57,9 +76,15 @@ dva
|
||||||
dvacet
|
dvacet
|
||||||
dvanáct
|
dvanáct
|
||||||
dvě
|
dvě
|
||||||
|
email
|
||||||
|
ho
|
||||||
hodně
|
hodně
|
||||||
|
i
|
||||||
já
|
já
|
||||||
jak
|
jak
|
||||||
|
jakmile
|
||||||
|
jako
|
||||||
|
jakož
|
||||||
jde
|
jde
|
||||||
je
|
je
|
||||||
jeden
|
jeden
|
||||||
|
@ -69,25 +94,39 @@ jedno
|
||||||
jednou
|
jednou
|
||||||
jedou
|
jedou
|
||||||
jeho
|
jeho
|
||||||
|
jehož
|
||||||
|
jej
|
||||||
její
|
její
|
||||||
jejich
|
jejich
|
||||||
|
jejichž
|
||||||
|
jehož
|
||||||
|
jelikož
|
||||||
jemu
|
jemu
|
||||||
jen
|
jen
|
||||||
jenom
|
jenom
|
||||||
|
jenž
|
||||||
|
jež
|
||||||
ještě
|
ještě
|
||||||
jestli
|
jestli
|
||||||
jestliže
|
jestliže
|
||||||
|
ještě
|
||||||
|
ji
|
||||||
jí
|
jí
|
||||||
jich
|
jich
|
||||||
jím
|
jím
|
||||||
|
jim
|
||||||
jimi
|
jimi
|
||||||
jinak
|
jinak
|
||||||
jsem
|
jiné
|
||||||
|
již
|
||||||
jsi
|
jsi
|
||||||
jsme
|
jsme
|
||||||
|
jsem
|
||||||
jsou
|
jsou
|
||||||
jste
|
jste
|
||||||
|
k
|
||||||
kam
|
kam
|
||||||
|
každý
|
||||||
kde
|
kde
|
||||||
kdo
|
kdo
|
||||||
kdy
|
kdy
|
||||||
|
@ -96,10 +135,13 @@ ke
|
||||||
kolik
|
kolik
|
||||||
kromě
|
kromě
|
||||||
která
|
která
|
||||||
|
kterak
|
||||||
|
kterou
|
||||||
které
|
které
|
||||||
kteří
|
kteří
|
||||||
který
|
který
|
||||||
kvůli
|
kvůli
|
||||||
|
ku
|
||||||
má
|
má
|
||||||
mají
|
mají
|
||||||
málo
|
málo
|
||||||
|
@ -110,8 +152,10 @@ máte
|
||||||
mé
|
mé
|
||||||
mě
|
mě
|
||||||
mezi
|
mezi
|
||||||
|
mi
|
||||||
mí
|
mí
|
||||||
mít
|
mít
|
||||||
|
mne
|
||||||
mně
|
mně
|
||||||
mnou
|
mnou
|
||||||
moc
|
moc
|
||||||
|
@ -134,6 +178,7 @@ nás
|
||||||
náš
|
náš
|
||||||
naše
|
naše
|
||||||
naši
|
naši
|
||||||
|
načež
|
||||||
ne
|
ne
|
||||||
ně
|
ně
|
||||||
nebo
|
nebo
|
||||||
|
@ -141,6 +186,7 @@ nebyl
|
||||||
nebyla
|
nebyla
|
||||||
nebyli
|
nebyli
|
||||||
nebyly
|
nebyly
|
||||||
|
nechť
|
||||||
něco
|
něco
|
||||||
nedělá
|
nedělá
|
||||||
nedělají
|
nedělají
|
||||||
|
@ -150,6 +196,7 @@ neděláš
|
||||||
neděláte
|
neděláte
|
||||||
nějak
|
nějak
|
||||||
nejsi
|
nejsi
|
||||||
|
nejsou
|
||||||
někde
|
někde
|
||||||
někdo
|
někdo
|
||||||
nemají
|
nemají
|
||||||
|
@ -157,15 +204,22 @@ nemáme
|
||||||
nemáte
|
nemáte
|
||||||
neměl
|
neměl
|
||||||
němu
|
němu
|
||||||
|
němuž
|
||||||
není
|
není
|
||||||
nestačí
|
nestačí
|
||||||
|
ně
|
||||||
nevadí
|
nevadí
|
||||||
|
nové
|
||||||
|
nový
|
||||||
|
noví
|
||||||
než
|
než
|
||||||
nic
|
nic
|
||||||
nich
|
nich
|
||||||
|
ní
|
||||||
ním
|
ním
|
||||||
nimi
|
nimi
|
||||||
nula
|
nula
|
||||||
|
o
|
||||||
od
|
od
|
||||||
ode
|
ode
|
||||||
on
|
on
|
||||||
|
@ -179,22 +233,37 @@ pak
|
||||||
patnáct
|
patnáct
|
||||||
pět
|
pět
|
||||||
po
|
po
|
||||||
|
pod
|
||||||
|
pokud
|
||||||
pořád
|
pořád
|
||||||
|
pouze
|
||||||
potom
|
potom
|
||||||
pozdě
|
pozdě
|
||||||
|
pravé
|
||||||
před
|
před
|
||||||
|
přede
|
||||||
přes
|
přes
|
||||||
přese
|
přece
|
||||||
pro
|
pro
|
||||||
proč
|
proč
|
||||||
prosím
|
prosím
|
||||||
prostě
|
prostě
|
||||||
|
proto
|
||||||
proti
|
proti
|
||||||
|
první
|
||||||
|
právě
|
||||||
protože
|
protože
|
||||||
|
při
|
||||||
|
přičemž
|
||||||
rovně
|
rovně
|
||||||
|
s
|
||||||
se
|
se
|
||||||
sedm
|
sedm
|
||||||
sedmnáct
|
sedmnáct
|
||||||
|
si
|
||||||
|
sice
|
||||||
|
skoro
|
||||||
|
sic
|
||||||
šest
|
šest
|
||||||
šestnáct
|
šestnáct
|
||||||
skoro
|
skoro
|
||||||
|
@ -203,41 +272,69 @@ smí
|
||||||
snad
|
snad
|
||||||
spolu
|
spolu
|
||||||
sta
|
sta
|
||||||
|
svůj
|
||||||
|
své
|
||||||
|
svá
|
||||||
|
svých
|
||||||
|
svým
|
||||||
|
svými
|
||||||
|
svůj
|
||||||
sté
|
sté
|
||||||
sto
|
sto
|
||||||
|
strana
|
||||||
ta
|
ta
|
||||||
tady
|
tady
|
||||||
tak
|
tak
|
||||||
takhle
|
takhle
|
||||||
taky
|
taky
|
||||||
|
také
|
||||||
|
takže
|
||||||
tam
|
tam
|
||||||
tamhle
|
támhle
|
||||||
tamhleto
|
támhleto
|
||||||
tamto
|
tamto
|
||||||
tě
|
tě
|
||||||
tebe
|
tebe
|
||||||
tebou
|
tebou
|
||||||
ted'
|
teď
|
||||||
tedy
|
tedy
|
||||||
ten
|
ten
|
||||||
|
tento
|
||||||
|
této
|
||||||
ti
|
ti
|
||||||
|
tím
|
||||||
|
tímto
|
||||||
tisíc
|
tisíc
|
||||||
tisíce
|
tisíce
|
||||||
to
|
to
|
||||||
tobě
|
tobě
|
||||||
tohle
|
tohle
|
||||||
|
tohoto
|
||||||
|
tom
|
||||||
|
tomto
|
||||||
|
tomu
|
||||||
|
tomuto
|
||||||
toto
|
toto
|
||||||
třeba
|
třeba
|
||||||
tři
|
tři
|
||||||
třináct
|
třináct
|
||||||
trošku
|
trošku
|
||||||
|
trochu
|
||||||
|
tu
|
||||||
|
tuto
|
||||||
tvá
|
tvá
|
||||||
tvé
|
tvé
|
||||||
tvoje
|
tvoje
|
||||||
tvůj
|
tvůj
|
||||||
ty
|
ty
|
||||||
|
tyto
|
||||||
|
těm
|
||||||
|
těma
|
||||||
|
těmi
|
||||||
|
u
|
||||||
určitě
|
určitě
|
||||||
už
|
už
|
||||||
|
v
|
||||||
vám
|
vám
|
||||||
vámi
|
vámi
|
||||||
vás
|
vás
|
||||||
|
@ -247,13 +344,19 @@ vaši
|
||||||
ve
|
ve
|
||||||
večer
|
večer
|
||||||
vedle
|
vedle
|
||||||
|
více
|
||||||
vlastně
|
vlastně
|
||||||
|
však
|
||||||
|
všechen
|
||||||
všechno
|
všechno
|
||||||
všichni
|
všichni
|
||||||
vůbec
|
vůbec
|
||||||
vy
|
vy
|
||||||
vždy
|
vždy
|
||||||
|
z
|
||||||
|
zda
|
||||||
za
|
za
|
||||||
|
zde
|
||||||
zač
|
zač
|
||||||
zatímco
|
zatímco
|
||||||
ze
|
ze
|
||||||
|
|
0
spacy/lang/cs/test_text.py
Normal file
0
spacy/lang/cs/test_text.py
Normal file
|
@ -8,6 +8,14 @@ _num_words = [
|
||||||
"fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand",
|
"fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand",
|
||||||
"million", "billion", "trillion", "quadrillion", "gajillion", "bazillion"
|
"million", "billion", "trillion", "quadrillion", "gajillion", "bazillion"
|
||||||
]
|
]
|
||||||
|
_ordinal_words = [
|
||||||
|
"first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth",
|
||||||
|
"ninth", "tenth", "eleventh", "twelfth", "thirteenth", "fourteenth",
|
||||||
|
"fifteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth",
|
||||||
|
"twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth",
|
||||||
|
"eightieth", "ninetieth", "hundredth", "thousandth", "millionth", "billionth",
|
||||||
|
"trillionth", "quadrillionth", "gajillionth", "bazillionth"
|
||||||
|
]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
@ -21,7 +29,14 @@ def like_num(text: str) -> bool:
|
||||||
num, denom = text.split("/")
|
num, denom = text.split("/")
|
||||||
if num.isdigit() and denom.isdigit():
|
if num.isdigit() and denom.isdigit():
|
||||||
return True
|
return True
|
||||||
if text.lower() in _num_words:
|
text_lower = text.lower()
|
||||||
|
if text_lower in _num_words:
|
||||||
|
return True
|
||||||
|
# Check ordinal number
|
||||||
|
if text_lower in _ordinal_words:
|
||||||
|
return True
|
||||||
|
if text_lower.endswith("th"):
|
||||||
|
if text_lower[:-2].isdigit():
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
|
@ -19,8 +19,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
np_left_deps = [doc.vocab.strings.add(label) for label in left_labels]
|
np_left_deps = [doc.vocab.strings.add(label) for label in left_labels]
|
||||||
np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
|
np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
|
||||||
stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
|
stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
|
||||||
token = doc[0]
|
for token in doclike:
|
||||||
while token and token.i < len(doclike):
|
|
||||||
if token.pos in [PROPN, NOUN, PRON]:
|
if token.pos in [PROPN, NOUN, PRON]:
|
||||||
left, right = noun_bounds(
|
left, right = noun_bounds(
|
||||||
doc, token, np_left_deps, np_right_deps, stop_deps
|
doc, token, np_left_deps, np_right_deps, stop_deps
|
||||||
|
|
|
@ -1,9 +1,11 @@
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
|
||||||
|
|
||||||
class HebrewDefaults(Language.Defaults):
|
class HebrewDefaults(Language.Defaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
||||||
|
|
||||||
|
|
||||||
|
|
95
spacy/lang/he/lex_attrs.py
Normal file
95
spacy/lang/he/lex_attrs.py
Normal file
|
@ -0,0 +1,95 @@
|
||||||
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
_num_words = [
|
||||||
|
"אפס",
|
||||||
|
"אחד",
|
||||||
|
"אחת",
|
||||||
|
"שתיים",
|
||||||
|
"שתים",
|
||||||
|
"שניים",
|
||||||
|
"שנים",
|
||||||
|
"שלוש",
|
||||||
|
"שלושה",
|
||||||
|
"ארבע",
|
||||||
|
"ארבעה",
|
||||||
|
"חמש",
|
||||||
|
"חמישה",
|
||||||
|
"שש",
|
||||||
|
"שישה",
|
||||||
|
"שבע",
|
||||||
|
"שבעה",
|
||||||
|
"שמונה",
|
||||||
|
"תשע",
|
||||||
|
"תשעה",
|
||||||
|
"עשר",
|
||||||
|
"עשרה",
|
||||||
|
"אחד עשר",
|
||||||
|
"אחת עשרה",
|
||||||
|
"שנים עשר",
|
||||||
|
"שתים עשרה",
|
||||||
|
"שלושה עשר",
|
||||||
|
"שלוש עשרה",
|
||||||
|
"ארבעה עשר",
|
||||||
|
"ארבע עשרה",
|
||||||
|
"חמישה עשר",
|
||||||
|
"חמש עשרה",
|
||||||
|
"ששה עשר",
|
||||||
|
"שש עשרה",
|
||||||
|
"שבעה עשר",
|
||||||
|
"שבע עשרה",
|
||||||
|
"שמונה עשר",
|
||||||
|
"שמונה עשרה",
|
||||||
|
"תשעה עשר",
|
||||||
|
"תשע עשרה",
|
||||||
|
"עשרים",
|
||||||
|
"שלושים",
|
||||||
|
"ארבעים",
|
||||||
|
"חמישים",
|
||||||
|
"שישים",
|
||||||
|
"שבעים",
|
||||||
|
"שמונים",
|
||||||
|
"תשעים",
|
||||||
|
"מאה",
|
||||||
|
"אלף",
|
||||||
|
"מליון",
|
||||||
|
"מליארד",
|
||||||
|
"טריליון",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
_ordinal_words = [
|
||||||
|
"ראשון",
|
||||||
|
"שני",
|
||||||
|
"שלישי",
|
||||||
|
"רביעי",
|
||||||
|
"חמישי",
|
||||||
|
"שישי",
|
||||||
|
"שביעי",
|
||||||
|
"שמיני",
|
||||||
|
"תשיעי",
|
||||||
|
"עשירי",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
|
text = text[1:]
|
||||||
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
|
||||||
|
if text.count("/") == 1:
|
||||||
|
num, denom = text.split("/")
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
|
||||||
|
if text in _num_words:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# CHeck ordinal number
|
||||||
|
if text in _ordinal_words:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
|
@ -39,7 +39,6 @@ STOP_WORDS = set(
|
||||||
בין
|
בין
|
||||||
עם
|
עם
|
||||||
עד
|
עד
|
||||||
נגר
|
|
||||||
על
|
על
|
||||||
אל
|
אל
|
||||||
מול
|
מול
|
||||||
|
@ -58,7 +57,7 @@ STOP_WORDS = set(
|
||||||
עליך
|
עליך
|
||||||
עלינו
|
עלינו
|
||||||
עליכם
|
עליכם
|
||||||
לעיכן
|
עליכן
|
||||||
עליהם
|
עליהם
|
||||||
עליהן
|
עליהן
|
||||||
כל
|
כל
|
||||||
|
@ -67,8 +66,8 @@ STOP_WORDS = set(
|
||||||
כך
|
כך
|
||||||
ככה
|
ככה
|
||||||
כזה
|
כזה
|
||||||
|
כזאת
|
||||||
זה
|
זה
|
||||||
זות
|
|
||||||
אותי
|
אותי
|
||||||
אותה
|
אותה
|
||||||
אותם
|
אותם
|
||||||
|
@ -91,7 +90,7 @@ STOP_WORDS = set(
|
||||||
איתכן
|
איתכן
|
||||||
יהיה
|
יהיה
|
||||||
תהיה
|
תהיה
|
||||||
היתי
|
הייתי
|
||||||
היתה
|
היתה
|
||||||
היה
|
היה
|
||||||
להיות
|
להיות
|
||||||
|
@ -101,8 +100,6 @@ STOP_WORDS = set(
|
||||||
עצמם
|
עצמם
|
||||||
עצמן
|
עצמן
|
||||||
עצמנו
|
עצמנו
|
||||||
עצמהם
|
|
||||||
עצמהן
|
|
||||||
מי
|
מי
|
||||||
מה
|
מה
|
||||||
איפה
|
איפה
|
||||||
|
@ -153,6 +150,7 @@ STOP_WORDS = set(
|
||||||
לאו
|
לאו
|
||||||
אי
|
אי
|
||||||
כלל
|
כלל
|
||||||
|
בעד
|
||||||
נגד
|
נגד
|
||||||
אם
|
אם
|
||||||
עם
|
עם
|
||||||
|
@ -196,7 +194,6 @@ STOP_WORDS = set(
|
||||||
אשר
|
אשר
|
||||||
ואילו
|
ואילו
|
||||||
למרות
|
למרות
|
||||||
אס
|
|
||||||
כמו
|
כמו
|
||||||
כפי
|
כפי
|
||||||
אז
|
אז
|
||||||
|
@ -204,8 +201,8 @@ STOP_WORDS = set(
|
||||||
כן
|
כן
|
||||||
לכן
|
לכן
|
||||||
לפיכך
|
לפיכך
|
||||||
מאד
|
|
||||||
עז
|
עז
|
||||||
|
מאוד
|
||||||
מעט
|
מעט
|
||||||
מעטים
|
מעטים
|
||||||
במידה
|
במידה
|
||||||
|
|
|
@ -15,4 +15,6 @@ sentences = [
|
||||||
"फ्रांस के राष्ट्रपति कौन हैं?",
|
"फ्रांस के राष्ट्रपति कौन हैं?",
|
||||||
"संयुक्त राज्यों की राजधानी क्या है?",
|
"संयुक्त राज्यों की राजधानी क्या है?",
|
||||||
"बराक ओबामा का जन्म कब हुआ था?",
|
"बराक ओबामा का जन्म कब हुआ था?",
|
||||||
|
"जवाहरलाल नेहरू भारत के पहले प्रधानमंत्री हैं।",
|
||||||
|
"राजेंद्र प्रसाद, भारत के पहले राष्ट्रपति, दो कार्यकाल के लिए कार्यालय रखने वाले एकमात्र व्यक्ति हैं।",
|
||||||
]
|
]
|
||||||
|
|
|
@ -254,7 +254,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
|
||||||
return text_dtokens, text_spaces
|
return text_dtokens, text_spaces
|
||||||
|
|
||||||
# align words and dtokens by referring text, and insert gap tokens for the space char spans
|
# align words and dtokens by referring text, and insert gap tokens for the space char spans
|
||||||
for word, dtoken in zip(words, dtokens):
|
for i, (word, dtoken) in enumerate(zip(words, dtokens)):
|
||||||
# skip all space tokens
|
# skip all space tokens
|
||||||
if word.isspace():
|
if word.isspace():
|
||||||
continue
|
continue
|
||||||
|
@ -275,7 +275,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
|
||||||
text_spaces.append(False)
|
text_spaces.append(False)
|
||||||
text_pos += len(word)
|
text_pos += len(word)
|
||||||
# poll a space char after the word
|
# poll a space char after the word
|
||||||
if text_pos < len(text) and text[text_pos] == " ":
|
if i + 1 < len(dtokens) and dtokens[i + 1].surface == " ":
|
||||||
text_spaces[-1] = True
|
text_spaces[-1] = True
|
||||||
text_pos += 1
|
text_pos += 1
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ from .. import attrs
|
||||||
_like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match
|
_like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match
|
||||||
_tlds = set(
|
_tlds = set(
|
||||||
"com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|"
|
"com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|"
|
||||||
"name|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|"
|
"name|pro|tel|travel|xyz|icu|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|"
|
||||||
"ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|"
|
"ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|"
|
||||||
"cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|"
|
"cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|"
|
||||||
"ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|"
|
"ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|"
|
||||||
|
|
|
@ -1,7 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
# Source: https://github.com/sanjaalcorps/NepaliStopWords/blob/master/NepaliStopWords.txt
|
# Source: https://github.com/sanjaalcorps/NepaliStopWords/blob/master/NepaliStopWords.txt
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
|
|
16
spacy/lang/sa/__init__.py
Normal file
16
spacy/lang/sa/__init__.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from ...language import Language
|
||||||
|
|
||||||
|
|
||||||
|
class SanskritDefaults(Language.Defaults):
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
class Sanskrit(Language):
|
||||||
|
lang = "sa"
|
||||||
|
Defaults = SanskritDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Sanskrit"]
|
15
spacy/lang/sa/examples.py
Normal file
15
spacy/lang/sa/examples.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.sa.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"अभ्यावहति कल्याणं विविधं वाक् सुभाषिता ।",
|
||||||
|
"मनसि व्याकुले चक्षुः पश्यन्नपि न पश्यति ।",
|
||||||
|
"यस्य बुद्धिर्बलं तस्य निर्बुद्धेस्तु कुतो बलम्?",
|
||||||
|
"परो अपि हितवान् बन्धुः बन्धुः अपि अहितः परः ।",
|
||||||
|
"अहितः देहजः व्याधिः हितम् आरण्यं औषधम् ॥",
|
||||||
|
]
|
127
spacy/lang/sa/lex_attrs.py
Normal file
127
spacy/lang/sa/lex_attrs.py
Normal file
|
@ -0,0 +1,127 @@
|
||||||
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
# reference 1: https://en.wikibooks.org/wiki/Sanskrit/Numbers
|
||||||
|
|
||||||
|
_num_words = [
|
||||||
|
"एकः",
|
||||||
|
"द्वौ",
|
||||||
|
"त्रयः",
|
||||||
|
"चत्वारः",
|
||||||
|
"पञ्च",
|
||||||
|
"षट्",
|
||||||
|
"सप्त",
|
||||||
|
"अष्ट",
|
||||||
|
"नव",
|
||||||
|
"दश",
|
||||||
|
"एकादश",
|
||||||
|
"द्वादश",
|
||||||
|
"त्रयोदश",
|
||||||
|
"चतुर्दश",
|
||||||
|
"पञ्चदश",
|
||||||
|
"षोडश",
|
||||||
|
"सप्तदश",
|
||||||
|
"अष्टादश",
|
||||||
|
"एकान्नविंशति",
|
||||||
|
"विंशति",
|
||||||
|
"एकाविंशति",
|
||||||
|
"द्वाविंशति",
|
||||||
|
"त्रयोविंशति",
|
||||||
|
"चतुर्विंशति",
|
||||||
|
"पञ्चविंशति",
|
||||||
|
"षड्विंशति",
|
||||||
|
"सप्तविंशति",
|
||||||
|
"अष्टाविंशति",
|
||||||
|
"एकान्नत्रिंशत्",
|
||||||
|
"त्रिंशत्",
|
||||||
|
"एकत्रिंशत्",
|
||||||
|
"द्वात्रिंशत्",
|
||||||
|
"त्रयत्रिंशत्",
|
||||||
|
"चतुस्त्रिंशत्",
|
||||||
|
"पञ्चत्रिंशत्",
|
||||||
|
"षट्त्रिंशत्",
|
||||||
|
"सप्तत्रिंशत्",
|
||||||
|
"अष्टात्रिंशत्",
|
||||||
|
"एकोनचत्वारिंशत्",
|
||||||
|
"चत्वारिंशत्",
|
||||||
|
"एकचत्वारिंशत्",
|
||||||
|
"द्वाचत्वारिंशत्",
|
||||||
|
"त्रयश्चत्वारिंशत्",
|
||||||
|
"चतुश्चत्वारिंशत्",
|
||||||
|
"पञ्चचत्वारिंशत्",
|
||||||
|
"षट्चत्वारिंशत्",
|
||||||
|
"सप्तचत्वारिंशत्",
|
||||||
|
"अष्टाचत्वारिंशत्",
|
||||||
|
"एकोनपञ्चाशत्",
|
||||||
|
"पञ्चाशत्",
|
||||||
|
"एकपञ्चाशत्",
|
||||||
|
"द्विपञ्चाशत्",
|
||||||
|
"त्रिपञ्चाशत्",
|
||||||
|
"चतुःपञ्चाशत्",
|
||||||
|
"पञ्चपञ्चाशत्",
|
||||||
|
"षट्पञ्चाशत्",
|
||||||
|
"सप्तपञ्चाशत्",
|
||||||
|
"अष्टपञ्चाशत्",
|
||||||
|
"एकोनषष्ठिः",
|
||||||
|
"षष्ठिः",
|
||||||
|
"एकषष्ठिः",
|
||||||
|
"द्विषष्ठिः",
|
||||||
|
"त्रिषष्ठिः",
|
||||||
|
"चतुःषष्ठिः",
|
||||||
|
"पञ्चषष्ठिः",
|
||||||
|
"षट्षष्ठिः",
|
||||||
|
"सप्तषष्ठिः",
|
||||||
|
"अष्टषष्ठिः",
|
||||||
|
"एकोनसप्ततिः",
|
||||||
|
"सप्ततिः",
|
||||||
|
"एकसप्ततिः",
|
||||||
|
"द्विसप्ततिः",
|
||||||
|
"त्रिसप्ततिः",
|
||||||
|
"चतुःसप्ततिः",
|
||||||
|
"पञ्चसप्ततिः",
|
||||||
|
"षट्सप्ततिः",
|
||||||
|
"सप्तसप्ततिः",
|
||||||
|
"अष्टसप्ततिः",
|
||||||
|
"एकोनाशीतिः",
|
||||||
|
"अशीतिः",
|
||||||
|
"एकाशीतिः",
|
||||||
|
"द्वशीतिः",
|
||||||
|
"त्र्यशीतिः",
|
||||||
|
"चतुरशीतिः",
|
||||||
|
"पञ्चाशीतिः",
|
||||||
|
"षडशीतिः",
|
||||||
|
"सप्ताशीतिः",
|
||||||
|
"अष्टाशीतिः",
|
||||||
|
"एकोननवतिः",
|
||||||
|
"नवतिः",
|
||||||
|
"एकनवतिः",
|
||||||
|
"द्विनवतिः",
|
||||||
|
"त्रिनवतिः",
|
||||||
|
"चतुर्नवतिः",
|
||||||
|
"पञ्चनवतिः",
|
||||||
|
"षण्णवतिः",
|
||||||
|
"सप्तनवतिः",
|
||||||
|
"अष्टनवतिः",
|
||||||
|
"एकोनशतम्",
|
||||||
|
"शतम्",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
"""
|
||||||
|
Check if text resembles a number
|
||||||
|
"""
|
||||||
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
|
text = text[1:]
|
||||||
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
if text.count("/") == 1:
|
||||||
|
num, denom = text.split("/")
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
if text in _num_words:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
515
spacy/lang/sa/stop_words.py
Normal file
515
spacy/lang/sa/stop_words.py
Normal file
|
@ -0,0 +1,515 @@
|
||||||
|
# Source: https://gist.github.com/Akhilesh28/fe8b8e180f64b72e64751bc31cb6d323
|
||||||
|
|
||||||
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
|
अहम्
|
||||||
|
आवाम्
|
||||||
|
वयम्
|
||||||
|
माम् मा
|
||||||
|
आवाम्
|
||||||
|
अस्मान् नः
|
||||||
|
मया
|
||||||
|
आवाभ्याम्
|
||||||
|
अस्माभिस्
|
||||||
|
मह्यम् मे
|
||||||
|
आवाभ्याम् नौ
|
||||||
|
अस्मभ्यम् नः
|
||||||
|
मत्
|
||||||
|
आवाभ्याम्
|
||||||
|
अस्मत्
|
||||||
|
मम मे
|
||||||
|
आवयोः
|
||||||
|
अस्माकम् नः
|
||||||
|
मयि
|
||||||
|
आवयोः
|
||||||
|
अस्मासु
|
||||||
|
त्वम्
|
||||||
|
युवाम्
|
||||||
|
यूयम्
|
||||||
|
त्वाम् त्वा
|
||||||
|
युवाम् वाम्
|
||||||
|
युष्मान् वः
|
||||||
|
त्वया
|
||||||
|
युवाभ्याम्
|
||||||
|
युष्माभिः
|
||||||
|
तुभ्यम् ते
|
||||||
|
युवाभ्याम् वाम्
|
||||||
|
युष्मभ्यम् वः
|
||||||
|
त्वत्
|
||||||
|
युवाभ्याम्
|
||||||
|
युष्मत्
|
||||||
|
तव ते
|
||||||
|
युवयोः वाम्
|
||||||
|
युष्माकम् वः
|
||||||
|
त्वयि
|
||||||
|
युवयोः
|
||||||
|
युष्मासु
|
||||||
|
सः
|
||||||
|
तौ
|
||||||
|
ते
|
||||||
|
तम्
|
||||||
|
तौ
|
||||||
|
तान्
|
||||||
|
तेन
|
||||||
|
ताभ्याम्
|
||||||
|
तैः
|
||||||
|
तस्मै
|
||||||
|
ताभ्याम्
|
||||||
|
तेभ्यः
|
||||||
|
तस्मात्
|
||||||
|
ताभ्याम्
|
||||||
|
तेभ्यः
|
||||||
|
तस्य
|
||||||
|
तयोः
|
||||||
|
तेषाम्
|
||||||
|
तस्मिन्
|
||||||
|
तयोः
|
||||||
|
तेषु
|
||||||
|
सा
|
||||||
|
ते
|
||||||
|
ताः
|
||||||
|
ताम्
|
||||||
|
ते
|
||||||
|
ताः
|
||||||
|
तया
|
||||||
|
ताभ्याम्
|
||||||
|
ताभिः
|
||||||
|
तस्यै
|
||||||
|
ताभ्याम्
|
||||||
|
ताभ्यः
|
||||||
|
तस्याः
|
||||||
|
ताभ्याम्
|
||||||
|
ताभ्यः
|
||||||
|
तस्य
|
||||||
|
तयोः
|
||||||
|
तासाम्
|
||||||
|
तस्याम्
|
||||||
|
तयोः
|
||||||
|
तासु
|
||||||
|
तत्
|
||||||
|
ते
|
||||||
|
तानि
|
||||||
|
तत्
|
||||||
|
ते
|
||||||
|
तानि
|
||||||
|
तया
|
||||||
|
ताभ्याम्
|
||||||
|
ताभिः
|
||||||
|
तस्यै
|
||||||
|
ताभ्याम्
|
||||||
|
ताभ्यः
|
||||||
|
तस्याः
|
||||||
|
ताभ्याम्
|
||||||
|
ताभ्यः
|
||||||
|
तस्य
|
||||||
|
तयोः
|
||||||
|
तासाम्
|
||||||
|
तस्याम्
|
||||||
|
तयोः
|
||||||
|
तासु
|
||||||
|
अयम्
|
||||||
|
इमौ
|
||||||
|
इमे
|
||||||
|
इमम्
|
||||||
|
इमौ
|
||||||
|
इमान्
|
||||||
|
अनेन
|
||||||
|
आभ्याम्
|
||||||
|
एभिः
|
||||||
|
अस्मै
|
||||||
|
आभ्याम्
|
||||||
|
एभ्यः
|
||||||
|
अस्मात्
|
||||||
|
आभ्याम्
|
||||||
|
एभ्यः
|
||||||
|
अस्य
|
||||||
|
अनयोः
|
||||||
|
एषाम्
|
||||||
|
अस्मिन्
|
||||||
|
अनयोः
|
||||||
|
एषु
|
||||||
|
इयम्
|
||||||
|
इमे
|
||||||
|
इमाः
|
||||||
|
इमाम्
|
||||||
|
इमे
|
||||||
|
इमाः
|
||||||
|
अनया
|
||||||
|
आभ्याम्
|
||||||
|
आभिः
|
||||||
|
अस्यै
|
||||||
|
आभ्याम्
|
||||||
|
आभ्यः
|
||||||
|
अस्याः
|
||||||
|
आभ्याम्
|
||||||
|
आभ्यः
|
||||||
|
अस्याः
|
||||||
|
अनयोः
|
||||||
|
आसाम्
|
||||||
|
अस्याम्
|
||||||
|
अनयोः
|
||||||
|
आसु
|
||||||
|
इदम्
|
||||||
|
इमे
|
||||||
|
इमानि
|
||||||
|
इदम्
|
||||||
|
इमे
|
||||||
|
इमानि
|
||||||
|
अनेन
|
||||||
|
आभ्याम्
|
||||||
|
एभिः
|
||||||
|
अस्मै
|
||||||
|
आभ्याम्
|
||||||
|
एभ्यः
|
||||||
|
अस्मात्
|
||||||
|
आभ्याम्
|
||||||
|
एभ्यः
|
||||||
|
अस्य
|
||||||
|
अनयोः
|
||||||
|
एषाम्
|
||||||
|
अस्मिन्
|
||||||
|
अनयोः
|
||||||
|
एषु
|
||||||
|
एषः
|
||||||
|
एतौ
|
||||||
|
एते
|
||||||
|
एतम् एनम्
|
||||||
|
एतौ एनौ
|
||||||
|
एतान् एनान्
|
||||||
|
एतेन
|
||||||
|
एताभ्याम्
|
||||||
|
एतैः
|
||||||
|
एतस्मै
|
||||||
|
एताभ्याम्
|
||||||
|
एतेभ्यः
|
||||||
|
एतस्मात्
|
||||||
|
एताभ्याम्
|
||||||
|
एतेभ्यः
|
||||||
|
एतस्य
|
||||||
|
एतस्मिन्
|
||||||
|
एतेषाम्
|
||||||
|
एतस्मिन्
|
||||||
|
एतस्मिन्
|
||||||
|
एतेषु
|
||||||
|
एषा
|
||||||
|
एते
|
||||||
|
एताः
|
||||||
|
एताम् एनाम्
|
||||||
|
एते एने
|
||||||
|
एताः एनाः
|
||||||
|
एतया एनया
|
||||||
|
एताभ्याम्
|
||||||
|
एताभिः
|
||||||
|
एतस्यै
|
||||||
|
एताभ्याम्
|
||||||
|
एताभ्यः
|
||||||
|
एतस्याः
|
||||||
|
एताभ्याम्
|
||||||
|
एताभ्यः
|
||||||
|
एतस्याः
|
||||||
|
एतयोः एनयोः
|
||||||
|
एतासाम्
|
||||||
|
एतस्याम्
|
||||||
|
एतयोः एनयोः
|
||||||
|
एतासु
|
||||||
|
एतत् एतद्
|
||||||
|
एते
|
||||||
|
एतानि
|
||||||
|
एतत् एतद् एनत् एनद्
|
||||||
|
एते एने
|
||||||
|
एतानि एनानि
|
||||||
|
एतेन एनेन
|
||||||
|
एताभ्याम्
|
||||||
|
एतैः
|
||||||
|
एतस्मै
|
||||||
|
एताभ्याम्
|
||||||
|
एतेभ्यः
|
||||||
|
एतस्मात्
|
||||||
|
एताभ्याम्
|
||||||
|
एतेभ्यः
|
||||||
|
एतस्य
|
||||||
|
एतयोः एनयोः
|
||||||
|
एतेषाम्
|
||||||
|
एतस्मिन्
|
||||||
|
एतयोः एनयोः
|
||||||
|
एतेषु
|
||||||
|
असौ
|
||||||
|
अमू
|
||||||
|
अमी
|
||||||
|
अमूम्
|
||||||
|
अमू
|
||||||
|
अमून्
|
||||||
|
अमुना
|
||||||
|
अमूभ्याम्
|
||||||
|
अमीभिः
|
||||||
|
अमुष्मै
|
||||||
|
अमूभ्याम्
|
||||||
|
अमीभ्यः
|
||||||
|
अमुष्मात्
|
||||||
|
अमूभ्याम्
|
||||||
|
अमीभ्यः
|
||||||
|
अमुष्य
|
||||||
|
अमुयोः
|
||||||
|
अमीषाम्
|
||||||
|
अमुष्मिन्
|
||||||
|
अमुयोः
|
||||||
|
अमीषु
|
||||||
|
असौ
|
||||||
|
अमू
|
||||||
|
अमूः
|
||||||
|
अमूम्
|
||||||
|
अमू
|
||||||
|
अमूः
|
||||||
|
अमुया
|
||||||
|
अमूभ्याम्
|
||||||
|
अमूभिः
|
||||||
|
अमुष्यै
|
||||||
|
अमूभ्याम्
|
||||||
|
अमूभ्यः
|
||||||
|
अमुष्याः
|
||||||
|
अमूभ्याम्
|
||||||
|
अमूभ्यः
|
||||||
|
अमुष्याः
|
||||||
|
अमुयोः
|
||||||
|
अमूषाम्
|
||||||
|
अमुष्याम्
|
||||||
|
अमुयोः
|
||||||
|
अमूषु
|
||||||
|
अमु
|
||||||
|
अमुनी
|
||||||
|
अमूनि
|
||||||
|
अमु
|
||||||
|
अमुनी
|
||||||
|
अमूनि
|
||||||
|
अमुना
|
||||||
|
अमूभ्याम्
|
||||||
|
अमीभिः
|
||||||
|
अमुष्मै
|
||||||
|
अमूभ्याम्
|
||||||
|
अमीभ्यः
|
||||||
|
अमुष्मात्
|
||||||
|
अमूभ्याम्
|
||||||
|
अमीभ्यः
|
||||||
|
अमुष्य
|
||||||
|
अमुयोः
|
||||||
|
अमीषाम्
|
||||||
|
अमुष्मिन्
|
||||||
|
अमुयोः
|
||||||
|
अमीषु
|
||||||
|
कः
|
||||||
|
कौ
|
||||||
|
के
|
||||||
|
कम्
|
||||||
|
कौ
|
||||||
|
कान्
|
||||||
|
केन
|
||||||
|
काभ्याम्
|
||||||
|
कैः
|
||||||
|
कस्मै
|
||||||
|
काभ्याम्
|
||||||
|
केभ्य
|
||||||
|
कस्मात्
|
||||||
|
काभ्याम्
|
||||||
|
केभ्य
|
||||||
|
कस्य
|
||||||
|
कयोः
|
||||||
|
केषाम्
|
||||||
|
कस्मिन्
|
||||||
|
कयोः
|
||||||
|
केषु
|
||||||
|
का
|
||||||
|
के
|
||||||
|
काः
|
||||||
|
काम्
|
||||||
|
के
|
||||||
|
काः
|
||||||
|
कया
|
||||||
|
काभ्याम्
|
||||||
|
काभिः
|
||||||
|
कस्यै
|
||||||
|
काभ्याम्
|
||||||
|
काभ्यः
|
||||||
|
कस्याः
|
||||||
|
काभ्याम्
|
||||||
|
काभ्यः
|
||||||
|
कस्याः
|
||||||
|
कयोः
|
||||||
|
कासाम्
|
||||||
|
कस्याम्
|
||||||
|
कयोः
|
||||||
|
कासु
|
||||||
|
किम्
|
||||||
|
के
|
||||||
|
कानि
|
||||||
|
किम्
|
||||||
|
के
|
||||||
|
कानि
|
||||||
|
केन
|
||||||
|
काभ्याम्
|
||||||
|
कैः
|
||||||
|
कस्मै
|
||||||
|
काभ्याम्
|
||||||
|
केभ्य
|
||||||
|
कस्मात्
|
||||||
|
काभ्याम्
|
||||||
|
केभ्य
|
||||||
|
कस्य
|
||||||
|
कयोः
|
||||||
|
केषाम्
|
||||||
|
कस्मिन्
|
||||||
|
कयोः
|
||||||
|
केषु
|
||||||
|
भवान्
|
||||||
|
भवन्तौ
|
||||||
|
भवन्तः
|
||||||
|
भवन्तम्
|
||||||
|
भवन्तौ
|
||||||
|
भवतः
|
||||||
|
भवता
|
||||||
|
भवद्भ्याम्
|
||||||
|
भवद्भिः
|
||||||
|
भवते
|
||||||
|
भवद्भ्याम्
|
||||||
|
भवद्भ्यः
|
||||||
|
भवतः
|
||||||
|
भवद्भ्याम्
|
||||||
|
भवद्भ्यः
|
||||||
|
भवतः
|
||||||
|
भवतोः
|
||||||
|
भवताम्
|
||||||
|
भवति
|
||||||
|
भवतोः
|
||||||
|
भवत्सु
|
||||||
|
भवती
|
||||||
|
भवत्यौ
|
||||||
|
भवत्यः
|
||||||
|
भवतीम्
|
||||||
|
भवत्यौ
|
||||||
|
भवतीः
|
||||||
|
भवत्या
|
||||||
|
भवतीभ्याम्
|
||||||
|
भवतीभिः
|
||||||
|
भवत्यै
|
||||||
|
भवतीभ्याम्
|
||||||
|
भवतीभिः
|
||||||
|
भवत्याः
|
||||||
|
भवतीभ्याम्
|
||||||
|
भवतीभिः
|
||||||
|
भवत्याः
|
||||||
|
भवत्योः
|
||||||
|
भवतीनाम्
|
||||||
|
भवत्याम्
|
||||||
|
भवत्योः
|
||||||
|
भवतीषु
|
||||||
|
भवत्
|
||||||
|
भवती
|
||||||
|
भवन्ति
|
||||||
|
भवत्
|
||||||
|
भवती
|
||||||
|
भवन्ति
|
||||||
|
भवता
|
||||||
|
भवद्भ्याम्
|
||||||
|
भवद्भिः
|
||||||
|
भवते
|
||||||
|
भवद्भ्याम्
|
||||||
|
भवद्भ्यः
|
||||||
|
भवतः
|
||||||
|
भवद्भ्याम्
|
||||||
|
भवद्भ्यः
|
||||||
|
भवतः
|
||||||
|
भवतोः
|
||||||
|
भवताम्
|
||||||
|
भवति
|
||||||
|
भवतोः
|
||||||
|
भवत्सु
|
||||||
|
अये
|
||||||
|
अरे
|
||||||
|
अरेरे
|
||||||
|
अविधा
|
||||||
|
असाधुना
|
||||||
|
अस्तोभ
|
||||||
|
अहह
|
||||||
|
अहावस्
|
||||||
|
आम्
|
||||||
|
आर्यहलम्
|
||||||
|
आह
|
||||||
|
आहो
|
||||||
|
इस्
|
||||||
|
उम्
|
||||||
|
उवे
|
||||||
|
काम्
|
||||||
|
कुम्
|
||||||
|
चमत्
|
||||||
|
टसत्
|
||||||
|
दृन्
|
||||||
|
धिक्
|
||||||
|
पाट्
|
||||||
|
फत्
|
||||||
|
फाट्
|
||||||
|
फुडुत्
|
||||||
|
बत
|
||||||
|
बाल्
|
||||||
|
वट्
|
||||||
|
व्यवस्तोभति व्यवस्तुभ्
|
||||||
|
षाट्
|
||||||
|
स्तोभ
|
||||||
|
हुम्मा
|
||||||
|
हूम्
|
||||||
|
अति
|
||||||
|
अधि
|
||||||
|
अनु
|
||||||
|
अप
|
||||||
|
अपि
|
||||||
|
अभि
|
||||||
|
अव
|
||||||
|
आ
|
||||||
|
उद्
|
||||||
|
उप
|
||||||
|
नि
|
||||||
|
निर्
|
||||||
|
परा
|
||||||
|
परि
|
||||||
|
प्र
|
||||||
|
प्रति
|
||||||
|
वि
|
||||||
|
सम्
|
||||||
|
अथवा उत
|
||||||
|
अन्यथा
|
||||||
|
इव
|
||||||
|
च
|
||||||
|
चेत् यदि
|
||||||
|
तु परन्तु
|
||||||
|
यतः करणेन हि यतस् यदर्थम् यदर्थे यर्हि यथा यत्कारणम् येन ही हिन
|
||||||
|
यथा यतस्
|
||||||
|
यद्यपि
|
||||||
|
यात् अवधेस् यावति
|
||||||
|
येन प्रकारेण
|
||||||
|
स्थाने
|
||||||
|
अह
|
||||||
|
एव
|
||||||
|
एवम्
|
||||||
|
कच्चित्
|
||||||
|
कु
|
||||||
|
कुवित्
|
||||||
|
कूपत्
|
||||||
|
च
|
||||||
|
चण्
|
||||||
|
चेत्
|
||||||
|
तत्र
|
||||||
|
नकिम्
|
||||||
|
नह
|
||||||
|
नुनम्
|
||||||
|
नेत्
|
||||||
|
भूयस्
|
||||||
|
मकिम्
|
||||||
|
मकिर्
|
||||||
|
यत्र
|
||||||
|
युगपत्
|
||||||
|
वा
|
||||||
|
शश्वत्
|
||||||
|
सूपत्
|
||||||
|
ह
|
||||||
|
हन्त
|
||||||
|
हि
|
||||||
|
""".split()
|
||||||
|
)
|
|
@ -34,13 +34,13 @@ URL_PATTERN = (
|
||||||
r"|"
|
r"|"
|
||||||
# host & domain names
|
# host & domain names
|
||||||
# mods: match is case-sensitive, so include [A-Z]
|
# mods: match is case-sensitive, so include [A-Z]
|
||||||
"(?:" # noqa: E131
|
r"(?:" # noqa: E131
|
||||||
"(?:"
|
r"(?:"
|
||||||
"[A-Za-z0-9\u00a1-\uffff]"
|
r"[A-Za-z0-9\u00a1-\uffff]"
|
||||||
"[A-Za-z0-9\u00a1-\uffff_-]{0,62}"
|
r"[A-Za-z0-9\u00a1-\uffff_-]{0,62}"
|
||||||
")?"
|
r")?"
|
||||||
"[A-Za-z0-9\u00a1-\uffff]\."
|
r"[A-Za-z0-9\u00a1-\uffff]\."
|
||||||
")+"
|
r")+"
|
||||||
# TLD identifier
|
# TLD identifier
|
||||||
# mods: use ALPHA_LOWER instead of a wider range so that this doesn't match
|
# mods: use ALPHA_LOWER instead of a wider range so that this doesn't match
|
||||||
# strings like "lower.Upper", which can be split on "." by infixes in some
|
# strings like "lower.Upper", which can be split on "." by infixes in some
|
||||||
|
@ -128,6 +128,8 @@ emoticons = set(
|
||||||
:-]
|
:-]
|
||||||
[:
|
[:
|
||||||
[-:
|
[-:
|
||||||
|
[=
|
||||||
|
=]
|
||||||
:o)
|
:o)
|
||||||
(o:
|
(o:
|
||||||
:}
|
:}
|
||||||
|
@ -159,6 +161,8 @@ emoticons = set(
|
||||||
=|
|
=|
|
||||||
:|
|
:|
|
||||||
:-|
|
:-|
|
||||||
|
]=
|
||||||
|
=[
|
||||||
:1
|
:1
|
||||||
:P
|
:P
|
||||||
:-P
|
:-P
|
||||||
|
|
|
@ -3,7 +3,6 @@ from typing import Tuple, Iterator
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import random
|
import random
|
||||||
import itertools
|
import itertools
|
||||||
import weakref
|
|
||||||
import functools
|
import functools
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
|
@ -95,7 +94,7 @@ class Language:
|
||||||
object and processing pipeline.
|
object and processing pipeline.
|
||||||
lang (str): Two-letter language ID, i.e. ISO code.
|
lang (str): Two-letter language ID, i.e. ISO code.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language
|
DOCS: https://nightly.spacy.io/api/language
|
||||||
"""
|
"""
|
||||||
|
|
||||||
Defaults = BaseDefaults
|
Defaults = BaseDefaults
|
||||||
|
@ -130,7 +129,7 @@ class Language:
|
||||||
create_tokenizer (Callable): Function that takes the nlp object and
|
create_tokenizer (Callable): Function that takes the nlp object and
|
||||||
returns a tokenizer.
|
returns a tokenizer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#init
|
DOCS: https://nightly.spacy.io/api/language#init
|
||||||
"""
|
"""
|
||||||
# We're only calling this to import all factories provided via entry
|
# We're only calling this to import all factories provided via entry
|
||||||
# points. The factory decorator applied to these functions takes care
|
# points. The factory decorator applied to these functions takes care
|
||||||
|
@ -185,14 +184,14 @@ class Language:
|
||||||
|
|
||||||
RETURNS (Dict[str, Any]): The meta.
|
RETURNS (Dict[str, Any]): The meta.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#meta
|
DOCS: https://nightly.spacy.io/api/language#meta
|
||||||
"""
|
"""
|
||||||
spacy_version = util.get_model_version_range(about.__version__)
|
spacy_version = util.get_model_version_range(about.__version__)
|
||||||
if self.vocab.lang:
|
if self.vocab.lang:
|
||||||
self._meta.setdefault("lang", self.vocab.lang)
|
self._meta.setdefault("lang", self.vocab.lang)
|
||||||
else:
|
else:
|
||||||
self._meta.setdefault("lang", self.lang)
|
self._meta.setdefault("lang", self.lang)
|
||||||
self._meta.setdefault("name", "model")
|
self._meta.setdefault("name", "pipeline")
|
||||||
self._meta.setdefault("version", "0.0.0")
|
self._meta.setdefault("version", "0.0.0")
|
||||||
self._meta.setdefault("spacy_version", spacy_version)
|
self._meta.setdefault("spacy_version", spacy_version)
|
||||||
self._meta.setdefault("description", "")
|
self._meta.setdefault("description", "")
|
||||||
|
@ -211,6 +210,7 @@ class Language:
|
||||||
# TODO: Adding this back to prevent breaking people's code etc., but
|
# TODO: Adding this back to prevent breaking people's code etc., but
|
||||||
# we should consider removing it
|
# we should consider removing it
|
||||||
self._meta["pipeline"] = list(self.pipe_names)
|
self._meta["pipeline"] = list(self.pipe_names)
|
||||||
|
self._meta["components"] = list(self.component_names)
|
||||||
self._meta["disabled"] = list(self.disabled)
|
self._meta["disabled"] = list(self.disabled)
|
||||||
return self._meta
|
return self._meta
|
||||||
|
|
||||||
|
@ -225,7 +225,7 @@ class Language:
|
||||||
|
|
||||||
RETURNS (thinc.api.Config): The config.
|
RETURNS (thinc.api.Config): The config.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#config
|
DOCS: https://nightly.spacy.io/api/language#config
|
||||||
"""
|
"""
|
||||||
self._config.setdefault("nlp", {})
|
self._config.setdefault("nlp", {})
|
||||||
self._config.setdefault("training", {})
|
self._config.setdefault("training", {})
|
||||||
|
@ -433,7 +433,7 @@ class Language:
|
||||||
will be combined and normalized for the whole pipeline.
|
will be combined and normalized for the whole pipeline.
|
||||||
func (Optional[Callable]): Factory function if not used as a decorator.
|
func (Optional[Callable]): Factory function if not used as a decorator.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#factory
|
DOCS: https://nightly.spacy.io/api/language#factory
|
||||||
"""
|
"""
|
||||||
if not isinstance(name, str):
|
if not isinstance(name, str):
|
||||||
raise ValueError(Errors.E963.format(decorator="factory"))
|
raise ValueError(Errors.E963.format(decorator="factory"))
|
||||||
|
@ -513,7 +513,7 @@ class Language:
|
||||||
Used for pipeline analysis.
|
Used for pipeline analysis.
|
||||||
func (Optional[Callable]): Factory function if not used as a decorator.
|
func (Optional[Callable]): Factory function if not used as a decorator.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#component
|
DOCS: https://nightly.spacy.io/api/language#component
|
||||||
"""
|
"""
|
||||||
if name is not None and not isinstance(name, str):
|
if name is not None and not isinstance(name, str):
|
||||||
raise ValueError(Errors.E963.format(decorator="component"))
|
raise ValueError(Errors.E963.format(decorator="component"))
|
||||||
|
@ -579,7 +579,7 @@ class Language:
|
||||||
name (str): Name of pipeline component to get.
|
name (str): Name of pipeline component to get.
|
||||||
RETURNS (callable): The pipeline component.
|
RETURNS (callable): The pipeline component.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#get_pipe
|
DOCS: https://nightly.spacy.io/api/language#get_pipe
|
||||||
"""
|
"""
|
||||||
for pipe_name, component in self._components:
|
for pipe_name, component in self._components:
|
||||||
if pipe_name == name:
|
if pipe_name == name:
|
||||||
|
@ -608,7 +608,7 @@ class Language:
|
||||||
arguments and types expected by the factory.
|
arguments and types expected by the factory.
|
||||||
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#create_pipe
|
DOCS: https://nightly.spacy.io/api/language#create_pipe
|
||||||
"""
|
"""
|
||||||
name = name if name is not None else factory_name
|
name = name if name is not None else factory_name
|
||||||
if not isinstance(config, dict):
|
if not isinstance(config, dict):
|
||||||
|
@ -722,7 +722,7 @@ class Language:
|
||||||
arguments and types expected by the factory.
|
arguments and types expected by the factory.
|
||||||
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#add_pipe
|
DOCS: https://nightly.spacy.io/api/language#add_pipe
|
||||||
"""
|
"""
|
||||||
if not isinstance(factory_name, str):
|
if not isinstance(factory_name, str):
|
||||||
bad_val = repr(factory_name)
|
bad_val = repr(factory_name)
|
||||||
|
@ -820,7 +820,7 @@ class Language:
|
||||||
name (str): Name of the component.
|
name (str): Name of the component.
|
||||||
RETURNS (bool): Whether a component of the name exists in the pipeline.
|
RETURNS (bool): Whether a component of the name exists in the pipeline.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#has_pipe
|
DOCS: https://nightly.spacy.io/api/language#has_pipe
|
||||||
"""
|
"""
|
||||||
return name in self.pipe_names
|
return name in self.pipe_names
|
||||||
|
|
||||||
|
@ -841,7 +841,7 @@ class Language:
|
||||||
validate (bool): Whether to validate the component config against the
|
validate (bool): Whether to validate the component config against the
|
||||||
arguments and types expected by the factory.
|
arguments and types expected by the factory.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#replace_pipe
|
DOCS: https://nightly.spacy.io/api/language#replace_pipe
|
||||||
"""
|
"""
|
||||||
if name not in self.pipe_names:
|
if name not in self.pipe_names:
|
||||||
raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
|
raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
|
||||||
|
@ -870,7 +870,7 @@ class Language:
|
||||||
old_name (str): Name of the component to rename.
|
old_name (str): Name of the component to rename.
|
||||||
new_name (str): New name of the component.
|
new_name (str): New name of the component.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#rename_pipe
|
DOCS: https://nightly.spacy.io/api/language#rename_pipe
|
||||||
"""
|
"""
|
||||||
if old_name not in self.component_names:
|
if old_name not in self.component_names:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -891,7 +891,7 @@ class Language:
|
||||||
name (str): Name of the component to remove.
|
name (str): Name of the component to remove.
|
||||||
RETURNS (tuple): A `(name, component)` tuple of the removed component.
|
RETURNS (tuple): A `(name, component)` tuple of the removed component.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#remove_pipe
|
DOCS: https://nightly.spacy.io/api/language#remove_pipe
|
||||||
"""
|
"""
|
||||||
if name not in self.component_names:
|
if name not in self.component_names:
|
||||||
raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
|
raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
|
||||||
|
@ -944,7 +944,7 @@ class Language:
|
||||||
keyword arguments for specific components.
|
keyword arguments for specific components.
|
||||||
RETURNS (Doc): A container for accessing the annotations.
|
RETURNS (Doc): A container for accessing the annotations.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#call
|
DOCS: https://nightly.spacy.io/api/language#call
|
||||||
"""
|
"""
|
||||||
if len(text) > self.max_length:
|
if len(text) > self.max_length:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -993,7 +993,7 @@ class Language:
|
||||||
disable (str or iterable): The name(s) of the pipes to disable
|
disable (str or iterable): The name(s) of the pipes to disable
|
||||||
enable (str or iterable): The name(s) of the pipes to enable - all others will be disabled
|
enable (str or iterable): The name(s) of the pipes to enable - all others will be disabled
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#select_pipes
|
DOCS: https://nightly.spacy.io/api/language#select_pipes
|
||||||
"""
|
"""
|
||||||
if enable is None and disable is None:
|
if enable is None and disable is None:
|
||||||
raise ValueError(Errors.E991)
|
raise ValueError(Errors.E991)
|
||||||
|
@ -1044,7 +1044,7 @@ class Language:
|
||||||
exclude (Iterable[str]): Names of components that shouldn't be updated.
|
exclude (Iterable[str]): Names of components that shouldn't be updated.
|
||||||
RETURNS (Dict[str, float]): The updated losses dictionary
|
RETURNS (Dict[str, float]): The updated losses dictionary
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#update
|
DOCS: https://nightly.spacy.io/api/language#update
|
||||||
"""
|
"""
|
||||||
if _ is not None:
|
if _ is not None:
|
||||||
raise ValueError(Errors.E989)
|
raise ValueError(Errors.E989)
|
||||||
|
@ -1106,7 +1106,7 @@ class Language:
|
||||||
>>> raw_batch = [Example.from_dict(nlp.make_doc(text), {}) for text in next(raw_text_batches)]
|
>>> raw_batch = [Example.from_dict(nlp.make_doc(text), {}) for text in next(raw_text_batches)]
|
||||||
>>> nlp.rehearse(raw_batch)
|
>>> nlp.rehearse(raw_batch)
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#rehearse
|
DOCS: https://nightly.spacy.io/api/language#rehearse
|
||||||
"""
|
"""
|
||||||
if len(examples) == 0:
|
if len(examples) == 0:
|
||||||
return
|
return
|
||||||
|
@ -1153,7 +1153,7 @@ class Language:
|
||||||
create_optimizer if it doesn't exist.
|
create_optimizer if it doesn't exist.
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#begin_training
|
DOCS: https://nightly.spacy.io/api/language#begin_training
|
||||||
"""
|
"""
|
||||||
# TODO: throw warning when get_gold_tuples is provided instead of get_examples
|
# TODO: throw warning when get_gold_tuples is provided instead of get_examples
|
||||||
if get_examples is None:
|
if get_examples is None:
|
||||||
|
@ -1200,7 +1200,7 @@ class Language:
|
||||||
sgd (Optional[Optimizer]): An optimizer.
|
sgd (Optional[Optimizer]): An optimizer.
|
||||||
RETURNS (Optimizer): The optimizer.
|
RETURNS (Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#resume_training
|
DOCS: https://nightly.spacy.io/api/language#resume_training
|
||||||
"""
|
"""
|
||||||
if device >= 0: # TODO: do we need this here?
|
if device >= 0: # TODO: do we need this here?
|
||||||
require_gpu(device)
|
require_gpu(device)
|
||||||
|
@ -1236,7 +1236,7 @@ class Language:
|
||||||
for the scorer.
|
for the scorer.
|
||||||
RETURNS (Scorer): The scorer containing the evaluation results.
|
RETURNS (Scorer): The scorer containing the evaluation results.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#evaluate
|
DOCS: https://nightly.spacy.io/api/language#evaluate
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "Language.evaluate")
|
validate_examples(examples, "Language.evaluate")
|
||||||
if component_cfg is None:
|
if component_cfg is None:
|
||||||
|
@ -1275,7 +1275,7 @@ class Language:
|
||||||
return results
|
return results
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def use_params(self, params: dict):
|
def use_params(self, params: Optional[dict]):
|
||||||
"""Replace weights of models in the pipeline with those provided in the
|
"""Replace weights of models in the pipeline with those provided in the
|
||||||
params dictionary. Can be used as a contextmanager, in which case,
|
params dictionary. Can be used as a contextmanager, in which case,
|
||||||
models go back to their original weights after the block.
|
models go back to their original weights after the block.
|
||||||
|
@ -1286,8 +1286,11 @@ class Language:
|
||||||
>>> with nlp.use_params(optimizer.averages):
|
>>> with nlp.use_params(optimizer.averages):
|
||||||
>>> nlp.to_disk("/tmp/checkpoint")
|
>>> nlp.to_disk("/tmp/checkpoint")
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#use_params
|
DOCS: https://nightly.spacy.io/api/language#use_params
|
||||||
"""
|
"""
|
||||||
|
if not params:
|
||||||
|
yield
|
||||||
|
else:
|
||||||
contexts = [
|
contexts = [
|
||||||
pipe.use_params(params)
|
pipe.use_params(params)
|
||||||
for name, pipe in self.pipeline
|
for name, pipe in self.pipeline
|
||||||
|
@ -1330,7 +1333,7 @@ class Language:
|
||||||
n_process (int): Number of processors to process texts. If -1, set `multiprocessing.cpu_count()`.
|
n_process (int): Number of processors to process texts. If -1, set `multiprocessing.cpu_count()`.
|
||||||
YIELDS (Doc): Documents in the order of the original text.
|
YIELDS (Doc): Documents in the order of the original text.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#pipe
|
DOCS: https://nightly.spacy.io/api/language#pipe
|
||||||
"""
|
"""
|
||||||
if n_process == -1:
|
if n_process == -1:
|
||||||
n_process = mp.cpu_count()
|
n_process = mp.cpu_count()
|
||||||
|
@ -1374,8 +1377,6 @@ class Language:
|
||||||
docs = (self.make_doc(text) for text in texts)
|
docs = (self.make_doc(text) for text in texts)
|
||||||
for pipe in pipes:
|
for pipe in pipes:
|
||||||
docs = pipe(docs)
|
docs = pipe(docs)
|
||||||
|
|
||||||
nr_seen = 0
|
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
|
@ -1466,7 +1467,7 @@ class Language:
|
||||||
the types expected by the factory.
|
the types expected by the factory.
|
||||||
RETURNS (Language): The initialized Language class.
|
RETURNS (Language): The initialized Language class.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#from_config
|
DOCS: https://nightly.spacy.io/api/language#from_config
|
||||||
"""
|
"""
|
||||||
if auto_fill:
|
if auto_fill:
|
||||||
config = Config(
|
config = Config(
|
||||||
|
@ -1579,7 +1580,7 @@ class Language:
|
||||||
it doesn't exist.
|
it doesn't exist.
|
||||||
exclude (list): Names of components or serialization fields to exclude.
|
exclude (list): Names of components or serialization fields to exclude.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#to_disk
|
DOCS: https://nightly.spacy.io/api/language#to_disk
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
serializers = {}
|
serializers = {}
|
||||||
|
@ -1608,7 +1609,7 @@ class Language:
|
||||||
exclude (list): Names of components or serialization fields to exclude.
|
exclude (list): Names of components or serialization fields to exclude.
|
||||||
RETURNS (Language): The modified `Language` object.
|
RETURNS (Language): The modified `Language` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#from_disk
|
DOCS: https://nightly.spacy.io/api/language#from_disk
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def deserialize_meta(path: Path) -> None:
|
def deserialize_meta(path: Path) -> None:
|
||||||
|
@ -1656,7 +1657,7 @@ class Language:
|
||||||
exclude (list): Names of components or serialization fields to exclude.
|
exclude (list): Names of components or serialization fields to exclude.
|
||||||
RETURNS (bytes): The serialized form of the `Language` object.
|
RETURNS (bytes): The serialized form of the `Language` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#to_bytes
|
DOCS: https://nightly.spacy.io/api/language#to_bytes
|
||||||
"""
|
"""
|
||||||
serializers = {}
|
serializers = {}
|
||||||
serializers["vocab"] = lambda: self.vocab.to_bytes()
|
serializers["vocab"] = lambda: self.vocab.to_bytes()
|
||||||
|
@ -1680,7 +1681,7 @@ class Language:
|
||||||
exclude (list): Names of components or serialization fields to exclude.
|
exclude (list): Names of components or serialization fields to exclude.
|
||||||
RETURNS (Language): The `Language` object.
|
RETURNS (Language): The `Language` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#from_bytes
|
DOCS: https://nightly.spacy.io/api/language#from_bytes
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def deserialize_meta(b):
|
def deserialize_meta(b):
|
||||||
|
|
|
@ -30,7 +30,7 @@ cdef class Lexeme:
|
||||||
tag, dependency parse, or lemma (lemmatization depends on the
|
tag, dependency parse, or lemma (lemmatization depends on the
|
||||||
part-of-speech tag).
|
part-of-speech tag).
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lexeme
|
DOCS: https://nightly.spacy.io/api/lexeme
|
||||||
"""
|
"""
|
||||||
def __init__(self, Vocab vocab, attr_t orth):
|
def __init__(self, Vocab vocab, attr_t orth):
|
||||||
"""Create a Lexeme object.
|
"""Create a Lexeme object.
|
||||||
|
|
|
@ -57,7 +57,7 @@ class Table(OrderedDict):
|
||||||
data (dict): The dictionary.
|
data (dict): The dictionary.
|
||||||
name (str): Optional table name for reference.
|
name (str): Optional table name for reference.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#table.from_dict
|
DOCS: https://nightly.spacy.io/api/lookups#table.from_dict
|
||||||
"""
|
"""
|
||||||
self = cls(name=name)
|
self = cls(name=name)
|
||||||
self.update(data)
|
self.update(data)
|
||||||
|
@ -69,7 +69,7 @@ class Table(OrderedDict):
|
||||||
name (str): Optional table name for reference.
|
name (str): Optional table name for reference.
|
||||||
data (dict): Initial data, used to hint Bloom Filter.
|
data (dict): Initial data, used to hint Bloom Filter.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#table.init
|
DOCS: https://nightly.spacy.io/api/lookups#table.init
|
||||||
"""
|
"""
|
||||||
OrderedDict.__init__(self)
|
OrderedDict.__init__(self)
|
||||||
self.name = name
|
self.name = name
|
||||||
|
@ -135,7 +135,7 @@ class Table(OrderedDict):
|
||||||
|
|
||||||
RETURNS (bytes): The serialized table.
|
RETURNS (bytes): The serialized table.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#table.to_bytes
|
DOCS: https://nightly.spacy.io/api/lookups#table.to_bytes
|
||||||
"""
|
"""
|
||||||
data = {
|
data = {
|
||||||
"name": self.name,
|
"name": self.name,
|
||||||
|
@ -150,7 +150,7 @@ class Table(OrderedDict):
|
||||||
bytes_data (bytes): The data to load.
|
bytes_data (bytes): The data to load.
|
||||||
RETURNS (Table): The loaded table.
|
RETURNS (Table): The loaded table.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#table.from_bytes
|
DOCS: https://nightly.spacy.io/api/lookups#table.from_bytes
|
||||||
"""
|
"""
|
||||||
loaded = srsly.msgpack_loads(bytes_data)
|
loaded = srsly.msgpack_loads(bytes_data)
|
||||||
data = loaded.get("dict", {})
|
data = loaded.get("dict", {})
|
||||||
|
@ -172,7 +172,7 @@ class Lookups:
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
"""Initialize the Lookups object.
|
"""Initialize the Lookups object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#init
|
DOCS: https://nightly.spacy.io/api/lookups#init
|
||||||
"""
|
"""
|
||||||
self._tables = {}
|
self._tables = {}
|
||||||
|
|
||||||
|
@ -201,7 +201,7 @@ class Lookups:
|
||||||
data (dict): Optional data to add to the table.
|
data (dict): Optional data to add to the table.
|
||||||
RETURNS (Table): The newly added table.
|
RETURNS (Table): The newly added table.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#add_table
|
DOCS: https://nightly.spacy.io/api/lookups#add_table
|
||||||
"""
|
"""
|
||||||
if name in self.tables:
|
if name in self.tables:
|
||||||
raise ValueError(Errors.E158.format(name=name))
|
raise ValueError(Errors.E158.format(name=name))
|
||||||
|
@ -215,7 +215,7 @@ class Lookups:
|
||||||
name (str): Name of the table to set.
|
name (str): Name of the table to set.
|
||||||
table (Table): The Table to set.
|
table (Table): The Table to set.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#set_table
|
DOCS: https://nightly.spacy.io/api/lookups#set_table
|
||||||
"""
|
"""
|
||||||
self._tables[name] = table
|
self._tables[name] = table
|
||||||
|
|
||||||
|
@ -227,7 +227,7 @@ class Lookups:
|
||||||
default (Any): Optional default value to return if table doesn't exist.
|
default (Any): Optional default value to return if table doesn't exist.
|
||||||
RETURNS (Table): The table.
|
RETURNS (Table): The table.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#get_table
|
DOCS: https://nightly.spacy.io/api/lookups#get_table
|
||||||
"""
|
"""
|
||||||
if name not in self._tables:
|
if name not in self._tables:
|
||||||
if default == UNSET:
|
if default == UNSET:
|
||||||
|
@ -241,7 +241,7 @@ class Lookups:
|
||||||
name (str): Name of the table to remove.
|
name (str): Name of the table to remove.
|
||||||
RETURNS (Table): The removed table.
|
RETURNS (Table): The removed table.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#remove_table
|
DOCS: https://nightly.spacy.io/api/lookups#remove_table
|
||||||
"""
|
"""
|
||||||
if name not in self._tables:
|
if name not in self._tables:
|
||||||
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
|
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
|
||||||
|
@ -253,7 +253,7 @@ class Lookups:
|
||||||
name (str): Name of the table.
|
name (str): Name of the table.
|
||||||
RETURNS (bool): Whether a table of that name exists.
|
RETURNS (bool): Whether a table of that name exists.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#has_table
|
DOCS: https://nightly.spacy.io/api/lookups#has_table
|
||||||
"""
|
"""
|
||||||
return name in self._tables
|
return name in self._tables
|
||||||
|
|
||||||
|
@ -262,7 +262,7 @@ class Lookups:
|
||||||
|
|
||||||
RETURNS (bytes): The serialized Lookups.
|
RETURNS (bytes): The serialized Lookups.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#to_bytes
|
DOCS: https://nightly.spacy.io/api/lookups#to_bytes
|
||||||
"""
|
"""
|
||||||
return srsly.msgpack_dumps(self._tables)
|
return srsly.msgpack_dumps(self._tables)
|
||||||
|
|
||||||
|
@ -272,7 +272,7 @@ class Lookups:
|
||||||
bytes_data (bytes): The data to load.
|
bytes_data (bytes): The data to load.
|
||||||
RETURNS (Lookups): The loaded Lookups.
|
RETURNS (Lookups): The loaded Lookups.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#from_bytes
|
DOCS: https://nightly.spacy.io/api/lookups#from_bytes
|
||||||
"""
|
"""
|
||||||
self._tables = {}
|
self._tables = {}
|
||||||
for key, value in srsly.msgpack_loads(bytes_data).items():
|
for key, value in srsly.msgpack_loads(bytes_data).items():
|
||||||
|
@ -287,7 +287,7 @@ class Lookups:
|
||||||
|
|
||||||
path (str / Path): The file path.
|
path (str / Path): The file path.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#to_disk
|
DOCS: https://nightly.spacy.io/api/lookups#to_disk
|
||||||
"""
|
"""
|
||||||
if len(self._tables):
|
if len(self._tables):
|
||||||
path = ensure_path(path)
|
path = ensure_path(path)
|
||||||
|
@ -306,7 +306,7 @@ class Lookups:
|
||||||
path (str / Path): The directory path.
|
path (str / Path): The directory path.
|
||||||
RETURNS (Lookups): The loaded lookups.
|
RETURNS (Lookups): The loaded lookups.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#from_disk
|
DOCS: https://nightly.spacy.io/api/lookups#from_disk
|
||||||
"""
|
"""
|
||||||
path = ensure_path(path)
|
path = ensure_path(path)
|
||||||
filepath = path / filename
|
filepath = path / filename
|
||||||
|
|
|
@ -1,16 +1,16 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: infer_types=True, profile=True
|
||||||
from cymem.cymem cimport Pool
|
from typing import List
|
||||||
from preshed.maps cimport PreshMap
|
|
||||||
from libcpp cimport bool
|
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from .matcher cimport Matcher
|
from .matcher cimport Matcher
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
|
|
||||||
from .matcher import unpickle_matcher
|
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
from ..tokens import Span
|
||||||
|
|
||||||
|
|
||||||
DELIMITER = "||"
|
DELIMITER = "||"
|
||||||
|
@ -22,36 +22,52 @@ cdef class DependencyMatcher:
|
||||||
"""Match dependency parse tree based on pattern rules."""
|
"""Match dependency parse tree based on pattern rules."""
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cdef readonly Vocab vocab
|
cdef readonly Vocab vocab
|
||||||
cdef readonly Matcher token_matcher
|
cdef readonly Matcher matcher
|
||||||
cdef public object _patterns
|
cdef public object _patterns
|
||||||
|
cdef public object _raw_patterns
|
||||||
cdef public object _keys_to_token
|
cdef public object _keys_to_token
|
||||||
cdef public object _root
|
cdef public object _root
|
||||||
cdef public object _entities
|
|
||||||
cdef public object _callbacks
|
cdef public object _callbacks
|
||||||
cdef public object _nodes
|
cdef public object _nodes
|
||||||
cdef public object _tree
|
cdef public object _tree
|
||||||
|
cdef public object _ops
|
||||||
|
|
||||||
def __init__(self, vocab):
|
def __init__(self, vocab, *, validate=False):
|
||||||
"""Create the DependencyMatcher.
|
"""Create the DependencyMatcher.
|
||||||
|
|
||||||
vocab (Vocab): The vocabulary object, which must be shared with the
|
vocab (Vocab): The vocabulary object, which must be shared with the
|
||||||
documents the matcher will operate on.
|
documents the matcher will operate on.
|
||||||
|
validate (bool): Whether patterns should be validated, passed to
|
||||||
|
Matcher as `validate`
|
||||||
"""
|
"""
|
||||||
size = 20
|
size = 20
|
||||||
# TODO: make matcher work with validation
|
self.matcher = Matcher(vocab, validate=validate)
|
||||||
self.token_matcher = Matcher(vocab, validate=False)
|
|
||||||
self._keys_to_token = {}
|
self._keys_to_token = {}
|
||||||
self._patterns = {}
|
self._patterns = {}
|
||||||
|
self._raw_patterns = {}
|
||||||
self._root = {}
|
self._root = {}
|
||||||
self._nodes = {}
|
self._nodes = {}
|
||||||
self._tree = {}
|
self._tree = {}
|
||||||
self._entities = {}
|
|
||||||
self._callbacks = {}
|
self._callbacks = {}
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
|
self._ops = {
|
||||||
|
"<": self.dep,
|
||||||
|
">": self.gov,
|
||||||
|
"<<": self.dep_chain,
|
||||||
|
">>": self.gov_chain,
|
||||||
|
".": self.imm_precede,
|
||||||
|
".*": self.precede,
|
||||||
|
";": self.imm_follow,
|
||||||
|
";*": self.follow,
|
||||||
|
"$+": self.imm_right_sib,
|
||||||
|
"$-": self.imm_left_sib,
|
||||||
|
"$++": self.right_sib,
|
||||||
|
"$--": self.left_sib,
|
||||||
|
}
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
data = (self.vocab, self._patterns,self._tree, self._callbacks)
|
data = (self.vocab, self._raw_patterns, self._callbacks)
|
||||||
return (unpickle_matcher, data, None, None)
|
return (unpickle_matcher, data, None, None)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
|
@ -74,54 +90,61 @@ cdef class DependencyMatcher:
|
||||||
idx = 0
|
idx = 0
|
||||||
visited_nodes = {}
|
visited_nodes = {}
|
||||||
for relation in pattern:
|
for relation in pattern:
|
||||||
if "PATTERN" not in relation or "SPEC" not in relation:
|
if not isinstance(relation, dict):
|
||||||
|
raise ValueError(Errors.E1008)
|
||||||
|
if "RIGHT_ATTRS" not in relation and "RIGHT_ID" not in relation:
|
||||||
raise ValueError(Errors.E098.format(key=key))
|
raise ValueError(Errors.E098.format(key=key))
|
||||||
if idx == 0:
|
if idx == 0:
|
||||||
if not(
|
if not(
|
||||||
"NODE_NAME" in relation["SPEC"]
|
"RIGHT_ID" in relation
|
||||||
and "NBOR_RELOP" not in relation["SPEC"]
|
and "REL_OP" not in relation
|
||||||
and "NBOR_NAME" not in relation["SPEC"]
|
and "LEFT_ID" not in relation
|
||||||
):
|
):
|
||||||
raise ValueError(Errors.E099.format(key=key))
|
raise ValueError(Errors.E099.format(key=key))
|
||||||
visited_nodes[relation["SPEC"]["NODE_NAME"]] = True
|
visited_nodes[relation["RIGHT_ID"]] = True
|
||||||
else:
|
else:
|
||||||
if not(
|
if not(
|
||||||
"NODE_NAME" in relation["SPEC"]
|
"RIGHT_ID" in relation
|
||||||
and "NBOR_RELOP" in relation["SPEC"]
|
and "RIGHT_ATTRS" in relation
|
||||||
and "NBOR_NAME" in relation["SPEC"]
|
and "REL_OP" in relation
|
||||||
|
and "LEFT_ID" in relation
|
||||||
):
|
):
|
||||||
raise ValueError(Errors.E100.format(key=key))
|
raise ValueError(Errors.E100.format(key=key))
|
||||||
if (
|
if (
|
||||||
relation["SPEC"]["NODE_NAME"] in visited_nodes
|
relation["RIGHT_ID"] in visited_nodes
|
||||||
or relation["SPEC"]["NBOR_NAME"] not in visited_nodes
|
or relation["LEFT_ID"] not in visited_nodes
|
||||||
):
|
):
|
||||||
raise ValueError(Errors.E101.format(key=key))
|
raise ValueError(Errors.E101.format(key=key))
|
||||||
visited_nodes[relation["SPEC"]["NODE_NAME"]] = True
|
if relation["REL_OP"] not in self._ops:
|
||||||
visited_nodes[relation["SPEC"]["NBOR_NAME"]] = True
|
raise ValueError(Errors.E1007.format(op=relation["REL_OP"]))
|
||||||
|
visited_nodes[relation["RIGHT_ID"]] = True
|
||||||
|
visited_nodes[relation["LEFT_ID"]] = True
|
||||||
idx = idx + 1
|
idx = idx + 1
|
||||||
|
|
||||||
def add(self, key, patterns, *_patterns, on_match=None):
|
def add(self, key, patterns, *, on_match=None):
|
||||||
"""Add a new matcher rule to the matcher.
|
"""Add a new matcher rule to the matcher.
|
||||||
|
|
||||||
key (str): The match ID.
|
key (str): The match ID.
|
||||||
patterns (list): The patterns to add for the given key.
|
patterns (list): The patterns to add for the given key.
|
||||||
on_match (callable): Optional callback executed on match.
|
on_match (callable): Optional callback executed on match.
|
||||||
"""
|
"""
|
||||||
if patterns is None or hasattr(patterns, "__call__"): # old API
|
if on_match is not None and not hasattr(on_match, "__call__"):
|
||||||
on_match = patterns
|
raise ValueError(Errors.E171.format(arg_type=type(on_match)))
|
||||||
patterns = _patterns
|
if patterns is None or not isinstance(patterns, List): # old API
|
||||||
|
raise ValueError(Errors.E948.format(arg_type=type(patterns)))
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
if len(pattern) == 0:
|
if len(pattern) == 0:
|
||||||
raise ValueError(Errors.E012.format(key=key))
|
raise ValueError(Errors.E012.format(key=key))
|
||||||
self.validate_input(pattern, key)
|
self.validate_input(pattern, key)
|
||||||
key = self._normalize_key(key)
|
key = self._normalize_key(key)
|
||||||
|
self._raw_patterns.setdefault(key, [])
|
||||||
|
self._raw_patterns[key].extend(patterns)
|
||||||
_patterns = []
|
_patterns = []
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
token_patterns = []
|
token_patterns = []
|
||||||
for i in range(len(pattern)):
|
for i in range(len(pattern)):
|
||||||
token_pattern = [pattern[i]["PATTERN"]]
|
token_pattern = [pattern[i]["RIGHT_ATTRS"]]
|
||||||
token_patterns.append(token_pattern)
|
token_patterns.append(token_pattern)
|
||||||
# self.patterns.append(token_patterns)
|
|
||||||
_patterns.append(token_patterns)
|
_patterns.append(token_patterns)
|
||||||
self._patterns.setdefault(key, [])
|
self._patterns.setdefault(key, [])
|
||||||
self._callbacks[key] = on_match
|
self._callbacks[key] = on_match
|
||||||
|
@ -135,7 +158,7 @@ cdef class DependencyMatcher:
|
||||||
# TODO: Better ways to hash edges in pattern?
|
# TODO: Better ways to hash edges in pattern?
|
||||||
for j in range(len(_patterns[i])):
|
for j in range(len(_patterns[i])):
|
||||||
k = self._normalize_key(unicode(key) + DELIMITER + unicode(i) + DELIMITER + unicode(j))
|
k = self._normalize_key(unicode(key) + DELIMITER + unicode(i) + DELIMITER + unicode(j))
|
||||||
self.token_matcher.add(k, [_patterns[i][j]])
|
self.matcher.add(k, [_patterns[i][j]])
|
||||||
_keys_to_token[k] = j
|
_keys_to_token[k] = j
|
||||||
_keys_to_token_list.append(_keys_to_token)
|
_keys_to_token_list.append(_keys_to_token)
|
||||||
self._keys_to_token.setdefault(key, [])
|
self._keys_to_token.setdefault(key, [])
|
||||||
|
@ -144,7 +167,7 @@ cdef class DependencyMatcher:
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
nodes = {}
|
nodes = {}
|
||||||
for i in range(len(pattern)):
|
for i in range(len(pattern)):
|
||||||
nodes[pattern[i]["SPEC"]["NODE_NAME"]] = i
|
nodes[pattern[i]["RIGHT_ID"]] = i
|
||||||
_nodes_list.append(nodes)
|
_nodes_list.append(nodes)
|
||||||
self._nodes.setdefault(key, [])
|
self._nodes.setdefault(key, [])
|
||||||
self._nodes[key].extend(_nodes_list)
|
self._nodes[key].extend(_nodes_list)
|
||||||
|
@ -161,13 +184,13 @@ cdef class DependencyMatcher:
|
||||||
root = -1
|
root = -1
|
||||||
for j in range(len(patterns[i])):
|
for j in range(len(patterns[i])):
|
||||||
token_pattern = patterns[i][j]
|
token_pattern = patterns[i][j]
|
||||||
if ("NBOR_RELOP" not in token_pattern["SPEC"]):
|
if ("REL_OP" not in token_pattern):
|
||||||
heads[j] = ('root', j)
|
heads[j] = ('root', j)
|
||||||
root = j
|
root = j
|
||||||
else:
|
else:
|
||||||
heads[j] = (
|
heads[j] = (
|
||||||
token_pattern["SPEC"]["NBOR_RELOP"],
|
token_pattern["REL_OP"],
|
||||||
_nodes_list[i][token_pattern["SPEC"]["NBOR_NAME"]]
|
_nodes_list[i][token_pattern["LEFT_ID"]]
|
||||||
)
|
)
|
||||||
_heads_list.append(heads)
|
_heads_list.append(heads)
|
||||||
_root_list.append(root)
|
_root_list.append(root)
|
||||||
|
@ -202,11 +225,21 @@ cdef class DependencyMatcher:
|
||||||
RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
|
RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
|
||||||
"""
|
"""
|
||||||
key = self._normalize_key(key)
|
key = self._normalize_key(key)
|
||||||
if key not in self._patterns:
|
if key not in self._raw_patterns:
|
||||||
return default
|
return default
|
||||||
return (self._callbacks[key], self._patterns[key])
|
return (self._callbacks[key], self._raw_patterns[key])
|
||||||
|
|
||||||
def __call__(self, Doc doc):
|
def remove(self, key):
|
||||||
|
key = self._normalize_key(key)
|
||||||
|
if not key in self._patterns:
|
||||||
|
raise ValueError(Errors.E175.format(key=key))
|
||||||
|
self._patterns.pop(key)
|
||||||
|
self._raw_patterns.pop(key)
|
||||||
|
self._nodes.pop(key)
|
||||||
|
self._tree.pop(key)
|
||||||
|
self._root.pop(key)
|
||||||
|
|
||||||
|
def __call__(self, object doclike):
|
||||||
"""Find all token sequences matching the supplied pattern.
|
"""Find all token sequences matching the supplied pattern.
|
||||||
|
|
||||||
doclike (Doc or Span): The document to match over.
|
doclike (Doc or Span): The document to match over.
|
||||||
|
@ -214,8 +247,14 @@ cdef class DependencyMatcher:
|
||||||
describing the matches. A match tuple describes a span
|
describing the matches. A match tuple describes a span
|
||||||
`doc[start:end]`. The `label_id` and `key` are both integers.
|
`doc[start:end]`. The `label_id` and `key` are both integers.
|
||||||
"""
|
"""
|
||||||
|
if isinstance(doclike, Doc):
|
||||||
|
doc = doclike
|
||||||
|
elif isinstance(doclike, Span):
|
||||||
|
doc = doclike.as_doc()
|
||||||
|
else:
|
||||||
|
raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
|
||||||
matched_key_trees = []
|
matched_key_trees = []
|
||||||
matches = self.token_matcher(doc)
|
matches = self.matcher(doc)
|
||||||
for key in list(self._patterns.keys()):
|
for key in list(self._patterns.keys()):
|
||||||
_patterns_list = self._patterns[key]
|
_patterns_list = self._patterns[key]
|
||||||
_keys_to_token_list = self._keys_to_token[key]
|
_keys_to_token_list = self._keys_to_token[key]
|
||||||
|
@ -245,25 +284,25 @@ cdef class DependencyMatcher:
|
||||||
|
|
||||||
matched_trees = []
|
matched_trees = []
|
||||||
self.recurse(_tree, id_to_position, _node_operator_map, 0, [], matched_trees)
|
self.recurse(_tree, id_to_position, _node_operator_map, 0, [], matched_trees)
|
||||||
matched_key_trees.append((key,matched_trees))
|
for matched_tree in matched_trees:
|
||||||
|
matched_key_trees.append((key, matched_tree))
|
||||||
for i, (ent_id, nodes) in enumerate(matched_key_trees):
|
for i, (match_id, nodes) in enumerate(matched_key_trees):
|
||||||
on_match = self._callbacks.get(ent_id)
|
on_match = self._callbacks.get(match_id)
|
||||||
if on_match is not None:
|
if on_match is not None:
|
||||||
on_match(self, doc, i, matched_key_trees)
|
on_match(self, doc, i, matched_key_trees)
|
||||||
return matched_key_trees
|
return matched_key_trees
|
||||||
|
|
||||||
def recurse(self, tree, id_to_position, _node_operator_map, int patternLength, visited_nodes, matched_trees):
|
def recurse(self, tree, id_to_position, _node_operator_map, int patternLength, visited_nodes, matched_trees):
|
||||||
cdef bool isValid;
|
cdef bint isValid;
|
||||||
if(patternLength == len(id_to_position.keys())):
|
if patternLength == len(id_to_position.keys()):
|
||||||
isValid = True
|
isValid = True
|
||||||
for node in range(patternLength):
|
for node in range(patternLength):
|
||||||
if(node in tree):
|
if node in tree:
|
||||||
for idx, (relop,nbor) in enumerate(tree[node]):
|
for idx, (relop,nbor) in enumerate(tree[node]):
|
||||||
computed_nbors = numpy.asarray(_node_operator_map[visited_nodes[node]][relop])
|
computed_nbors = numpy.asarray(_node_operator_map[visited_nodes[node]][relop])
|
||||||
isNbor = False
|
isNbor = False
|
||||||
for computed_nbor in computed_nbors:
|
for computed_nbor in computed_nbors:
|
||||||
if(computed_nbor.i == visited_nodes[nbor]):
|
if computed_nbor.i == visited_nodes[nbor]:
|
||||||
isNbor = True
|
isNbor = True
|
||||||
isValid = isValid & isNbor
|
isValid = isValid & isNbor
|
||||||
if(isValid):
|
if(isValid):
|
||||||
|
@ -295,24 +334,14 @@ cdef class DependencyMatcher:
|
||||||
_node_operator_map[node] = {}
|
_node_operator_map[node] = {}
|
||||||
for operator in all_operators:
|
for operator in all_operators:
|
||||||
_node_operator_map[node][operator] = []
|
_node_operator_map[node][operator] = []
|
||||||
# Used to invoke methods for each operator
|
|
||||||
switcher = {
|
|
||||||
"<": self.dep,
|
|
||||||
">": self.gov,
|
|
||||||
"<<": self.dep_chain,
|
|
||||||
">>": self.gov_chain,
|
|
||||||
".": self.imm_precede,
|
|
||||||
"$+": self.imm_right_sib,
|
|
||||||
"$-": self.imm_left_sib,
|
|
||||||
"$++": self.right_sib,
|
|
||||||
"$--": self.left_sib
|
|
||||||
}
|
|
||||||
for operator in all_operators:
|
for operator in all_operators:
|
||||||
for node in all_nodes:
|
for node in all_nodes:
|
||||||
_node_operator_map[node][operator] = switcher.get(operator)(doc,node)
|
_node_operator_map[node][operator] = self._ops.get(operator)(doc, node)
|
||||||
return _node_operator_map
|
return _node_operator_map
|
||||||
|
|
||||||
def dep(self, doc, node):
|
def dep(self, doc, node):
|
||||||
|
if doc[node].head == doc[node]:
|
||||||
|
return []
|
||||||
return [doc[node].head]
|
return [doc[node].head]
|
||||||
|
|
||||||
def gov(self,doc,node):
|
def gov(self,doc,node):
|
||||||
|
@ -322,36 +351,51 @@ cdef class DependencyMatcher:
|
||||||
return list(doc[node].ancestors)
|
return list(doc[node].ancestors)
|
||||||
|
|
||||||
def gov_chain(self, doc, node):
|
def gov_chain(self, doc, node):
|
||||||
return list(doc[node].subtree)
|
return [t for t in doc[node].subtree if t != doc[node]]
|
||||||
|
|
||||||
def imm_precede(self, doc, node):
|
def imm_precede(self, doc, node):
|
||||||
if node > 0:
|
sent = self._get_sent(doc[node])
|
||||||
|
if node < len(doc) - 1 and doc[node + 1] in sent:
|
||||||
|
return [doc[node + 1]]
|
||||||
|
return []
|
||||||
|
|
||||||
|
def precede(self, doc, node):
|
||||||
|
sent = self._get_sent(doc[node])
|
||||||
|
return [doc[i] for i in range(node + 1, sent.end)]
|
||||||
|
|
||||||
|
def imm_follow(self, doc, node):
|
||||||
|
sent = self._get_sent(doc[node])
|
||||||
|
if node > 0 and doc[node - 1] in sent:
|
||||||
return [doc[node - 1]]
|
return [doc[node - 1]]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
def follow(self, doc, node):
|
||||||
|
sent = self._get_sent(doc[node])
|
||||||
|
return [doc[i] for i in range(sent.start, node)]
|
||||||
|
|
||||||
def imm_right_sib(self, doc, node):
|
def imm_right_sib(self, doc, node):
|
||||||
for child in list(doc[node].head.children):
|
for child in list(doc[node].head.children):
|
||||||
if child.i == node - 1:
|
if child.i == node + 1:
|
||||||
return [doc[child.i]]
|
return [doc[child.i]]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def imm_left_sib(self, doc, node):
|
def imm_left_sib(self, doc, node):
|
||||||
for child in list(doc[node].head.children):
|
for child in list(doc[node].head.children):
|
||||||
if child.i == node + 1:
|
if child.i == node - 1:
|
||||||
return [doc[child.i]]
|
return [doc[child.i]]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def right_sib(self, doc, node):
|
def right_sib(self, doc, node):
|
||||||
candidate_children = []
|
candidate_children = []
|
||||||
for child in list(doc[node].head.children):
|
for child in list(doc[node].head.children):
|
||||||
if child.i < node:
|
if child.i > node:
|
||||||
candidate_children.append(doc[child.i])
|
candidate_children.append(doc[child.i])
|
||||||
return candidate_children
|
return candidate_children
|
||||||
|
|
||||||
def left_sib(self, doc, node):
|
def left_sib(self, doc, node):
|
||||||
candidate_children = []
|
candidate_children = []
|
||||||
for child in list(doc[node].head.children):
|
for child in list(doc[node].head.children):
|
||||||
if child.i > node:
|
if child.i < node:
|
||||||
candidate_children.append(doc[child.i])
|
candidate_children.append(doc[child.i])
|
||||||
return candidate_children
|
return candidate_children
|
||||||
|
|
||||||
|
@ -360,3 +404,15 @@ cdef class DependencyMatcher:
|
||||||
return self.vocab.strings.add(key)
|
return self.vocab.strings.add(key)
|
||||||
else:
|
else:
|
||||||
return key
|
return key
|
||||||
|
|
||||||
|
def _get_sent(self, token):
|
||||||
|
root = (list(token.ancestors) or [token])[-1]
|
||||||
|
return token.doc[root.left_edge.i:root.right_edge.i + 1]
|
||||||
|
|
||||||
|
|
||||||
|
def unpickle_matcher(vocab, patterns, callbacks):
|
||||||
|
matcher = DependencyMatcher(vocab)
|
||||||
|
for key, pattern in patterns.items():
|
||||||
|
callback = callbacks.get(key, None)
|
||||||
|
matcher.add(key, pattern, on_match=callback)
|
||||||
|
return matcher
|
||||||
|
|
|
@ -31,8 +31,8 @@ DEF PADDING = 5
|
||||||
cdef class Matcher:
|
cdef class Matcher:
|
||||||
"""Match sequences of tokens, based on pattern rules.
|
"""Match sequences of tokens, based on pattern rules.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/matcher
|
DOCS: https://nightly.spacy.io/api/matcher
|
||||||
USAGE: https://spacy.io/usage/rule-based-matching
|
USAGE: https://nightly.spacy.io/usage/rule-based-matching
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, vocab, validate=True):
|
def __init__(self, vocab, validate=True):
|
||||||
|
@ -829,9 +829,11 @@ def _get_extra_predicates(spec, extra_predicates):
|
||||||
attr = "ORTH"
|
attr = "ORTH"
|
||||||
attr = IDS.get(attr.upper())
|
attr = IDS.get(attr.upper())
|
||||||
if isinstance(value, dict):
|
if isinstance(value, dict):
|
||||||
|
processed = False
|
||||||
|
value_with_upper_keys = {k.upper(): v for k, v in value.items()}
|
||||||
for type_, cls in predicate_types.items():
|
for type_, cls in predicate_types.items():
|
||||||
if type_ in value:
|
if type_ in value_with_upper_keys:
|
||||||
predicate = cls(len(extra_predicates), attr, value[type_], type_)
|
predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_)
|
||||||
# Don't create a redundant predicates.
|
# Don't create a redundant predicates.
|
||||||
# This helps with efficiency, as we're caching the results.
|
# This helps with efficiency, as we're caching the results.
|
||||||
if predicate.key in seen_predicates:
|
if predicate.key in seen_predicates:
|
||||||
|
@ -840,6 +842,9 @@ def _get_extra_predicates(spec, extra_predicates):
|
||||||
extra_predicates.append(predicate)
|
extra_predicates.append(predicate)
|
||||||
output.append(predicate.i)
|
output.append(predicate.i)
|
||||||
seen_predicates[predicate.key] = predicate.i
|
seen_predicates[predicate.key] = predicate.i
|
||||||
|
processed = True
|
||||||
|
if not processed:
|
||||||
|
warnings.warn(Warnings.W035.format(pattern=value))
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -19,8 +19,8 @@ cdef class PhraseMatcher:
|
||||||
sequences based on lists of token descriptions, the `PhraseMatcher` accepts
|
sequences based on lists of token descriptions, the `PhraseMatcher` accepts
|
||||||
match patterns in the form of `Doc` objects.
|
match patterns in the form of `Doc` objects.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/phrasematcher
|
DOCS: https://nightly.spacy.io/api/phrasematcher
|
||||||
USAGE: https://spacy.io/usage/rule-based-matching#phrasematcher
|
USAGE: https://nightly.spacy.io/usage/rule-based-matching#phrasematcher
|
||||||
|
|
||||||
Adapted from FlashText: https://github.com/vi3k6i5/flashtext
|
Adapted from FlashText: https://github.com/vi3k6i5/flashtext
|
||||||
MIT License (see `LICENSE`)
|
MIT License (see `LICENSE`)
|
||||||
|
@ -34,7 +34,7 @@ cdef class PhraseMatcher:
|
||||||
attr (int / str): Token attribute to match on.
|
attr (int / str): Token attribute to match on.
|
||||||
validate (bool): Perform additional validation when patterns are added.
|
validate (bool): Perform additional validation when patterns are added.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/phrasematcher#init
|
DOCS: https://nightly.spacy.io/api/phrasematcher#init
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self._callbacks = {}
|
self._callbacks = {}
|
||||||
|
@ -61,7 +61,7 @@ cdef class PhraseMatcher:
|
||||||
|
|
||||||
RETURNS (int): The number of rules.
|
RETURNS (int): The number of rules.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/phrasematcher#len
|
DOCS: https://nightly.spacy.io/api/phrasematcher#len
|
||||||
"""
|
"""
|
||||||
return len(self._callbacks)
|
return len(self._callbacks)
|
||||||
|
|
||||||
|
@ -71,7 +71,7 @@ cdef class PhraseMatcher:
|
||||||
key (str): The match ID.
|
key (str): The match ID.
|
||||||
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/phrasematcher#contains
|
DOCS: https://nightly.spacy.io/api/phrasematcher#contains
|
||||||
"""
|
"""
|
||||||
return key in self._callbacks
|
return key in self._callbacks
|
||||||
|
|
||||||
|
@ -85,7 +85,7 @@ cdef class PhraseMatcher:
|
||||||
|
|
||||||
key (str): The match ID.
|
key (str): The match ID.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/phrasematcher#remove
|
DOCS: https://nightly.spacy.io/api/phrasematcher#remove
|
||||||
"""
|
"""
|
||||||
if key not in self._docs:
|
if key not in self._docs:
|
||||||
raise KeyError(key)
|
raise KeyError(key)
|
||||||
|
@ -164,7 +164,7 @@ cdef class PhraseMatcher:
|
||||||
as variable arguments. Will be ignored if a list of patterns is
|
as variable arguments. Will be ignored if a list of patterns is
|
||||||
provided as the second argument.
|
provided as the second argument.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/phrasematcher#add
|
DOCS: https://nightly.spacy.io/api/phrasematcher#add
|
||||||
"""
|
"""
|
||||||
if docs is None or hasattr(docs, "__call__"): # old API
|
if docs is None or hasattr(docs, "__call__"): # old API
|
||||||
on_match = docs
|
on_match = docs
|
||||||
|
@ -228,7 +228,7 @@ cdef class PhraseMatcher:
|
||||||
`doc[start:end]`. The `match_id` is an integer. If as_spans is set
|
`doc[start:end]`. The `match_id` is an integer. If as_spans is set
|
||||||
to True, a list of Span objects is returned.
|
to True, a list of Span objects is returned.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/phrasematcher#call
|
DOCS: https://nightly.spacy.io/api/phrasematcher#call
|
||||||
"""
|
"""
|
||||||
matches = []
|
matches = []
|
||||||
if doc is None or len(doc) == 0:
|
if doc is None or len(doc) == 0:
|
||||||
|
|
|
@ -24,7 +24,7 @@ def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
@registry.assets.register("spacy.KBFromFile.v1")
|
@registry.misc.register("spacy.KBFromFile.v1")
|
||||||
def load_kb(kb_path: str) -> Callable[[Vocab], KnowledgeBase]:
|
def load_kb(kb_path: str) -> Callable[[Vocab], KnowledgeBase]:
|
||||||
def kb_from_file(vocab):
|
def kb_from_file(vocab):
|
||||||
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
||||||
|
@ -34,7 +34,7 @@ def load_kb(kb_path: str) -> Callable[[Vocab], KnowledgeBase]:
|
||||||
return kb_from_file
|
return kb_from_file
|
||||||
|
|
||||||
|
|
||||||
@registry.assets.register("spacy.EmptyKB.v1")
|
@registry.misc.register("spacy.EmptyKB.v1")
|
||||||
def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
|
def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
|
||||||
def empty_kb_factory(vocab):
|
def empty_kb_factory(vocab):
|
||||||
return KnowledgeBase(vocab=vocab, entity_vector_length=entity_vector_length)
|
return KnowledgeBase(vocab=vocab, entity_vector_length=entity_vector_length)
|
||||||
|
@ -42,6 +42,6 @@ def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
|
||||||
return empty_kb_factory
|
return empty_kb_factory
|
||||||
|
|
||||||
|
|
||||||
@registry.assets.register("spacy.CandidateGenerator.v1")
|
@registry.misc.register("spacy.CandidateGenerator.v1")
|
||||||
def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
|
def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
|
||||||
return get_candidates
|
return get_candidates
|
||||||
|
|
|
@ -38,7 +38,7 @@ class AttributeRuler(Pipe):
|
||||||
"""Set token-level attributes for tokens matched by Matcher patterns.
|
"""Set token-level attributes for tokens matched by Matcher patterns.
|
||||||
Additionally supports importing patterns from tag maps and morph rules.
|
Additionally supports importing patterns from tag maps and morph rules.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/attributeruler
|
DOCS: https://nightly.spacy.io/api/attributeruler
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -59,7 +59,7 @@ class AttributeRuler(Pipe):
|
||||||
|
|
||||||
RETURNS (AttributeRuler): The AttributeRuler component.
|
RETURNS (AttributeRuler): The AttributeRuler component.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/attributeruler#init
|
DOCS: https://nightly.spacy.io/api/attributeruler#init
|
||||||
"""
|
"""
|
||||||
self.name = name
|
self.name = name
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
|
@ -77,7 +77,7 @@ class AttributeRuler(Pipe):
|
||||||
doc (Doc): The document to process.
|
doc (Doc): The document to process.
|
||||||
RETURNS (Doc): The processed Doc.
|
RETURNS (Doc): The processed Doc.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/attributeruler#call
|
DOCS: https://nightly.spacy.io/api/attributeruler#call
|
||||||
"""
|
"""
|
||||||
matches = sorted(self.matcher(doc))
|
matches = sorted(self.matcher(doc))
|
||||||
|
|
||||||
|
@ -121,7 +121,7 @@ class AttributeRuler(Pipe):
|
||||||
tag_map (dict): The tag map that maps fine-grained tags to
|
tag_map (dict): The tag map that maps fine-grained tags to
|
||||||
coarse-grained tags and morphological features.
|
coarse-grained tags and morphological features.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/attributeruler#load_from_morph_rules
|
DOCS: https://nightly.spacy.io/api/attributeruler#load_from_morph_rules
|
||||||
"""
|
"""
|
||||||
for tag, attrs in tag_map.items():
|
for tag, attrs in tag_map.items():
|
||||||
pattern = [{"TAG": tag}]
|
pattern = [{"TAG": tag}]
|
||||||
|
@ -139,7 +139,7 @@ class AttributeRuler(Pipe):
|
||||||
fine-grained tags to coarse-grained tags, lemmas and morphological
|
fine-grained tags to coarse-grained tags, lemmas and morphological
|
||||||
features.
|
features.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/attributeruler#load_from_morph_rules
|
DOCS: https://nightly.spacy.io/api/attributeruler#load_from_morph_rules
|
||||||
"""
|
"""
|
||||||
for tag in morph_rules:
|
for tag in morph_rules:
|
||||||
for word in morph_rules[tag]:
|
for word in morph_rules[tag]:
|
||||||
|
@ -163,7 +163,7 @@ class AttributeRuler(Pipe):
|
||||||
index (int): The index of the token in the matched span to modify. May
|
index (int): The index of the token in the matched span to modify. May
|
||||||
be negative to index from the end of the span. Defaults to 0.
|
be negative to index from the end of the span. Defaults to 0.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/attributeruler#add
|
DOCS: https://nightly.spacy.io/api/attributeruler#add
|
||||||
"""
|
"""
|
||||||
self.matcher.add(len(self.attrs), patterns)
|
self.matcher.add(len(self.attrs), patterns)
|
||||||
self._attrs_unnormed.append(attrs)
|
self._attrs_unnormed.append(attrs)
|
||||||
|
@ -178,7 +178,7 @@ class AttributeRuler(Pipe):
|
||||||
as the arguments to AttributeRuler.add (patterns/attrs/index) to
|
as the arguments to AttributeRuler.add (patterns/attrs/index) to
|
||||||
add as patterns.
|
add as patterns.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/attributeruler#add_patterns
|
DOCS: https://nightly.spacy.io/api/attributeruler#add_patterns
|
||||||
"""
|
"""
|
||||||
for p in pattern_dicts:
|
for p in pattern_dicts:
|
||||||
self.add(**p)
|
self.add(**p)
|
||||||
|
@ -203,7 +203,7 @@ class AttributeRuler(Pipe):
|
||||||
Scorer.score_token_attr for the attributes "tag", "pos", "morph"
|
Scorer.score_token_attr for the attributes "tag", "pos", "morph"
|
||||||
and "lemma" for the target token attributes.
|
and "lemma" for the target token attributes.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#score
|
DOCS: https://nightly.spacy.io/api/tagger#score
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "AttributeRuler.score")
|
validate_examples(examples, "AttributeRuler.score")
|
||||||
results = {}
|
results = {}
|
||||||
|
@ -227,7 +227,7 @@ class AttributeRuler(Pipe):
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (bytes): The serialized object.
|
RETURNS (bytes): The serialized object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/attributeruler#to_bytes
|
DOCS: https://nightly.spacy.io/api/attributeruler#to_bytes
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["vocab"] = self.vocab.to_bytes
|
serialize["vocab"] = self.vocab.to_bytes
|
||||||
|
@ -243,7 +243,7 @@ class AttributeRuler(Pipe):
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
returns (AttributeRuler): The loaded object.
|
returns (AttributeRuler): The loaded object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/attributeruler#from_bytes
|
DOCS: https://nightly.spacy.io/api/attributeruler#from_bytes
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def load_patterns(b):
|
def load_patterns(b):
|
||||||
|
@ -264,7 +264,7 @@ class AttributeRuler(Pipe):
|
||||||
|
|
||||||
path (Union[Path, str]): A path to a directory.
|
path (Union[Path, str]): A path to a directory.
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
DOCS: https://spacy.io/api/attributeruler#to_disk
|
DOCS: https://nightly.spacy.io/api/attributeruler#to_disk
|
||||||
"""
|
"""
|
||||||
serialize = {
|
serialize = {
|
||||||
"vocab": lambda p: self.vocab.to_disk(p),
|
"vocab": lambda p: self.vocab.to_disk(p),
|
||||||
|
@ -279,7 +279,7 @@ class AttributeRuler(Pipe):
|
||||||
|
|
||||||
path (Union[Path, str]): A path to a directory.
|
path (Union[Path, str]): A path to a directory.
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
DOCS: https://spacy.io/api/attributeruler#from_disk
|
DOCS: https://nightly.spacy.io/api/attributeruler#from_disk
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def load_patterns(p):
|
def load_patterns(p):
|
||||||
|
|
|
@ -105,7 +105,7 @@ def make_parser(
|
||||||
cdef class DependencyParser(Parser):
|
cdef class DependencyParser(Parser):
|
||||||
"""Pipeline component for dependency parsing.
|
"""Pipeline component for dependency parsing.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/dependencyparser
|
DOCS: https://nightly.spacy.io/api/dependencyparser
|
||||||
"""
|
"""
|
||||||
TransitionSystem = ArcEager
|
TransitionSystem = ArcEager
|
||||||
|
|
||||||
|
@ -146,7 +146,7 @@ cdef class DependencyParser(Parser):
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
|
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
|
||||||
and Scorer.score_deps.
|
and Scorer.score_deps.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/dependencyparser#score
|
DOCS: https://nightly.spacy.io/api/dependencyparser#score
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "DependencyParser.score")
|
validate_examples(examples, "DependencyParser.score")
|
||||||
def dep_getter(token, attr):
|
def dep_getter(token, attr):
|
||||||
|
@ -156,7 +156,7 @@ cdef class DependencyParser(Parser):
|
||||||
results = {}
|
results = {}
|
||||||
results.update(Scorer.score_spans(examples, "sents", **kwargs))
|
results.update(Scorer.score_spans(examples, "sents", **kwargs))
|
||||||
kwargs.setdefault("getter", dep_getter)
|
kwargs.setdefault("getter", dep_getter)
|
||||||
kwargs.setdefault("ignore_label", ("p", "punct"))
|
kwargs.setdefault("ignore_labels", ("p", "punct"))
|
||||||
results.update(Scorer.score_deps(examples, "dep", **kwargs))
|
results.update(Scorer.score_deps(examples, "dep", **kwargs))
|
||||||
del results["sents_per_type"]
|
del results["sents_per_type"]
|
||||||
return results
|
return results
|
||||||
|
|
|
@ -39,12 +39,12 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
|
requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
|
||||||
assigns=["token.ent_kb_id"],
|
assigns=["token.ent_kb_id"],
|
||||||
default_config={
|
default_config={
|
||||||
"kb_loader": {"@assets": "spacy.EmptyKB.v1", "entity_vector_length": 64},
|
"kb_loader": {"@misc": "spacy.EmptyKB.v1", "entity_vector_length": 64},
|
||||||
"model": DEFAULT_NEL_MODEL,
|
"model": DEFAULT_NEL_MODEL,
|
||||||
"labels_discard": [],
|
"labels_discard": [],
|
||||||
"incl_prior": True,
|
"incl_prior": True,
|
||||||
"incl_context": True,
|
"incl_context": True,
|
||||||
"get_candidates": {"@assets": "spacy.CandidateGenerator.v1"},
|
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
def make_entity_linker(
|
def make_entity_linker(
|
||||||
|
@ -83,7 +83,7 @@ def make_entity_linker(
|
||||||
class EntityLinker(Pipe):
|
class EntityLinker(Pipe):
|
||||||
"""Pipeline component for named entity linking.
|
"""Pipeline component for named entity linking.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker
|
DOCS: https://nightly.spacy.io/api/entitylinker
|
||||||
"""
|
"""
|
||||||
|
|
||||||
NIL = "NIL" # string used to refer to a non-existing link
|
NIL = "NIL" # string used to refer to a non-existing link
|
||||||
|
@ -111,7 +111,7 @@ class EntityLinker(Pipe):
|
||||||
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
|
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
|
||||||
incl_context (bool): Whether or not to include the local context in the model.
|
incl_context (bool): Whether or not to include the local context in the model.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker#init
|
DOCS: https://nightly.spacy.io/api/entitylinker#init
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
@ -151,7 +151,7 @@ class EntityLinker(Pipe):
|
||||||
create_optimizer if it doesn't exist.
|
create_optimizer if it doesn't exist.
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker#begin_training
|
DOCS: https://nightly.spacy.io/api/entitylinker#begin_training
|
||||||
"""
|
"""
|
||||||
self.require_kb()
|
self.require_kb()
|
||||||
nO = self.kb.entity_vector_length
|
nO = self.kb.entity_vector_length
|
||||||
|
@ -182,7 +182,7 @@ class EntityLinker(Pipe):
|
||||||
Updated using the component name as the key.
|
Updated using the component name as the key.
|
||||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker#update
|
DOCS: https://nightly.spacy.io/api/entitylinker#update
|
||||||
"""
|
"""
|
||||||
self.require_kb()
|
self.require_kb()
|
||||||
if losses is None:
|
if losses is None:
|
||||||
|
@ -264,7 +264,7 @@ class EntityLinker(Pipe):
|
||||||
doc (Doc): The document to process.
|
doc (Doc): The document to process.
|
||||||
RETURNS (Doc): The processed Doc.
|
RETURNS (Doc): The processed Doc.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker#call
|
DOCS: https://nightly.spacy.io/api/entitylinker#call
|
||||||
"""
|
"""
|
||||||
kb_ids = self.predict([doc])
|
kb_ids = self.predict([doc])
|
||||||
self.set_annotations([doc], kb_ids)
|
self.set_annotations([doc], kb_ids)
|
||||||
|
@ -279,7 +279,7 @@ class EntityLinker(Pipe):
|
||||||
batch_size (int): The number of documents to buffer.
|
batch_size (int): The number of documents to buffer.
|
||||||
YIELDS (Doc): Processed documents in order.
|
YIELDS (Doc): Processed documents in order.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker#pipe
|
DOCS: https://nightly.spacy.io/api/entitylinker#pipe
|
||||||
"""
|
"""
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
kb_ids = self.predict(docs)
|
kb_ids = self.predict(docs)
|
||||||
|
@ -294,7 +294,7 @@ class EntityLinker(Pipe):
|
||||||
docs (Iterable[Doc]): The documents to predict.
|
docs (Iterable[Doc]): The documents to predict.
|
||||||
RETURNS (List[int]): The models prediction for each document.
|
RETURNS (List[int]): The models prediction for each document.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker#predict
|
DOCS: https://nightly.spacy.io/api/entitylinker#predict
|
||||||
"""
|
"""
|
||||||
self.require_kb()
|
self.require_kb()
|
||||||
entity_count = 0
|
entity_count = 0
|
||||||
|
@ -391,7 +391,7 @@ class EntityLinker(Pipe):
|
||||||
docs (Iterable[Doc]): The documents to modify.
|
docs (Iterable[Doc]): The documents to modify.
|
||||||
kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.
|
kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker#set_annotations
|
DOCS: https://nightly.spacy.io/api/entitylinker#set_annotations
|
||||||
"""
|
"""
|
||||||
count_ents = len([ent for doc in docs for ent in doc.ents])
|
count_ents = len([ent for doc in docs for ent in doc.ents])
|
||||||
if count_ents != len(kb_ids):
|
if count_ents != len(kb_ids):
|
||||||
|
@ -412,7 +412,7 @@ class EntityLinker(Pipe):
|
||||||
path (str / Path): Path to a directory.
|
path (str / Path): Path to a directory.
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker#to_disk
|
DOCS: https://nightly.spacy.io/api/entitylinker#to_disk
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
||||||
|
@ -430,7 +430,7 @@ class EntityLinker(Pipe):
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (EntityLinker): The modified EntityLinker object.
|
RETURNS (EntityLinker): The modified EntityLinker object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker#from_disk
|
DOCS: https://nightly.spacy.io/api/entitylinker#from_disk
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
|
|
|
@ -53,8 +53,8 @@ class EntityRuler:
|
||||||
purely rule-based entity recognition system. After initialization, the
|
purely rule-based entity recognition system. After initialization, the
|
||||||
component is typically added to the pipeline using `nlp.add_pipe`.
|
component is typically added to the pipeline using `nlp.add_pipe`.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler
|
DOCS: https://nightly.spacy.io/api/entityruler
|
||||||
USAGE: https://spacy.io/usage/rule-based-matching#entityruler
|
USAGE: https://nightly.spacy.io/usage/rule-based-matching#entityruler
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -88,7 +88,7 @@ class EntityRuler:
|
||||||
added by the model, overwrite them by matches if necessary.
|
added by the model, overwrite them by matches if necessary.
|
||||||
ent_id_sep (str): Separator used internally for entity IDs.
|
ent_id_sep (str): Separator used internally for entity IDs.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#init
|
DOCS: https://nightly.spacy.io/api/entityruler#init
|
||||||
"""
|
"""
|
||||||
self.nlp = nlp
|
self.nlp = nlp
|
||||||
self.name = name
|
self.name = name
|
||||||
|
@ -127,13 +127,13 @@ class EntityRuler:
|
||||||
doc (Doc): The Doc object in the pipeline.
|
doc (Doc): The Doc object in the pipeline.
|
||||||
RETURNS (Doc): The Doc with added entities, if available.
|
RETURNS (Doc): The Doc with added entities, if available.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#call
|
DOCS: https://nightly.spacy.io/api/entityruler#call
|
||||||
"""
|
"""
|
||||||
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
|
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
|
||||||
matches = set(
|
matches = set(
|
||||||
[(m_id, start, end) for m_id, start, end in matches if start != end]
|
[(m_id, start, end) for m_id, start, end in matches if start != end]
|
||||||
)
|
)
|
||||||
get_sort_key = lambda m: (m[2] - m[1], m[1])
|
get_sort_key = lambda m: (m[2] - m[1], -m[1])
|
||||||
matches = sorted(matches, key=get_sort_key, reverse=True)
|
matches = sorted(matches, key=get_sort_key, reverse=True)
|
||||||
entities = list(doc.ents)
|
entities = list(doc.ents)
|
||||||
new_entities = []
|
new_entities = []
|
||||||
|
@ -165,7 +165,7 @@ class EntityRuler:
|
||||||
|
|
||||||
RETURNS (set): The string labels.
|
RETURNS (set): The string labels.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#labels
|
DOCS: https://nightly.spacy.io/api/entityruler#labels
|
||||||
"""
|
"""
|
||||||
keys = set(self.token_patterns.keys())
|
keys = set(self.token_patterns.keys())
|
||||||
keys.update(self.phrase_patterns.keys())
|
keys.update(self.phrase_patterns.keys())
|
||||||
|
@ -185,7 +185,7 @@ class EntityRuler:
|
||||||
|
|
||||||
RETURNS (set): The string entity ids.
|
RETURNS (set): The string entity ids.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#ent_ids
|
DOCS: https://nightly.spacy.io/api/entityruler#ent_ids
|
||||||
"""
|
"""
|
||||||
keys = set(self.token_patterns.keys())
|
keys = set(self.token_patterns.keys())
|
||||||
keys.update(self.phrase_patterns.keys())
|
keys.update(self.phrase_patterns.keys())
|
||||||
|
@ -203,7 +203,7 @@ class EntityRuler:
|
||||||
|
|
||||||
RETURNS (list): The original patterns, one dictionary per pattern.
|
RETURNS (list): The original patterns, one dictionary per pattern.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#patterns
|
DOCS: https://nightly.spacy.io/api/entityruler#patterns
|
||||||
"""
|
"""
|
||||||
all_patterns = []
|
all_patterns = []
|
||||||
for label, patterns in self.token_patterns.items():
|
for label, patterns in self.token_patterns.items():
|
||||||
|
@ -230,7 +230,7 @@ class EntityRuler:
|
||||||
|
|
||||||
patterns (list): The patterns to add.
|
patterns (list): The patterns to add.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#add_patterns
|
DOCS: https://nightly.spacy.io/api/entityruler#add_patterns
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# disable the nlp components after this one in case they hadn't been initialized / deserialised yet
|
# disable the nlp components after this one in case they hadn't been initialized / deserialised yet
|
||||||
|
@ -324,7 +324,7 @@ class EntityRuler:
|
||||||
patterns_bytes (bytes): The bytestring to load.
|
patterns_bytes (bytes): The bytestring to load.
|
||||||
RETURNS (EntityRuler): The loaded entity ruler.
|
RETURNS (EntityRuler): The loaded entity ruler.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#from_bytes
|
DOCS: https://nightly.spacy.io/api/entityruler#from_bytes
|
||||||
"""
|
"""
|
||||||
cfg = srsly.msgpack_loads(patterns_bytes)
|
cfg = srsly.msgpack_loads(patterns_bytes)
|
||||||
self.clear()
|
self.clear()
|
||||||
|
@ -346,7 +346,7 @@ class EntityRuler:
|
||||||
|
|
||||||
RETURNS (bytes): The serialized patterns.
|
RETURNS (bytes): The serialized patterns.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#to_bytes
|
DOCS: https://nightly.spacy.io/api/entityruler#to_bytes
|
||||||
"""
|
"""
|
||||||
serial = {
|
serial = {
|
||||||
"overwrite": self.overwrite,
|
"overwrite": self.overwrite,
|
||||||
|
@ -365,7 +365,7 @@ class EntityRuler:
|
||||||
path (str / Path): The JSONL file to load.
|
path (str / Path): The JSONL file to load.
|
||||||
RETURNS (EntityRuler): The loaded entity ruler.
|
RETURNS (EntityRuler): The loaded entity ruler.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#from_disk
|
DOCS: https://nightly.spacy.io/api/entityruler#from_disk
|
||||||
"""
|
"""
|
||||||
path = ensure_path(path)
|
path = ensure_path(path)
|
||||||
self.clear()
|
self.clear()
|
||||||
|
@ -401,7 +401,7 @@ class EntityRuler:
|
||||||
|
|
||||||
path (str / Path): The JSONL file to save.
|
path (str / Path): The JSONL file to save.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#to_disk
|
DOCS: https://nightly.spacy.io/api/entityruler#to_disk
|
||||||
"""
|
"""
|
||||||
path = ensure_path(path)
|
path = ensure_path(path)
|
||||||
cfg = {
|
cfg = {
|
||||||
|
|
|
@ -15,7 +15,7 @@ def merge_noun_chunks(doc: Doc) -> Doc:
|
||||||
doc (Doc): The Doc object.
|
doc (Doc): The Doc object.
|
||||||
RETURNS (Doc): The Doc object with merged noun chunks.
|
RETURNS (Doc): The Doc object with merged noun chunks.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipeline-functions#merge_noun_chunks
|
DOCS: https://nightly.spacy.io/api/pipeline-functions#merge_noun_chunks
|
||||||
"""
|
"""
|
||||||
if not doc.is_parsed:
|
if not doc.is_parsed:
|
||||||
return doc
|
return doc
|
||||||
|
@ -37,7 +37,7 @@ def merge_entities(doc: Doc):
|
||||||
doc (Doc): The Doc object.
|
doc (Doc): The Doc object.
|
||||||
RETURNS (Doc): The Doc object with merged entities.
|
RETURNS (Doc): The Doc object with merged entities.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipeline-functions#merge_entities
|
DOCS: https://nightly.spacy.io/api/pipeline-functions#merge_entities
|
||||||
"""
|
"""
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
|
@ -54,7 +54,7 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc:
|
||||||
label (str): The subtoken dependency label.
|
label (str): The subtoken dependency label.
|
||||||
RETURNS (Doc): The Doc object with merged subtokens.
|
RETURNS (Doc): The Doc object with merged subtokens.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipeline-functions#merge_subtokens
|
DOCS: https://nightly.spacy.io/api/pipeline-functions#merge_subtokens
|
||||||
"""
|
"""
|
||||||
# TODO: make stateful component with "label" config
|
# TODO: make stateful component with "label" config
|
||||||
merger = Matcher(doc.vocab)
|
merger = Matcher(doc.vocab)
|
||||||
|
|
|
@ -43,7 +43,7 @@ class Lemmatizer(Pipe):
|
||||||
The Lemmatizer supports simple part-of-speech-sensitive suffix rules and
|
The Lemmatizer supports simple part-of-speech-sensitive suffix rules and
|
||||||
lookup tables.
|
lookup tables.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lemmatizer
|
DOCS: https://nightly.spacy.io/api/lemmatizer
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -54,7 +54,7 @@ class Lemmatizer(Pipe):
|
||||||
mode (str): The lemmatizer mode.
|
mode (str): The lemmatizer mode.
|
||||||
RETURNS (dict): The lookups configuration settings for this mode.
|
RETURNS (dict): The lookups configuration settings for this mode.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lemmatizer#get_lookups_config
|
DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
|
||||||
"""
|
"""
|
||||||
if mode == "lookup":
|
if mode == "lookup":
|
||||||
return {
|
return {
|
||||||
|
@ -80,7 +80,7 @@ class Lemmatizer(Pipe):
|
||||||
lookups should be loaded.
|
lookups should be loaded.
|
||||||
RETURNS (Lookups): The Lookups object.
|
RETURNS (Lookups): The Lookups object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lemmatizer#get_lookups_config
|
DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
|
||||||
"""
|
"""
|
||||||
config = cls.get_lookups_config(mode)
|
config = cls.get_lookups_config(mode)
|
||||||
required_tables = config.get("required_tables", [])
|
required_tables = config.get("required_tables", [])
|
||||||
|
@ -123,7 +123,7 @@ class Lemmatizer(Pipe):
|
||||||
overwrite (bool): Whether to overwrite existing lemmas. Defaults to
|
overwrite (bool): Whether to overwrite existing lemmas. Defaults to
|
||||||
`False`.
|
`False`.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lemmatizer#init
|
DOCS: https://nightly.spacy.io/api/lemmatizer#init
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
@ -152,7 +152,7 @@ class Lemmatizer(Pipe):
|
||||||
doc (Doc): The Doc to process.
|
doc (Doc): The Doc to process.
|
||||||
RETURNS (Doc): The processed Doc.
|
RETURNS (Doc): The processed Doc.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lemmatizer#call
|
DOCS: https://nightly.spacy.io/api/lemmatizer#call
|
||||||
"""
|
"""
|
||||||
for token in doc:
|
for token in doc:
|
||||||
if self.overwrite or token.lemma == 0:
|
if self.overwrite or token.lemma == 0:
|
||||||
|
@ -168,7 +168,7 @@ class Lemmatizer(Pipe):
|
||||||
batch_size (int): The number of documents to buffer.
|
batch_size (int): The number of documents to buffer.
|
||||||
YIELDS (Doc): Processed documents in order.
|
YIELDS (Doc): Processed documents in order.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lemmatizer#pipe
|
DOCS: https://nightly.spacy.io/api/lemmatizer#pipe
|
||||||
"""
|
"""
|
||||||
for doc in stream:
|
for doc in stream:
|
||||||
doc = self(doc)
|
doc = self(doc)
|
||||||
|
@ -180,7 +180,7 @@ class Lemmatizer(Pipe):
|
||||||
token (Token): The token to lemmatize.
|
token (Token): The token to lemmatize.
|
||||||
RETURNS (list): The available lemmas for the string.
|
RETURNS (list): The available lemmas for the string.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lemmatizer#lookup_lemmatize
|
DOCS: https://nightly.spacy.io/api/lemmatizer#lookup_lemmatize
|
||||||
"""
|
"""
|
||||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||||
result = lookup_table.get(token.text, token.text)
|
result = lookup_table.get(token.text, token.text)
|
||||||
|
@ -194,7 +194,7 @@ class Lemmatizer(Pipe):
|
||||||
token (Token): The token to lemmatize.
|
token (Token): The token to lemmatize.
|
||||||
RETURNS (list): The available lemmas for the string.
|
RETURNS (list): The available lemmas for the string.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lemmatizer#rule_lemmatize
|
DOCS: https://nightly.spacy.io/api/lemmatizer#rule_lemmatize
|
||||||
"""
|
"""
|
||||||
cache_key = (token.orth, token.pos, token.morph)
|
cache_key = (token.orth, token.pos, token.morph)
|
||||||
if cache_key in self.cache:
|
if cache_key in self.cache:
|
||||||
|
@ -260,7 +260,7 @@ class Lemmatizer(Pipe):
|
||||||
token (Token): The token.
|
token (Token): The token.
|
||||||
RETURNS (bool): Whether the token is a base form.
|
RETURNS (bool): Whether the token is a base form.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lemmatizer#is_base_form
|
DOCS: https://nightly.spacy.io/api/lemmatizer#is_base_form
|
||||||
"""
|
"""
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@ -270,7 +270,7 @@ class Lemmatizer(Pipe):
|
||||||
examples (Iterable[Example]): The examples to score.
|
examples (Iterable[Example]): The examples to score.
|
||||||
RETURNS (Dict[str, Any]): The scores.
|
RETURNS (Dict[str, Any]): The scores.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lemmatizer#score
|
DOCS: https://nightly.spacy.io/api/lemmatizer#score
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "Lemmatizer.score")
|
validate_examples(examples, "Lemmatizer.score")
|
||||||
return Scorer.score_token_attr(examples, "lemma", **kwargs)
|
return Scorer.score_token_attr(examples, "lemma", **kwargs)
|
||||||
|
@ -282,7 +282,7 @@ class Lemmatizer(Pipe):
|
||||||
it doesn't exist.
|
it doesn't exist.
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#to_disk
|
DOCS: https://nightly.spacy.io/api/vocab#to_disk
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
||||||
|
@ -297,7 +297,7 @@ class Lemmatizer(Pipe):
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (Vocab): The modified `Vocab` object.
|
RETURNS (Vocab): The modified `Vocab` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#to_disk
|
DOCS: https://nightly.spacy.io/api/vocab#to_disk
|
||||||
"""
|
"""
|
||||||
deserialize = {}
|
deserialize = {}
|
||||||
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
|
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
|
||||||
|
@ -310,7 +310,7 @@ class Lemmatizer(Pipe):
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (bytes): The serialized form of the `Vocab` object.
|
RETURNS (bytes): The serialized form of the `Vocab` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#to_bytes
|
DOCS: https://nightly.spacy.io/api/vocab#to_bytes
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["vocab"] = self.vocab.to_bytes
|
serialize["vocab"] = self.vocab.to_bytes
|
||||||
|
@ -324,7 +324,7 @@ class Lemmatizer(Pipe):
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (Vocab): The `Vocab` object.
|
RETURNS (Vocab): The `Vocab` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#from_bytes
|
DOCS: https://nightly.spacy.io/api/vocab#from_bytes
|
||||||
"""
|
"""
|
||||||
deserialize = {}
|
deserialize = {}
|
||||||
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
|
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
|
||||||
|
|
|
@ -79,7 +79,7 @@ class Morphologizer(Tagger):
|
||||||
labels_morph (dict): Mapping of morph + POS tags to morph labels.
|
labels_morph (dict): Mapping of morph + POS tags to morph labels.
|
||||||
labels_pos (dict): Mapping of morph + POS tags to POS tags.
|
labels_pos (dict): Mapping of morph + POS tags to POS tags.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/morphologizer#init
|
DOCS: https://nightly.spacy.io/api/morphologizer#init
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
@ -106,7 +106,7 @@ class Morphologizer(Tagger):
|
||||||
label (str): The label to add.
|
label (str): The label to add.
|
||||||
RETURNS (int): 0 if label is already present, otherwise 1.
|
RETURNS (int): 0 if label is already present, otherwise 1.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/morphologizer#add_label
|
DOCS: https://nightly.spacy.io/api/morphologizer#add_label
|
||||||
"""
|
"""
|
||||||
if not isinstance(label, str):
|
if not isinstance(label, str):
|
||||||
raise ValueError(Errors.E187)
|
raise ValueError(Errors.E187)
|
||||||
|
@ -139,7 +139,7 @@ class Morphologizer(Tagger):
|
||||||
create_optimizer if it doesn't exist.
|
create_optimizer if it doesn't exist.
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/morphologizer#begin_training
|
DOCS: https://nightly.spacy.io/api/morphologizer#begin_training
|
||||||
"""
|
"""
|
||||||
if not hasattr(get_examples, "__call__"):
|
if not hasattr(get_examples, "__call__"):
|
||||||
err = Errors.E930.format(name="Morphologizer", obj=type(get_examples))
|
err = Errors.E930.format(name="Morphologizer", obj=type(get_examples))
|
||||||
|
@ -169,7 +169,7 @@ class Morphologizer(Tagger):
|
||||||
docs (Iterable[Doc]): The documents to modify.
|
docs (Iterable[Doc]): The documents to modify.
|
||||||
batch_tag_ids: The IDs to set, produced by Morphologizer.predict.
|
batch_tag_ids: The IDs to set, produced by Morphologizer.predict.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/morphologizer#set_annotations
|
DOCS: https://nightly.spacy.io/api/morphologizer#set_annotations
|
||||||
"""
|
"""
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
|
@ -194,7 +194,7 @@ class Morphologizer(Tagger):
|
||||||
scores: Scores representing the model's predictions.
|
scores: Scores representing the model's predictions.
|
||||||
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/morphologizer#get_loss
|
DOCS: https://nightly.spacy.io/api/morphologizer#get_loss
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "Morphologizer.get_loss")
|
validate_examples(examples, "Morphologizer.get_loss")
|
||||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
||||||
|
@ -231,7 +231,7 @@ class Morphologizer(Tagger):
|
||||||
Scorer.score_token_attr for the attributes "pos" and "morph" and
|
Scorer.score_token_attr for the attributes "pos" and "morph" and
|
||||||
Scorer.score_token_attr_per_feat for the attribute "morph".
|
Scorer.score_token_attr_per_feat for the attribute "morph".
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/morphologizer#score
|
DOCS: https://nightly.spacy.io/api/morphologizer#score
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "Morphologizer.score")
|
validate_examples(examples, "Morphologizer.score")
|
||||||
results = {}
|
results = {}
|
||||||
|
@ -247,7 +247,7 @@ class Morphologizer(Tagger):
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (bytes): The serialized object.
|
RETURNS (bytes): The serialized object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/morphologizer#to_bytes
|
DOCS: https://nightly.spacy.io/api/morphologizer#to_bytes
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["model"] = self.model.to_bytes
|
serialize["model"] = self.model.to_bytes
|
||||||
|
@ -262,7 +262,7 @@ class Morphologizer(Tagger):
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (Morphologizer): The loaded Morphologizer.
|
RETURNS (Morphologizer): The loaded Morphologizer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/morphologizer#from_bytes
|
DOCS: https://nightly.spacy.io/api/morphologizer#from_bytes
|
||||||
"""
|
"""
|
||||||
def load_model(b):
|
def load_model(b):
|
||||||
try:
|
try:
|
||||||
|
@ -284,7 +284,7 @@ class Morphologizer(Tagger):
|
||||||
path (str / Path): Path to a directory.
|
path (str / Path): Path to a directory.
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/morphologizer#to_disk
|
DOCS: https://nightly.spacy.io/api/morphologizer#to_disk
|
||||||
"""
|
"""
|
||||||
serialize = {
|
serialize = {
|
||||||
"vocab": lambda p: self.vocab.to_disk(p),
|
"vocab": lambda p: self.vocab.to_disk(p),
|
||||||
|
@ -300,7 +300,7 @@ class Morphologizer(Tagger):
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (Morphologizer): The modified Morphologizer object.
|
RETURNS (Morphologizer): The modified Morphologizer object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/morphologizer#from_disk
|
DOCS: https://nightly.spacy.io/api/morphologizer#from_disk
|
||||||
"""
|
"""
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
with p.open("rb") as file_:
|
with p.open("rb") as file_:
|
||||||
|
|
|
@ -88,7 +88,7 @@ def make_ner(
|
||||||
cdef class EntityRecognizer(Parser):
|
cdef class EntityRecognizer(Parser):
|
||||||
"""Pipeline component for named entity recognition.
|
"""Pipeline component for named entity recognition.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityrecognizer
|
DOCS: https://nightly.spacy.io/api/entityrecognizer
|
||||||
"""
|
"""
|
||||||
TransitionSystem = BiluoPushDown
|
TransitionSystem = BiluoPushDown
|
||||||
|
|
||||||
|
@ -119,7 +119,7 @@ cdef class EntityRecognizer(Parser):
|
||||||
examples (Iterable[Example]): The examples to score.
|
examples (Iterable[Example]): The examples to score.
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityrecognizer#score
|
DOCS: https://nightly.spacy.io/api/entityrecognizer#score
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "EntityRecognizer.score")
|
validate_examples(examples, "EntityRecognizer.score")
|
||||||
return Scorer.score_spans(examples, "ents", **kwargs)
|
return Scorer.score_spans(examples, "ents", **kwargs)
|
||||||
|
|
|
@ -15,7 +15,7 @@ cdef class Pipe:
|
||||||
from it and it defines the interface that components should follow to
|
from it and it defines the interface that components should follow to
|
||||||
function as trainable components in a spaCy pipeline.
|
function as trainable components in a spaCy pipeline.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe
|
DOCS: https://nightly.spacy.io/api/pipe
|
||||||
"""
|
"""
|
||||||
def __init__(self, vocab, model, name, **cfg):
|
def __init__(self, vocab, model, name, **cfg):
|
||||||
"""Initialize a pipeline component.
|
"""Initialize a pipeline component.
|
||||||
|
@ -25,7 +25,7 @@ cdef class Pipe:
|
||||||
name (str): The component instance name.
|
name (str): The component instance name.
|
||||||
**cfg: Additonal settings and config parameters.
|
**cfg: Additonal settings and config parameters.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#init
|
DOCS: https://nightly.spacy.io/api/pipe#init
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
@ -40,7 +40,7 @@ cdef class Pipe:
|
||||||
docs (Doc): The Doc to process.
|
docs (Doc): The Doc to process.
|
||||||
RETURNS (Doc): The processed Doc.
|
RETURNS (Doc): The processed Doc.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#call
|
DOCS: https://nightly.spacy.io/api/pipe#call
|
||||||
"""
|
"""
|
||||||
scores = self.predict([doc])
|
scores = self.predict([doc])
|
||||||
self.set_annotations([doc], scores)
|
self.set_annotations([doc], scores)
|
||||||
|
@ -55,7 +55,7 @@ cdef class Pipe:
|
||||||
batch_size (int): The number of documents to buffer.
|
batch_size (int): The number of documents to buffer.
|
||||||
YIELDS (Doc): Processed documents in order.
|
YIELDS (Doc): Processed documents in order.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#pipe
|
DOCS: https://nightly.spacy.io/api/pipe#pipe
|
||||||
"""
|
"""
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
scores = self.predict(docs)
|
scores = self.predict(docs)
|
||||||
|
@ -69,7 +69,7 @@ cdef class Pipe:
|
||||||
docs (Iterable[Doc]): The documents to predict.
|
docs (Iterable[Doc]): The documents to predict.
|
||||||
RETURNS: Vector representations for each token in the documents.
|
RETURNS: Vector representations for each token in the documents.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#predict
|
DOCS: https://nightly.spacy.io/api/pipe#predict
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(Errors.E931.format(method="predict", name=self.name))
|
raise NotImplementedError(Errors.E931.format(method="predict", name=self.name))
|
||||||
|
|
||||||
|
@ -79,7 +79,7 @@ cdef class Pipe:
|
||||||
docs (Iterable[Doc]): The documents to modify.
|
docs (Iterable[Doc]): The documents to modify.
|
||||||
scores: The scores to assign.
|
scores: The scores to assign.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#set_annotations
|
DOCS: https://nightly.spacy.io/api/pipe#set_annotations
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(Errors.E931.format(method="set_annotations", name=self.name))
|
raise NotImplementedError(Errors.E931.format(method="set_annotations", name=self.name))
|
||||||
|
|
||||||
|
@ -96,7 +96,7 @@ cdef class Pipe:
|
||||||
Updated using the component name as the key.
|
Updated using the component name as the key.
|
||||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#update
|
DOCS: https://nightly.spacy.io/api/pipe#update
|
||||||
"""
|
"""
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
|
@ -132,7 +132,7 @@ cdef class Pipe:
|
||||||
Updated using the component name as the key.
|
Updated using the component name as the key.
|
||||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#rehearse
|
DOCS: https://nightly.spacy.io/api/pipe#rehearse
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -144,7 +144,7 @@ cdef class Pipe:
|
||||||
scores: Scores representing the model's predictions.
|
scores: Scores representing the model's predictions.
|
||||||
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#get_loss
|
DOCS: https://nightly.spacy.io/api/pipe#get_loss
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(Errors.E931.format(method="get_loss", name=self.name))
|
raise NotImplementedError(Errors.E931.format(method="get_loss", name=self.name))
|
||||||
|
|
||||||
|
@ -156,7 +156,7 @@ cdef class Pipe:
|
||||||
label (str): The label to add.
|
label (str): The label to add.
|
||||||
RETURNS (int): 0 if label is already present, otherwise 1.
|
RETURNS (int): 0 if label is already present, otherwise 1.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#add_label
|
DOCS: https://nightly.spacy.io/api/pipe#add_label
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
|
raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
|
||||||
|
|
||||||
|
@ -165,7 +165,7 @@ cdef class Pipe:
|
||||||
|
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#create_optimizer
|
DOCS: https://nightly.spacy.io/api/pipe#create_optimizer
|
||||||
"""
|
"""
|
||||||
return util.create_default_optimizer()
|
return util.create_default_optimizer()
|
||||||
|
|
||||||
|
@ -181,7 +181,7 @@ cdef class Pipe:
|
||||||
create_optimizer if it doesn't exist.
|
create_optimizer if it doesn't exist.
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#begin_training
|
DOCS: https://nightly.spacy.io/api/pipe#begin_training
|
||||||
"""
|
"""
|
||||||
self.model.initialize()
|
self.model.initialize()
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
|
@ -200,7 +200,7 @@ cdef class Pipe:
|
||||||
|
|
||||||
params (dict): The parameter values to use in the model.
|
params (dict): The parameter values to use in the model.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#use_params
|
DOCS: https://nightly.spacy.io/api/pipe#use_params
|
||||||
"""
|
"""
|
||||||
with self.model.use_params(params):
|
with self.model.use_params(params):
|
||||||
yield
|
yield
|
||||||
|
@ -211,7 +211,7 @@ cdef class Pipe:
|
||||||
examples (Iterable[Example]): The examples to score.
|
examples (Iterable[Example]): The examples to score.
|
||||||
RETURNS (Dict[str, Any]): The scores.
|
RETURNS (Dict[str, Any]): The scores.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#score
|
DOCS: https://nightly.spacy.io/api/pipe#score
|
||||||
"""
|
"""
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
@ -221,7 +221,7 @@ cdef class Pipe:
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (bytes): The serialized object.
|
RETURNS (bytes): The serialized object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#to_bytes
|
DOCS: https://nightly.spacy.io/api/pipe#to_bytes
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||||
|
@ -236,7 +236,7 @@ cdef class Pipe:
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (Pipe): The loaded object.
|
RETURNS (Pipe): The loaded object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#from_bytes
|
DOCS: https://nightly.spacy.io/api/pipe#from_bytes
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def load_model(b):
|
def load_model(b):
|
||||||
|
@ -259,7 +259,7 @@ cdef class Pipe:
|
||||||
path (str / Path): Path to a directory.
|
path (str / Path): Path to a directory.
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#to_disk
|
DOCS: https://nightly.spacy.io/api/pipe#to_disk
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
||||||
|
@ -274,7 +274,7 @@ cdef class Pipe:
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (Pipe): The loaded object.
|
RETURNS (Pipe): The loaded object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#from_disk
|
DOCS: https://nightly.spacy.io/api/pipe#from_disk
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
|
|
|
@ -29,7 +29,7 @@ def make_sentencizer(
|
||||||
class Sentencizer(Pipe):
|
class Sentencizer(Pipe):
|
||||||
"""Segment the Doc into sentences using a rule-based strategy.
|
"""Segment the Doc into sentences using a rule-based strategy.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer
|
DOCS: https://nightly.spacy.io/api/sentencizer
|
||||||
"""
|
"""
|
||||||
|
|
||||||
default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
|
default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
|
||||||
|
@ -51,7 +51,7 @@ class Sentencizer(Pipe):
|
||||||
serialized with the nlp object.
|
serialized with the nlp object.
|
||||||
RETURNS (Sentencizer): The sentencizer component.
|
RETURNS (Sentencizer): The sentencizer component.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer#init
|
DOCS: https://nightly.spacy.io/api/sentencizer#init
|
||||||
"""
|
"""
|
||||||
self.name = name
|
self.name = name
|
||||||
if punct_chars:
|
if punct_chars:
|
||||||
|
@ -68,7 +68,7 @@ class Sentencizer(Pipe):
|
||||||
doc (Doc): The document to process.
|
doc (Doc): The document to process.
|
||||||
RETURNS (Doc): The processed Doc.
|
RETURNS (Doc): The processed Doc.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer#call
|
DOCS: https://nightly.spacy.io/api/sentencizer#call
|
||||||
"""
|
"""
|
||||||
start = 0
|
start = 0
|
||||||
seen_period = False
|
seen_period = False
|
||||||
|
@ -94,7 +94,7 @@ class Sentencizer(Pipe):
|
||||||
batch_size (int): The number of documents to buffer.
|
batch_size (int): The number of documents to buffer.
|
||||||
YIELDS (Doc): Processed documents in order.
|
YIELDS (Doc): Processed documents in order.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer#pipe
|
DOCS: https://nightly.spacy.io/api/sentencizer#pipe
|
||||||
"""
|
"""
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
predictions = self.predict(docs)
|
predictions = self.predict(docs)
|
||||||
|
@ -157,7 +157,7 @@ class Sentencizer(Pipe):
|
||||||
examples (Iterable[Example]): The examples to score.
|
examples (Iterable[Example]): The examples to score.
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer#score
|
DOCS: https://nightly.spacy.io/api/sentencizer#score
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "Sentencizer.score")
|
validate_examples(examples, "Sentencizer.score")
|
||||||
results = Scorer.score_spans(examples, "sents", **kwargs)
|
results = Scorer.score_spans(examples, "sents", **kwargs)
|
||||||
|
@ -169,7 +169,7 @@ class Sentencizer(Pipe):
|
||||||
|
|
||||||
RETURNS (bytes): The serialized object.
|
RETURNS (bytes): The serialized object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer#to_bytes
|
DOCS: https://nightly.spacy.io/api/sentencizer#to_bytes
|
||||||
"""
|
"""
|
||||||
return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)})
|
return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)})
|
||||||
|
|
||||||
|
@ -179,7 +179,7 @@ class Sentencizer(Pipe):
|
||||||
bytes_data (bytes): The data to load.
|
bytes_data (bytes): The data to load.
|
||||||
returns (Sentencizer): The loaded object.
|
returns (Sentencizer): The loaded object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer#from_bytes
|
DOCS: https://nightly.spacy.io/api/sentencizer#from_bytes
|
||||||
"""
|
"""
|
||||||
cfg = srsly.msgpack_loads(bytes_data)
|
cfg = srsly.msgpack_loads(bytes_data)
|
||||||
self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
|
self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
|
||||||
|
@ -188,7 +188,7 @@ class Sentencizer(Pipe):
|
||||||
def to_disk(self, path, *, exclude=tuple()):
|
def to_disk(self, path, *, exclude=tuple()):
|
||||||
"""Serialize the sentencizer to disk.
|
"""Serialize the sentencizer to disk.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer#to_disk
|
DOCS: https://nightly.spacy.io/api/sentencizer#to_disk
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
path = path.with_suffix(".json")
|
path = path.with_suffix(".json")
|
||||||
|
@ -198,7 +198,7 @@ class Sentencizer(Pipe):
|
||||||
def from_disk(self, path, *, exclude=tuple()):
|
def from_disk(self, path, *, exclude=tuple()):
|
||||||
"""Load the sentencizer from disk.
|
"""Load the sentencizer from disk.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer#from_disk
|
DOCS: https://nightly.spacy.io/api/sentencizer#from_disk
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
path = path.with_suffix(".json")
|
path = path.with_suffix(".json")
|
||||||
|
|
|
@ -44,7 +44,7 @@ def make_senter(nlp: Language, name: str, model: Model):
|
||||||
class SentenceRecognizer(Tagger):
|
class SentenceRecognizer(Tagger):
|
||||||
"""Pipeline component for sentence segmentation.
|
"""Pipeline component for sentence segmentation.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer
|
DOCS: https://nightly.spacy.io/api/sentencerecognizer
|
||||||
"""
|
"""
|
||||||
def __init__(self, vocab, model, name="senter"):
|
def __init__(self, vocab, model, name="senter"):
|
||||||
"""Initialize a sentence recognizer.
|
"""Initialize a sentence recognizer.
|
||||||
|
@ -54,7 +54,7 @@ class SentenceRecognizer(Tagger):
|
||||||
name (str): The component instance name, used to add entries to the
|
name (str): The component instance name, used to add entries to the
|
||||||
losses during training.
|
losses during training.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer#init
|
DOCS: https://nightly.spacy.io/api/sentencerecognizer#init
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
@ -76,7 +76,7 @@ class SentenceRecognizer(Tagger):
|
||||||
docs (Iterable[Doc]): The documents to modify.
|
docs (Iterable[Doc]): The documents to modify.
|
||||||
batch_tag_ids: The IDs to set, produced by SentenceRecognizer.predict.
|
batch_tag_ids: The IDs to set, produced by SentenceRecognizer.predict.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer#set_annotations
|
DOCS: https://nightly.spacy.io/api/sentencerecognizer#set_annotations
|
||||||
"""
|
"""
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
|
@ -101,7 +101,7 @@ class SentenceRecognizer(Tagger):
|
||||||
scores: Scores representing the model's predictions.
|
scores: Scores representing the model's predictions.
|
||||||
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer#get_loss
|
DOCS: https://nightly.spacy.io/api/sentencerecognizer#get_loss
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "SentenceRecognizer.get_loss")
|
validate_examples(examples, "SentenceRecognizer.get_loss")
|
||||||
labels = self.labels
|
labels = self.labels
|
||||||
|
@ -135,7 +135,7 @@ class SentenceRecognizer(Tagger):
|
||||||
create_optimizer if it doesn't exist.
|
create_optimizer if it doesn't exist.
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer#begin_training
|
DOCS: https://nightly.spacy.io/api/sentencerecognizer#begin_training
|
||||||
"""
|
"""
|
||||||
self.set_output(len(self.labels))
|
self.set_output(len(self.labels))
|
||||||
self.model.initialize()
|
self.model.initialize()
|
||||||
|
@ -151,7 +151,7 @@ class SentenceRecognizer(Tagger):
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
examples (Iterable[Example]): The examples to score.
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer#score
|
DOCS: https://nightly.spacy.io/api/sentencerecognizer#score
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "SentenceRecognizer.score")
|
validate_examples(examples, "SentenceRecognizer.score")
|
||||||
results = Scorer.score_spans(examples, "sents", **kwargs)
|
results = Scorer.score_spans(examples, "sents", **kwargs)
|
||||||
|
@ -164,7 +164,7 @@ class SentenceRecognizer(Tagger):
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (bytes): The serialized object.
|
RETURNS (bytes): The serialized object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer#to_bytes
|
DOCS: https://nightly.spacy.io/api/sentencerecognizer#to_bytes
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["model"] = self.model.to_bytes
|
serialize["model"] = self.model.to_bytes
|
||||||
|
@ -179,7 +179,7 @@ class SentenceRecognizer(Tagger):
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (Tagger): The loaded SentenceRecognizer.
|
RETURNS (Tagger): The loaded SentenceRecognizer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer#from_bytes
|
DOCS: https://nightly.spacy.io/api/sentencerecognizer#from_bytes
|
||||||
"""
|
"""
|
||||||
def load_model(b):
|
def load_model(b):
|
||||||
try:
|
try:
|
||||||
|
@ -201,7 +201,7 @@ class SentenceRecognizer(Tagger):
|
||||||
path (str / Path): Path to a directory.
|
path (str / Path): Path to a directory.
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer#to_disk
|
DOCS: https://nightly.spacy.io/api/sentencerecognizer#to_disk
|
||||||
"""
|
"""
|
||||||
serialize = {
|
serialize = {
|
||||||
"vocab": lambda p: self.vocab.to_disk(p),
|
"vocab": lambda p: self.vocab.to_disk(p),
|
||||||
|
@ -217,7 +217,7 @@ class SentenceRecognizer(Tagger):
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (Tagger): The modified SentenceRecognizer object.
|
RETURNS (Tagger): The modified SentenceRecognizer object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer#from_disk
|
DOCS: https://nightly.spacy.io/api/sentencerecognizer#from_disk
|
||||||
"""
|
"""
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
with p.open("rb") as file_:
|
with p.open("rb") as file_:
|
||||||
|
|
|
@ -78,7 +78,7 @@ class SimpleNER(Pipe):
|
||||||
def add_label(self, label: str) -> None:
|
def add_label(self, label: str) -> None:
|
||||||
"""Add a new label to the pipe.
|
"""Add a new label to the pipe.
|
||||||
label (str): The label to add.
|
label (str): The label to add.
|
||||||
DOCS: https://spacy.io/api/simplener#add_label
|
DOCS: https://nightly.spacy.io/api/simplener#add_label
|
||||||
"""
|
"""
|
||||||
if not isinstance(label, str):
|
if not isinstance(label, str):
|
||||||
raise ValueError(Errors.E187)
|
raise ValueError(Errors.E187)
|
||||||
|
|
|
@ -58,7 +58,7 @@ def make_tagger(nlp: Language, name: str, model: Model):
|
||||||
class Tagger(Pipe):
|
class Tagger(Pipe):
|
||||||
"""Pipeline component for part-of-speech tagging.
|
"""Pipeline component for part-of-speech tagging.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger
|
DOCS: https://nightly.spacy.io/api/tagger
|
||||||
"""
|
"""
|
||||||
def __init__(self, vocab, model, name="tagger", *, labels=None):
|
def __init__(self, vocab, model, name="tagger", *, labels=None):
|
||||||
"""Initialize a part-of-speech tagger.
|
"""Initialize a part-of-speech tagger.
|
||||||
|
@ -69,7 +69,7 @@ class Tagger(Pipe):
|
||||||
losses during training.
|
losses during training.
|
||||||
labels (List): The set of labels. Defaults to None.
|
labels (List): The set of labels. Defaults to None.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#init
|
DOCS: https://nightly.spacy.io/api/tagger#init
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
@ -86,7 +86,7 @@ class Tagger(Pipe):
|
||||||
|
|
||||||
RETURNS (Tuple[str]): The labels.
|
RETURNS (Tuple[str]): The labels.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#labels
|
DOCS: https://nightly.spacy.io/api/tagger#labels
|
||||||
"""
|
"""
|
||||||
return tuple(self.cfg["labels"])
|
return tuple(self.cfg["labels"])
|
||||||
|
|
||||||
|
@ -96,7 +96,7 @@ class Tagger(Pipe):
|
||||||
doc (Doc): The document to process.
|
doc (Doc): The document to process.
|
||||||
RETURNS (Doc): The processed Doc.
|
RETURNS (Doc): The processed Doc.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#call
|
DOCS: https://nightly.spacy.io/api/tagger#call
|
||||||
"""
|
"""
|
||||||
tags = self.predict([doc])
|
tags = self.predict([doc])
|
||||||
self.set_annotations([doc], tags)
|
self.set_annotations([doc], tags)
|
||||||
|
@ -111,7 +111,7 @@ class Tagger(Pipe):
|
||||||
batch_size (int): The number of documents to buffer.
|
batch_size (int): The number of documents to buffer.
|
||||||
YIELDS (Doc): Processed documents in order.
|
YIELDS (Doc): Processed documents in order.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#pipe
|
DOCS: https://nightly.spacy.io/api/tagger#pipe
|
||||||
"""
|
"""
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
tag_ids = self.predict(docs)
|
tag_ids = self.predict(docs)
|
||||||
|
@ -124,7 +124,7 @@ class Tagger(Pipe):
|
||||||
docs (Iterable[Doc]): The documents to predict.
|
docs (Iterable[Doc]): The documents to predict.
|
||||||
RETURNS: The models prediction for each document.
|
RETURNS: The models prediction for each document.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#predict
|
DOCS: https://nightly.spacy.io/api/tagger#predict
|
||||||
"""
|
"""
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
|
@ -153,7 +153,7 @@ class Tagger(Pipe):
|
||||||
docs (Iterable[Doc]): The documents to modify.
|
docs (Iterable[Doc]): The documents to modify.
|
||||||
batch_tag_ids: The IDs to set, produced by Tagger.predict.
|
batch_tag_ids: The IDs to set, produced by Tagger.predict.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#set_annotations
|
DOCS: https://nightly.spacy.io/api/tagger#set_annotations
|
||||||
"""
|
"""
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
|
@ -182,7 +182,7 @@ class Tagger(Pipe):
|
||||||
Updated using the component name as the key.
|
Updated using the component name as the key.
|
||||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#update
|
DOCS: https://nightly.spacy.io/api/tagger#update
|
||||||
"""
|
"""
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
|
@ -220,7 +220,7 @@ class Tagger(Pipe):
|
||||||
Updated using the component name as the key.
|
Updated using the component name as the key.
|
||||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#rehearse
|
DOCS: https://nightly.spacy.io/api/tagger#rehearse
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "Tagger.rehearse")
|
validate_examples(examples, "Tagger.rehearse")
|
||||||
docs = [eg.predicted for eg in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
|
@ -247,7 +247,7 @@ class Tagger(Pipe):
|
||||||
scores: Scores representing the model's predictions.
|
scores: Scores representing the model's predictions.
|
||||||
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#get_loss
|
DOCS: https://nightly.spacy.io/api/tagger#get_loss
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "Tagger.get_loss")
|
validate_examples(examples, "Tagger.get_loss")
|
||||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
||||||
|
@ -269,7 +269,7 @@ class Tagger(Pipe):
|
||||||
create_optimizer if it doesn't exist.
|
create_optimizer if it doesn't exist.
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#begin_training
|
DOCS: https://nightly.spacy.io/api/tagger#begin_training
|
||||||
"""
|
"""
|
||||||
if not hasattr(get_examples, "__call__"):
|
if not hasattr(get_examples, "__call__"):
|
||||||
err = Errors.E930.format(name="Tagger", obj=type(get_examples))
|
err = Errors.E930.format(name="Tagger", obj=type(get_examples))
|
||||||
|
@ -307,7 +307,7 @@ class Tagger(Pipe):
|
||||||
label (str): The label to add.
|
label (str): The label to add.
|
||||||
RETURNS (int): 0 if label is already present, otherwise 1.
|
RETURNS (int): 0 if label is already present, otherwise 1.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#add_label
|
DOCS: https://nightly.spacy.io/api/tagger#add_label
|
||||||
"""
|
"""
|
||||||
if not isinstance(label, str):
|
if not isinstance(label, str):
|
||||||
raise ValueError(Errors.E187)
|
raise ValueError(Errors.E187)
|
||||||
|
@ -324,7 +324,7 @@ class Tagger(Pipe):
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by
|
RETURNS (Dict[str, Any]): The scores, produced by
|
||||||
Scorer.score_token_attr for the attributes "tag".
|
Scorer.score_token_attr for the attributes "tag".
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#score
|
DOCS: https://nightly.spacy.io/api/tagger#score
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "Tagger.score")
|
validate_examples(examples, "Tagger.score")
|
||||||
return Scorer.score_token_attr(examples, "tag", **kwargs)
|
return Scorer.score_token_attr(examples, "tag", **kwargs)
|
||||||
|
@ -335,7 +335,7 @@ class Tagger(Pipe):
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (bytes): The serialized object.
|
RETURNS (bytes): The serialized object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#to_bytes
|
DOCS: https://nightly.spacy.io/api/tagger#to_bytes
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["model"] = self.model.to_bytes
|
serialize["model"] = self.model.to_bytes
|
||||||
|
@ -350,7 +350,7 @@ class Tagger(Pipe):
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (Tagger): The loaded Tagger.
|
RETURNS (Tagger): The loaded Tagger.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#from_bytes
|
DOCS: https://nightly.spacy.io/api/tagger#from_bytes
|
||||||
"""
|
"""
|
||||||
def load_model(b):
|
def load_model(b):
|
||||||
try:
|
try:
|
||||||
|
@ -372,7 +372,7 @@ class Tagger(Pipe):
|
||||||
path (str / Path): Path to a directory.
|
path (str / Path): Path to a directory.
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#to_disk
|
DOCS: https://nightly.spacy.io/api/tagger#to_disk
|
||||||
"""
|
"""
|
||||||
serialize = {
|
serialize = {
|
||||||
"vocab": lambda p: self.vocab.to_disk(p),
|
"vocab": lambda p: self.vocab.to_disk(p),
|
||||||
|
@ -388,7 +388,7 @@ class Tagger(Pipe):
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (Tagger): The modified Tagger object.
|
RETURNS (Tagger): The modified Tagger object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#from_disk
|
DOCS: https://nightly.spacy.io/api/tagger#from_disk
|
||||||
"""
|
"""
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
with p.open("rb") as file_:
|
with p.open("rb") as file_:
|
||||||
|
|
|
@ -92,7 +92,7 @@ def make_textcat(
|
||||||
class TextCategorizer(Pipe):
|
class TextCategorizer(Pipe):
|
||||||
"""Pipeline component for text classification.
|
"""Pipeline component for text classification.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer
|
DOCS: https://nightly.spacy.io/api/textcategorizer
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -111,7 +111,7 @@ class TextCategorizer(Pipe):
|
||||||
losses during training.
|
losses during training.
|
||||||
labels (Iterable[str]): The labels to use.
|
labels (Iterable[str]): The labels to use.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#init
|
DOCS: https://nightly.spacy.io/api/textcategorizer#init
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
@ -124,7 +124,7 @@ class TextCategorizer(Pipe):
|
||||||
def labels(self) -> Tuple[str]:
|
def labels(self) -> Tuple[str]:
|
||||||
"""RETURNS (Tuple[str]): The labels currently added to the component.
|
"""RETURNS (Tuple[str]): The labels currently added to the component.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#labels
|
DOCS: https://nightly.spacy.io/api/textcategorizer#labels
|
||||||
"""
|
"""
|
||||||
return tuple(self.cfg.setdefault("labels", []))
|
return tuple(self.cfg.setdefault("labels", []))
|
||||||
|
|
||||||
|
@ -146,7 +146,7 @@ class TextCategorizer(Pipe):
|
||||||
batch_size (int): The number of documents to buffer.
|
batch_size (int): The number of documents to buffer.
|
||||||
YIELDS (Doc): Processed documents in order.
|
YIELDS (Doc): Processed documents in order.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#pipe
|
DOCS: https://nightly.spacy.io/api/textcategorizer#pipe
|
||||||
"""
|
"""
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
scores = self.predict(docs)
|
scores = self.predict(docs)
|
||||||
|
@ -159,7 +159,7 @@ class TextCategorizer(Pipe):
|
||||||
docs (Iterable[Doc]): The documents to predict.
|
docs (Iterable[Doc]): The documents to predict.
|
||||||
RETURNS: The models prediction for each document.
|
RETURNS: The models prediction for each document.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#predict
|
DOCS: https://nightly.spacy.io/api/textcategorizer#predict
|
||||||
"""
|
"""
|
||||||
tensors = [doc.tensor for doc in docs]
|
tensors = [doc.tensor for doc in docs]
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
|
@ -177,7 +177,7 @@ class TextCategorizer(Pipe):
|
||||||
docs (Iterable[Doc]): The documents to modify.
|
docs (Iterable[Doc]): The documents to modify.
|
||||||
scores: The scores to set, produced by TextCategorizer.predict.
|
scores: The scores to set, produced by TextCategorizer.predict.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#set_annotations
|
DOCS: https://nightly.spacy.io/api/textcategorizer#set_annotations
|
||||||
"""
|
"""
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
for j, label in enumerate(self.labels):
|
for j, label in enumerate(self.labels):
|
||||||
|
@ -204,7 +204,7 @@ class TextCategorizer(Pipe):
|
||||||
Updated using the component name as the key.
|
Updated using the component name as the key.
|
||||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#update
|
DOCS: https://nightly.spacy.io/api/textcategorizer#update
|
||||||
"""
|
"""
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
|
@ -245,7 +245,7 @@ class TextCategorizer(Pipe):
|
||||||
Updated using the component name as the key.
|
Updated using the component name as the key.
|
||||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#rehearse
|
DOCS: https://nightly.spacy.io/api/textcategorizer#rehearse
|
||||||
"""
|
"""
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses.setdefault(self.name, 0.0)
|
losses.setdefault(self.name, 0.0)
|
||||||
|
@ -289,7 +289,7 @@ class TextCategorizer(Pipe):
|
||||||
scores: Scores representing the model's predictions.
|
scores: Scores representing the model's predictions.
|
||||||
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#get_loss
|
DOCS: https://nightly.spacy.io/api/textcategorizer#get_loss
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "TextCategorizer.get_loss")
|
validate_examples(examples, "TextCategorizer.get_loss")
|
||||||
truths, not_missing = self._examples_to_truth(examples)
|
truths, not_missing = self._examples_to_truth(examples)
|
||||||
|
@ -305,7 +305,7 @@ class TextCategorizer(Pipe):
|
||||||
label (str): The label to add.
|
label (str): The label to add.
|
||||||
RETURNS (int): 0 if label is already present, otherwise 1.
|
RETURNS (int): 0 if label is already present, otherwise 1.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#add_label
|
DOCS: https://nightly.spacy.io/api/textcategorizer#add_label
|
||||||
"""
|
"""
|
||||||
if not isinstance(label, str):
|
if not isinstance(label, str):
|
||||||
raise ValueError(Errors.E187)
|
raise ValueError(Errors.E187)
|
||||||
|
@ -343,7 +343,7 @@ class TextCategorizer(Pipe):
|
||||||
create_optimizer if it doesn't exist.
|
create_optimizer if it doesn't exist.
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#begin_training
|
DOCS: https://nightly.spacy.io/api/textcategorizer#begin_training
|
||||||
"""
|
"""
|
||||||
if not hasattr(get_examples, "__call__"):
|
if not hasattr(get_examples, "__call__"):
|
||||||
err = Errors.E930.format(name="TextCategorizer", obj=type(get_examples))
|
err = Errors.E930.format(name="TextCategorizer", obj=type(get_examples))
|
||||||
|
@ -378,7 +378,7 @@ class TextCategorizer(Pipe):
|
||||||
positive_label (str): Optional positive label.
|
positive_label (str): Optional positive label.
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
|
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#score
|
DOCS: https://nightly.spacy.io/api/textcategorizer#score
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "TextCategorizer.score")
|
validate_examples(examples, "TextCategorizer.score")
|
||||||
return Scorer.score_cats(
|
return Scorer.score_cats(
|
||||||
|
|
|
@ -56,7 +56,7 @@ class Tok2Vec(Pipe):
|
||||||
a list of Doc objects as input, and output a list of 2d float arrays.
|
a list of Doc objects as input, and output a list of 2d float arrays.
|
||||||
name (str): The component instance name.
|
name (str): The component instance name.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tok2vec#init
|
DOCS: https://nightly.spacy.io/api/tok2vec#init
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
@ -91,7 +91,7 @@ class Tok2Vec(Pipe):
|
||||||
docs (Doc): The Doc to process.
|
docs (Doc): The Doc to process.
|
||||||
RETURNS (Doc): The processed Doc.
|
RETURNS (Doc): The processed Doc.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tok2vec#call
|
DOCS: https://nightly.spacy.io/api/tok2vec#call
|
||||||
"""
|
"""
|
||||||
tokvecses = self.predict([doc])
|
tokvecses = self.predict([doc])
|
||||||
self.set_annotations([doc], tokvecses)
|
self.set_annotations([doc], tokvecses)
|
||||||
|
@ -106,7 +106,7 @@ class Tok2Vec(Pipe):
|
||||||
batch_size (int): The number of documents to buffer.
|
batch_size (int): The number of documents to buffer.
|
||||||
YIELDS (Doc): Processed documents in order.
|
YIELDS (Doc): Processed documents in order.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tok2vec#pipe
|
DOCS: https://nightly.spacy.io/api/tok2vec#pipe
|
||||||
"""
|
"""
|
||||||
for docs in minibatch(stream, batch_size):
|
for docs in minibatch(stream, batch_size):
|
||||||
docs = list(docs)
|
docs = list(docs)
|
||||||
|
@ -121,7 +121,7 @@ class Tok2Vec(Pipe):
|
||||||
docs (Iterable[Doc]): The documents to predict.
|
docs (Iterable[Doc]): The documents to predict.
|
||||||
RETURNS: Vector representations for each token in the documents.
|
RETURNS: Vector representations for each token in the documents.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tok2vec#predict
|
DOCS: https://nightly.spacy.io/api/tok2vec#predict
|
||||||
"""
|
"""
|
||||||
tokvecs = self.model.predict(docs)
|
tokvecs = self.model.predict(docs)
|
||||||
batch_id = Tok2VecListener.get_batch_id(docs)
|
batch_id = Tok2VecListener.get_batch_id(docs)
|
||||||
|
@ -135,7 +135,7 @@ class Tok2Vec(Pipe):
|
||||||
docs (Iterable[Doc]): The documents to modify.
|
docs (Iterable[Doc]): The documents to modify.
|
||||||
tokvecses: The tensors to set, produced by Tok2Vec.predict.
|
tokvecses: The tensors to set, produced by Tok2Vec.predict.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tok2vec#set_annotations
|
DOCS: https://nightly.spacy.io/api/tok2vec#set_annotations
|
||||||
"""
|
"""
|
||||||
for doc, tokvecs in zip(docs, tokvecses):
|
for doc, tokvecs in zip(docs, tokvecses):
|
||||||
assert tokvecs.shape[0] == len(doc)
|
assert tokvecs.shape[0] == len(doc)
|
||||||
|
@ -162,7 +162,7 @@ class Tok2Vec(Pipe):
|
||||||
Updated using the component name as the key.
|
Updated using the component name as the key.
|
||||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tok2vec#update
|
DOCS: https://nightly.spacy.io/api/tok2vec#update
|
||||||
"""
|
"""
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
|
@ -220,7 +220,7 @@ class Tok2Vec(Pipe):
|
||||||
create_optimizer if it doesn't exist.
|
create_optimizer if it doesn't exist.
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tok2vec#begin_training
|
DOCS: https://nightly.spacy.io/api/tok2vec#begin_training
|
||||||
"""
|
"""
|
||||||
docs = [Doc(self.vocab, words=["hello"])]
|
docs = [Doc(self.vocab, words=["hello"])]
|
||||||
self.model.initialize(X=docs)
|
self.model.initialize(X=docs)
|
||||||
|
|
|
@ -57,12 +57,13 @@ def validate_token_pattern(obj: list) -> List[str]:
|
||||||
|
|
||||||
|
|
||||||
class TokenPatternString(BaseModel):
|
class TokenPatternString(BaseModel):
|
||||||
REGEX: Optional[StrictStr]
|
REGEX: Optional[StrictStr] = Field(None, alias="regex")
|
||||||
IN: Optional[List[StrictStr]]
|
IN: Optional[List[StrictStr]] = Field(None, alias="in")
|
||||||
NOT_IN: Optional[List[StrictStr]]
|
NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
extra = "forbid"
|
extra = "forbid"
|
||||||
|
allow_population_by_field_name = True # allow alias and field name
|
||||||
|
|
||||||
@validator("*", pre=True, each_item=True, allow_reuse=True)
|
@validator("*", pre=True, each_item=True, allow_reuse=True)
|
||||||
def raise_for_none(cls, v):
|
def raise_for_none(cls, v):
|
||||||
|
@ -72,9 +73,9 @@ class TokenPatternString(BaseModel):
|
||||||
|
|
||||||
|
|
||||||
class TokenPatternNumber(BaseModel):
|
class TokenPatternNumber(BaseModel):
|
||||||
REGEX: Optional[StrictStr] = None
|
REGEX: Optional[StrictStr] = Field(None, alias="regex")
|
||||||
IN: Optional[List[StrictInt]] = None
|
IN: Optional[List[StrictInt]] = Field(None, alias="in")
|
||||||
NOT_IN: Optional[List[StrictInt]] = None
|
NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in")
|
||||||
EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
|
EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
|
||||||
NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
|
NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
|
||||||
GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
|
GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
|
||||||
|
@ -84,6 +85,7 @@ class TokenPatternNumber(BaseModel):
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
extra = "forbid"
|
extra = "forbid"
|
||||||
|
allow_population_by_field_name = True # allow alias and field name
|
||||||
|
|
||||||
@validator("*", pre=True, each_item=True, allow_reuse=True)
|
@validator("*", pre=True, each_item=True, allow_reuse=True)
|
||||||
def raise_for_none(cls, v):
|
def raise_for_none(cls, v):
|
||||||
|
|
|
@ -85,7 +85,7 @@ class Scorer:
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize the Scorer.
|
"""Initialize the Scorer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#init
|
DOCS: https://nightly.spacy.io/api/scorer#init
|
||||||
"""
|
"""
|
||||||
self.nlp = nlp
|
self.nlp = nlp
|
||||||
self.cfg = cfg
|
self.cfg = cfg
|
||||||
|
@ -101,7 +101,7 @@ class Scorer:
|
||||||
examples (Iterable[Example]): The predicted annotations + correct annotations.
|
examples (Iterable[Example]): The predicted annotations + correct annotations.
|
||||||
RETURNS (Dict): A dictionary of scores.
|
RETURNS (Dict): A dictionary of scores.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#score
|
DOCS: https://nightly.spacy.io/api/scorer#score
|
||||||
"""
|
"""
|
||||||
scores = {}
|
scores = {}
|
||||||
if hasattr(self.nlp.tokenizer, "score"):
|
if hasattr(self.nlp.tokenizer, "score"):
|
||||||
|
@ -121,7 +121,7 @@ class Scorer:
|
||||||
RETURNS (Dict[str, float]): A dictionary containing the scores
|
RETURNS (Dict[str, float]): A dictionary containing the scores
|
||||||
token_acc/p/r/f.
|
token_acc/p/r/f.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#score_tokenization
|
DOCS: https://nightly.spacy.io/api/scorer#score_tokenization
|
||||||
"""
|
"""
|
||||||
acc_score = PRFScore()
|
acc_score = PRFScore()
|
||||||
prf_score = PRFScore()
|
prf_score = PRFScore()
|
||||||
|
@ -169,7 +169,7 @@ class Scorer:
|
||||||
RETURNS (Dict[str, float]): A dictionary containing the accuracy score
|
RETURNS (Dict[str, float]): A dictionary containing the accuracy score
|
||||||
under the key attr_acc.
|
under the key attr_acc.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#score_token_attr
|
DOCS: https://nightly.spacy.io/api/scorer#score_token_attr
|
||||||
"""
|
"""
|
||||||
tag_score = PRFScore()
|
tag_score = PRFScore()
|
||||||
for example in examples:
|
for example in examples:
|
||||||
|
@ -263,7 +263,7 @@ class Scorer:
|
||||||
RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under
|
RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under
|
||||||
the keys attr_p/r/f and the per-type PRF scores under attr_per_type.
|
the keys attr_p/r/f and the per-type PRF scores under attr_per_type.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#score_spans
|
DOCS: https://nightly.spacy.io/api/scorer#score_spans
|
||||||
"""
|
"""
|
||||||
score = PRFScore()
|
score = PRFScore()
|
||||||
score_per_type = dict()
|
score_per_type = dict()
|
||||||
|
@ -350,7 +350,7 @@ class Scorer:
|
||||||
attr_f_per_type,
|
attr_f_per_type,
|
||||||
attr_auc_per_type
|
attr_auc_per_type
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#score_cats
|
DOCS: https://nightly.spacy.io/api/scorer#score_cats
|
||||||
"""
|
"""
|
||||||
if threshold is None:
|
if threshold is None:
|
||||||
threshold = 0.5 if multi_label else 0.0
|
threshold = 0.5 if multi_label else 0.0
|
||||||
|
@ -467,7 +467,7 @@ class Scorer:
|
||||||
RETURNS (Dict[str, Any]): A dictionary containing the scores:
|
RETURNS (Dict[str, Any]): A dictionary containing the scores:
|
||||||
attr_uas, attr_las, and attr_las_per_type.
|
attr_uas, attr_las, and attr_las_per_type.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#score_deps
|
DOCS: https://nightly.spacy.io/api/scorer#score_deps
|
||||||
"""
|
"""
|
||||||
unlabelled = PRFScore()
|
unlabelled = PRFScore()
|
||||||
labelled = PRFScore()
|
labelled = PRFScore()
|
||||||
|
|
|
@ -91,7 +91,7 @@ cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) e
|
||||||
cdef class StringStore:
|
cdef class StringStore:
|
||||||
"""Look up strings by 64-bit hashes.
|
"""Look up strings by 64-bit hashes.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/stringstore
|
DOCS: https://nightly.spacy.io/api/stringstore
|
||||||
"""
|
"""
|
||||||
def __init__(self, strings=None, freeze=False):
|
def __init__(self, strings=None, freeze=False):
|
||||||
"""Create the StringStore.
|
"""Create the StringStore.
|
||||||
|
|
|
@ -44,6 +44,11 @@ def ca_tokenizer():
|
||||||
return get_lang_class("ca")().tokenizer
|
return get_lang_class("ca")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def cs_tokenizer():
|
||||||
|
return get_lang_class("cs")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def da_tokenizer():
|
def da_tokenizer():
|
||||||
return get_lang_class("da")().tokenizer
|
return get_lang_class("da")().tokenizer
|
||||||
|
@ -204,6 +209,11 @@ def ru_lemmatizer():
|
||||||
return get_lang_class("ru")().add_pipe("lemmatizer")
|
return get_lang_class("ru")().add_pipe("lemmatizer")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def sa_tokenizer():
|
||||||
|
return get_lang_class("sa")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def sr_tokenizer():
|
def sr_tokenizer():
|
||||||
return get_lang_class("sr")().tokenizer
|
return get_lang_class("sr")().tokenizer
|
||||||
|
|
|
@ -317,7 +317,8 @@ def test_doc_from_array_morph(en_vocab):
|
||||||
|
|
||||||
|
|
||||||
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||||
en_texts = ["Merging the docs is fun.", "They don't think alike."]
|
en_texts = ["Merging the docs is fun.", "", "They don't think alike."]
|
||||||
|
en_texts_without_empty = [t for t in en_texts if len(t)]
|
||||||
de_text = "Wie war die Frage?"
|
de_text = "Wie war die Frage?"
|
||||||
en_docs = [en_tokenizer(text) for text in en_texts]
|
en_docs = [en_tokenizer(text) for text in en_texts]
|
||||||
docs_idx = en_texts[0].index("docs")
|
docs_idx = en_texts[0].index("docs")
|
||||||
|
@ -338,14 +339,14 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||||
Doc.from_docs(en_docs + [de_doc])
|
Doc.from_docs(en_docs + [de_doc])
|
||||||
|
|
||||||
m_doc = Doc.from_docs(en_docs)
|
m_doc = Doc.from_docs(en_docs)
|
||||||
assert len(en_docs) == len(list(m_doc.sents))
|
assert len(en_texts_without_empty) == len(list(m_doc.sents))
|
||||||
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
|
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
|
||||||
assert str(m_doc) == " ".join(en_texts)
|
assert str(m_doc) == " ".join(en_texts_without_empty)
|
||||||
p_token = m_doc[len(en_docs[0]) - 1]
|
p_token = m_doc[len(en_docs[0]) - 1]
|
||||||
assert p_token.text == "." and bool(p_token.whitespace_)
|
assert p_token.text == "." and bool(p_token.whitespace_)
|
||||||
en_docs_tokens = [t for doc in en_docs for t in doc]
|
en_docs_tokens = [t for doc in en_docs for t in doc]
|
||||||
assert len(m_doc) == len(en_docs_tokens)
|
assert len(m_doc) == len(en_docs_tokens)
|
||||||
think_idx = len(en_texts[0]) + 1 + en_texts[1].index("think")
|
think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think")
|
||||||
assert m_doc[9].idx == think_idx
|
assert m_doc[9].idx == think_idx
|
||||||
with pytest.raises(AttributeError):
|
with pytest.raises(AttributeError):
|
||||||
# not callable, because it was not set via set_extension
|
# not callable, because it was not set via set_extension
|
||||||
|
@ -353,14 +354,14 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||||
assert len(m_doc.user_data) == len(en_docs[0].user_data) # but it's there
|
assert len(m_doc.user_data) == len(en_docs[0].user_data) # but it's there
|
||||||
|
|
||||||
m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
|
m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
|
||||||
assert len(en_docs) == len(list(m_doc.sents))
|
assert len(en_texts_without_empty) == len(list(m_doc.sents))
|
||||||
assert len(str(m_doc)) == len(en_texts[0]) + len(en_texts[1])
|
assert len(str(m_doc)) == sum(len(t) for t in en_texts)
|
||||||
assert str(m_doc) == "".join(en_texts)
|
assert str(m_doc) == "".join(en_texts)
|
||||||
p_token = m_doc[len(en_docs[0]) - 1]
|
p_token = m_doc[len(en_docs[0]) - 1]
|
||||||
assert p_token.text == "." and not bool(p_token.whitespace_)
|
assert p_token.text == "." and not bool(p_token.whitespace_)
|
||||||
en_docs_tokens = [t for doc in en_docs for t in doc]
|
en_docs_tokens = [t for doc in en_docs for t in doc]
|
||||||
assert len(m_doc) == len(en_docs_tokens)
|
assert len(m_doc) == len(en_docs_tokens)
|
||||||
think_idx = len(en_texts[0]) + 0 + en_texts[1].index("think")
|
think_idx = len(en_texts[0]) + 0 + en_texts[2].index("think")
|
||||||
assert m_doc[9].idx == think_idx
|
assert m_doc[9].idx == think_idx
|
||||||
|
|
||||||
m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
|
m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
|
||||||
|
@ -369,12 +370,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||||
assert list(m_doc.sents)
|
assert list(m_doc.sents)
|
||||||
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
|
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
|
||||||
# space delimiter considered, although spacy attribute was missing
|
# space delimiter considered, although spacy attribute was missing
|
||||||
assert str(m_doc) == " ".join(en_texts)
|
assert str(m_doc) == " ".join(en_texts_without_empty)
|
||||||
p_token = m_doc[len(en_docs[0]) - 1]
|
p_token = m_doc[len(en_docs[0]) - 1]
|
||||||
assert p_token.text == "." and bool(p_token.whitespace_)
|
assert p_token.text == "." and bool(p_token.whitespace_)
|
||||||
en_docs_tokens = [t for doc in en_docs for t in doc]
|
en_docs_tokens = [t for doc in en_docs for t in doc]
|
||||||
assert len(m_doc) == len(en_docs_tokens)
|
assert len(m_doc) == len(en_docs_tokens)
|
||||||
think_idx = len(en_texts[0]) + 1 + en_texts[1].index("think")
|
think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think")
|
||||||
assert m_doc[9].idx == think_idx
|
assert m_doc[9].idx == think_idx
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -162,11 +162,36 @@ def test_spans_are_hashable(en_tokenizer):
|
||||||
|
|
||||||
def test_spans_by_character(doc):
|
def test_spans_by_character(doc):
|
||||||
span1 = doc[1:-2]
|
span1 = doc[1:-2]
|
||||||
|
|
||||||
|
# default and specified alignment mode "strict"
|
||||||
span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE")
|
span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE")
|
||||||
assert span1.start_char == span2.start_char
|
assert span1.start_char == span2.start_char
|
||||||
assert span1.end_char == span2.end_char
|
assert span1.end_char == span2.end_char
|
||||||
assert span2.label_ == "GPE"
|
assert span2.label_ == "GPE"
|
||||||
|
|
||||||
|
span2 = doc.char_span(
|
||||||
|
span1.start_char, span1.end_char, label="GPE", alignment_mode="strict"
|
||||||
|
)
|
||||||
|
assert span1.start_char == span2.start_char
|
||||||
|
assert span1.end_char == span2.end_char
|
||||||
|
assert span2.label_ == "GPE"
|
||||||
|
|
||||||
|
# alignment mode "contract"
|
||||||
|
span2 = doc.char_span(
|
||||||
|
span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract"
|
||||||
|
)
|
||||||
|
assert span1.start_char == span2.start_char
|
||||||
|
assert span1.end_char == span2.end_char
|
||||||
|
assert span2.label_ == "GPE"
|
||||||
|
|
||||||
|
# alignment mode "expand"
|
||||||
|
span2 = doc.char_span(
|
||||||
|
span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="expand"
|
||||||
|
)
|
||||||
|
assert span1.start_char == span2.start_char
|
||||||
|
assert span1.end_char == span2.end_char
|
||||||
|
assert span2.label_ == "GPE"
|
||||||
|
|
||||||
|
|
||||||
def test_span_to_array(doc):
|
def test_span_to_array(doc):
|
||||||
span = doc[1:-2]
|
span = doc[1:-2]
|
||||||
|
|
0
spacy/tests/lang/cs/__init__.py
Normal file
0
spacy/tests/lang/cs/__init__.py
Normal file
23
spacy/tests/lang/cs/test_text.py
Normal file
23
spacy/tests/lang/cs/test_text.py
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"text,match",
|
||||||
|
[
|
||||||
|
("10", True),
|
||||||
|
("1", True),
|
||||||
|
("10.000", True),
|
||||||
|
("1000", True),
|
||||||
|
("999,0", True),
|
||||||
|
("devatenáct", True),
|
||||||
|
("osmdesát", True),
|
||||||
|
("kvadrilion", True),
|
||||||
|
("Pes", False),
|
||||||
|
(",", False),
|
||||||
|
("1/2", True),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_lex_attrs_like_number(cs_tokenizer, text, match):
|
||||||
|
tokens = cs_tokenizer(text)
|
||||||
|
assert len(tokens) == 1
|
||||||
|
assert tokens[0].like_num == match
|
|
@ -56,6 +56,11 @@ def test_lex_attrs_like_number(en_tokenizer, text, match):
|
||||||
assert tokens[0].like_num == match
|
assert tokens[0].like_num == match
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("word", ["third", "Millionth", "100th", "Hundredth"])
|
||||||
|
def test_en_lex_attrs_like_number_for_ordinal(word):
|
||||||
|
assert like_num(word)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("word", ["eleven"])
|
@pytest.mark.parametrize("word", ["eleven"])
|
||||||
def test_en_lex_attrs_capitals(word):
|
def test_en_lex_attrs_capitals(word):
|
||||||
assert like_num(word)
|
assert like_num(word)
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
from spacy.lang.he.lex_attrs import like_num
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
@ -39,3 +40,30 @@ def test_he_tokenizer_handles_abbreviation(he_tokenizer, text, expected_tokens):
|
||||||
def test_he_tokenizer_handles_punct(he_tokenizer, text, expected_tokens):
|
def test_he_tokenizer_handles_punct(he_tokenizer, text, expected_tokens):
|
||||||
tokens = he_tokenizer(text)
|
tokens = he_tokenizer(text)
|
||||||
assert expected_tokens == [token.text for token in tokens]
|
assert expected_tokens == [token.text for token in tokens]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"text,match",
|
||||||
|
[
|
||||||
|
("10", True),
|
||||||
|
("1", True),
|
||||||
|
("10,000", True),
|
||||||
|
("10,00", True),
|
||||||
|
("999.0", True),
|
||||||
|
("אחד", True),
|
||||||
|
("שתיים", True),
|
||||||
|
("מליון", True),
|
||||||
|
("כלב", False),
|
||||||
|
(",", False),
|
||||||
|
("1/2", True),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_lex_attrs_like_number(he_tokenizer, text, match):
|
||||||
|
tokens = he_tokenizer(text)
|
||||||
|
assert len(tokens) == 1
|
||||||
|
assert tokens[0].like_num == match
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("word", ["שלישי", "מליון", "עשירי", "מאה", "עשר", "אחד עשר"])
|
||||||
|
def test_he_lex_attrs_like_number_for_ordinal(word):
|
||||||
|
assert like_num(word)
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user