mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Merge remote-tracking branch 'upstream/master' into bugfix/revert-token-match
This commit is contained in:
commit
792c8af8cf
106
.github/contributors/Baciccin.md
vendored
Normal file
106
.github/contributors/Baciccin.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | ------------------------ |
|
||||||
|
| Name | Giovanni Battista Parodi |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-03-19 |
|
||||||
|
| GitHub username | Baciccin |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/MiniLau.md
vendored
Normal file
106
.github/contributors/MiniLau.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Desausoi Laurent |
|
||||||
|
| Company name (if applicable) | / |
|
||||||
|
| Title or role (if applicable) | / |
|
||||||
|
| Date | 22 November 2019 |
|
||||||
|
| GitHub username | MiniLau |
|
||||||
|
| Website (optional) | / |
|
106
.github/contributors/Mlawrence95.md
vendored
Normal file
106
.github/contributors/Mlawrence95.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [ x ] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Mike Lawrence |
|
||||||
|
| Company name (if applicable) | NA |
|
||||||
|
| Title or role (if applicable) | NA |
|
||||||
|
| Date | April 17, 2020 |
|
||||||
|
| GitHub username | Mlawrence95 |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/Tiljander.md
vendored
Normal file
106
.github/contributors/Tiljander.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Henrik Tiljander |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 24/3/2020 |
|
||||||
|
| GitHub username | Tiljander |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/YohannesDatasci.md
vendored
Normal file
106
.github/contributors/YohannesDatasci.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [X] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Yohannes |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-04-02 |
|
||||||
|
| GitHub username | YohannesDatasci |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/chopeen.md
vendored
Normal file
106
.github/contributors/chopeen.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Marek Grzenkowicz |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020.04.10 |
|
||||||
|
| GitHub username | chopeen |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/elben10
vendored
Normal file
106
.github/contributors/elben10
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Jakob Jul Elben |
|
||||||
|
| Company name (if applicable) | N/A |
|
||||||
|
| Title or role (if applicable) | N/A |
|
||||||
|
| Date | April 16th, 2020 |
|
||||||
|
| GitHub username | elben10 |
|
||||||
|
| Website (optional) | N/A |
|
106
.github/contributors/guerda.md
vendored
Normal file
106
.github/contributors/guerda.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Philip Gillißen |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-03-24 |
|
||||||
|
| GitHub username | guerda |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/jacse.md
vendored
Normal file
106
.github/contributors/jacse.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Jacob Lauritzen |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-03-30 |
|
||||||
|
| GitHub username | jacse |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/koaning.md
vendored
Normal file
106
.github/contributors/koaning.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | ------------------------ |
|
||||||
|
| Name | Vincent D. Warmerdam |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | Data Person |
|
||||||
|
| Date | 2020-03-01 |
|
||||||
|
| GitHub username | koaning |
|
||||||
|
| Website (optional) | https://koaning.io |
|
106
.github/contributors/laszabine.md
vendored
Normal file
106
.github/contributors/laszabine.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Sabine Laszakovits |
|
||||||
|
| Company name (if applicable) | Austrian Academy of Sciences |
|
||||||
|
| Title or role (if applicable) | Data analyst |
|
||||||
|
| Date | 2020-04-16 |
|
||||||
|
| GitHub username | laszabine |
|
||||||
|
| Website (optional) | https://sabine.laszakovits.net |
|
106
.github/contributors/leicmi.md
vendored
Normal file
106
.github/contributors/leicmi.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Michael Leichtfried |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 30.03.2020 |
|
||||||
|
| GitHub username | leicmi |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/louisguitton.md
vendored
Normal file
106
.github/contributors/louisguitton.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Louis Guitton |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-04-25 |
|
||||||
|
| GitHub username | louisguitton |
|
||||||
|
| Website (optional) | https://guitton.co/ |
|
106
.github/contributors/merrcury.md
vendored
Normal file
106
.github/contributors/merrcury.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [X] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Himanshu Garg |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-03-10 |
|
||||||
|
| GitHub username | merrcury |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/michael-k.md
vendored
Normal file
106
.github/contributors/michael-k.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [X] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Michael Käufl |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-04-23 |
|
||||||
|
| GitHub username | michael-k |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/nikhilsaldanha.md
vendored
Normal file
106
.github/contributors/nikhilsaldanha.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Nikhil Saldanha |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-03-17 |
|
||||||
|
| GitHub username | nikhilsaldanha |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/paoloq.md
vendored
Normal file
106
.github/contributors/paoloq.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Paolo Arduin |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 9 April 2020 |
|
||||||
|
| GitHub username | paoloq |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/pinealan.md
vendored
Normal file
106
.github/contributors/pinealan.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Alan Chan |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-03-15 |
|
||||||
|
| GitHub username | pinealan |
|
||||||
|
| Website (optional) | http://pinealan.xyz |
|
107
.github/contributors/punitvara.md
vendored
Normal file
107
.github/contributors/punitvara.md
vendored
Normal file
|
@ -0,0 +1,107 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | ------------------------ |
|
||||||
|
| Name | Punit Vara |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-04-26 |
|
||||||
|
| GitHub username | punitvara |
|
||||||
|
| Website (optional) | https://punitvara.com |
|
||||||
|
|
106
.github/contributors/sabiqueqb.md
vendored
Normal file
106
.github/contributors/sabiqueqb.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Sabique Ahammed Lava |
|
||||||
|
| Company name (if applicable) | QBurst |
|
||||||
|
| Title or role (if applicable) | Senior Engineer |
|
||||||
|
| Date | 24 Apr 2020 |
|
||||||
|
| GitHub username | sabiqueqb |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/sebastienharinck.md
vendored
Normal file
106
.github/contributors/sebastienharinck.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------------------------------- |
|
||||||
|
| Name | Sébastien Harinck |
|
||||||
|
| Company name (if applicable) | Odaxiom |
|
||||||
|
| Title or role (if applicable) | ML Engineer |
|
||||||
|
| Date | 2020-04-15 |
|
||||||
|
| GitHub username | sebastienharinck |
|
||||||
|
| Website (optional) | [https://odaxiom.com](https://odaxiom.com) |
|
106
.github/contributors/sloev.md
vendored
Normal file
106
.github/contributors/sloev.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | ------------------------ |
|
||||||
|
| Name | Johannes Valbjørn |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-03-13 |
|
||||||
|
| GitHub username | sloev |
|
||||||
|
| Website (optional) | https://sloev.github.io |
|
106
.github/contributors/thomasthiebaud.md
vendored
Normal file
106
.github/contributors/thomasthiebaud.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
- Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
- to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
- each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
| ----------------------------- | --------------- |
|
||||||
|
| Name | Thomas Thiebaud |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-04-07 |
|
||||||
|
| GitHub username | thomasthiebaud |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/tommilligan.md
vendored
Normal file
106
.github/contributors/tommilligan.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
- Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
- to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
- each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
| ----------------------------- | ------------ |
|
||||||
|
| Name | Tom Milligan |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-03-24 |
|
||||||
|
| GitHub username | tommilligan |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/umarbutler.md
vendored
Normal file
106
.github/contributors/umarbutler.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | ------------------------ |
|
||||||
|
| Name | Umar Butler |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-04-09 |
|
||||||
|
| GitHub username | umarbutler |
|
||||||
|
| Website (optional) | https://umarbutler.com |
|
106
.github/contributors/vondersam.md
vendored
Normal file
106
.github/contributors/vondersam.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | ------------------------|
|
||||||
|
| Name | Samuel Rodríguez Medina |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | Computational linguist |
|
||||||
|
| Date | 28 April 2020 |
|
||||||
|
| GitHub username | vondersam |
|
||||||
|
| Website (optional) | |
|
5
.gitignore
vendored
5
.gitignore
vendored
|
@ -5,6 +5,11 @@ corpora/
|
||||||
keys/
|
keys/
|
||||||
*.json.gz
|
*.json.gz
|
||||||
|
|
||||||
|
# Tests
|
||||||
|
spacy/tests/package/setup.cfg
|
||||||
|
spacy/tests/package/pyproject.toml
|
||||||
|
spacy/tests/package/requirements.txt
|
||||||
|
|
||||||
# Website
|
# Website
|
||||||
website/.cache/
|
website/.cache/
|
||||||
website/public/
|
website/public/
|
||||||
|
|
2
LICENSE
2
LICENSE
|
@ -1,6 +1,6 @@
|
||||||
The MIT License (MIT)
|
The MIT License (MIT)
|
||||||
|
|
||||||
Copyright (C) 2016-2019 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
|
Copyright (C) 2016-2020 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
|
|
@ -48,7 +48,7 @@ jobs:
|
||||||
imageName: 'vs2017-win2016'
|
imageName: 'vs2017-win2016'
|
||||||
python.version: '3.6'
|
python.version: '3.6'
|
||||||
Python36Mac:
|
Python36Mac:
|
||||||
imageName: 'macos-10.13'
|
imageName: 'macos-10.14'
|
||||||
python.version: '3.6'
|
python.version: '3.6'
|
||||||
# Don't test on 3.7 for now to speed up builds
|
# Don't test on 3.7 for now to speed up builds
|
||||||
# Python37Linux:
|
# Python37Linux:
|
||||||
|
@ -67,7 +67,7 @@ jobs:
|
||||||
imageName: 'vs2017-win2016'
|
imageName: 'vs2017-win2016'
|
||||||
python.version: '3.8'
|
python.version: '3.8'
|
||||||
Python38Mac:
|
Python38Mac:
|
||||||
imageName: 'macos-10.13'
|
imageName: 'macos-10.14'
|
||||||
python.version: '3.8'
|
python.version: '3.8'
|
||||||
maxParallel: 4
|
maxParallel: 4
|
||||||
pool:
|
pool:
|
||||||
|
|
|
@ -1,37 +0,0 @@
|
||||||
## Entity Linking with Wikipedia and Wikidata
|
|
||||||
|
|
||||||
### Step 1: Create a Knowledge Base (KB) and training data
|
|
||||||
|
|
||||||
Run `wikidata_pretrain_kb.py`
|
|
||||||
* This takes as input the locations of a **Wikipedia and a Wikidata dump**, and produces a **KB directory** + **training file**
|
|
||||||
* WikiData: get `latest-all.json.bz2` from https://dumps.wikimedia.org/wikidatawiki/entities/
|
|
||||||
* Wikipedia: get `enwiki-latest-pages-articles-multistream.xml.bz2` from https://dumps.wikimedia.org/enwiki/latest/ (or for any other language)
|
|
||||||
* You can set the filtering parameters for KB construction:
|
|
||||||
* `max_per_alias` (`-a`): (max) number of candidate entities in the KB per alias/synonym
|
|
||||||
* `min_freq` (`-f`): threshold of number of times an entity should occur in the corpus to be included in the KB
|
|
||||||
* `min_pair` (`-c`): threshold of number of times an entity+alias combination should occur in the corpus to be included in the KB
|
|
||||||
* Further parameters to set:
|
|
||||||
* `descriptions_from_wikipedia` (`-wp`): whether to parse descriptions from Wikipedia (`True`) or Wikidata (`False`)
|
|
||||||
* `entity_vector_length` (`-v`): length of the pre-trained entity description vectors
|
|
||||||
* `lang` (`-la`): language for which to fetch Wikidata information (as the dump contains all languages)
|
|
||||||
|
|
||||||
Quick testing and rerunning:
|
|
||||||
* When trying out the pipeline for a quick test, set `limit_prior` (`-lp`), `limit_train` (`-lt`) and/or `limit_wd` (`-lw`) to read only parts of the dumps instead of everything.
|
|
||||||
* e.g. set `-lt 20000 -lp 2000 -lw 3000 -f 1`
|
|
||||||
* If you only want to (re)run certain parts of the pipeline, just remove the corresponding files and they will be recalculated or reparsed.
|
|
||||||
|
|
||||||
|
|
||||||
### Step 2: Train an Entity Linking model
|
|
||||||
|
|
||||||
Run `wikidata_train_entity_linker.py`
|
|
||||||
* This takes the **KB directory** produced by Step 1, and trains an **Entity Linking model**
|
|
||||||
* Specify the output directory (`-o`) in which the final, trained model will be saved
|
|
||||||
* You can set the learning parameters for the EL training:
|
|
||||||
* `epochs` (`-e`): number of training iterations
|
|
||||||
* `dropout` (`-p`): dropout rate
|
|
||||||
* `lr` (`-n`): learning rate
|
|
||||||
* `l2` (`-r`): L2 regularization
|
|
||||||
* Specify the number of training and dev testing articles with `train_articles` (`-t`) and `dev_articles` (`-d`) respectively
|
|
||||||
* If not specified, the full dataset will be processed - this may take a LONG time !
|
|
||||||
* Further parameters to set:
|
|
||||||
* `labels_discard` (`-l`): NER label types to discard during training
|
|
|
@ -1,12 +0,0 @@
|
||||||
TRAINING_DATA_FILE = "gold_entities.jsonl"
|
|
||||||
KB_FILE = "kb"
|
|
||||||
KB_MODEL_DIR = "nlp_kb"
|
|
||||||
OUTPUT_MODEL_DIR = "nlp"
|
|
||||||
|
|
||||||
PRIOR_PROB_PATH = "prior_prob.csv"
|
|
||||||
ENTITY_DEFS_PATH = "entity_defs.csv"
|
|
||||||
ENTITY_FREQ_PATH = "entity_freq.csv"
|
|
||||||
ENTITY_ALIAS_PATH = "entity_alias.csv"
|
|
||||||
ENTITY_DESCR_PATH = "entity_descriptions.csv"
|
|
||||||
|
|
||||||
LOG_FORMAT = '%(asctime)s - %(levelname)s - %(name)s - %(message)s'
|
|
|
@ -1,204 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import random
|
|
||||||
from tqdm import tqdm
|
|
||||||
from collections import defaultdict
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class Metrics(object):
|
|
||||||
true_pos = 0
|
|
||||||
false_pos = 0
|
|
||||||
false_neg = 0
|
|
||||||
|
|
||||||
def update_results(self, true_entity, candidate):
|
|
||||||
candidate_is_correct = true_entity == candidate
|
|
||||||
|
|
||||||
# Assume that we have no labeled negatives in the data (i.e. cases where true_entity is "NIL")
|
|
||||||
# Therefore, if candidate_is_correct then we have a true positive and never a true negative.
|
|
||||||
self.true_pos += candidate_is_correct
|
|
||||||
self.false_neg += not candidate_is_correct
|
|
||||||
if candidate and candidate not in {"", "NIL"}:
|
|
||||||
# A wrong prediction (e.g. Q42 != Q3) counts both as a FP as well as a FN.
|
|
||||||
self.false_pos += not candidate_is_correct
|
|
||||||
|
|
||||||
def calculate_precision(self):
|
|
||||||
if self.true_pos == 0:
|
|
||||||
return 0.0
|
|
||||||
else:
|
|
||||||
return self.true_pos / (self.true_pos + self.false_pos)
|
|
||||||
|
|
||||||
def calculate_recall(self):
|
|
||||||
if self.true_pos == 0:
|
|
||||||
return 0.0
|
|
||||||
else:
|
|
||||||
return self.true_pos / (self.true_pos + self.false_neg)
|
|
||||||
|
|
||||||
def calculate_fscore(self):
|
|
||||||
p = self.calculate_precision()
|
|
||||||
r = self.calculate_recall()
|
|
||||||
if p + r == 0:
|
|
||||||
return 0.0
|
|
||||||
else:
|
|
||||||
return 2 * p * r / (p + r)
|
|
||||||
|
|
||||||
|
|
||||||
class EvaluationResults(object):
|
|
||||||
def __init__(self):
|
|
||||||
self.metrics = Metrics()
|
|
||||||
self.metrics_by_label = defaultdict(Metrics)
|
|
||||||
|
|
||||||
def update_metrics(self, ent_label, true_entity, candidate):
|
|
||||||
self.metrics.update_results(true_entity, candidate)
|
|
||||||
self.metrics_by_label[ent_label].update_results(true_entity, candidate)
|
|
||||||
|
|
||||||
def report_metrics(self, model_name):
|
|
||||||
model_str = model_name.title()
|
|
||||||
recall = self.metrics.calculate_recall()
|
|
||||||
precision = self.metrics.calculate_precision()
|
|
||||||
fscore = self.metrics.calculate_fscore()
|
|
||||||
return (
|
|
||||||
"{}: ".format(model_str)
|
|
||||||
+ "F-score = {} | ".format(round(fscore, 3))
|
|
||||||
+ "Recall = {} | ".format(round(recall, 3))
|
|
||||||
+ "Precision = {} | ".format(round(precision, 3))
|
|
||||||
+ "F-score by label = {}".format(
|
|
||||||
{k: v.calculate_fscore() for k, v in sorted(self.metrics_by_label.items())}
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class BaselineResults(object):
|
|
||||||
def __init__(self):
|
|
||||||
self.random = EvaluationResults()
|
|
||||||
self.prior = EvaluationResults()
|
|
||||||
self.oracle = EvaluationResults()
|
|
||||||
|
|
||||||
def report_performance(self, model):
|
|
||||||
results = getattr(self, model)
|
|
||||||
return results.report_metrics(model)
|
|
||||||
|
|
||||||
def update_baselines(
|
|
||||||
self,
|
|
||||||
true_entity,
|
|
||||||
ent_label,
|
|
||||||
random_candidate,
|
|
||||||
prior_candidate,
|
|
||||||
oracle_candidate,
|
|
||||||
):
|
|
||||||
self.oracle.update_metrics(ent_label, true_entity, oracle_candidate)
|
|
||||||
self.prior.update_metrics(ent_label, true_entity, prior_candidate)
|
|
||||||
self.random.update_metrics(ent_label, true_entity, random_candidate)
|
|
||||||
|
|
||||||
|
|
||||||
def measure_performance(dev_data, kb, el_pipe, baseline=True, context=True, dev_limit=None):
|
|
||||||
counts = dict()
|
|
||||||
baseline_results = BaselineResults()
|
|
||||||
context_results = EvaluationResults()
|
|
||||||
combo_results = EvaluationResults()
|
|
||||||
|
|
||||||
for doc, gold in tqdm(dev_data, total=dev_limit, leave=False, desc='Processing dev data'):
|
|
||||||
if len(doc) > 0:
|
|
||||||
correct_ents = dict()
|
|
||||||
for entity, kb_dict in gold.links.items():
|
|
||||||
start, end = entity
|
|
||||||
for gold_kb, value in kb_dict.items():
|
|
||||||
if value:
|
|
||||||
# only evaluating on positive examples
|
|
||||||
offset = _offset(start, end)
|
|
||||||
correct_ents[offset] = gold_kb
|
|
||||||
|
|
||||||
if baseline:
|
|
||||||
_add_baseline(baseline_results, counts, doc, correct_ents, kb)
|
|
||||||
|
|
||||||
if context:
|
|
||||||
# using only context
|
|
||||||
el_pipe.cfg["incl_context"] = True
|
|
||||||
el_pipe.cfg["incl_prior"] = False
|
|
||||||
_add_eval_result(context_results, doc, correct_ents, el_pipe)
|
|
||||||
|
|
||||||
# measuring combined accuracy (prior + context)
|
|
||||||
el_pipe.cfg["incl_context"] = True
|
|
||||||
el_pipe.cfg["incl_prior"] = True
|
|
||||||
_add_eval_result(combo_results, doc, correct_ents, el_pipe)
|
|
||||||
|
|
||||||
if baseline:
|
|
||||||
logger.info("Counts: {}".format({k: v for k, v in sorted(counts.items())}))
|
|
||||||
logger.info(baseline_results.report_performance("random"))
|
|
||||||
logger.info(baseline_results.report_performance("prior"))
|
|
||||||
logger.info(baseline_results.report_performance("oracle"))
|
|
||||||
|
|
||||||
if context:
|
|
||||||
logger.info(context_results.report_metrics("context only"))
|
|
||||||
logger.info(combo_results.report_metrics("context and prior"))
|
|
||||||
|
|
||||||
|
|
||||||
def _add_eval_result(results, doc, correct_ents, el_pipe):
|
|
||||||
"""
|
|
||||||
Evaluate the ent.kb_id_ annotations against the gold standard.
|
|
||||||
Only evaluate entities that overlap between gold and NER, to isolate the performance of the NEL.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
doc = el_pipe(doc)
|
|
||||||
for ent in doc.ents:
|
|
||||||
ent_label = ent.label_
|
|
||||||
start = ent.start_char
|
|
||||||
end = ent.end_char
|
|
||||||
offset = _offset(start, end)
|
|
||||||
gold_entity = correct_ents.get(offset, None)
|
|
||||||
# the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
|
|
||||||
if gold_entity is not None:
|
|
||||||
pred_entity = ent.kb_id_
|
|
||||||
results.update_metrics(ent_label, gold_entity, pred_entity)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logging.error("Error assessing accuracy " + str(e))
|
|
||||||
|
|
||||||
|
|
||||||
def _add_baseline(baseline_results, counts, doc, correct_ents, kb):
|
|
||||||
"""
|
|
||||||
Measure 3 performance baselines: random selection, prior probabilities, and 'oracle' prediction for upper bound.
|
|
||||||
Only evaluate entities that overlap between gold and NER, to isolate the performance of the NEL.
|
|
||||||
"""
|
|
||||||
for ent in doc.ents:
|
|
||||||
ent_label = ent.label_
|
|
||||||
start = ent.start_char
|
|
||||||
end = ent.end_char
|
|
||||||
offset = _offset(start, end)
|
|
||||||
gold_entity = correct_ents.get(offset, None)
|
|
||||||
|
|
||||||
# the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
|
|
||||||
if gold_entity is not None:
|
|
||||||
candidates = kb.get_candidates(ent.text)
|
|
||||||
oracle_candidate = ""
|
|
||||||
prior_candidate = ""
|
|
||||||
random_candidate = ""
|
|
||||||
if candidates:
|
|
||||||
scores = []
|
|
||||||
|
|
||||||
for c in candidates:
|
|
||||||
scores.append(c.prior_prob)
|
|
||||||
if c.entity_ == gold_entity:
|
|
||||||
oracle_candidate = c.entity_
|
|
||||||
|
|
||||||
best_index = scores.index(max(scores))
|
|
||||||
prior_candidate = candidates[best_index].entity_
|
|
||||||
random_candidate = random.choice(candidates).entity_
|
|
||||||
|
|
||||||
current_count = counts.get(ent_label, 0)
|
|
||||||
counts[ent_label] = current_count+1
|
|
||||||
|
|
||||||
baseline_results.update_baselines(
|
|
||||||
gold_entity,
|
|
||||||
ent_label,
|
|
||||||
random_candidate,
|
|
||||||
prior_candidate,
|
|
||||||
oracle_candidate,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _offset(start, end):
|
|
||||||
return "{}_{}".format(start, end)
|
|
|
@ -1,161 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import logging
|
|
||||||
|
|
||||||
from spacy.kb import KnowledgeBase
|
|
||||||
|
|
||||||
from bin.wiki_entity_linking.train_descriptions import EntityEncoder
|
|
||||||
from bin.wiki_entity_linking import wiki_io as io
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def create_kb(
|
|
||||||
nlp,
|
|
||||||
max_entities_per_alias,
|
|
||||||
min_entity_freq,
|
|
||||||
min_occ,
|
|
||||||
entity_def_path,
|
|
||||||
entity_descr_path,
|
|
||||||
entity_alias_path,
|
|
||||||
entity_freq_path,
|
|
||||||
prior_prob_path,
|
|
||||||
entity_vector_length,
|
|
||||||
):
|
|
||||||
# Create the knowledge base from Wikidata entries
|
|
||||||
kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=entity_vector_length)
|
|
||||||
entity_list, filtered_title_to_id = _define_entities(nlp, kb, entity_def_path, entity_descr_path, min_entity_freq, entity_freq_path, entity_vector_length)
|
|
||||||
_define_aliases(kb, entity_alias_path, entity_list, filtered_title_to_id, max_entities_per_alias, min_occ, prior_prob_path)
|
|
||||||
return kb
|
|
||||||
|
|
||||||
|
|
||||||
def _define_entities(nlp, kb, entity_def_path, entity_descr_path, min_entity_freq, entity_freq_path, entity_vector_length):
|
|
||||||
# read the mappings from file
|
|
||||||
title_to_id = io.read_title_to_id(entity_def_path)
|
|
||||||
id_to_descr = io.read_id_to_descr(entity_descr_path)
|
|
||||||
|
|
||||||
# check the length of the nlp vectors
|
|
||||||
if "vectors" in nlp.meta and nlp.vocab.vectors.size:
|
|
||||||
input_dim = nlp.vocab.vectors_length
|
|
||||||
logger.info("Loaded pretrained vectors of size %s" % input_dim)
|
|
||||||
else:
|
|
||||||
raise ValueError(
|
|
||||||
"The `nlp` object should have access to pretrained word vectors, "
|
|
||||||
" cf. https://spacy.io/usage/models#languages."
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info("Filtering entities with fewer than {} mentions".format(min_entity_freq))
|
|
||||||
entity_frequencies = io.read_entity_to_count(entity_freq_path)
|
|
||||||
# filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise
|
|
||||||
filtered_title_to_id, entity_list, description_list, frequency_list = get_filtered_entities(
|
|
||||||
title_to_id,
|
|
||||||
id_to_descr,
|
|
||||||
entity_frequencies,
|
|
||||||
min_entity_freq
|
|
||||||
)
|
|
||||||
logger.info("Kept {} entities from the set of {}".format(len(description_list), len(title_to_id.keys())))
|
|
||||||
|
|
||||||
logger.info("Training entity encoder")
|
|
||||||
encoder = EntityEncoder(nlp, input_dim, entity_vector_length)
|
|
||||||
encoder.train(description_list=description_list, to_print=True)
|
|
||||||
|
|
||||||
logger.info("Getting entity embeddings")
|
|
||||||
embeddings = encoder.apply_encoder(description_list)
|
|
||||||
|
|
||||||
logger.info("Adding {} entities".format(len(entity_list)))
|
|
||||||
kb.set_entities(
|
|
||||||
entity_list=entity_list, freq_list=frequency_list, vector_list=embeddings
|
|
||||||
)
|
|
||||||
return entity_list, filtered_title_to_id
|
|
||||||
|
|
||||||
|
|
||||||
def _define_aliases(kb, entity_alias_path, entity_list, filtered_title_to_id, max_entities_per_alias, min_occ, prior_prob_path):
|
|
||||||
logger.info("Adding aliases from Wikipedia and Wikidata")
|
|
||||||
_add_aliases(
|
|
||||||
kb,
|
|
||||||
entity_list=entity_list,
|
|
||||||
title_to_id=filtered_title_to_id,
|
|
||||||
max_entities_per_alias=max_entities_per_alias,
|
|
||||||
min_occ=min_occ,
|
|
||||||
prior_prob_path=prior_prob_path,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_filtered_entities(title_to_id, id_to_descr, entity_frequencies,
|
|
||||||
min_entity_freq: int = 10):
|
|
||||||
filtered_title_to_id = dict()
|
|
||||||
entity_list = []
|
|
||||||
description_list = []
|
|
||||||
frequency_list = []
|
|
||||||
for title, entity in title_to_id.items():
|
|
||||||
freq = entity_frequencies.get(title, 0)
|
|
||||||
desc = id_to_descr.get(entity, None)
|
|
||||||
if desc and freq > min_entity_freq:
|
|
||||||
entity_list.append(entity)
|
|
||||||
description_list.append(desc)
|
|
||||||
frequency_list.append(freq)
|
|
||||||
filtered_title_to_id[title] = entity
|
|
||||||
return filtered_title_to_id, entity_list, description_list, frequency_list
|
|
||||||
|
|
||||||
|
|
||||||
def _add_aliases(kb, entity_list, title_to_id, max_entities_per_alias, min_occ, prior_prob_path):
|
|
||||||
wp_titles = title_to_id.keys()
|
|
||||||
|
|
||||||
# adding aliases with prior probabilities
|
|
||||||
# we can read this file sequentially, it's sorted by alias, and then by count
|
|
||||||
logger.info("Adding WP aliases")
|
|
||||||
with prior_prob_path.open("r", encoding="utf8") as prior_file:
|
|
||||||
# skip header
|
|
||||||
prior_file.readline()
|
|
||||||
line = prior_file.readline()
|
|
||||||
previous_alias = None
|
|
||||||
total_count = 0
|
|
||||||
counts = []
|
|
||||||
entities = []
|
|
||||||
while line:
|
|
||||||
splits = line.replace("\n", "").split(sep="|")
|
|
||||||
new_alias = splits[0]
|
|
||||||
count = int(splits[1])
|
|
||||||
entity = splits[2]
|
|
||||||
|
|
||||||
if new_alias != previous_alias and previous_alias:
|
|
||||||
# done reading the previous alias --> output
|
|
||||||
if len(entities) > 0:
|
|
||||||
selected_entities = []
|
|
||||||
prior_probs = []
|
|
||||||
for ent_count, ent_string in zip(counts, entities):
|
|
||||||
if ent_string in wp_titles:
|
|
||||||
wd_id = title_to_id[ent_string]
|
|
||||||
p_entity_givenalias = ent_count / total_count
|
|
||||||
selected_entities.append(wd_id)
|
|
||||||
prior_probs.append(p_entity_givenalias)
|
|
||||||
|
|
||||||
if selected_entities:
|
|
||||||
try:
|
|
||||||
kb.add_alias(
|
|
||||||
alias=previous_alias,
|
|
||||||
entities=selected_entities,
|
|
||||||
probabilities=prior_probs,
|
|
||||||
)
|
|
||||||
except ValueError as e:
|
|
||||||
logger.error(e)
|
|
||||||
total_count = 0
|
|
||||||
counts = []
|
|
||||||
entities = []
|
|
||||||
|
|
||||||
total_count += count
|
|
||||||
|
|
||||||
if len(entities) < max_entities_per_alias and count >= min_occ:
|
|
||||||
counts.append(count)
|
|
||||||
entities.append(entity)
|
|
||||||
previous_alias = new_alias
|
|
||||||
|
|
||||||
line = prior_file.readline()
|
|
||||||
|
|
||||||
|
|
||||||
def read_kb(nlp, kb_file):
|
|
||||||
kb = KnowledgeBase(vocab=nlp.vocab)
|
|
||||||
kb.load_bulk(kb_file)
|
|
||||||
return kb
|
|
|
@ -1,152 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from random import shuffle
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from spacy._ml import zero_init, create_default_optimizer
|
|
||||||
from spacy.cli.pretrain import get_cossim_loss
|
|
||||||
|
|
||||||
from thinc.v2v import Model
|
|
||||||
from thinc.api import chain
|
|
||||||
from thinc.neural._classes.affine import Affine
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class EntityEncoder:
|
|
||||||
"""
|
|
||||||
Train the embeddings of entity descriptions to fit a fixed-size entity vector (e.g. 64D).
|
|
||||||
This entity vector will be stored in the KB, for further downstream use in the entity model.
|
|
||||||
"""
|
|
||||||
|
|
||||||
DROP = 0
|
|
||||||
BATCH_SIZE = 1000
|
|
||||||
|
|
||||||
# Set min. acceptable loss to avoid a 'mean of empty slice' warning by numpy
|
|
||||||
MIN_LOSS = 0.01
|
|
||||||
|
|
||||||
# Reasonable default to stop training when things are not improving
|
|
||||||
MAX_NO_IMPROVEMENT = 20
|
|
||||||
|
|
||||||
def __init__(self, nlp, input_dim, desc_width, epochs=5):
|
|
||||||
self.nlp = nlp
|
|
||||||
self.input_dim = input_dim
|
|
||||||
self.desc_width = desc_width
|
|
||||||
self.epochs = epochs
|
|
||||||
|
|
||||||
def apply_encoder(self, description_list):
|
|
||||||
if self.encoder is None:
|
|
||||||
raise ValueError("Can not apply encoder before training it")
|
|
||||||
|
|
||||||
batch_size = 100000
|
|
||||||
|
|
||||||
start = 0
|
|
||||||
stop = min(batch_size, len(description_list))
|
|
||||||
encodings = []
|
|
||||||
|
|
||||||
while start < len(description_list):
|
|
||||||
docs = list(self.nlp.pipe(description_list[start:stop]))
|
|
||||||
doc_embeddings = [self._get_doc_embedding(doc) for doc in docs]
|
|
||||||
enc = self.encoder(np.asarray(doc_embeddings))
|
|
||||||
encodings.extend(enc.tolist())
|
|
||||||
|
|
||||||
start = start + batch_size
|
|
||||||
stop = min(stop + batch_size, len(description_list))
|
|
||||||
logger.info("Encoded: {} entities".format(stop))
|
|
||||||
|
|
||||||
return encodings
|
|
||||||
|
|
||||||
def train(self, description_list, to_print=False):
|
|
||||||
processed, loss = self._train_model(description_list)
|
|
||||||
if to_print:
|
|
||||||
logger.info(
|
|
||||||
"Trained entity descriptions on {} ".format(processed) +
|
|
||||||
"(non-unique) descriptions across {} ".format(self.epochs) +
|
|
||||||
"epochs"
|
|
||||||
)
|
|
||||||
logger.info("Final loss: {}".format(loss))
|
|
||||||
|
|
||||||
def _train_model(self, description_list):
|
|
||||||
best_loss = 1.0
|
|
||||||
iter_since_best = 0
|
|
||||||
self._build_network(self.input_dim, self.desc_width)
|
|
||||||
|
|
||||||
processed = 0
|
|
||||||
loss = 1
|
|
||||||
# copy this list so that shuffling does not affect other functions
|
|
||||||
descriptions = description_list.copy()
|
|
||||||
to_continue = True
|
|
||||||
|
|
||||||
for i in range(self.epochs):
|
|
||||||
shuffle(descriptions)
|
|
||||||
|
|
||||||
batch_nr = 0
|
|
||||||
start = 0
|
|
||||||
stop = min(self.BATCH_SIZE, len(descriptions))
|
|
||||||
|
|
||||||
while to_continue and start < len(descriptions):
|
|
||||||
batch = []
|
|
||||||
for descr in descriptions[start:stop]:
|
|
||||||
doc = self.nlp(descr)
|
|
||||||
doc_vector = self._get_doc_embedding(doc)
|
|
||||||
batch.append(doc_vector)
|
|
||||||
|
|
||||||
loss = self._update(batch)
|
|
||||||
if batch_nr % 25 == 0:
|
|
||||||
logger.info("loss: {} ".format(loss))
|
|
||||||
processed += len(batch)
|
|
||||||
|
|
||||||
# in general, continue training if we haven't reached our ideal min yet
|
|
||||||
to_continue = loss > self.MIN_LOSS
|
|
||||||
|
|
||||||
# store the best loss and track how long it's been
|
|
||||||
if loss < best_loss:
|
|
||||||
best_loss = loss
|
|
||||||
iter_since_best = 0
|
|
||||||
else:
|
|
||||||
iter_since_best += 1
|
|
||||||
|
|
||||||
# stop learning if we haven't seen improvement since the last few iterations
|
|
||||||
if iter_since_best > self.MAX_NO_IMPROVEMENT:
|
|
||||||
to_continue = False
|
|
||||||
|
|
||||||
batch_nr += 1
|
|
||||||
start = start + self.BATCH_SIZE
|
|
||||||
stop = min(stop + self.BATCH_SIZE, len(descriptions))
|
|
||||||
|
|
||||||
return processed, loss
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _get_doc_embedding(doc):
|
|
||||||
indices = np.zeros((len(doc),), dtype="i")
|
|
||||||
for i, word in enumerate(doc):
|
|
||||||
if word.orth in doc.vocab.vectors.key2row:
|
|
||||||
indices[i] = doc.vocab.vectors.key2row[word.orth]
|
|
||||||
else:
|
|
||||||
indices[i] = 0
|
|
||||||
word_vectors = doc.vocab.vectors.data[indices]
|
|
||||||
doc_vector = np.mean(word_vectors, axis=0)
|
|
||||||
return doc_vector
|
|
||||||
|
|
||||||
def _build_network(self, orig_width, hidden_with):
|
|
||||||
with Model.define_operators({">>": chain}):
|
|
||||||
# very simple encoder-decoder model
|
|
||||||
self.encoder = Affine(hidden_with, orig_width)
|
|
||||||
self.model = self.encoder >> zero_init(
|
|
||||||
Affine(orig_width, hidden_with, drop_factor=0.0)
|
|
||||||
)
|
|
||||||
self.sgd = create_default_optimizer(self.model.ops)
|
|
||||||
|
|
||||||
def _update(self, vectors):
|
|
||||||
predictions, bp_model = self.model.begin_update(
|
|
||||||
np.asarray(vectors), drop=self.DROP
|
|
||||||
)
|
|
||||||
loss, d_scores = self._get_loss(scores=predictions, golds=np.asarray(vectors))
|
|
||||||
bp_model(d_scores, sgd=self.sgd)
|
|
||||||
return loss / len(vectors)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _get_loss(golds, scores):
|
|
||||||
loss, gradients = get_cossim_loss(scores, golds)
|
|
||||||
return loss, gradients
|
|
|
@ -1,127 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import sys
|
|
||||||
import csv
|
|
||||||
|
|
||||||
# min() needed to prevent error on windows, cf https://stackoverflow.com/questions/52404416/
|
|
||||||
csv.field_size_limit(min(sys.maxsize, 2147483646))
|
|
||||||
|
|
||||||
""" This class provides reading/writing methods for temp files """
|
|
||||||
|
|
||||||
|
|
||||||
# Entity definition: WP title -> WD ID #
|
|
||||||
def write_title_to_id(entity_def_output, title_to_id):
|
|
||||||
with entity_def_output.open("w", encoding="utf8") as id_file:
|
|
||||||
id_file.write("WP_title" + "|" + "WD_id" + "\n")
|
|
||||||
for title, qid in title_to_id.items():
|
|
||||||
id_file.write(title + "|" + str(qid) + "\n")
|
|
||||||
|
|
||||||
|
|
||||||
def read_title_to_id(entity_def_output):
|
|
||||||
title_to_id = dict()
|
|
||||||
with entity_def_output.open("r", encoding="utf8") as id_file:
|
|
||||||
csvreader = csv.reader(id_file, delimiter="|")
|
|
||||||
# skip header
|
|
||||||
next(csvreader)
|
|
||||||
for row in csvreader:
|
|
||||||
title_to_id[row[0]] = row[1]
|
|
||||||
return title_to_id
|
|
||||||
|
|
||||||
|
|
||||||
# Entity aliases from WD: WD ID -> WD alias #
|
|
||||||
def write_id_to_alias(entity_alias_path, id_to_alias):
|
|
||||||
with entity_alias_path.open("w", encoding="utf8") as alias_file:
|
|
||||||
alias_file.write("WD_id" + "|" + "alias" + "\n")
|
|
||||||
for qid, alias_list in id_to_alias.items():
|
|
||||||
for alias in alias_list:
|
|
||||||
alias_file.write(str(qid) + "|" + alias + "\n")
|
|
||||||
|
|
||||||
|
|
||||||
def read_id_to_alias(entity_alias_path):
|
|
||||||
id_to_alias = dict()
|
|
||||||
with entity_alias_path.open("r", encoding="utf8") as alias_file:
|
|
||||||
csvreader = csv.reader(alias_file, delimiter="|")
|
|
||||||
# skip header
|
|
||||||
next(csvreader)
|
|
||||||
for row in csvreader:
|
|
||||||
qid = row[0]
|
|
||||||
alias = row[1]
|
|
||||||
alias_list = id_to_alias.get(qid, [])
|
|
||||||
alias_list.append(alias)
|
|
||||||
id_to_alias[qid] = alias_list
|
|
||||||
return id_to_alias
|
|
||||||
|
|
||||||
|
|
||||||
def read_alias_to_id_generator(entity_alias_path):
|
|
||||||
""" Read (aliases, qid) tuples """
|
|
||||||
|
|
||||||
with entity_alias_path.open("r", encoding="utf8") as alias_file:
|
|
||||||
csvreader = csv.reader(alias_file, delimiter="|")
|
|
||||||
# skip header
|
|
||||||
next(csvreader)
|
|
||||||
for row in csvreader:
|
|
||||||
qid = row[0]
|
|
||||||
alias = row[1]
|
|
||||||
yield alias, qid
|
|
||||||
|
|
||||||
|
|
||||||
# Entity descriptions from WD: WD ID -> WD alias #
|
|
||||||
def write_id_to_descr(entity_descr_output, id_to_descr):
|
|
||||||
with entity_descr_output.open("w", encoding="utf8") as descr_file:
|
|
||||||
descr_file.write("WD_id" + "|" + "description" + "\n")
|
|
||||||
for qid, descr in id_to_descr.items():
|
|
||||||
descr_file.write(str(qid) + "|" + descr + "\n")
|
|
||||||
|
|
||||||
|
|
||||||
def read_id_to_descr(entity_desc_path):
|
|
||||||
id_to_desc = dict()
|
|
||||||
with entity_desc_path.open("r", encoding="utf8") as descr_file:
|
|
||||||
csvreader = csv.reader(descr_file, delimiter="|")
|
|
||||||
# skip header
|
|
||||||
next(csvreader)
|
|
||||||
for row in csvreader:
|
|
||||||
id_to_desc[row[0]] = row[1]
|
|
||||||
return id_to_desc
|
|
||||||
|
|
||||||
|
|
||||||
# Entity counts from WP: WP title -> count #
|
|
||||||
def write_entity_to_count(prior_prob_input, count_output):
|
|
||||||
# Write entity counts for quick access later
|
|
||||||
entity_to_count = dict()
|
|
||||||
total_count = 0
|
|
||||||
|
|
||||||
with prior_prob_input.open("r", encoding="utf8") as prior_file:
|
|
||||||
# skip header
|
|
||||||
prior_file.readline()
|
|
||||||
line = prior_file.readline()
|
|
||||||
|
|
||||||
while line:
|
|
||||||
splits = line.replace("\n", "").split(sep="|")
|
|
||||||
# alias = splits[0]
|
|
||||||
count = int(splits[1])
|
|
||||||
entity = splits[2]
|
|
||||||
|
|
||||||
current_count = entity_to_count.get(entity, 0)
|
|
||||||
entity_to_count[entity] = current_count + count
|
|
||||||
|
|
||||||
total_count += count
|
|
||||||
|
|
||||||
line = prior_file.readline()
|
|
||||||
|
|
||||||
with count_output.open("w", encoding="utf8") as entity_file:
|
|
||||||
entity_file.write("entity" + "|" + "count" + "\n")
|
|
||||||
for entity, count in entity_to_count.items():
|
|
||||||
entity_file.write(entity + "|" + str(count) + "\n")
|
|
||||||
|
|
||||||
|
|
||||||
def read_entity_to_count(count_input):
|
|
||||||
entity_to_count = dict()
|
|
||||||
with count_input.open("r", encoding="utf8") as csvfile:
|
|
||||||
csvreader = csv.reader(csvfile, delimiter="|")
|
|
||||||
# skip header
|
|
||||||
next(csvreader)
|
|
||||||
for row in csvreader:
|
|
||||||
entity_to_count[row[0]] = int(row[1])
|
|
||||||
|
|
||||||
return entity_to_count
|
|
|
@ -1,128 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
# List of meta pages in Wikidata, should be kept out of the Knowledge base
|
|
||||||
WD_META_ITEMS = [
|
|
||||||
"Q163875",
|
|
||||||
"Q191780",
|
|
||||||
"Q224414",
|
|
||||||
"Q4167836",
|
|
||||||
"Q4167410",
|
|
||||||
"Q4663903",
|
|
||||||
"Q11266439",
|
|
||||||
"Q13406463",
|
|
||||||
"Q15407973",
|
|
||||||
"Q18616576",
|
|
||||||
"Q19887878",
|
|
||||||
"Q22808320",
|
|
||||||
"Q23894233",
|
|
||||||
"Q33120876",
|
|
||||||
"Q42104522",
|
|
||||||
"Q47460393",
|
|
||||||
"Q64875536",
|
|
||||||
"Q66480449",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
# TODO: add more cases from non-English WP's
|
|
||||||
|
|
||||||
# List of prefixes that refer to Wikipedia "file" pages
|
|
||||||
WP_FILE_NAMESPACE = ["Bestand", "File"]
|
|
||||||
|
|
||||||
# List of prefixes that refer to Wikipedia "category" pages
|
|
||||||
WP_CATEGORY_NAMESPACE = ["Kategori", "Category", "Categorie"]
|
|
||||||
|
|
||||||
# List of prefixes that refer to Wikipedia "meta" pages
|
|
||||||
# these will/should be matched ignoring case
|
|
||||||
WP_META_NAMESPACE = (
|
|
||||||
WP_FILE_NAMESPACE
|
|
||||||
+ WP_CATEGORY_NAMESPACE
|
|
||||||
+ [
|
|
||||||
"b",
|
|
||||||
"betawikiversity",
|
|
||||||
"Book",
|
|
||||||
"c",
|
|
||||||
"Commons",
|
|
||||||
"d",
|
|
||||||
"dbdump",
|
|
||||||
"download",
|
|
||||||
"Draft",
|
|
||||||
"Education",
|
|
||||||
"Foundation",
|
|
||||||
"Gadget",
|
|
||||||
"Gadget definition",
|
|
||||||
"Gebruiker",
|
|
||||||
"gerrit",
|
|
||||||
"Help",
|
|
||||||
"Image",
|
|
||||||
"Incubator",
|
|
||||||
"m",
|
|
||||||
"mail",
|
|
||||||
"mailarchive",
|
|
||||||
"media",
|
|
||||||
"MediaWiki",
|
|
||||||
"MediaWiki talk",
|
|
||||||
"Mediawikiwiki",
|
|
||||||
"MediaZilla",
|
|
||||||
"Meta",
|
|
||||||
"Metawikipedia",
|
|
||||||
"Module",
|
|
||||||
"mw",
|
|
||||||
"n",
|
|
||||||
"nost",
|
|
||||||
"oldwikisource",
|
|
||||||
"otrs",
|
|
||||||
"OTRSwiki",
|
|
||||||
"Overleg gebruiker",
|
|
||||||
"outreach",
|
|
||||||
"outreachwiki",
|
|
||||||
"Portal",
|
|
||||||
"phab",
|
|
||||||
"Phabricator",
|
|
||||||
"Project",
|
|
||||||
"q",
|
|
||||||
"quality",
|
|
||||||
"rev",
|
|
||||||
"s",
|
|
||||||
"spcom",
|
|
||||||
"Special",
|
|
||||||
"species",
|
|
||||||
"Strategy",
|
|
||||||
"sulutil",
|
|
||||||
"svn",
|
|
||||||
"Talk",
|
|
||||||
"Template",
|
|
||||||
"Template talk",
|
|
||||||
"Testwiki",
|
|
||||||
"ticket",
|
|
||||||
"TimedText",
|
|
||||||
"Toollabs",
|
|
||||||
"tools",
|
|
||||||
"tswiki",
|
|
||||||
"User",
|
|
||||||
"User talk",
|
|
||||||
"v",
|
|
||||||
"voy",
|
|
||||||
"w",
|
|
||||||
"Wikibooks",
|
|
||||||
"Wikidata",
|
|
||||||
"wikiHow",
|
|
||||||
"Wikinvest",
|
|
||||||
"wikilivres",
|
|
||||||
"Wikimedia",
|
|
||||||
"Wikinews",
|
|
||||||
"Wikipedia",
|
|
||||||
"Wikipedia talk",
|
|
||||||
"Wikiquote",
|
|
||||||
"Wikisource",
|
|
||||||
"Wikispecies",
|
|
||||||
"Wikitech",
|
|
||||||
"Wikiversity",
|
|
||||||
"Wikivoyage",
|
|
||||||
"wikt",
|
|
||||||
"wiktionary",
|
|
||||||
"wmf",
|
|
||||||
"wmania",
|
|
||||||
"WP",
|
|
||||||
]
|
|
||||||
)
|
|
|
@ -1,179 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
"""Script to process Wikipedia and Wikidata dumps and create a knowledge base (KB)
|
|
||||||
with specific parameters. Intermediate files are written to disk.
|
|
||||||
|
|
||||||
Running the full pipeline on a standard laptop, may take up to 13 hours of processing.
|
|
||||||
Use the -p, -d and -s options to speed up processing using the intermediate files
|
|
||||||
from a previous run.
|
|
||||||
|
|
||||||
For the Wikidata dump: get the latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/
|
|
||||||
For the Wikipedia dump: get enwiki-latest-pages-articles-multistream.xml.bz2
|
|
||||||
from https://dumps.wikimedia.org/enwiki/latest/
|
|
||||||
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import logging
|
|
||||||
from pathlib import Path
|
|
||||||
import plac
|
|
||||||
|
|
||||||
from bin.wiki_entity_linking import wikipedia_processor as wp, wikidata_processor as wd
|
|
||||||
from bin.wiki_entity_linking import wiki_io as io
|
|
||||||
from bin.wiki_entity_linking import kb_creator
|
|
||||||
from bin.wiki_entity_linking import TRAINING_DATA_FILE, KB_FILE, ENTITY_DESCR_PATH, KB_MODEL_DIR, LOG_FORMAT
|
|
||||||
from bin.wiki_entity_linking import ENTITY_FREQ_PATH, PRIOR_PROB_PATH, ENTITY_DEFS_PATH, ENTITY_ALIAS_PATH
|
|
||||||
import spacy
|
|
||||||
from bin.wiki_entity_linking.kb_creator import read_kb
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
wd_json=("Path to the downloaded WikiData JSON dump.", "positional", None, Path),
|
|
||||||
wp_xml=("Path to the downloaded Wikipedia XML dump.", "positional", None, Path),
|
|
||||||
output_dir=("Output directory", "positional", None, Path),
|
|
||||||
model=("Model name or path, should include pretrained vectors.", "positional", None, str),
|
|
||||||
max_per_alias=("Max. # entities per alias (default 10)", "option", "a", int),
|
|
||||||
min_freq=("Min. count of an entity in the corpus (default 20)", "option", "f", int),
|
|
||||||
min_pair=("Min. count of entity-alias pairs (default 5)", "option", "c", int),
|
|
||||||
entity_vector_length=("Length of entity vectors (default 64)", "option", "v", int),
|
|
||||||
loc_prior_prob=("Location to file with prior probabilities", "option", "p", Path),
|
|
||||||
loc_entity_defs=("Location to file with entity definitions", "option", "d", Path),
|
|
||||||
loc_entity_desc=("Location to file with entity descriptions", "option", "s", Path),
|
|
||||||
descr_from_wp=("Flag for using descriptions from WP instead of WD (default False)", "flag", "wp"),
|
|
||||||
limit_prior=("Threshold to limit lines read from WP for prior probabilities", "option", "lp", int),
|
|
||||||
limit_train=("Threshold to limit lines read from WP for training set", "option", "lt", int),
|
|
||||||
limit_wd=("Threshold to limit lines read from WD", "option", "lw", int),
|
|
||||||
lang=("Optional language for which to get Wikidata titles. Defaults to 'en'", "option", "la", str),
|
|
||||||
)
|
|
||||||
def main(
|
|
||||||
wd_json,
|
|
||||||
wp_xml,
|
|
||||||
output_dir,
|
|
||||||
model,
|
|
||||||
max_per_alias=10,
|
|
||||||
min_freq=20,
|
|
||||||
min_pair=5,
|
|
||||||
entity_vector_length=64,
|
|
||||||
loc_prior_prob=None,
|
|
||||||
loc_entity_defs=None,
|
|
||||||
loc_entity_alias=None,
|
|
||||||
loc_entity_desc=None,
|
|
||||||
descr_from_wp=False,
|
|
||||||
limit_prior=None,
|
|
||||||
limit_train=None,
|
|
||||||
limit_wd=None,
|
|
||||||
lang="en",
|
|
||||||
):
|
|
||||||
entity_defs_path = loc_entity_defs if loc_entity_defs else output_dir / ENTITY_DEFS_PATH
|
|
||||||
entity_alias_path = loc_entity_alias if loc_entity_alias else output_dir / ENTITY_ALIAS_PATH
|
|
||||||
entity_descr_path = loc_entity_desc if loc_entity_desc else output_dir / ENTITY_DESCR_PATH
|
|
||||||
entity_freq_path = output_dir / ENTITY_FREQ_PATH
|
|
||||||
prior_prob_path = loc_prior_prob if loc_prior_prob else output_dir / PRIOR_PROB_PATH
|
|
||||||
training_entities_path = output_dir / TRAINING_DATA_FILE
|
|
||||||
kb_path = output_dir / KB_FILE
|
|
||||||
|
|
||||||
logger.info("Creating KB with Wikipedia and WikiData")
|
|
||||||
|
|
||||||
# STEP 0: set up IO
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir(parents=True)
|
|
||||||
|
|
||||||
# STEP 1: Load the NLP object
|
|
||||||
logger.info("STEP 1: Loading NLP model {}".format(model))
|
|
||||||
nlp = spacy.load(model)
|
|
||||||
|
|
||||||
# check the length of the nlp vectors
|
|
||||||
if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
|
|
||||||
raise ValueError(
|
|
||||||
"The `nlp` object should have access to pretrained word vectors, "
|
|
||||||
" cf. https://spacy.io/usage/models#languages."
|
|
||||||
)
|
|
||||||
|
|
||||||
# STEP 2: create prior probabilities from WP
|
|
||||||
if not prior_prob_path.exists():
|
|
||||||
# It takes about 2h to process 1000M lines of Wikipedia XML dump
|
|
||||||
logger.info("STEP 2: Writing prior probabilities to {}".format(prior_prob_path))
|
|
||||||
if limit_prior is not None:
|
|
||||||
logger.warning("Warning: reading only {} lines of Wikipedia dump".format(limit_prior))
|
|
||||||
wp.read_prior_probs(wp_xml, prior_prob_path, limit=limit_prior)
|
|
||||||
else:
|
|
||||||
logger.info("STEP 2: Reading prior probabilities from {}".format(prior_prob_path))
|
|
||||||
|
|
||||||
# STEP 3: calculate entity frequencies
|
|
||||||
if not entity_freq_path.exists():
|
|
||||||
logger.info("STEP 3: Calculating and writing entity frequencies to {}".format(entity_freq_path))
|
|
||||||
io.write_entity_to_count(prior_prob_path, entity_freq_path)
|
|
||||||
else:
|
|
||||||
logger.info("STEP 3: Reading entity frequencies from {}".format(entity_freq_path))
|
|
||||||
|
|
||||||
# STEP 4: reading definitions and (possibly) descriptions from WikiData or from file
|
|
||||||
if (not entity_defs_path.exists()) or (not descr_from_wp and not entity_descr_path.exists()):
|
|
||||||
# It takes about 10h to process 55M lines of Wikidata JSON dump
|
|
||||||
logger.info("STEP 4: Parsing and writing Wikidata entity definitions to {}".format(entity_defs_path))
|
|
||||||
if limit_wd is not None:
|
|
||||||
logger.warning("Warning: reading only {} lines of Wikidata dump".format(limit_wd))
|
|
||||||
title_to_id, id_to_descr, id_to_alias = wd.read_wikidata_entities_json(
|
|
||||||
wd_json,
|
|
||||||
limit_wd,
|
|
||||||
to_print=False,
|
|
||||||
lang=lang,
|
|
||||||
parse_descr=(not descr_from_wp),
|
|
||||||
)
|
|
||||||
io.write_title_to_id(entity_defs_path, title_to_id)
|
|
||||||
|
|
||||||
logger.info("STEP 4b: Writing Wikidata entity aliases to {}".format(entity_alias_path))
|
|
||||||
io.write_id_to_alias(entity_alias_path, id_to_alias)
|
|
||||||
|
|
||||||
if not descr_from_wp:
|
|
||||||
logger.info("STEP 4c: Writing Wikidata entity descriptions to {}".format(entity_descr_path))
|
|
||||||
io.write_id_to_descr(entity_descr_path, id_to_descr)
|
|
||||||
else:
|
|
||||||
logger.info("STEP 4: Reading entity definitions from {}".format(entity_defs_path))
|
|
||||||
logger.info("STEP 4b: Reading entity aliases from {}".format(entity_alias_path))
|
|
||||||
if not descr_from_wp:
|
|
||||||
logger.info("STEP 4c: Reading entity descriptions from {}".format(entity_descr_path))
|
|
||||||
|
|
||||||
# STEP 5: Getting gold entities from Wikipedia
|
|
||||||
if (not training_entities_path.exists()) or (descr_from_wp and not entity_descr_path.exists()):
|
|
||||||
logger.info("STEP 5: Parsing and writing Wikipedia gold entities to {}".format(training_entities_path))
|
|
||||||
if limit_train is not None:
|
|
||||||
logger.warning("Warning: reading only {} lines of Wikipedia dump".format(limit_train))
|
|
||||||
wp.create_training_and_desc(wp_xml, entity_defs_path, entity_descr_path,
|
|
||||||
training_entities_path, descr_from_wp, limit_train)
|
|
||||||
if descr_from_wp:
|
|
||||||
logger.info("STEP 5b: Parsing and writing Wikipedia descriptions to {}".format(entity_descr_path))
|
|
||||||
else:
|
|
||||||
logger.info("STEP 5: Reading gold entities from {}".format(training_entities_path))
|
|
||||||
if descr_from_wp:
|
|
||||||
logger.info("STEP 5b: Reading entity descriptions from {}".format(entity_descr_path))
|
|
||||||
|
|
||||||
# STEP 6: creating the actual KB
|
|
||||||
# It takes ca. 30 minutes to pretrain the entity embeddings
|
|
||||||
if not kb_path.exists():
|
|
||||||
logger.info("STEP 6: Creating the KB at {}".format(kb_path))
|
|
||||||
kb = kb_creator.create_kb(
|
|
||||||
nlp=nlp,
|
|
||||||
max_entities_per_alias=max_per_alias,
|
|
||||||
min_entity_freq=min_freq,
|
|
||||||
min_occ=min_pair,
|
|
||||||
entity_def_path=entity_defs_path,
|
|
||||||
entity_descr_path=entity_descr_path,
|
|
||||||
entity_alias_path=entity_alias_path,
|
|
||||||
entity_freq_path=entity_freq_path,
|
|
||||||
prior_prob_path=prior_prob_path,
|
|
||||||
entity_vector_length=entity_vector_length,
|
|
||||||
)
|
|
||||||
kb.dump(kb_path)
|
|
||||||
logger.info("kb entities: {}".format(kb.get_size_entities()))
|
|
||||||
logger.info("kb aliases: {}".format(kb.get_size_aliases()))
|
|
||||||
nlp.to_disk(output_dir / KB_MODEL_DIR)
|
|
||||||
else:
|
|
||||||
logger.info("STEP 6: KB already exists at {}".format(kb_path))
|
|
||||||
|
|
||||||
logger.info("Done!")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
|
|
||||||
plac.call(main)
|
|
|
@ -1,154 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import bz2
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
|
|
||||||
from bin.wiki_entity_linking.wiki_namespaces import WD_META_ITEMS
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def read_wikidata_entities_json(wikidata_file, limit=None, to_print=False, lang="en", parse_descr=True):
|
|
||||||
# Read the JSON wiki data and parse out the entities. Takes about 7-10h to parse 55M lines.
|
|
||||||
# get latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/
|
|
||||||
|
|
||||||
site_filter = '{}wiki'.format(lang)
|
|
||||||
|
|
||||||
# filter: currently defined as OR: one hit suffices to be removed from further processing
|
|
||||||
exclude_list = WD_META_ITEMS
|
|
||||||
|
|
||||||
# punctuation
|
|
||||||
exclude_list.extend(["Q1383557", "Q10617810"])
|
|
||||||
|
|
||||||
# letters etc
|
|
||||||
exclude_list.extend(["Q188725", "Q19776628", "Q3841820", "Q17907810", "Q9788", "Q9398093"])
|
|
||||||
|
|
||||||
neg_prop_filter = {
|
|
||||||
'P31': exclude_list, # instance of
|
|
||||||
'P279': exclude_list # subclass
|
|
||||||
}
|
|
||||||
|
|
||||||
title_to_id = dict()
|
|
||||||
id_to_descr = dict()
|
|
||||||
id_to_alias = dict()
|
|
||||||
|
|
||||||
# parse appropriate fields - depending on what we need in the KB
|
|
||||||
parse_properties = False
|
|
||||||
parse_sitelinks = True
|
|
||||||
parse_labels = False
|
|
||||||
parse_aliases = True
|
|
||||||
parse_claims = True
|
|
||||||
|
|
||||||
with bz2.open(wikidata_file, mode='rb') as file:
|
|
||||||
for cnt, line in enumerate(file):
|
|
||||||
if limit and cnt >= limit:
|
|
||||||
break
|
|
||||||
if cnt % 500000 == 0 and cnt > 0:
|
|
||||||
logger.info("processed {} lines of WikiData JSON dump".format(cnt))
|
|
||||||
clean_line = line.strip()
|
|
||||||
if clean_line.endswith(b","):
|
|
||||||
clean_line = clean_line[:-1]
|
|
||||||
if len(clean_line) > 1:
|
|
||||||
obj = json.loads(clean_line)
|
|
||||||
entry_type = obj["type"]
|
|
||||||
|
|
||||||
if entry_type == "item":
|
|
||||||
keep = True
|
|
||||||
|
|
||||||
claims = obj["claims"]
|
|
||||||
if parse_claims:
|
|
||||||
for prop, value_set in neg_prop_filter.items():
|
|
||||||
claim_property = claims.get(prop, None)
|
|
||||||
if claim_property:
|
|
||||||
for cp in claim_property:
|
|
||||||
cp_id = (
|
|
||||||
cp["mainsnak"]
|
|
||||||
.get("datavalue", {})
|
|
||||||
.get("value", {})
|
|
||||||
.get("id")
|
|
||||||
)
|
|
||||||
cp_rank = cp["rank"]
|
|
||||||
if cp_rank != "deprecated" and cp_id in value_set:
|
|
||||||
keep = False
|
|
||||||
|
|
||||||
if keep:
|
|
||||||
unique_id = obj["id"]
|
|
||||||
|
|
||||||
if to_print:
|
|
||||||
print("ID:", unique_id)
|
|
||||||
print("type:", entry_type)
|
|
||||||
|
|
||||||
# parsing all properties that refer to other entities
|
|
||||||
if parse_properties:
|
|
||||||
for prop, claim_property in claims.items():
|
|
||||||
cp_dicts = [
|
|
||||||
cp["mainsnak"]["datavalue"].get("value")
|
|
||||||
for cp in claim_property
|
|
||||||
if cp["mainsnak"].get("datavalue")
|
|
||||||
]
|
|
||||||
cp_values = [
|
|
||||||
cp_dict.get("id")
|
|
||||||
for cp_dict in cp_dicts
|
|
||||||
if isinstance(cp_dict, dict)
|
|
||||||
if cp_dict.get("id") is not None
|
|
||||||
]
|
|
||||||
if cp_values:
|
|
||||||
if to_print:
|
|
||||||
print("prop:", prop, cp_values)
|
|
||||||
|
|
||||||
found_link = False
|
|
||||||
if parse_sitelinks:
|
|
||||||
site_value = obj["sitelinks"].get(site_filter, None)
|
|
||||||
if site_value:
|
|
||||||
site = site_value["title"]
|
|
||||||
if to_print:
|
|
||||||
print(site_filter, ":", site)
|
|
||||||
title_to_id[site] = unique_id
|
|
||||||
found_link = True
|
|
||||||
|
|
||||||
if parse_labels:
|
|
||||||
labels = obj["labels"]
|
|
||||||
if labels:
|
|
||||||
lang_label = labels.get(lang, None)
|
|
||||||
if lang_label:
|
|
||||||
if to_print:
|
|
||||||
print(
|
|
||||||
"label (" + lang + "):", lang_label["value"]
|
|
||||||
)
|
|
||||||
|
|
||||||
if found_link and parse_descr:
|
|
||||||
descriptions = obj["descriptions"]
|
|
||||||
if descriptions:
|
|
||||||
lang_descr = descriptions.get(lang, None)
|
|
||||||
if lang_descr:
|
|
||||||
if to_print:
|
|
||||||
print(
|
|
||||||
"description (" + lang + "):",
|
|
||||||
lang_descr["value"],
|
|
||||||
)
|
|
||||||
id_to_descr[unique_id] = lang_descr["value"]
|
|
||||||
|
|
||||||
if parse_aliases:
|
|
||||||
aliases = obj["aliases"]
|
|
||||||
if aliases:
|
|
||||||
lang_aliases = aliases.get(lang, None)
|
|
||||||
if lang_aliases:
|
|
||||||
for item in lang_aliases:
|
|
||||||
if to_print:
|
|
||||||
print(
|
|
||||||
"alias (" + lang + "):", item["value"]
|
|
||||||
)
|
|
||||||
alias_list = id_to_alias.get(unique_id, [])
|
|
||||||
alias_list.append(item["value"])
|
|
||||||
id_to_alias[unique_id] = alias_list
|
|
||||||
|
|
||||||
if to_print:
|
|
||||||
print()
|
|
||||||
|
|
||||||
# log final number of lines processed
|
|
||||||
logger.info("Finished. Processed {} lines of WikiData JSON dump".format(cnt))
|
|
||||||
return title_to_id, id_to_descr, id_to_alias
|
|
||||||
|
|
||||||
|
|
|
@ -1,172 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
"""Script that takes a previously created Knowledge Base and trains an entity linking
|
|
||||||
pipeline. The provided KB directory should hold the kb, the original nlp object and
|
|
||||||
its vocab used to create the KB, and a few auxiliary files such as the entity definitions,
|
|
||||||
as created by the script `wikidata_create_kb`.
|
|
||||||
|
|
||||||
For the Wikipedia dump: get enwiki-latest-pages-articles-multistream.xml.bz2
|
|
||||||
from https://dumps.wikimedia.org/enwiki/latest/
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import random
|
|
||||||
import logging
|
|
||||||
import spacy
|
|
||||||
from pathlib import Path
|
|
||||||
import plac
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
from bin.wiki_entity_linking import wikipedia_processor
|
|
||||||
from bin.wiki_entity_linking import TRAINING_DATA_FILE, KB_MODEL_DIR, KB_FILE, LOG_FORMAT, OUTPUT_MODEL_DIR
|
|
||||||
from bin.wiki_entity_linking.entity_linker_evaluation import measure_performance
|
|
||||||
from bin.wiki_entity_linking.kb_creator import read_kb
|
|
||||||
|
|
||||||
from spacy.util import minibatch, compounding
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
dir_kb=("Directory with KB, NLP and related files", "positional", None, Path),
|
|
||||||
output_dir=("Output directory", "option", "o", Path),
|
|
||||||
loc_training=("Location to training data", "option", "k", Path),
|
|
||||||
epochs=("Number of training iterations (default 10)", "option", "e", int),
|
|
||||||
dropout=("Dropout to prevent overfitting (default 0.5)", "option", "p", float),
|
|
||||||
lr=("Learning rate (default 0.005)", "option", "n", float),
|
|
||||||
l2=("L2 regularization", "option", "r", float),
|
|
||||||
train_articles=("# training articles (default 90% of all)", "option", "t", int),
|
|
||||||
dev_articles=("# dev test articles (default 10% of all)", "option", "d", int),
|
|
||||||
labels_discard=("NER labels to discard (default None)", "option", "l", str),
|
|
||||||
)
|
|
||||||
def main(
|
|
||||||
dir_kb,
|
|
||||||
output_dir=None,
|
|
||||||
loc_training=None,
|
|
||||||
epochs=10,
|
|
||||||
dropout=0.5,
|
|
||||||
lr=0.005,
|
|
||||||
l2=1e-6,
|
|
||||||
train_articles=None,
|
|
||||||
dev_articles=None,
|
|
||||||
labels_discard=None
|
|
||||||
):
|
|
||||||
if not output_dir:
|
|
||||||
logger.warning("No output dir specified so no results will be written, are you sure about this ?")
|
|
||||||
|
|
||||||
logger.info("Creating Entity Linker with Wikipedia and WikiData")
|
|
||||||
|
|
||||||
output_dir = Path(output_dir) if output_dir else dir_kb
|
|
||||||
training_path = loc_training if loc_training else dir_kb / TRAINING_DATA_FILE
|
|
||||||
nlp_dir = dir_kb / KB_MODEL_DIR
|
|
||||||
kb_path = dir_kb / KB_FILE
|
|
||||||
nlp_output_dir = output_dir / OUTPUT_MODEL_DIR
|
|
||||||
|
|
||||||
# STEP 0: set up IO
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir()
|
|
||||||
|
|
||||||
# STEP 1 : load the NLP object
|
|
||||||
logger.info("STEP 1a: Loading model from {}".format(nlp_dir))
|
|
||||||
nlp = spacy.load(nlp_dir)
|
|
||||||
logger.info("Original NLP pipeline has following pipeline components: {}".format(nlp.pipe_names))
|
|
||||||
|
|
||||||
# check that there is a NER component in the pipeline
|
|
||||||
if "ner" not in nlp.pipe_names:
|
|
||||||
raise ValueError("The `nlp` object should have a pretrained `ner` component.")
|
|
||||||
|
|
||||||
logger.info("STEP 1b: Loading KB from {}".format(kb_path))
|
|
||||||
kb = read_kb(nlp, kb_path)
|
|
||||||
|
|
||||||
# STEP 2: read the training dataset previously created from WP
|
|
||||||
logger.info("STEP 2: Reading training & dev dataset from {}".format(training_path))
|
|
||||||
train_indices, dev_indices = wikipedia_processor.read_training_indices(training_path)
|
|
||||||
logger.info("Training set has {} articles, limit set to roughly {} articles per epoch"
|
|
||||||
.format(len(train_indices), train_articles if train_articles else "all"))
|
|
||||||
logger.info("Dev set has {} articles, limit set to rougly {} articles for evaluation"
|
|
||||||
.format(len(dev_indices), dev_articles if dev_articles else "all"))
|
|
||||||
if dev_articles:
|
|
||||||
dev_indices = dev_indices[0:dev_articles]
|
|
||||||
|
|
||||||
# STEP 3: create and train an entity linking pipe
|
|
||||||
logger.info("STEP 3: Creating and training an Entity Linking pipe for {} epochs".format(epochs))
|
|
||||||
if labels_discard:
|
|
||||||
labels_discard = [x.strip() for x in labels_discard.split(",")]
|
|
||||||
logger.info("Discarding {} NER types: {}".format(len(labels_discard), labels_discard))
|
|
||||||
else:
|
|
||||||
labels_discard = []
|
|
||||||
|
|
||||||
el_pipe = nlp.create_pipe(
|
|
||||||
name="entity_linker", config={"pretrained_vectors": nlp.vocab.vectors.name,
|
|
||||||
"labels_discard": labels_discard}
|
|
||||||
)
|
|
||||||
el_pipe.set_kb(kb)
|
|
||||||
nlp.add_pipe(el_pipe, last=True)
|
|
||||||
|
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
|
|
||||||
with nlp.disable_pipes(*other_pipes): # only train Entity Linking
|
|
||||||
optimizer = nlp.begin_training()
|
|
||||||
optimizer.learn_rate = lr
|
|
||||||
optimizer.L2 = l2
|
|
||||||
|
|
||||||
logger.info("Dev Baseline Accuracies:")
|
|
||||||
dev_data = wikipedia_processor.read_el_docs_golds(nlp=nlp, entity_file_path=training_path,
|
|
||||||
dev=True, line_ids=dev_indices,
|
|
||||||
kb=kb, labels_discard=labels_discard)
|
|
||||||
|
|
||||||
measure_performance(dev_data, kb, el_pipe, baseline=True, context=False, dev_limit=len(dev_indices))
|
|
||||||
|
|
||||||
for itn in range(epochs):
|
|
||||||
random.shuffle(train_indices)
|
|
||||||
losses = {}
|
|
||||||
batches = minibatch(train_indices, size=compounding(8.0, 128.0, 1.001))
|
|
||||||
batchnr = 0
|
|
||||||
articles_processed = 0
|
|
||||||
|
|
||||||
# we either process the whole training file, or just a part each epoch
|
|
||||||
bar_total = len(train_indices)
|
|
||||||
if train_articles:
|
|
||||||
bar_total = train_articles
|
|
||||||
|
|
||||||
with tqdm(total=bar_total, leave=False, desc='Epoch ' + str(itn)) as pbar:
|
|
||||||
for batch in batches:
|
|
||||||
if not train_articles or articles_processed < train_articles:
|
|
||||||
with nlp.disable_pipes("entity_linker"):
|
|
||||||
train_batch = wikipedia_processor.read_el_docs_golds(nlp=nlp, entity_file_path=training_path,
|
|
||||||
dev=False, line_ids=batch,
|
|
||||||
kb=kb, labels_discard=labels_discard)
|
|
||||||
docs, golds = zip(*train_batch)
|
|
||||||
try:
|
|
||||||
with nlp.disable_pipes(*other_pipes):
|
|
||||||
nlp.update(
|
|
||||||
docs=docs,
|
|
||||||
golds=golds,
|
|
||||||
sgd=optimizer,
|
|
||||||
drop=dropout,
|
|
||||||
losses=losses,
|
|
||||||
)
|
|
||||||
batchnr += 1
|
|
||||||
articles_processed += len(docs)
|
|
||||||
pbar.update(len(docs))
|
|
||||||
except Exception as e:
|
|
||||||
logger.error("Error updating batch:" + str(e))
|
|
||||||
if batchnr > 0:
|
|
||||||
logging.info("Epoch {} trained on {} articles, train loss {}"
|
|
||||||
.format(itn, articles_processed, round(losses["entity_linker"] / batchnr, 2)))
|
|
||||||
# re-read the dev_data (data is returned as a generator)
|
|
||||||
dev_data = wikipedia_processor.read_el_docs_golds(nlp=nlp, entity_file_path=training_path,
|
|
||||||
dev=True, line_ids=dev_indices,
|
|
||||||
kb=kb, labels_discard=labels_discard)
|
|
||||||
measure_performance(dev_data, kb, el_pipe, baseline=False, context=True, dev_limit=len(dev_indices))
|
|
||||||
|
|
||||||
if output_dir:
|
|
||||||
# STEP 4: write the NLP pipeline (now including an EL model) to file
|
|
||||||
logger.info("Final NLP pipeline has following pipeline components: {}".format(nlp.pipe_names))
|
|
||||||
logger.info("STEP 4: Writing trained NLP to {}".format(nlp_output_dir))
|
|
||||||
nlp.to_disk(nlp_output_dir)
|
|
||||||
|
|
||||||
logger.info("Done!")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
|
|
||||||
plac.call(main)
|
|
|
@ -1,565 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import re
|
|
||||||
import bz2
|
|
||||||
import logging
|
|
||||||
import random
|
|
||||||
import json
|
|
||||||
|
|
||||||
from spacy.gold import GoldParse
|
|
||||||
from bin.wiki_entity_linking import wiki_io as io
|
|
||||||
from bin.wiki_entity_linking.wiki_namespaces import (
|
|
||||||
WP_META_NAMESPACE,
|
|
||||||
WP_FILE_NAMESPACE,
|
|
||||||
WP_CATEGORY_NAMESPACE,
|
|
||||||
)
|
|
||||||
|
|
||||||
"""
|
|
||||||
Process a Wikipedia dump to calculate entity frequencies and prior probabilities in combination with certain mentions.
|
|
||||||
Write these results to file for downstream KB and training data generation.
|
|
||||||
|
|
||||||
Process Wikipedia interlinks to generate a training dataset for the EL algorithm.
|
|
||||||
"""
|
|
||||||
|
|
||||||
ENTITY_FILE = "gold_entities.csv"
|
|
||||||
|
|
||||||
map_alias_to_link = dict()
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
title_regex = re.compile(r"(?<=<title>).*(?=</title>)")
|
|
||||||
id_regex = re.compile(r"(?<=<id>)\d*(?=</id>)")
|
|
||||||
text_regex = re.compile(r"(?<=<text xml:space=\"preserve\">).*(?=</text)")
|
|
||||||
info_regex = re.compile(r"{[^{]*?}")
|
|
||||||
html_regex = re.compile(r"<!--[^-]*-->")
|
|
||||||
ref_regex = re.compile(r"<ref.*?>") # non-greedy
|
|
||||||
ref_2_regex = re.compile(r"</ref.*?>") # non-greedy
|
|
||||||
|
|
||||||
# find the links
|
|
||||||
link_regex = re.compile(r"\[\[[^\[\]]*\]\]")
|
|
||||||
|
|
||||||
# match on interwiki links, e.g. `en:` or `:fr:`
|
|
||||||
ns_regex = r":?" + "[a-z][a-z]" + ":"
|
|
||||||
# match on Namespace: optionally preceded by a :
|
|
||||||
for ns in WP_META_NAMESPACE:
|
|
||||||
ns_regex += "|" + ":?" + ns + ":"
|
|
||||||
ns_regex = re.compile(ns_regex, re.IGNORECASE)
|
|
||||||
|
|
||||||
files = r""
|
|
||||||
for f in WP_FILE_NAMESPACE:
|
|
||||||
files += "\[\[" + f + ":[^[\]]+]]" + "|"
|
|
||||||
files = files[0 : len(files) - 1]
|
|
||||||
file_regex = re.compile(files)
|
|
||||||
|
|
||||||
cats = r""
|
|
||||||
for c in WP_CATEGORY_NAMESPACE:
|
|
||||||
cats += "\[\[" + c + ":[^\[]*]]" + "|"
|
|
||||||
cats = cats[0 : len(cats) - 1]
|
|
||||||
category_regex = re.compile(cats)
|
|
||||||
|
|
||||||
|
|
||||||
def read_prior_probs(wikipedia_input, prior_prob_output, limit=None):
|
|
||||||
"""
|
|
||||||
Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities.
|
|
||||||
The full file takes about 2-3h to parse 1100M lines.
|
|
||||||
It works relatively fast because it runs line by line, irrelevant of which article the intrawiki is from,
|
|
||||||
though dev test articles are excluded in order not to get an artificially strong baseline.
|
|
||||||
"""
|
|
||||||
cnt = 0
|
|
||||||
read_id = False
|
|
||||||
current_article_id = None
|
|
||||||
with bz2.open(wikipedia_input, mode="rb") as file:
|
|
||||||
line = file.readline()
|
|
||||||
while line and (not limit or cnt < limit):
|
|
||||||
if cnt % 25000000 == 0 and cnt > 0:
|
|
||||||
logger.info("processed {} lines of Wikipedia XML dump".format(cnt))
|
|
||||||
clean_line = line.strip().decode("utf-8")
|
|
||||||
|
|
||||||
# we attempt at reading the article's ID (but not the revision or contributor ID)
|
|
||||||
if "<revision>" in clean_line or "<contributor>" in clean_line:
|
|
||||||
read_id = False
|
|
||||||
if "<page>" in clean_line:
|
|
||||||
read_id = True
|
|
||||||
|
|
||||||
if read_id:
|
|
||||||
ids = id_regex.search(clean_line)
|
|
||||||
if ids:
|
|
||||||
current_article_id = ids[0]
|
|
||||||
|
|
||||||
# only processing prior probabilities from true training (non-dev) articles
|
|
||||||
if not is_dev(current_article_id):
|
|
||||||
aliases, entities, normalizations = get_wp_links(clean_line)
|
|
||||||
for alias, entity, norm in zip(aliases, entities, normalizations):
|
|
||||||
_store_alias(
|
|
||||||
alias, entity, normalize_alias=norm, normalize_entity=True
|
|
||||||
)
|
|
||||||
|
|
||||||
line = file.readline()
|
|
||||||
cnt += 1
|
|
||||||
logger.info("processed {} lines of Wikipedia XML dump".format(cnt))
|
|
||||||
logger.info("Finished. processed {} lines of Wikipedia XML dump".format(cnt))
|
|
||||||
|
|
||||||
# write all aliases and their entities and count occurrences to file
|
|
||||||
with prior_prob_output.open("w", encoding="utf8") as outputfile:
|
|
||||||
outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n")
|
|
||||||
for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]):
|
|
||||||
s_dict = sorted(alias_dict.items(), key=lambda x: x[1], reverse=True)
|
|
||||||
for entity, count in s_dict:
|
|
||||||
outputfile.write(alias + "|" + str(count) + "|" + entity + "\n")
|
|
||||||
|
|
||||||
|
|
||||||
def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True):
|
|
||||||
alias = alias.strip()
|
|
||||||
entity = entity.strip()
|
|
||||||
|
|
||||||
# remove everything after # as this is not part of the title but refers to a specific paragraph
|
|
||||||
if normalize_entity:
|
|
||||||
# wikipedia titles are always capitalized
|
|
||||||
entity = _capitalize_first(entity.split("#")[0])
|
|
||||||
if normalize_alias:
|
|
||||||
alias = alias.split("#")[0]
|
|
||||||
|
|
||||||
if alias and entity:
|
|
||||||
alias_dict = map_alias_to_link.get(alias, dict())
|
|
||||||
entity_count = alias_dict.get(entity, 0)
|
|
||||||
alias_dict[entity] = entity_count + 1
|
|
||||||
map_alias_to_link[alias] = alias_dict
|
|
||||||
|
|
||||||
|
|
||||||
def get_wp_links(text):
|
|
||||||
aliases = []
|
|
||||||
entities = []
|
|
||||||
normalizations = []
|
|
||||||
|
|
||||||
matches = link_regex.findall(text)
|
|
||||||
for match in matches:
|
|
||||||
match = match[2:][:-2].replace("_", " ").strip()
|
|
||||||
|
|
||||||
if ns_regex.match(match):
|
|
||||||
pass # ignore the entity if it points to a "meta" page
|
|
||||||
|
|
||||||
# this is a simple [[link]], with the alias the same as the mention
|
|
||||||
elif "|" not in match:
|
|
||||||
aliases.append(match)
|
|
||||||
entities.append(match)
|
|
||||||
normalizations.append(True)
|
|
||||||
|
|
||||||
# in wiki format, the link is written as [[entity|alias]]
|
|
||||||
else:
|
|
||||||
splits = match.split("|")
|
|
||||||
entity = splits[0].strip()
|
|
||||||
alias = splits[1].strip()
|
|
||||||
# specific wiki format [[alias (specification)|]]
|
|
||||||
if len(alias) == 0 and "(" in entity:
|
|
||||||
alias = entity.split("(")[0]
|
|
||||||
aliases.append(alias)
|
|
||||||
entities.append(entity)
|
|
||||||
normalizations.append(False)
|
|
||||||
else:
|
|
||||||
aliases.append(alias)
|
|
||||||
entities.append(entity)
|
|
||||||
normalizations.append(False)
|
|
||||||
|
|
||||||
return aliases, entities, normalizations
|
|
||||||
|
|
||||||
|
|
||||||
def _capitalize_first(text):
|
|
||||||
if not text:
|
|
||||||
return None
|
|
||||||
result = text[0].capitalize()
|
|
||||||
if len(result) > 0:
|
|
||||||
result += text[1:]
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def create_training_and_desc(
|
|
||||||
wp_input, def_input, desc_output, training_output, parse_desc, limit=None
|
|
||||||
):
|
|
||||||
wp_to_id = io.read_title_to_id(def_input)
|
|
||||||
_process_wikipedia_texts(
|
|
||||||
wp_input, wp_to_id, desc_output, training_output, parse_desc, limit
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _process_wikipedia_texts(
|
|
||||||
wikipedia_input, wp_to_id, output, training_output, parse_descriptions, limit=None
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Read the XML wikipedia data to parse out training data:
|
|
||||||
raw text data + positive instances
|
|
||||||
"""
|
|
||||||
|
|
||||||
read_ids = set()
|
|
||||||
|
|
||||||
with output.open("a", encoding="utf8") as descr_file, training_output.open(
|
|
||||||
"w", encoding="utf8"
|
|
||||||
) as entity_file:
|
|
||||||
if parse_descriptions:
|
|
||||||
_write_training_description(descr_file, "WD_id", "description")
|
|
||||||
with bz2.open(wikipedia_input, mode="rb") as file:
|
|
||||||
article_count = 0
|
|
||||||
article_text = ""
|
|
||||||
article_title = None
|
|
||||||
article_id = None
|
|
||||||
reading_text = False
|
|
||||||
reading_revision = False
|
|
||||||
|
|
||||||
for line in file:
|
|
||||||
clean_line = line.strip().decode("utf-8")
|
|
||||||
|
|
||||||
if clean_line == "<revision>":
|
|
||||||
reading_revision = True
|
|
||||||
elif clean_line == "</revision>":
|
|
||||||
reading_revision = False
|
|
||||||
|
|
||||||
# Start reading new page
|
|
||||||
if clean_line == "<page>":
|
|
||||||
article_text = ""
|
|
||||||
article_title = None
|
|
||||||
article_id = None
|
|
||||||
# finished reading this page
|
|
||||||
elif clean_line == "</page>":
|
|
||||||
if article_id:
|
|
||||||
clean_text, entities = _process_wp_text(
|
|
||||||
article_title, article_text, wp_to_id
|
|
||||||
)
|
|
||||||
if clean_text is not None and entities is not None:
|
|
||||||
_write_training_entities(
|
|
||||||
entity_file, article_id, clean_text, entities
|
|
||||||
)
|
|
||||||
|
|
||||||
if article_title in wp_to_id and parse_descriptions:
|
|
||||||
description = " ".join(
|
|
||||||
clean_text[:1000].split(" ")[:-1]
|
|
||||||
)
|
|
||||||
_write_training_description(
|
|
||||||
descr_file, wp_to_id[article_title], description
|
|
||||||
)
|
|
||||||
article_count += 1
|
|
||||||
if article_count % 10000 == 0 and article_count > 0:
|
|
||||||
logger.info(
|
|
||||||
"Processed {} articles".format(article_count)
|
|
||||||
)
|
|
||||||
if limit and article_count >= limit:
|
|
||||||
break
|
|
||||||
article_text = ""
|
|
||||||
article_title = None
|
|
||||||
article_id = None
|
|
||||||
reading_text = False
|
|
||||||
reading_revision = False
|
|
||||||
|
|
||||||
# start reading text within a page
|
|
||||||
if "<text" in clean_line:
|
|
||||||
reading_text = True
|
|
||||||
|
|
||||||
if reading_text:
|
|
||||||
article_text += " " + clean_line
|
|
||||||
|
|
||||||
# stop reading text within a page (we assume a new page doesn't start on the same line)
|
|
||||||
if "</text" in clean_line:
|
|
||||||
reading_text = False
|
|
||||||
|
|
||||||
# read the ID of this article (outside the revision portion of the document)
|
|
||||||
if not reading_revision:
|
|
||||||
ids = id_regex.search(clean_line)
|
|
||||||
if ids:
|
|
||||||
article_id = ids[0]
|
|
||||||
if article_id in read_ids:
|
|
||||||
logger.info(
|
|
||||||
"Found duplicate article ID", article_id, clean_line
|
|
||||||
) # This should never happen ...
|
|
||||||
read_ids.add(article_id)
|
|
||||||
|
|
||||||
# read the title of this article (outside the revision portion of the document)
|
|
||||||
if not reading_revision:
|
|
||||||
titles = title_regex.search(clean_line)
|
|
||||||
if titles:
|
|
||||||
article_title = titles[0].strip()
|
|
||||||
logger.info("Finished. Processed {} articles".format(article_count))
|
|
||||||
|
|
||||||
|
|
||||||
def _process_wp_text(article_title, article_text, wp_to_id):
|
|
||||||
# ignore meta Wikipedia pages
|
|
||||||
if ns_regex.match(article_title):
|
|
||||||
return None, None
|
|
||||||
|
|
||||||
# remove the text tags
|
|
||||||
text_search = text_regex.search(article_text)
|
|
||||||
if text_search is None:
|
|
||||||
return None, None
|
|
||||||
text = text_search.group(0)
|
|
||||||
|
|
||||||
# stop processing if this is a redirect page
|
|
||||||
if text.startswith("#REDIRECT"):
|
|
||||||
return None, None
|
|
||||||
|
|
||||||
# get the raw text without markup etc, keeping only interwiki links
|
|
||||||
clean_text, entities = _remove_links(_get_clean_wp_text(text), wp_to_id)
|
|
||||||
return clean_text, entities
|
|
||||||
|
|
||||||
|
|
||||||
def _get_clean_wp_text(article_text):
|
|
||||||
clean_text = article_text.strip()
|
|
||||||
|
|
||||||
# remove bolding & italic markup
|
|
||||||
clean_text = clean_text.replace("'''", "")
|
|
||||||
clean_text = clean_text.replace("''", "")
|
|
||||||
|
|
||||||
# remove nested {{info}} statements by removing the inner/smallest ones first and iterating
|
|
||||||
try_again = True
|
|
||||||
previous_length = len(clean_text)
|
|
||||||
while try_again:
|
|
||||||
clean_text = info_regex.sub(
|
|
||||||
"", clean_text
|
|
||||||
) # non-greedy match excluding a nested {
|
|
||||||
if len(clean_text) < previous_length:
|
|
||||||
try_again = True
|
|
||||||
else:
|
|
||||||
try_again = False
|
|
||||||
previous_length = len(clean_text)
|
|
||||||
|
|
||||||
# remove HTML comments
|
|
||||||
clean_text = html_regex.sub("", clean_text)
|
|
||||||
|
|
||||||
# remove Category and File statements
|
|
||||||
clean_text = category_regex.sub("", clean_text)
|
|
||||||
clean_text = file_regex.sub("", clean_text)
|
|
||||||
|
|
||||||
# remove multiple =
|
|
||||||
while "==" in clean_text:
|
|
||||||
clean_text = clean_text.replace("==", "=")
|
|
||||||
|
|
||||||
clean_text = clean_text.replace(". =", ".")
|
|
||||||
clean_text = clean_text.replace(" = ", ". ")
|
|
||||||
clean_text = clean_text.replace("= ", ".")
|
|
||||||
clean_text = clean_text.replace(" =", "")
|
|
||||||
|
|
||||||
# remove refs (non-greedy match)
|
|
||||||
clean_text = ref_regex.sub("", clean_text)
|
|
||||||
clean_text = ref_2_regex.sub("", clean_text)
|
|
||||||
|
|
||||||
# remove additional wikiformatting
|
|
||||||
clean_text = re.sub(r"<blockquote>", "", clean_text)
|
|
||||||
clean_text = re.sub(r"</blockquote>", "", clean_text)
|
|
||||||
|
|
||||||
# change special characters back to normal ones
|
|
||||||
clean_text = clean_text.replace(r"<", "<")
|
|
||||||
clean_text = clean_text.replace(r">", ">")
|
|
||||||
clean_text = clean_text.replace(r""", '"')
|
|
||||||
clean_text = clean_text.replace(r"&nbsp;", " ")
|
|
||||||
clean_text = clean_text.replace(r"&", "&")
|
|
||||||
|
|
||||||
# remove multiple spaces
|
|
||||||
while " " in clean_text:
|
|
||||||
clean_text = clean_text.replace(" ", " ")
|
|
||||||
|
|
||||||
return clean_text.strip()
|
|
||||||
|
|
||||||
|
|
||||||
def _remove_links(clean_text, wp_to_id):
|
|
||||||
# read the text char by char to get the right offsets for the interwiki links
|
|
||||||
entities = []
|
|
||||||
final_text = ""
|
|
||||||
open_read = 0
|
|
||||||
reading_text = True
|
|
||||||
reading_entity = False
|
|
||||||
reading_mention = False
|
|
||||||
reading_special_case = False
|
|
||||||
entity_buffer = ""
|
|
||||||
mention_buffer = ""
|
|
||||||
for index, letter in enumerate(clean_text):
|
|
||||||
if letter == "[":
|
|
||||||
open_read += 1
|
|
||||||
elif letter == "]":
|
|
||||||
open_read -= 1
|
|
||||||
elif letter == "|":
|
|
||||||
if reading_text:
|
|
||||||
final_text += letter
|
|
||||||
# switch from reading entity to mention in the [[entity|mention]] pattern
|
|
||||||
elif reading_entity:
|
|
||||||
reading_text = False
|
|
||||||
reading_entity = False
|
|
||||||
reading_mention = True
|
|
||||||
else:
|
|
||||||
reading_special_case = True
|
|
||||||
else:
|
|
||||||
if reading_entity:
|
|
||||||
entity_buffer += letter
|
|
||||||
elif reading_mention:
|
|
||||||
mention_buffer += letter
|
|
||||||
elif reading_text:
|
|
||||||
final_text += letter
|
|
||||||
else:
|
|
||||||
raise ValueError("Not sure at point", clean_text[index - 2 : index + 2])
|
|
||||||
|
|
||||||
if open_read > 2:
|
|
||||||
reading_special_case = True
|
|
||||||
|
|
||||||
if open_read == 2 and reading_text:
|
|
||||||
reading_text = False
|
|
||||||
reading_entity = True
|
|
||||||
reading_mention = False
|
|
||||||
|
|
||||||
# we just finished reading an entity
|
|
||||||
if open_read == 0 and not reading_text:
|
|
||||||
if "#" in entity_buffer or entity_buffer.startswith(":"):
|
|
||||||
reading_special_case = True
|
|
||||||
# Ignore cases with nested structures like File: handles etc
|
|
||||||
if not reading_special_case:
|
|
||||||
if not mention_buffer:
|
|
||||||
mention_buffer = entity_buffer
|
|
||||||
start = len(final_text)
|
|
||||||
end = start + len(mention_buffer)
|
|
||||||
qid = wp_to_id.get(entity_buffer, None)
|
|
||||||
if qid:
|
|
||||||
entities.append((mention_buffer, qid, start, end))
|
|
||||||
final_text += mention_buffer
|
|
||||||
|
|
||||||
entity_buffer = ""
|
|
||||||
mention_buffer = ""
|
|
||||||
|
|
||||||
reading_text = True
|
|
||||||
reading_entity = False
|
|
||||||
reading_mention = False
|
|
||||||
reading_special_case = False
|
|
||||||
return final_text, entities
|
|
||||||
|
|
||||||
|
|
||||||
def _write_training_description(outputfile, qid, description):
|
|
||||||
if description is not None:
|
|
||||||
line = str(qid) + "|" + description + "\n"
|
|
||||||
outputfile.write(line)
|
|
||||||
|
|
||||||
|
|
||||||
def _write_training_entities(outputfile, article_id, clean_text, entities):
|
|
||||||
entities_data = [
|
|
||||||
{"alias": ent[0], "entity": ent[1], "start": ent[2], "end": ent[3]}
|
|
||||||
for ent in entities
|
|
||||||
]
|
|
||||||
line = (
|
|
||||||
json.dumps(
|
|
||||||
{
|
|
||||||
"article_id": article_id,
|
|
||||||
"clean_text": clean_text,
|
|
||||||
"entities": entities_data,
|
|
||||||
},
|
|
||||||
ensure_ascii=False,
|
|
||||||
)
|
|
||||||
+ "\n"
|
|
||||||
)
|
|
||||||
outputfile.write(line)
|
|
||||||
|
|
||||||
|
|
||||||
def read_training_indices(entity_file_path):
|
|
||||||
""" This method creates two lists of indices into the training file: one with indices for the
|
|
||||||
training examples, and one for the dev examples."""
|
|
||||||
train_indices = []
|
|
||||||
dev_indices = []
|
|
||||||
|
|
||||||
with entity_file_path.open("r", encoding="utf8") as file:
|
|
||||||
for i, line in enumerate(file):
|
|
||||||
example = json.loads(line)
|
|
||||||
article_id = example["article_id"]
|
|
||||||
clean_text = example["clean_text"]
|
|
||||||
|
|
||||||
if is_valid_article(clean_text):
|
|
||||||
if is_dev(article_id):
|
|
||||||
dev_indices.append(i)
|
|
||||||
else:
|
|
||||||
train_indices.append(i)
|
|
||||||
|
|
||||||
return train_indices, dev_indices
|
|
||||||
|
|
||||||
|
|
||||||
def read_el_docs_golds(nlp, entity_file_path, dev, line_ids, kb, labels_discard=None):
|
|
||||||
""" This method provides training/dev examples that correspond to the entity annotations found by the nlp object.
|
|
||||||
For training, it will include both positive and negative examples by using the candidate generator from the kb.
|
|
||||||
For testing (kb=None), it will include all positive examples only."""
|
|
||||||
if not labels_discard:
|
|
||||||
labels_discard = []
|
|
||||||
|
|
||||||
texts = []
|
|
||||||
entities_list = []
|
|
||||||
|
|
||||||
with entity_file_path.open("r", encoding="utf8") as file:
|
|
||||||
for i, line in enumerate(file):
|
|
||||||
if i in line_ids:
|
|
||||||
example = json.loads(line)
|
|
||||||
article_id = example["article_id"]
|
|
||||||
clean_text = example["clean_text"]
|
|
||||||
entities = example["entities"]
|
|
||||||
|
|
||||||
if dev != is_dev(article_id) or not is_valid_article(clean_text):
|
|
||||||
continue
|
|
||||||
|
|
||||||
texts.append(clean_text)
|
|
||||||
entities_list.append(entities)
|
|
||||||
|
|
||||||
docs = nlp.pipe(texts, batch_size=50)
|
|
||||||
|
|
||||||
for doc, entities in zip(docs, entities_list):
|
|
||||||
gold = _get_gold_parse(doc, entities, dev=dev, kb=kb, labels_discard=labels_discard)
|
|
||||||
if gold and len(gold.links) > 0:
|
|
||||||
yield doc, gold
|
|
||||||
|
|
||||||
|
|
||||||
def _get_gold_parse(doc, entities, dev, kb, labels_discard):
|
|
||||||
gold_entities = {}
|
|
||||||
tagged_ent_positions = {
|
|
||||||
(ent.start_char, ent.end_char): ent
|
|
||||||
for ent in doc.ents
|
|
||||||
if ent.label_ not in labels_discard
|
|
||||||
}
|
|
||||||
|
|
||||||
for entity in entities:
|
|
||||||
entity_id = entity["entity"]
|
|
||||||
alias = entity["alias"]
|
|
||||||
start = entity["start"]
|
|
||||||
end = entity["end"]
|
|
||||||
|
|
||||||
candidate_ids = []
|
|
||||||
if kb and not dev:
|
|
||||||
candidates = kb.get_candidates(alias)
|
|
||||||
candidate_ids = [cand.entity_ for cand in candidates]
|
|
||||||
|
|
||||||
tagged_ent = tagged_ent_positions.get((start, end), None)
|
|
||||||
if tagged_ent:
|
|
||||||
# TODO: check that alias == doc.text[start:end]
|
|
||||||
should_add_ent = (dev or entity_id in candidate_ids) and is_valid_sentence(
|
|
||||||
tagged_ent.sent.text
|
|
||||||
)
|
|
||||||
|
|
||||||
if should_add_ent:
|
|
||||||
value_by_id = {entity_id: 1.0}
|
|
||||||
if not dev:
|
|
||||||
random.shuffle(candidate_ids)
|
|
||||||
value_by_id.update(
|
|
||||||
{kb_id: 0.0 for kb_id in candidate_ids if kb_id != entity_id}
|
|
||||||
)
|
|
||||||
gold_entities[(start, end)] = value_by_id
|
|
||||||
|
|
||||||
return GoldParse(doc, links=gold_entities)
|
|
||||||
|
|
||||||
|
|
||||||
def is_dev(article_id):
|
|
||||||
if not article_id:
|
|
||||||
return False
|
|
||||||
return article_id.endswith("3")
|
|
||||||
|
|
||||||
|
|
||||||
def is_valid_article(doc_text):
|
|
||||||
# custom length cut-off
|
|
||||||
return 10 < len(doc_text) < 30000
|
|
||||||
|
|
||||||
|
|
||||||
def is_valid_sentence(sent_text):
|
|
||||||
if not 10 < len(sent_text) < 3000:
|
|
||||||
# custom length cut-off
|
|
||||||
return False
|
|
||||||
|
|
||||||
if sent_text.strip().startswith("*") or sent_text.strip().startswith("#"):
|
|
||||||
# remove 'enumeration' sentences (occurs often on Wikipedia)
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
|
@ -88,8 +88,8 @@ def read_text(bz2_loc, n=10000):
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
def get_matches(tokenizer, phrases, texts, max_length=6):
|
def get_matches(tokenizer, phrases, texts):
|
||||||
matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length)
|
matcher = PhraseMatcher(tokenizer.vocab)
|
||||||
matcher.add("Phrase", None, *phrases)
|
matcher.add("Phrase", None, *phrases)
|
||||||
for text in texts:
|
for text in texts:
|
||||||
doc = tokenizer(text)
|
doc = tokenizer(text)
|
||||||
|
|
|
@ -1,15 +1,15 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
|
|
||||||
"""Example of defining and (pre)training spaCy's knowledge base,
|
"""Example of defining a knowledge base in spaCy,
|
||||||
which is needed to implement entity linking functionality.
|
which is needed to implement entity linking functionality.
|
||||||
|
|
||||||
For more details, see the documentation:
|
For more details, see the documentation:
|
||||||
* Knowledge base: https://spacy.io/api/kb
|
* Knowledge base: https://spacy.io/api/kb
|
||||||
* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
|
* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
|
||||||
|
|
||||||
Compatible with: spaCy v2.2.3
|
Compatible with: spaCy v2.2.4
|
||||||
Last tested with: v2.2.3
|
Last tested with: v2.2.4
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
|
@ -20,24 +20,18 @@ from spacy.vocab import Vocab
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.kb import KnowledgeBase
|
from spacy.kb import KnowledgeBase
|
||||||
|
|
||||||
from bin.wiki_entity_linking.train_descriptions import EntityEncoder
|
|
||||||
|
|
||||||
|
|
||||||
# Q2146908 (Russ Cochran): American golfer
|
# Q2146908 (Russ Cochran): American golfer
|
||||||
# Q7381115 (Russ Cochran): publisher
|
# Q7381115 (Russ Cochran): publisher
|
||||||
ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)}
|
ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)}
|
||||||
|
|
||||||
INPUT_DIM = 300 # dimension of pretrained input vectors
|
|
||||||
DESC_WIDTH = 64 # dimension of output entity vectors
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
model=("Model name, should have pretrained word embeddings", "positional", None, str),
|
model=("Model name, should have pretrained word embeddings", "positional", None, str),
|
||||||
output_dir=("Optional output directory", "option", "o", Path),
|
output_dir=("Optional output directory", "option", "o", Path),
|
||||||
n_iter=("Number of training iterations", "option", "n", int),
|
|
||||||
)
|
)
|
||||||
def main(model=None, output_dir=None, n_iter=50):
|
def main(model=None, output_dir=None):
|
||||||
"""Load the model, create the KB and pretrain the entity encodings.
|
"""Load the model and create the KB with pre-defined entity encodings.
|
||||||
If an output_dir is provided, the KB will be stored there in a file 'kb'.
|
If an output_dir is provided, the KB will be stored there in a file 'kb'.
|
||||||
The updated vocab will also be written to a directory in the output_dir."""
|
The updated vocab will also be written to a directory in the output_dir."""
|
||||||
|
|
||||||
|
@ -51,33 +45,23 @@ def main(model=None, output_dir=None, n_iter=50):
|
||||||
" cf. https://spacy.io/usage/models#languages."
|
" cf. https://spacy.io/usage/models#languages."
|
||||||
)
|
)
|
||||||
|
|
||||||
kb = KnowledgeBase(vocab=nlp.vocab)
|
# You can change the dimension of vectors in your KB by using an encoder that changes the dimensionality.
|
||||||
|
# For simplicity, we'll just use the original vector dimension here instead.
|
||||||
|
vectors_dim = nlp.vocab.vectors.shape[1]
|
||||||
|
kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=vectors_dim)
|
||||||
|
|
||||||
# set up the data
|
# set up the data
|
||||||
entity_ids = []
|
entity_ids = []
|
||||||
descriptions = []
|
descr_embeddings = []
|
||||||
freqs = []
|
freqs = []
|
||||||
for key, value in ENTITIES.items():
|
for key, value in ENTITIES.items():
|
||||||
desc, freq = value
|
desc, freq = value
|
||||||
entity_ids.append(key)
|
entity_ids.append(key)
|
||||||
descriptions.append(desc)
|
descr_embeddings.append(nlp(desc).vector)
|
||||||
freqs.append(freq)
|
freqs.append(freq)
|
||||||
|
|
||||||
# training entity description encodings
|
|
||||||
# this part can easily be replaced with a custom entity encoder
|
|
||||||
encoder = EntityEncoder(
|
|
||||||
nlp=nlp,
|
|
||||||
input_dim=INPUT_DIM,
|
|
||||||
desc_width=DESC_WIDTH,
|
|
||||||
epochs=n_iter,
|
|
||||||
)
|
|
||||||
encoder.train(description_list=descriptions, to_print=True)
|
|
||||||
|
|
||||||
# get the pretrained entity vectors
|
|
||||||
embeddings = encoder.apply_encoder(descriptions)
|
|
||||||
|
|
||||||
# set the entities, can also be done by calling `kb.add_entity` for each entity
|
# set the entities, can also be done by calling `kb.add_entity` for each entity
|
||||||
kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=embeddings)
|
kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=descr_embeddings)
|
||||||
|
|
||||||
# adding aliases, the entities need to be defined in the KB beforehand
|
# adding aliases, the entities need to be defined in the KB beforehand
|
||||||
kb.add_alias(
|
kb.add_alias(
|
||||||
|
@ -113,8 +97,8 @@ def main(model=None, output_dir=None, n_iter=50):
|
||||||
vocab2 = Vocab().from_disk(vocab_path)
|
vocab2 = Vocab().from_disk(vocab_path)
|
||||||
kb2 = KnowledgeBase(vocab=vocab2)
|
kb2 = KnowledgeBase(vocab=vocab2)
|
||||||
kb2.load_bulk(kb_path)
|
kb2.load_bulk(kb_path)
|
||||||
_print_kb(kb2)
|
|
||||||
print()
|
print()
|
||||||
|
_print_kb(kb2)
|
||||||
|
|
||||||
|
|
||||||
def _print_kb(kb):
|
def _print_kb(kb):
|
||||||
|
@ -126,6 +110,5 @@ if __name__ == "__main__":
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
||||||
# Expected output:
|
# Expected output:
|
||||||
|
|
||||||
# 2 kb entities: ['Q2146908', 'Q7381115']
|
# 2 kb entities: ['Q2146908', 'Q7381115']
|
||||||
# 1 kb aliases: ['Russ Cochran']
|
# 1 kb aliases: ['Russ Cochran']
|
|
@ -1,15 +1,15 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
|
|
||||||
"""Example of training spaCy's entity linker, starting off with an
|
"""Example of training spaCy's entity linker, starting off with a predefined
|
||||||
existing model and a pre-defined knowledge base.
|
knowledge base and corresponding vocab, and a blank English model.
|
||||||
|
|
||||||
For more details, see the documentation:
|
For more details, see the documentation:
|
||||||
* Training: https://spacy.io/usage/training
|
* Training: https://spacy.io/usage/training
|
||||||
* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
|
* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
|
||||||
|
|
||||||
Compatible with: spaCy v2.2.3
|
Compatible with: spaCy v2.2.4
|
||||||
Last tested with: v2.2.3
|
Last tested with: v2.2.4
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
|
@ -17,13 +17,11 @@ import plac
|
||||||
import random
|
import random
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from spacy.symbols import PERSON
|
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.kb import KnowledgeBase
|
from spacy.kb import KnowledgeBase
|
||||||
from spacy.pipeline import EntityRuler
|
from spacy.pipeline import EntityRuler
|
||||||
from spacy.tokens import Span
|
|
||||||
from spacy.util import minibatch, compounding
|
from spacy.util import minibatch, compounding
|
||||||
|
|
||||||
|
|
||||||
|
|
62
netlify.toml
62
netlify.toml
|
@ -7,42 +7,42 @@ redirects = [
|
||||||
{from = "https://alpha.spacy.io/*", to = "https://spacy.io", force = true},
|
{from = "https://alpha.spacy.io/*", to = "https://spacy.io", force = true},
|
||||||
{from = "http://alpha.spacy.io/*", to = "https://spacy.io", force = true},
|
{from = "http://alpha.spacy.io/*", to = "https://spacy.io", force = true},
|
||||||
# Old demos
|
# Old demos
|
||||||
{from = "/demos/*", to = "https://explosion.ai/demos/:splat"},
|
{from = "/demos/*", to = "https://explosion.ai/demos/:splat", force = true},
|
||||||
# Old blog
|
# Old blog
|
||||||
{from = "/blog/*", to = "https://explosion.ai/blog/:splat"},
|
{from = "/blog/*", to = "https://explosion.ai/blog/:splat", force = true},
|
||||||
{from = "/feed", to = "https://explosion.ai/feed"},
|
{from = "/feed", to = "https://explosion.ai/feed", force = true},
|
||||||
{from = "/feed.xml", to = "https://explosion.ai/feed"},
|
{from = "/feed.xml", to = "https://explosion.ai/feed", force = true},
|
||||||
# Old documentation pages (1.x)
|
# Old documentation pages (1.x)
|
||||||
{from = "/docs/usage/processing-text", to = "/usage/linguistic-features"},
|
{from = "/docs/usage/processing-text", to = "/usage/linguistic-features", force = true},
|
||||||
{from = "/docs/usage/deep-learning", to = "/usage/training"},
|
{from = "/docs/usage/deep-learning", to = "/usage/training", force = true},
|
||||||
{from = "/docs/usage/pos-tagging", to = "/usage/linguistic-features#pos-tagging"},
|
{from = "/docs/usage/pos-tagging", to = "/usage/linguistic-features#pos-tagging", force = true},
|
||||||
{from = "/docs/usage/dependency-parse", to = "/usage/linguistic-features#dependency-parse"},
|
{from = "/docs/usage/dependency-parse", to = "/usage/linguistic-features#dependency-parse", force = true},
|
||||||
{from = "/docs/usage/entity-recognition", to = "/usage/linguistic-features#named-entities"},
|
{from = "/docs/usage/entity-recognition", to = "/usage/linguistic-features#named-entities", force = true},
|
||||||
{from = "/docs/usage/word-vectors-similarities", to = "/usage/vectors-similarity"},
|
{from = "/docs/usage/word-vectors-similarities", to = "/usage/vectors-similarity", force = true},
|
||||||
{from = "/docs/usage/customizing-tokenizer", to = "/usage/linguistic-features#tokenization"},
|
{from = "/docs/usage/customizing-tokenizer", to = "/usage/linguistic-features#tokenization", force = true},
|
||||||
{from = "/docs/usage/language-processing-pipeline", to = "/usage/processing-pipelines"},
|
{from = "/docs/usage/language-processing-pipeline", to = "/usage/processing-pipelines", force = true},
|
||||||
{from = "/docs/usage/customizing-pipeline", to = "/usage/processing-pipelines"},
|
{from = "/docs/usage/customizing-pipeline", to = "/usage/processing-pipelines", force = true},
|
||||||
{from = "/docs/usage/training-ner", to = "/usage/training#ner"},
|
{from = "/docs/usage/training-ner", to = "/usage/training#ner", force = true},
|
||||||
{from = "/docs/usage/tutorials", to = "/usage/examples"},
|
{from = "/docs/usage/tutorials", to = "/usage/examples", force = true},
|
||||||
{from = "/docs/usage/data-model", to = "/api"},
|
{from = "/docs/usage/data-model", to = "/api", force = true},
|
||||||
{from = "/docs/usage/cli", to = "/api/cli"},
|
{from = "/docs/usage/cli", to = "/api/cli", force = true},
|
||||||
{from = "/docs/usage/lightning-tour", to = "/usage/spacy-101#lightning-tour"},
|
{from = "/docs/usage/lightning-tour", to = "/usage/spacy-101#lightning-tour", force = true},
|
||||||
{from = "/docs/api/language-models", to = "/usage/models#languages"},
|
{from = "/docs/api/language-models", to = "/usage/models#languages", force = true},
|
||||||
{from = "/docs/api/spacy", to = "/docs/api/top-level"},
|
{from = "/docs/api/spacy", to = "/docs/api/top-level", force = true},
|
||||||
{from = "/docs/api/displacy", to = "/api/top-level#displacy"},
|
{from = "/docs/api/displacy", to = "/api/top-level#displacy", force = true},
|
||||||
{from = "/docs/api/util", to = "/api/top-level#util"},
|
{from = "/docs/api/util", to = "/api/top-level#util", force = true},
|
||||||
{from = "/docs/api/features", to = "/models/#architecture"},
|
{from = "/docs/api/features", to = "/models/#architecture", force = true},
|
||||||
{from = "/docs/api/philosophy", to = "/usage/spacy-101"},
|
{from = "/docs/api/philosophy", to = "/usage/spacy-101", force = true},
|
||||||
{from = "/docs/usage/showcase", to = "/universe"},
|
{from = "/docs/usage/showcase", to = "/universe", force = true},
|
||||||
{from = "/tutorials/load-new-word-vectors", to = "/usage/vectors-similarity#custom"},
|
{from = "/tutorials/load-new-word-vectors", to = "/usage/vectors-similarity#custom", force = true},
|
||||||
{from = "/tutorials", to = "/usage/examples"},
|
{from = "/tutorials", to = "/usage/examples", force = true},
|
||||||
# Rewrite all other docs pages to /
|
# Rewrite all other docs pages to /
|
||||||
{from = "/docs/*", to = "/:splat"},
|
{from = "/docs/*", to = "/:splat"},
|
||||||
# Updated documentation pages
|
# Updated documentation pages
|
||||||
{from = "/usage/resources", to = "/universe"},
|
{from = "/usage/resources", to = "/universe", force = true},
|
||||||
{from = "/usage/lightning-tour", to = "/usage/spacy-101#lightning-tour"},
|
{from = "/usage/lightning-tour", to = "/usage/spacy-101#lightning-tour", force = true},
|
||||||
{from = "/usage/linguistic-features#rule-based-matching", to = "/usage/rule-based-matching"},
|
{from = "/usage/linguistic-features#rule-based-matching", to = "/usage/rule-based-matching", force = true},
|
||||||
{from = "/models/comparison", to = "/models"},
|
{from = "/models/comparison", to = "/models", force = true},
|
||||||
{from = "/api/#section-cython", to = "/api/cython", force = true},
|
{from = "/api/#section-cython", to = "/api/cython", force = true},
|
||||||
{from = "/api/#cython", to = "/api/cython", force = true},
|
{from = "/api/#cython", to = "/api/cython", force = true},
|
||||||
{from = "/api/sentencesegmenter", to="/api/sentencizer"},
|
{from = "/api/sentencesegmenter", to="/api/sentencizer"},
|
||||||
|
|
18
setup.cfg
18
setup.cfg
|
@ -30,7 +30,7 @@ zip_safe = false
|
||||||
include_package_data = true
|
include_package_data = true
|
||||||
scripts =
|
scripts =
|
||||||
bin/spacy
|
bin/spacy
|
||||||
python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*
|
python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*
|
||||||
setup_requires =
|
setup_requires =
|
||||||
wheel
|
wheel
|
||||||
cython>=0.25
|
cython>=0.25
|
||||||
|
@ -61,17 +61,21 @@ install_requires =
|
||||||
lookups =
|
lookups =
|
||||||
spacy_lookups_data>=0.0.5,<0.2.0
|
spacy_lookups_data>=0.0.5,<0.2.0
|
||||||
cuda =
|
cuda =
|
||||||
cupy>=5.0.0b4
|
cupy>=5.0.0b4,<9.0.0
|
||||||
cuda80 =
|
cuda80 =
|
||||||
cupy-cuda80>=5.0.0b4
|
cupy-cuda80>=5.0.0b4,<9.0.0
|
||||||
cuda90 =
|
cuda90 =
|
||||||
cupy-cuda90>=5.0.0b4
|
cupy-cuda90>=5.0.0b4,<9.0.0
|
||||||
cuda91 =
|
cuda91 =
|
||||||
cupy-cuda91>=5.0.0b4
|
cupy-cuda91>=5.0.0b4,<9.0.0
|
||||||
cuda92 =
|
cuda92 =
|
||||||
cupy-cuda92>=5.0.0b4
|
cupy-cuda92>=5.0.0b4,<9.0.0
|
||||||
cuda100 =
|
cuda100 =
|
||||||
cupy-cuda100>=5.0.0b4
|
cupy-cuda100>=5.0.0b4,<9.0.0
|
||||||
|
cuda101 =
|
||||||
|
cupy-cuda101>=5.0.0b4,<9.0.0
|
||||||
|
cuda102 =
|
||||||
|
cupy-cuda102>=5.0.0b4,<9.0.0
|
||||||
# Language tokenizers with external dependencies
|
# Language tokenizers with external dependencies
|
||||||
ja =
|
ja =
|
||||||
fugashi>=0.1.3
|
fugashi>=0.1.3
|
||||||
|
|
1
setup.py
1
setup.py
|
@ -31,7 +31,6 @@ PACKAGES = find_packages()
|
||||||
|
|
||||||
|
|
||||||
MOD_NAMES = [
|
MOD_NAMES = [
|
||||||
"spacy._align",
|
|
||||||
"spacy.parts_of_speech",
|
"spacy.parts_of_speech",
|
||||||
"spacy.strings",
|
"spacy.strings",
|
||||||
"spacy.lexeme",
|
"spacy.lexeme",
|
||||||
|
|
|
@ -13,7 +13,7 @@ from . import pipeline
|
||||||
from .cli.info import info as cli_info
|
from .cli.info import info as cli_info
|
||||||
from .glossary import explain
|
from .glossary import explain
|
||||||
from .about import __version__
|
from .about import __version__
|
||||||
from .errors import Errors, Warnings, deprecation_warning
|
from .errors import Errors, Warnings
|
||||||
from . import util
|
from . import util
|
||||||
from .util import registry
|
from .util import registry
|
||||||
from .language import component
|
from .language import component
|
||||||
|
@ -26,7 +26,7 @@ if sys.maxunicode == 65535:
|
||||||
def load(name, **overrides):
|
def load(name, **overrides):
|
||||||
depr_path = overrides.get("path")
|
depr_path = overrides.get("path")
|
||||||
if depr_path not in (True, False, None):
|
if depr_path not in (True, False, None):
|
||||||
deprecation_warning(Warnings.W001.format(path=depr_path))
|
warnings.warn(Warnings.W001.format(path=depr_path), DeprecationWarning)
|
||||||
return util.load_model(name, **overrides)
|
return util.load_model(name, **overrides)
|
||||||
|
|
||||||
|
|
||||||
|
|
255
spacy/_align.pyx
255
spacy/_align.pyx
|
@ -1,255 +0,0 @@
|
||||||
# cython: infer_types=True
|
|
||||||
'''Do Levenshtein alignment, for evaluation of tokenized input.
|
|
||||||
|
|
||||||
Random notes:
|
|
||||||
|
|
||||||
r i n g
|
|
||||||
0 1 2 3 4
|
|
||||||
r 1 0 1 2 3
|
|
||||||
a 2 1 1 2 3
|
|
||||||
n 3 2 2 1 2
|
|
||||||
g 4 3 3 2 1
|
|
||||||
|
|
||||||
0,0: (1,1)=min(0+0,1+1,1+1)=0 S
|
|
||||||
1,0: (2,1)=min(1+1,0+1,2+1)=1 D
|
|
||||||
2,0: (3,1)=min(2+1,3+1,1+1)=2 D
|
|
||||||
3,0: (4,1)=min(3+1,4+1,2+1)=3 D
|
|
||||||
0,1: (1,2)=min(1+1,2+1,0+1)=1 D
|
|
||||||
1,1: (2,2)=min(0+1,1+1,1+1)=1 S
|
|
||||||
2,1: (3,2)=min(1+1,1+1,2+1)=2 S or I
|
|
||||||
3,1: (4,2)=min(2+1,2+1,3+1)=3 S or I
|
|
||||||
0,2: (1,3)=min(2+1,3+1,1+1)=2 I
|
|
||||||
1,2: (2,3)=min(1+1,2+1,1+1)=2 S or I
|
|
||||||
2,2: (3,3)
|
|
||||||
3,2: (4,3)
|
|
||||||
At state (i, j) we're asking "How do I transform S[:i+1] to T[:j+1]?"
|
|
||||||
|
|
||||||
We know the costs to transition:
|
|
||||||
|
|
||||||
S[:i] -> T[:j] (at D[i,j])
|
|
||||||
S[:i+1] -> T[:j] (at D[i+1,j])
|
|
||||||
S[:i] -> T[:j+1] (at D[i,j+1])
|
|
||||||
|
|
||||||
Further, now we can transform:
|
|
||||||
S[:i+1] -> S[:i] (DEL) for 1,
|
|
||||||
T[:j+1] -> T[:j] (INS) for 1.
|
|
||||||
S[i+1] -> T[j+1] (SUB) for 0 or 1
|
|
||||||
|
|
||||||
Therefore we have the costs:
|
|
||||||
SUB: Cost(S[:i]->T[:j]) + Cost(S[i]->S[j])
|
|
||||||
i.e. D[i, j] + S[i+1] != T[j+1]
|
|
||||||
INS: Cost(S[:i+1]->T[:j]) + Cost(T[:j+1]->T[:j])
|
|
||||||
i.e. D[i+1,j] + 1
|
|
||||||
DEL: Cost(S[:i]->T[:j+1]) + Cost(S[:i+1]->S[:i])
|
|
||||||
i.e. D[i,j+1] + 1
|
|
||||||
|
|
||||||
Source string S has length m, with index i
|
|
||||||
Target string T has length n, with index j
|
|
||||||
|
|
||||||
Output two alignment vectors: i2j (length m) and j2i (length n)
|
|
||||||
# function LevenshteinDistance(char s[1..m], char t[1..n]):
|
|
||||||
# for all i and j, d[i,j] will hold the Levenshtein distance between
|
|
||||||
# the first i characters of s and the first j characters of t
|
|
||||||
# note that d has (m+1)*(n+1) values
|
|
||||||
# set each element in d to zero
|
|
||||||
ring rang
|
|
||||||
- r i n g
|
|
||||||
- 0 0 0 0 0
|
|
||||||
r 0 0 0 0 0
|
|
||||||
a 0 0 0 0 0
|
|
||||||
n 0 0 0 0 0
|
|
||||||
g 0 0 0 0 0
|
|
||||||
|
|
||||||
# source prefixes can be transformed into empty string by
|
|
||||||
# dropping all characters
|
|
||||||
# d[i, 0] := i
|
|
||||||
ring rang
|
|
||||||
- r i n g
|
|
||||||
- 0 0 0 0 0
|
|
||||||
r 1 0 0 0 0
|
|
||||||
a 2 0 0 0 0
|
|
||||||
n 3 0 0 0 0
|
|
||||||
g 4 0 0 0 0
|
|
||||||
|
|
||||||
# target prefixes can be reached from empty source prefix
|
|
||||||
# by inserting every character
|
|
||||||
# d[0, j] := j
|
|
||||||
- r i n g
|
|
||||||
- 0 1 2 3 4
|
|
||||||
r 1 0 0 0 0
|
|
||||||
a 2 0 0 0 0
|
|
||||||
n 3 0 0 0 0
|
|
||||||
g 4 0 0 0 0
|
|
||||||
|
|
||||||
'''
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
from libc.stdint cimport uint32_t
|
|
||||||
import numpy
|
|
||||||
cimport numpy as np
|
|
||||||
from .compat import unicode_
|
|
||||||
from murmurhash.mrmr cimport hash32
|
|
||||||
|
|
||||||
|
|
||||||
def align(S, T):
|
|
||||||
cdef int m = len(S)
|
|
||||||
cdef int n = len(T)
|
|
||||||
cdef np.ndarray matrix = numpy.zeros((m+1, n+1), dtype='int32')
|
|
||||||
cdef np.ndarray i2j = numpy.zeros((m,), dtype='i')
|
|
||||||
cdef np.ndarray j2i = numpy.zeros((n,), dtype='i')
|
|
||||||
|
|
||||||
cdef np.ndarray S_arr = _convert_sequence(S)
|
|
||||||
cdef np.ndarray T_arr = _convert_sequence(T)
|
|
||||||
|
|
||||||
fill_matrix(<int*>matrix.data,
|
|
||||||
<const int*>S_arr.data, m, <const int*>T_arr.data, n)
|
|
||||||
fill_i2j(i2j, matrix)
|
|
||||||
fill_j2i(j2i, matrix)
|
|
||||||
for i in range(i2j.shape[0]):
|
|
||||||
if i2j[i] >= 0 and len(S[i]) != len(T[i2j[i]]):
|
|
||||||
i2j[i] = -1
|
|
||||||
for j in range(j2i.shape[0]):
|
|
||||||
if j2i[j] >= 0 and len(T[j]) != len(S[j2i[j]]):
|
|
||||||
j2i[j] = -1
|
|
||||||
return matrix[-1,-1], i2j, j2i, matrix
|
|
||||||
|
|
||||||
|
|
||||||
def multi_align(np.ndarray i2j, np.ndarray j2i, i_lengths, j_lengths):
|
|
||||||
'''Let's say we had:
|
|
||||||
|
|
||||||
Guess: [aa bb cc dd]
|
|
||||||
Truth: [aa bbcc dd]
|
|
||||||
i2j: [0, None, -2, 2]
|
|
||||||
j2i: [0, -2, 3]
|
|
||||||
|
|
||||||
We want:
|
|
||||||
|
|
||||||
i2j_multi: {1: 1, 2: 1}
|
|
||||||
j2i_multi: {}
|
|
||||||
'''
|
|
||||||
i2j_miss = _get_regions(i2j, i_lengths)
|
|
||||||
j2i_miss = _get_regions(j2i, j_lengths)
|
|
||||||
|
|
||||||
i2j_multi, j2i_multi = _get_mapping(i2j_miss, j2i_miss, i_lengths, j_lengths)
|
|
||||||
return i2j_multi, j2i_multi
|
|
||||||
|
|
||||||
|
|
||||||
def _get_regions(alignment, lengths):
|
|
||||||
regions = {}
|
|
||||||
start = None
|
|
||||||
offset = 0
|
|
||||||
for i in range(len(alignment)):
|
|
||||||
if alignment[i] < 0:
|
|
||||||
if start is None:
|
|
||||||
start = offset
|
|
||||||
regions.setdefault(start, [])
|
|
||||||
regions[start].append(i)
|
|
||||||
else:
|
|
||||||
start = None
|
|
||||||
offset += lengths[i]
|
|
||||||
return regions
|
|
||||||
|
|
||||||
|
|
||||||
def _get_mapping(miss1, miss2, lengths1, lengths2):
|
|
||||||
i2j = {}
|
|
||||||
j2i = {}
|
|
||||||
for start, region1 in miss1.items():
|
|
||||||
if not region1 or start not in miss2:
|
|
||||||
continue
|
|
||||||
region2 = miss2[start]
|
|
||||||
if sum(lengths1[i] for i in region1) == sum(lengths2[i] for i in region2):
|
|
||||||
j = region2.pop(0)
|
|
||||||
buff = []
|
|
||||||
# Consume tokens from region 1, until we meet the length of the
|
|
||||||
# first token in region2. If we do, align the tokens. If
|
|
||||||
# we exceed the length, break.
|
|
||||||
while region1:
|
|
||||||
buff.append(region1.pop(0))
|
|
||||||
if sum(lengths1[i] for i in buff) == lengths2[j]:
|
|
||||||
for i in buff:
|
|
||||||
i2j[i] = j
|
|
||||||
j2i[j] = buff[-1]
|
|
||||||
j += 1
|
|
||||||
buff = []
|
|
||||||
elif sum(lengths1[i] for i in buff) > lengths2[j]:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
if buff and sum(lengths1[i] for i in buff) == lengths2[j]:
|
|
||||||
for i in buff:
|
|
||||||
i2j[i] = j
|
|
||||||
j2i[j] = buff[-1]
|
|
||||||
return i2j, j2i
|
|
||||||
|
|
||||||
|
|
||||||
def _convert_sequence(seq):
|
|
||||||
if isinstance(seq, numpy.ndarray):
|
|
||||||
return numpy.ascontiguousarray(seq, dtype='uint32_t')
|
|
||||||
cdef np.ndarray output = numpy.zeros((len(seq),), dtype='uint32')
|
|
||||||
cdef bytes item_bytes
|
|
||||||
for i, item in enumerate(seq):
|
|
||||||
if item == "``":
|
|
||||||
item = '"'
|
|
||||||
elif item == "''":
|
|
||||||
item = '"'
|
|
||||||
if isinstance(item, unicode):
|
|
||||||
item_bytes = item.encode('utf8')
|
|
||||||
else:
|
|
||||||
item_bytes = item
|
|
||||||
output[i] = hash32(<void*><char*>item_bytes, len(item_bytes), 0)
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
cdef void fill_matrix(int* D,
|
|
||||||
const int* S, int m, const int* T, int n) nogil:
|
|
||||||
m1 = m+1
|
|
||||||
n1 = n+1
|
|
||||||
for i in range(m1*n1):
|
|
||||||
D[i] = 0
|
|
||||||
|
|
||||||
for i in range(m1):
|
|
||||||
D[i*n1] = i
|
|
||||||
|
|
||||||
for j in range(n1):
|
|
||||||
D[j] = j
|
|
||||||
|
|
||||||
cdef int sub_cost, ins_cost, del_cost
|
|
||||||
for j in range(n):
|
|
||||||
for i in range(m):
|
|
||||||
i_j = i*n1 + j
|
|
||||||
i1_j1 = (i+1)*n1 + j+1
|
|
||||||
i1_j = (i+1)*n1 + j
|
|
||||||
i_j1 = i*n1 + j+1
|
|
||||||
if S[i] != T[j]:
|
|
||||||
sub_cost = D[i_j] + 1
|
|
||||||
else:
|
|
||||||
sub_cost = D[i_j]
|
|
||||||
del_cost = D[i_j1] + 1
|
|
||||||
ins_cost = D[i1_j] + 1
|
|
||||||
best = min(min(sub_cost, ins_cost), del_cost)
|
|
||||||
D[i1_j1] = best
|
|
||||||
|
|
||||||
|
|
||||||
cdef void fill_i2j(np.ndarray i2j, np.ndarray D) except *:
|
|
||||||
j = D.shape[1]-2
|
|
||||||
cdef int i = D.shape[0]-2
|
|
||||||
while i >= 0:
|
|
||||||
while D[i+1, j] < D[i+1, j+1]:
|
|
||||||
j -= 1
|
|
||||||
if D[i, j+1] < D[i+1, j+1]:
|
|
||||||
i2j[i] = -1
|
|
||||||
else:
|
|
||||||
i2j[i] = j
|
|
||||||
j -= 1
|
|
||||||
i -= 1
|
|
||||||
|
|
||||||
cdef void fill_j2i(np.ndarray j2i, np.ndarray D) except *:
|
|
||||||
i = D.shape[0]-2
|
|
||||||
cdef int j = D.shape[1]-2
|
|
||||||
while j >= 0:
|
|
||||||
while D[i, j+1] < D[i+1, j+1]:
|
|
||||||
i -= 1
|
|
||||||
if D[i+1, j] < D[i+1, j+1]:
|
|
||||||
j2i[j] = -1
|
|
||||||
else:
|
|
||||||
j2i[j] = i
|
|
||||||
i -= 1
|
|
||||||
j -= 1
|
|
15
spacy/_ml.py
15
spacy/_ml.py
|
@ -2,6 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
|
import warnings
|
||||||
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
|
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
|
||||||
from thinc.t2t import ExtractWindow, ParametricAttention
|
from thinc.t2t import ExtractWindow, ParametricAttention
|
||||||
from thinc.t2v import Pooling, sum_pool, mean_pool
|
from thinc.t2v import Pooling, sum_pool, mean_pool
|
||||||
|
@ -22,7 +23,7 @@ from thinc.neural._classes.affine import _set_dimensions_if_needed
|
||||||
import thinc.extra.load_nlp
|
import thinc.extra.load_nlp
|
||||||
|
|
||||||
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
|
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
|
||||||
from .errors import Errors, user_warning, Warnings
|
from .errors import Errors, Warnings
|
||||||
from . import util
|
from . import util
|
||||||
from . import ml as new_ml
|
from . import ml as new_ml
|
||||||
from .ml import _legacy_tok2vec
|
from .ml import _legacy_tok2vec
|
||||||
|
@ -283,13 +284,13 @@ def link_vectors_to_models(vocab):
|
||||||
if vectors.name is None:
|
if vectors.name is None:
|
||||||
vectors.name = VECTORS_KEY
|
vectors.name = VECTORS_KEY
|
||||||
if vectors.data.size != 0:
|
if vectors.data.size != 0:
|
||||||
user_warning(Warnings.W020.format(shape=vectors.data.shape))
|
warnings.warn(Warnings.W020.format(shape=vectors.data.shape))
|
||||||
ops = Model.ops
|
ops = Model.ops
|
||||||
for word in vocab:
|
for word in vocab:
|
||||||
if word.orth in vectors.key2row:
|
if word.orth in vectors.key2row:
|
||||||
word.rank = vectors.key2row[word.orth]
|
word.rank = vectors.key2row[word.orth]
|
||||||
else:
|
else:
|
||||||
word.rank = 0
|
word.rank = util.OOV_RANK
|
||||||
data = ops.asarray(vectors.data)
|
data = ops.asarray(vectors.data)
|
||||||
# Set an entry here, so that vectors are accessed by StaticVectors
|
# Set an entry here, so that vectors are accessed by StaticVectors
|
||||||
# (unideal, I know)
|
# (unideal, I know)
|
||||||
|
@ -299,7 +300,7 @@ def link_vectors_to_models(vocab):
|
||||||
# This is a hack to avoid the problem in #3853.
|
# This is a hack to avoid the problem in #3853.
|
||||||
old_name = vectors.name
|
old_name = vectors.name
|
||||||
new_name = vectors.name + "_%d" % data.shape[0]
|
new_name = vectors.name + "_%d" % data.shape[0]
|
||||||
user_warning(Warnings.W019.format(old=old_name, new=new_name))
|
warnings.warn(Warnings.W019.format(old=old_name, new=new_name))
|
||||||
vectors.name = new_name
|
vectors.name = new_name
|
||||||
key = (ops.device, vectors.name)
|
key = (ops.device, vectors.name)
|
||||||
thinc.extra.load_nlp.VECTORS[key] = data
|
thinc.extra.load_nlp.VECTORS[key] = data
|
||||||
|
@ -693,9 +694,11 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
)
|
)
|
||||||
|
|
||||||
linear_model = build_bow_text_classifier(
|
linear_model = build_bow_text_classifier(
|
||||||
nr_class, ngram_size=cfg.get("ngram_size", 1), exclusive_classes=False
|
nr_class,
|
||||||
|
ngram_size=cfg.get("ngram_size", 1),
|
||||||
|
exclusive_classes=cfg.get("exclusive_classes", False),
|
||||||
)
|
)
|
||||||
if cfg.get("exclusive_classes"):
|
if cfg.get("exclusive_classes", False):
|
||||||
output_layer = Softmax(nr_class, nr_class * 2)
|
output_layer = Softmax(nr_class, nr_class * 2)
|
||||||
else:
|
else:
|
||||||
output_layer = (
|
output_layer = (
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "2.2.4.dev0"
|
__version__ = "2.2.4"
|
||||||
__release__ = True
|
__release__ = True
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
|
|
|
@ -1,11 +1,13 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
|
||||||
from .tokens import Doc, Token, Span
|
from .tokens import Doc, Token, Span
|
||||||
from .errors import Errors, Warnings, user_warning
|
from .errors import Errors, Warnings
|
||||||
|
|
||||||
|
|
||||||
def analyze_pipes(pipeline, name, pipe, index, warn=True):
|
def analyze_pipes(pipeline, name, pipe, index, warn=True):
|
||||||
|
@ -34,7 +36,7 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True):
|
||||||
if not fulfilled:
|
if not fulfilled:
|
||||||
problems.append(annot)
|
problems.append(annot)
|
||||||
if warn:
|
if warn:
|
||||||
user_warning(Warnings.W025.format(name=name, attr=annot))
|
warnings.warn(Warnings.W025.format(name=name, attr=annot))
|
||||||
return problems
|
return problems
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -94,3 +94,4 @@ cdef enum attr_id_t:
|
||||||
ENT_ID = symbols.ENT_ID
|
ENT_ID = symbols.ENT_ID
|
||||||
|
|
||||||
IDX
|
IDX
|
||||||
|
SENT_END
|
|
@ -88,6 +88,7 @@ IDS = {
|
||||||
"ENT_KB_ID": ENT_KB_ID,
|
"ENT_KB_ID": ENT_KB_ID,
|
||||||
"HEAD": HEAD,
|
"HEAD": HEAD,
|
||||||
"SENT_START": SENT_START,
|
"SENT_START": SENT_START,
|
||||||
|
"SENT_END": SENT_END,
|
||||||
"SPACY": SPACY,
|
"SPACY": SPACY,
|
||||||
"PROB": PROB,
|
"PROB": PROB,
|
||||||
"LANG": LANG,
|
"LANG": LANG,
|
||||||
|
|
|
@ -23,20 +23,17 @@ BLANK_MODEL_THRESHOLD = 2000
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
|
# fmt: off
|
||||||
lang=("model language", "positional", None, str),
|
lang=("model language", "positional", None, str),
|
||||||
train_path=("location of JSON-formatted training data", "positional", None, Path),
|
train_path=("location of JSON-formatted training data", "positional", None, Path),
|
||||||
dev_path=("location of JSON-formatted development data", "positional", None, Path),
|
dev_path=("location of JSON-formatted development data", "positional", None, Path),
|
||||||
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
|
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
|
||||||
base_model=("name of model to update (optional)", "option", "b", str),
|
base_model=("name of model to update (optional)", "option", "b", str),
|
||||||
pipeline=(
|
pipeline=("Comma-separated names of pipeline components to train", "option", "p", str),
|
||||||
"Comma-separated names of pipeline components to train",
|
|
||||||
"option",
|
|
||||||
"p",
|
|
||||||
str,
|
|
||||||
),
|
|
||||||
ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
|
ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
|
||||||
verbose=("Print additional information and explanations", "flag", "V", bool),
|
verbose=("Print additional information and explanations", "flag", "V", bool),
|
||||||
no_format=("Don't pretty-print the results", "flag", "NF", bool),
|
no_format=("Don't pretty-print the results", "flag", "NF", bool),
|
||||||
|
# fmt: on
|
||||||
)
|
)
|
||||||
def debug_data(
|
def debug_data(
|
||||||
lang,
|
lang,
|
||||||
|
@ -111,9 +108,11 @@ def debug_data(
|
||||||
msg.good("Corpus is loadable")
|
msg.good("Corpus is loadable")
|
||||||
|
|
||||||
# Create all gold data here to avoid iterating over the train_docs constantly
|
# Create all gold data here to avoid iterating over the train_docs constantly
|
||||||
gold_train_data = _compile_gold(train_docs, pipeline)
|
gold_train_data = _compile_gold(train_docs, pipeline, nlp)
|
||||||
gold_train_unpreprocessed_data = _compile_gold(train_docs_unpreprocessed, pipeline)
|
gold_train_unpreprocessed_data = _compile_gold(
|
||||||
gold_dev_data = _compile_gold(dev_docs, pipeline)
|
train_docs_unpreprocessed, pipeline, nlp
|
||||||
|
)
|
||||||
|
gold_dev_data = _compile_gold(dev_docs, pipeline, nlp)
|
||||||
|
|
||||||
train_texts = gold_train_data["texts"]
|
train_texts = gold_train_data["texts"]
|
||||||
dev_texts = gold_dev_data["texts"]
|
dev_texts = gold_dev_data["texts"]
|
||||||
|
@ -185,6 +184,16 @@ def debug_data(
|
||||||
nlp.vocab.vectors_length,
|
nlp.vocab.vectors_length,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
|
||||||
|
msg.warn(
|
||||||
|
"{} words in training data without vectors ({:0.2f}%)".format(
|
||||||
|
n_missing_vectors,
|
||||||
|
n_missing_vectors / gold_train_data["n_words"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
msg.text(
|
||||||
|
"10 most common words without vectors: {}".format(_format_labels(gold_train_data["words_missing_vectors"].most_common(10), counts=True)), show=verbose,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
msg.info("No word vectors present in the model")
|
msg.info("No word vectors present in the model")
|
||||||
|
|
||||||
|
@ -235,13 +244,17 @@ def debug_data(
|
||||||
|
|
||||||
if gold_train_data["ws_ents"]:
|
if gold_train_data["ws_ents"]:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"{} invalid whitespace entity span(s)".format(gold_train_data["ws_ents"])
|
"{} invalid whitespace entity span(s)".format(
|
||||||
|
gold_train_data["ws_ents"]
|
||||||
|
)
|
||||||
)
|
)
|
||||||
has_ws_ents_error = True
|
has_ws_ents_error = True
|
||||||
|
|
||||||
if gold_train_data["punct_ents"]:
|
if gold_train_data["punct_ents"]:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"{} entity span(s) with punctuation".format(gold_train_data["punct_ents"])
|
"{} entity span(s) with punctuation".format(
|
||||||
|
gold_train_data["punct_ents"]
|
||||||
|
)
|
||||||
)
|
)
|
||||||
has_punct_ents_warning = True
|
has_punct_ents_warning = True
|
||||||
|
|
||||||
|
@ -561,7 +574,7 @@ def _load_file(file_path, msg):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _compile_gold(train_docs, pipeline):
|
def _compile_gold(train_docs, pipeline, nlp):
|
||||||
data = {
|
data = {
|
||||||
"ner": Counter(),
|
"ner": Counter(),
|
||||||
"cats": Counter(),
|
"cats": Counter(),
|
||||||
|
@ -573,6 +586,7 @@ def _compile_gold(train_docs, pipeline):
|
||||||
"punct_ents": 0,
|
"punct_ents": 0,
|
||||||
"n_words": 0,
|
"n_words": 0,
|
||||||
"n_misaligned_words": 0,
|
"n_misaligned_words": 0,
|
||||||
|
"words_missing_vectors": Counter(),
|
||||||
"n_sents": 0,
|
"n_sents": 0,
|
||||||
"n_nonproj": 0,
|
"n_nonproj": 0,
|
||||||
"n_cycles": 0,
|
"n_cycles": 0,
|
||||||
|
@ -585,6 +599,10 @@ def _compile_gold(train_docs, pipeline):
|
||||||
data["n_words"] += len(valid_words)
|
data["n_words"] += len(valid_words)
|
||||||
data["n_misaligned_words"] += len(gold.words) - len(valid_words)
|
data["n_misaligned_words"] += len(gold.words) - len(valid_words)
|
||||||
data["texts"].add(doc.text)
|
data["texts"].add(doc.text)
|
||||||
|
if len(nlp.vocab.vectors):
|
||||||
|
for word in valid_words:
|
||||||
|
if nlp.vocab.strings[word] not in nlp.vocab.vectors:
|
||||||
|
data["words_missing_vectors"].update([word])
|
||||||
if "ner" in pipeline:
|
if "ner" in pipeline:
|
||||||
for i, label in enumerate(gold.ner):
|
for i, label in enumerate(gold.ner):
|
||||||
if label is None:
|
if label is None:
|
||||||
|
@ -592,7 +610,13 @@ def _compile_gold(train_docs, pipeline):
|
||||||
if label.startswith(("B-", "U-", "L-")) and doc[i].is_space:
|
if label.startswith(("B-", "U-", "L-")) and doc[i].is_space:
|
||||||
# "Illegal" whitespace entity
|
# "Illegal" whitespace entity
|
||||||
data["ws_ents"] += 1
|
data["ws_ents"] += 1
|
||||||
if label.startswith(("B-", "U-", "L-")) and doc[i].text in [".", "'", "!", "?", ","]:
|
if label.startswith(("B-", "U-", "L-")) and doc[i].text in [
|
||||||
|
".",
|
||||||
|
"'",
|
||||||
|
"!",
|
||||||
|
"?",
|
||||||
|
",",
|
||||||
|
]:
|
||||||
# punctuation entity: could be replaced by whitespace when training with noise,
|
# punctuation entity: could be replaced by whitespace when training with noise,
|
||||||
# so add a warning to alert the user to this unexpected side effect.
|
# so add a warning to alert the user to this unexpected side effect.
|
||||||
data["punct_ents"] += 1
|
data["punct_ents"] += 1
|
||||||
|
@ -629,7 +653,11 @@ def _format_labels(labels, counts=False):
|
||||||
def _get_examples_without_label(data, label):
|
def _get_examples_without_label(data, label):
|
||||||
count = 0
|
count = 0
|
||||||
for doc, gold in data:
|
for doc, gold in data:
|
||||||
labels = [label.split("-")[1] for label in gold.ner if label not in ("O", "-")]
|
labels = [
|
||||||
|
label.split("-")[1]
|
||||||
|
for label in gold.ner
|
||||||
|
if label is not None and label not in ("O", "-")
|
||||||
|
]
|
||||||
if label not in labels:
|
if label not in labels:
|
||||||
count += 1
|
count += 1
|
||||||
return count
|
return count
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
from __future__ import unicode_literals, division, print_function
|
from __future__ import unicode_literals, division, print_function
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
|
import spacy
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
|
||||||
|
@ -43,6 +44,9 @@ def evaluate(
|
||||||
if displacy_path and not displacy_path.exists():
|
if displacy_path and not displacy_path.exists():
|
||||||
msg.fail("Visualization output directory not found", displacy_path, exits=1)
|
msg.fail("Visualization output directory not found", displacy_path, exits=1)
|
||||||
corpus = GoldCorpus(data_path, data_path)
|
corpus = GoldCorpus(data_path, data_path)
|
||||||
|
if model.startswith("blank:"):
|
||||||
|
nlp = spacy.blank(model.replace("blank:", ""))
|
||||||
|
else:
|
||||||
nlp = util.load_model(model)
|
nlp = util.load_model(model)
|
||||||
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
|
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
|
||||||
begin = timer()
|
begin = timer()
|
||||||
|
|
|
@ -12,11 +12,12 @@ import tarfile
|
||||||
import gzip
|
import gzip
|
||||||
import zipfile
|
import zipfile
|
||||||
import srsly
|
import srsly
|
||||||
|
import warnings
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
|
||||||
from ..vectors import Vectors
|
from ..vectors import Vectors
|
||||||
from ..errors import Errors, Warnings, user_warning
|
from ..errors import Errors, Warnings
|
||||||
from ..util import ensure_path, get_lang_class
|
from ..util import ensure_path, get_lang_class, OOV_RANK
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import ftfy
|
import ftfy
|
||||||
|
@ -34,6 +35,12 @@ DEFAULT_OOV_PROB = -20
|
||||||
jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
|
jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
|
||||||
clusters_loc=("Optional location of brown clusters data", "option", "c", str),
|
clusters_loc=("Optional location of brown clusters data", "option", "c", str),
|
||||||
vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
|
vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
|
||||||
|
truncate_vectors=(
|
||||||
|
"Optional number of vectors to truncate to when reading in vectors file",
|
||||||
|
"option",
|
||||||
|
"t",
|
||||||
|
int,
|
||||||
|
),
|
||||||
prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
|
prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
|
||||||
vectors_name=(
|
vectors_name=(
|
||||||
"Optional name for the word vectors, e.g. en_core_web_lg.vectors",
|
"Optional name for the word vectors, e.g. en_core_web_lg.vectors",
|
||||||
|
@ -50,6 +57,7 @@ def init_model(
|
||||||
clusters_loc=None,
|
clusters_loc=None,
|
||||||
jsonl_loc=None,
|
jsonl_loc=None,
|
||||||
vectors_loc=None,
|
vectors_loc=None,
|
||||||
|
truncate_vectors=0,
|
||||||
prune_vectors=-1,
|
prune_vectors=-1,
|
||||||
vectors_name=None,
|
vectors_name=None,
|
||||||
model_name=None,
|
model_name=None,
|
||||||
|
@ -87,7 +95,7 @@ def init_model(
|
||||||
nlp = create_model(lang, lex_attrs, name=model_name)
|
nlp = create_model(lang, lex_attrs, name=model_name)
|
||||||
msg.good("Successfully created model")
|
msg.good("Successfully created model")
|
||||||
if vectors_loc is not None:
|
if vectors_loc is not None:
|
||||||
add_vectors(nlp, vectors_loc, prune_vectors, vectors_name)
|
add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name)
|
||||||
vec_added = len(nlp.vocab.vectors)
|
vec_added = len(nlp.vocab.vectors)
|
||||||
lex_added = len(nlp.vocab)
|
lex_added = len(nlp.vocab)
|
||||||
msg.good(
|
msg.good(
|
||||||
|
@ -148,7 +156,7 @@ def create_model(lang, lex_attrs, name=None):
|
||||||
lang_class = get_lang_class(lang)
|
lang_class = get_lang_class(lang)
|
||||||
nlp = lang_class()
|
nlp = lang_class()
|
||||||
for lexeme in nlp.vocab:
|
for lexeme in nlp.vocab:
|
||||||
lexeme.rank = 0
|
lexeme.rank = OOV_RANK
|
||||||
lex_added = 0
|
lex_added = 0
|
||||||
for attrs in lex_attrs:
|
for attrs in lex_attrs:
|
||||||
if "settings" in attrs:
|
if "settings" in attrs:
|
||||||
|
@ -168,7 +176,7 @@ def create_model(lang, lex_attrs, name=None):
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
|
def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
|
||||||
vectors_loc = ensure_path(vectors_loc)
|
vectors_loc = ensure_path(vectors_loc)
|
||||||
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
||||||
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
|
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
|
||||||
|
@ -178,7 +186,7 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
|
||||||
else:
|
else:
|
||||||
if vectors_loc:
|
if vectors_loc:
|
||||||
with msg.loading("Reading vectors from {}".format(vectors_loc)):
|
with msg.loading("Reading vectors from {}".format(vectors_loc)):
|
||||||
vectors_data, vector_keys = read_vectors(vectors_loc)
|
vectors_data, vector_keys = read_vectors(vectors_loc, truncate_vectors)
|
||||||
msg.good("Loaded vectors from {}".format(vectors_loc))
|
msg.good("Loaded vectors from {}".format(vectors_loc))
|
||||||
else:
|
else:
|
||||||
vectors_data, vector_keys = (None, None)
|
vectors_data, vector_keys = (None, None)
|
||||||
|
@ -198,9 +206,11 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
|
||||||
nlp.vocab.prune_vectors(prune_vectors)
|
nlp.vocab.prune_vectors(prune_vectors)
|
||||||
|
|
||||||
|
|
||||||
def read_vectors(vectors_loc):
|
def read_vectors(vectors_loc, truncate_vectors=0):
|
||||||
f = open_file(vectors_loc)
|
f = open_file(vectors_loc)
|
||||||
shape = tuple(int(size) for size in next(f).split())
|
shape = tuple(int(size) for size in next(f).split())
|
||||||
|
if truncate_vectors >= 1:
|
||||||
|
shape = (truncate_vectors, shape[1])
|
||||||
vectors_data = numpy.zeros(shape=shape, dtype="f")
|
vectors_data = numpy.zeros(shape=shape, dtype="f")
|
||||||
vectors_keys = []
|
vectors_keys = []
|
||||||
for i, line in enumerate(tqdm(f)):
|
for i, line in enumerate(tqdm(f)):
|
||||||
|
@ -211,6 +221,8 @@ def read_vectors(vectors_loc):
|
||||||
msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
|
msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
|
||||||
vectors_data[i] = numpy.asarray(pieces, dtype="f")
|
vectors_data[i] = numpy.asarray(pieces, dtype="f")
|
||||||
vectors_keys.append(word)
|
vectors_keys.append(word)
|
||||||
|
if i == truncate_vectors - 1:
|
||||||
|
break
|
||||||
return vectors_data, vectors_keys
|
return vectors_data, vectors_keys
|
||||||
|
|
||||||
|
|
||||||
|
@ -246,7 +258,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
||||||
def read_clusters(clusters_loc):
|
def read_clusters(clusters_loc):
|
||||||
clusters = {}
|
clusters = {}
|
||||||
if ftfy is None:
|
if ftfy is None:
|
||||||
user_warning(Warnings.W004)
|
warnings.warn(Warnings.W004)
|
||||||
with clusters_loc.open() as f:
|
with clusters_loc.open() as f:
|
||||||
for line in tqdm(f):
|
for line in tqdm(f):
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -225,7 +225,9 @@ def train(
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
msg.text("Extending component from base model '{}'".format(pipe))
|
msg.text("Extending component from base model '{}'".format(pipe))
|
||||||
disabled_pipes = nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline])
|
disabled_pipes = nlp.disable_pipes(
|
||||||
|
[p for p in nlp.pipe_names if p not in pipeline]
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
msg.text("Starting with blank model '{}'".format(lang))
|
msg.text("Starting with blank model '{}'".format(lang))
|
||||||
lang_cls = util.get_lang_class(lang)
|
lang_cls = util.get_lang_class(lang)
|
||||||
|
@ -361,7 +363,7 @@ def train(
|
||||||
if len(textcat_labels) == 2:
|
if len(textcat_labels) == 2:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"If the textcat component is a binary classifier with "
|
"If the textcat component is a binary classifier with "
|
||||||
"exclusive classes, provide '--textcat_positive_label' for "
|
"exclusive classes, provide '--textcat-positive-label' for "
|
||||||
"an evaluation on the positive class."
|
"an evaluation on the positive class."
|
||||||
)
|
)
|
||||||
msg.text(
|
msg.text(
|
||||||
|
@ -415,10 +417,10 @@ def train(
|
||||||
losses=losses,
|
losses=losses,
|
||||||
)
|
)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
msg.warn("Error during training")
|
err = "Error during training"
|
||||||
if init_tok2vec:
|
if init_tok2vec:
|
||||||
msg.warn("Did you provide the same parameters during 'train' as during 'pretrain'?")
|
err += " Did you provide the same parameters during 'train' as during 'pretrain'?"
|
||||||
msg.fail("Original error message: {}".format(e), exits=1)
|
msg.fail(err, "Original error message: {}".format(e), exits=1)
|
||||||
if raw_text:
|
if raw_text:
|
||||||
# If raw text is available, perform 'rehearsal' updates,
|
# If raw text is available, perform 'rehearsal' updates,
|
||||||
# which use unlabelled data to reduce overfitting.
|
# which use unlabelled data to reduce overfitting.
|
||||||
|
@ -452,6 +454,9 @@ def train(
|
||||||
cpu_wps = nwords / (end_time - start_time)
|
cpu_wps = nwords / (end_time - start_time)
|
||||||
else:
|
else:
|
||||||
gpu_wps = nwords / (end_time - start_time)
|
gpu_wps = nwords / (end_time - start_time)
|
||||||
|
# Only evaluate on CPU in the first iteration (for
|
||||||
|
# timing) if GPU is enabled
|
||||||
|
if i == 0:
|
||||||
with Model.use_device("cpu"):
|
with Model.use_device("cpu"):
|
||||||
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
||||||
for name, component in nlp_loaded.pipeline:
|
for name, component in nlp_loaded.pipeline:
|
||||||
|
@ -546,7 +551,11 @@ def train(
|
||||||
)
|
)
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
msg.warn("Aborting and saving the final best model. Encountered exception: {}".format(e))
|
msg.warn(
|
||||||
|
"Aborting and saving the final best model. "
|
||||||
|
"Encountered exception: {}".format(e),
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
finally:
|
finally:
|
||||||
best_pipes = nlp.pipe_names
|
best_pipes = nlp.pipe_names
|
||||||
if disabled_pipes:
|
if disabled_pipes:
|
||||||
|
@ -561,15 +570,25 @@ def train(
|
||||||
final_meta.setdefault("speed", {})
|
final_meta.setdefault("speed", {})
|
||||||
final_meta["speed"].setdefault("cpu", None)
|
final_meta["speed"].setdefault("cpu", None)
|
||||||
final_meta["speed"].setdefault("gpu", None)
|
final_meta["speed"].setdefault("gpu", None)
|
||||||
|
meta.setdefault("speed", {})
|
||||||
|
meta["speed"].setdefault("cpu", None)
|
||||||
|
meta["speed"].setdefault("gpu", None)
|
||||||
# combine cpu and gpu speeds with the base model speeds
|
# combine cpu and gpu speeds with the base model speeds
|
||||||
if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]:
|
if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]:
|
||||||
speed = _get_total_speed([final_meta["speed"]["cpu"], meta["speed"]["cpu"]])
|
speed = _get_total_speed(
|
||||||
|
[final_meta["speed"]["cpu"], meta["speed"]["cpu"]]
|
||||||
|
)
|
||||||
final_meta["speed"]["cpu"] = speed
|
final_meta["speed"]["cpu"] = speed
|
||||||
if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]:
|
if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]:
|
||||||
speed = _get_total_speed([final_meta["speed"]["gpu"], meta["speed"]["gpu"]])
|
speed = _get_total_speed(
|
||||||
|
[final_meta["speed"]["gpu"], meta["speed"]["gpu"]]
|
||||||
|
)
|
||||||
final_meta["speed"]["gpu"] = speed
|
final_meta["speed"]["gpu"] = speed
|
||||||
# if there were no speeds to update, overwrite with meta
|
# if there were no speeds to update, overwrite with meta
|
||||||
if final_meta["speed"]["cpu"] is None and final_meta["speed"]["gpu"] is None:
|
if (
|
||||||
|
final_meta["speed"]["cpu"] is None
|
||||||
|
and final_meta["speed"]["gpu"] is None
|
||||||
|
):
|
||||||
final_meta["speed"].update(meta["speed"])
|
final_meta["speed"].update(meta["speed"])
|
||||||
# note: beam speeds are not combined with the base model
|
# note: beam speeds are not combined with the base model
|
||||||
if has_beam_widths:
|
if has_beam_widths:
|
||||||
|
@ -661,6 +680,8 @@ def _find_best(experiment_dir, component):
|
||||||
if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
|
if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
|
||||||
accs = srsly.read_json(epoch_model / "accuracy.json")
|
accs = srsly.read_json(epoch_model / "accuracy.json")
|
||||||
scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
|
scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
|
||||||
|
# remove per_type dicts from score list for max() comparison
|
||||||
|
scores = [score for score in scores if isinstance(score, float)]
|
||||||
accuracies.append((scores, epoch_model))
|
accuracies.append((scores, epoch_model))
|
||||||
if accuracies:
|
if accuracies:
|
||||||
return max(accuracies)[1]
|
return max(accuracies)[1]
|
||||||
|
|
|
@ -7,10 +7,12 @@ USAGE: https://spacy.io/usage/visualizers
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
|
||||||
from .render import DependencyRenderer, EntityRenderer
|
from .render import DependencyRenderer, EntityRenderer
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
from ..compat import b_to_str
|
from ..compat import b_to_str
|
||||||
from ..errors import Errors, Warnings, user_warning
|
from ..errors import Errors, Warnings
|
||||||
from ..util import is_in_jupyter
|
from ..util import is_in_jupyter
|
||||||
|
|
||||||
|
|
||||||
|
@ -89,7 +91,7 @@ def serve(
|
||||||
from wsgiref import simple_server
|
from wsgiref import simple_server
|
||||||
|
|
||||||
if is_in_jupyter():
|
if is_in_jupyter():
|
||||||
user_warning(Warnings.W011)
|
warnings.warn(Warnings.W011)
|
||||||
|
|
||||||
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
||||||
httpd = simple_server.make_server(host, port, app)
|
httpd = simple_server.make_server(host, port, app)
|
||||||
|
@ -119,7 +121,7 @@ def parse_deps(orig_doc, options={}):
|
||||||
"""
|
"""
|
||||||
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"]))
|
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"]))
|
||||||
if not doc.is_parsed:
|
if not doc.is_parsed:
|
||||||
user_warning(Warnings.W005)
|
warnings.warn(Warnings.W005)
|
||||||
if options.get("collapse_phrases", False):
|
if options.get("collapse_phrases", False):
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
for np in list(doc.noun_chunks):
|
for np in list(doc.noun_chunks):
|
||||||
|
@ -146,9 +148,14 @@ def parse_deps(orig_doc, options={}):
|
||||||
retokenizer.merge(span, attrs=attrs)
|
retokenizer.merge(span, attrs=attrs)
|
||||||
fine_grained = options.get("fine_grained")
|
fine_grained = options.get("fine_grained")
|
||||||
add_lemma = options.get("add_lemma")
|
add_lemma = options.get("add_lemma")
|
||||||
words = [{"text": w.text,
|
words = [
|
||||||
|
{
|
||||||
|
"text": w.text,
|
||||||
"tag": w.tag_ if fine_grained else w.pos_,
|
"tag": w.tag_ if fine_grained else w.pos_,
|
||||||
"lemma": w.lemma_ if add_lemma else None} for w in doc]
|
"lemma": w.lemma_ if add_lemma else None,
|
||||||
|
}
|
||||||
|
for w in doc
|
||||||
|
]
|
||||||
|
|
||||||
arcs = []
|
arcs = []
|
||||||
for word in doc:
|
for word in doc:
|
||||||
|
@ -179,7 +186,7 @@ def parse_ents(doc, options={}):
|
||||||
for ent in doc.ents
|
for ent in doc.ents
|
||||||
]
|
]
|
||||||
if not ents:
|
if not ents:
|
||||||
user_warning(Warnings.W006)
|
warnings.warn(Warnings.W006)
|
||||||
title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None
|
title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None
|
||||||
settings = get_doc_settings(doc)
|
settings = get_doc_settings(doc)
|
||||||
return {"text": doc.text, "ents": ents, "title": title, "settings": settings}
|
return {"text": doc.text, "ents": ents, "title": title, "settings": settings}
|
||||||
|
|
|
@ -3,7 +3,13 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_WORDS_LEMMA, TPL_DEP_ARCS, TPL_ENTS
|
from .templates import (
|
||||||
|
TPL_DEP_SVG,
|
||||||
|
TPL_DEP_WORDS,
|
||||||
|
TPL_DEP_WORDS_LEMMA,
|
||||||
|
TPL_DEP_ARCS,
|
||||||
|
TPL_ENTS,
|
||||||
|
)
|
||||||
from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
|
from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
|
||||||
from ..util import minify_html, escape_html, registry
|
from ..util import minify_html, escape_html, registry
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
@ -83,7 +89,10 @@ class DependencyRenderer(object):
|
||||||
self.width = self.offset_x + len(words) * self.distance
|
self.width = self.offset_x + len(words) * self.distance
|
||||||
self.height = self.offset_y + 3 * self.word_spacing
|
self.height = self.offset_y + 3 * self.word_spacing
|
||||||
self.id = render_id
|
self.id = render_id
|
||||||
words = [self.render_word(w["text"], w["tag"], w.get("lemma", None), i) for i, w in enumerate(words)]
|
words = [
|
||||||
|
self.render_word(w["text"], w["tag"], w.get("lemma", None), i)
|
||||||
|
for i, w in enumerate(words)
|
||||||
|
]
|
||||||
arcs = [
|
arcs = [
|
||||||
self.render_arrow(a["label"], a["start"], a["end"], a["dir"], i)
|
self.render_arrow(a["label"], a["start"], a["end"], a["dir"], i)
|
||||||
for i, a in enumerate(arcs)
|
for i, a in enumerate(arcs)
|
||||||
|
@ -101,7 +110,9 @@ class DependencyRenderer(object):
|
||||||
lang=self.lang,
|
lang=self.lang,
|
||||||
)
|
)
|
||||||
|
|
||||||
def render_word(self, text, tag, lemma, i,):
|
def render_word(
|
||||||
|
self, text, tag, lemma, i,
|
||||||
|
):
|
||||||
"""Render individual word.
|
"""Render individual word.
|
||||||
|
|
||||||
text (unicode): Word text.
|
text (unicode): Word text.
|
||||||
|
@ -115,7 +126,9 @@ class DependencyRenderer(object):
|
||||||
x = self.width - x
|
x = self.width - x
|
||||||
html_text = escape_html(text)
|
html_text = escape_html(text)
|
||||||
if lemma is not None:
|
if lemma is not None:
|
||||||
return TPL_DEP_WORDS_LEMMA.format(text=html_text, tag=tag, lemma=lemma, x=x, y=y)
|
return TPL_DEP_WORDS_LEMMA.format(
|
||||||
|
text=html_text, tag=tag, lemma=lemma, x=x, y=y
|
||||||
|
)
|
||||||
return TPL_DEP_WORDS.format(text=html_text, tag=tag, x=x, y=y)
|
return TPL_DEP_WORDS.format(text=html_text, tag=tag, x=x, y=y)
|
||||||
|
|
||||||
def render_arrow(self, label, start, end, direction, i):
|
def render_arrow(self, label, start, end, direction, i):
|
||||||
|
|
|
@ -1,11 +1,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import os
|
|
||||||
import warnings
|
|
||||||
import inspect
|
|
||||||
|
|
||||||
|
|
||||||
def add_codes(err_cls):
|
def add_codes(err_cls):
|
||||||
"""Add error codes to string messages via class attribute names."""
|
"""Add error codes to string messages via class attribute names."""
|
||||||
|
|
||||||
|
@ -93,8 +88,7 @@ class Warnings(object):
|
||||||
W022 = ("Training a new part-of-speech tagger using a model with no "
|
W022 = ("Training a new part-of-speech tagger using a model with no "
|
||||||
"lemmatization rules or data. This means that the trained model "
|
"lemmatization rules or data. This means that the trained model "
|
||||||
"may not be able to lemmatize correctly. If this is intentional "
|
"may not be able to lemmatize correctly. If this is intentional "
|
||||||
"or the language you're using doesn't have lemmatization data, "
|
"or the language you're using doesn't have lemmatization data. "
|
||||||
"you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. "
|
|
||||||
"If this is surprising, make sure you have the spacy-lookups-data "
|
"If this is surprising, make sure you have the spacy-lookups-data "
|
||||||
"package installed.")
|
"package installed.")
|
||||||
W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. "
|
W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. "
|
||||||
|
@ -110,7 +104,8 @@ class Warnings(object):
|
||||||
W028 = ("Doc.from_array was called with a vector of type '{type}', "
|
W028 = ("Doc.from_array was called with a vector of type '{type}', "
|
||||||
"but is expecting one of type 'uint64' instead. This may result "
|
"but is expecting one of type 'uint64' instead. This may result "
|
||||||
"in problems with the vocab further on in the pipeline.")
|
"in problems with the vocab further on in the pipeline.")
|
||||||
|
W029 = ("Unable to align tokens with entities from character offsets. "
|
||||||
|
"Discarding entity annotation for the text: {text}.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
@ -552,6 +547,14 @@ class Errors(object):
|
||||||
"array.")
|
"array.")
|
||||||
E191 = ("Invalid head: the head token must be from the same doc as the "
|
E191 = ("Invalid head: the head token must be from the same doc as the "
|
||||||
"token itself.")
|
"token itself.")
|
||||||
|
E192 = ("Unable to resize vectors in place with cupy.")
|
||||||
|
E193 = ("Unable to resize vectors in place if the resized vector dimension "
|
||||||
|
"({new_dim}) is not the same as the current vector dimension "
|
||||||
|
"({curr_dim}).")
|
||||||
|
E194 = ("Unable to aligned mismatched text '{text}' and words '{words}'.")
|
||||||
|
E195 = ("Matcher can be called on {good} only, got {got}.")
|
||||||
|
E196 = ("Refusing to write to token.is_sent_end. Sentence boundaries can "
|
||||||
|
"only be fixed with token.is_sent_start.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
@ -586,64 +589,3 @@ class MatchPatternError(ValueError):
|
||||||
|
|
||||||
class AlignmentError(ValueError):
|
class AlignmentError(ValueError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class ModelsWarning(UserWarning):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
WARNINGS = {
|
|
||||||
"user": UserWarning,
|
|
||||||
"deprecation": DeprecationWarning,
|
|
||||||
"models": ModelsWarning,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def _get_warn_types(arg):
|
|
||||||
if arg == "": # don't show any warnings
|
|
||||||
return []
|
|
||||||
if not arg or arg == "all": # show all available warnings
|
|
||||||
return WARNINGS.keys()
|
|
||||||
return [w_type.strip() for w_type in arg.split(",") if w_type.strip() in WARNINGS]
|
|
||||||
|
|
||||||
|
|
||||||
def _get_warn_excl(arg):
|
|
||||||
if not arg:
|
|
||||||
return []
|
|
||||||
return [w_id.strip() for w_id in arg.split(",")]
|
|
||||||
|
|
||||||
|
|
||||||
SPACY_WARNING_FILTER = os.environ.get("SPACY_WARNING_FILTER")
|
|
||||||
SPACY_WARNING_TYPES = _get_warn_types(os.environ.get("SPACY_WARNING_TYPES"))
|
|
||||||
SPACY_WARNING_IGNORE = _get_warn_excl(os.environ.get("SPACY_WARNING_IGNORE"))
|
|
||||||
|
|
||||||
|
|
||||||
def user_warning(message):
|
|
||||||
_warn(message, "user")
|
|
||||||
|
|
||||||
|
|
||||||
def deprecation_warning(message):
|
|
||||||
_warn(message, "deprecation")
|
|
||||||
|
|
||||||
|
|
||||||
def models_warning(message):
|
|
||||||
_warn(message, "models")
|
|
||||||
|
|
||||||
|
|
||||||
def _warn(message, warn_type="user"):
|
|
||||||
"""
|
|
||||||
message (unicode): The message to display.
|
|
||||||
category (Warning): The Warning to show.
|
|
||||||
"""
|
|
||||||
if message.startswith("["):
|
|
||||||
w_id = message.split("[", 1)[1].split("]", 1)[0] # get ID from string
|
|
||||||
else:
|
|
||||||
w_id = None
|
|
||||||
ignore_warning = w_id and w_id in SPACY_WARNING_IGNORE
|
|
||||||
if warn_type in SPACY_WARNING_TYPES and not ignore_warning:
|
|
||||||
category = WARNINGS[warn_type]
|
|
||||||
stack = inspect.stack()[-1]
|
|
||||||
with warnings.catch_warnings():
|
|
||||||
if SPACY_WARNING_FILTER:
|
|
||||||
warnings.simplefilter(SPACY_WARNING_FILTER, category)
|
|
||||||
warnings.warn_explicit(message, category, stack[1], stack[2])
|
|
||||||
|
|
152
spacy/gold.pyx
152
spacy/gold.pyx
|
@ -10,10 +10,11 @@ import shutil
|
||||||
import itertools
|
import itertools
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import srsly
|
import srsly
|
||||||
|
import warnings
|
||||||
|
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
from .tokens import Doc, Span
|
from .tokens import Doc, Span
|
||||||
from .errors import Errors, AlignmentError, user_warning, Warnings
|
from .errors import Errors, AlignmentError, Warnings
|
||||||
from .compat import path2str
|
from .compat import path2str
|
||||||
from . import util
|
from . import util
|
||||||
from .util import minibatch, itershuffle
|
from .util import minibatch, itershuffle
|
||||||
|
@ -21,7 +22,6 @@ from .util import minibatch, itershuffle
|
||||||
from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek
|
from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek
|
||||||
|
|
||||||
|
|
||||||
USE_NEW_ALIGN = False
|
|
||||||
punct_re = re.compile(r"\W")
|
punct_re = re.compile(r"\W")
|
||||||
|
|
||||||
|
|
||||||
|
@ -73,57 +73,8 @@ def merge_sents(sents):
|
||||||
return [(m_deps, (m_cats, m_brackets))]
|
return [(m_deps, (m_cats, m_brackets))]
|
||||||
|
|
||||||
|
|
||||||
_ALIGNMENT_NORM_MAP = [("``", "'"), ("''", "'"), ('"', "'"), ("`", "'")]
|
|
||||||
|
|
||||||
|
|
||||||
def _normalize_for_alignment(tokens):
|
def _normalize_for_alignment(tokens):
|
||||||
tokens = [w.replace(" ", "").lower() for w in tokens]
|
return [w.replace(" ", "").lower() for w in tokens]
|
||||||
output = []
|
|
||||||
for token in tokens:
|
|
||||||
token = token.replace(" ", "").lower()
|
|
||||||
for before, after in _ALIGNMENT_NORM_MAP:
|
|
||||||
token = token.replace(before, after)
|
|
||||||
output.append(token)
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
def _align_before_v2_2_2(tokens_a, tokens_b):
|
|
||||||
"""Calculate alignment tables between two tokenizations, using the Levenshtein
|
|
||||||
algorithm. The alignment is case-insensitive.
|
|
||||||
|
|
||||||
tokens_a (List[str]): The candidate tokenization.
|
|
||||||
tokens_b (List[str]): The reference tokenization.
|
|
||||||
RETURNS: (tuple): A 5-tuple consisting of the following information:
|
|
||||||
* cost (int): The number of misaligned tokens.
|
|
||||||
* a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
|
|
||||||
For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
|
|
||||||
to `tokens_b[6]`. If there's no one-to-one alignment for a token,
|
|
||||||
it has the value -1.
|
|
||||||
* b2a (List[int]): The same as `a2b`, but mapping the other direction.
|
|
||||||
* a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
|
|
||||||
to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
|
|
||||||
the same token of `tokens_b`.
|
|
||||||
* b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
|
|
||||||
direction.
|
|
||||||
"""
|
|
||||||
from . import _align
|
|
||||||
if tokens_a == tokens_b:
|
|
||||||
alignment = numpy.arange(len(tokens_a))
|
|
||||||
return 0, alignment, alignment, {}, {}
|
|
||||||
tokens_a = [w.replace(" ", "").lower() for w in tokens_a]
|
|
||||||
tokens_b = [w.replace(" ", "").lower() for w in tokens_b]
|
|
||||||
cost, i2j, j2i, matrix = _align.align(tokens_a, tokens_b)
|
|
||||||
i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in tokens_a],
|
|
||||||
[len(w) for w in tokens_b])
|
|
||||||
for i, j in list(i2j_multi.items()):
|
|
||||||
if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j:
|
|
||||||
i2j[i] = j
|
|
||||||
i2j_multi.pop(i)
|
|
||||||
for j, i in list(j2i_multi.items()):
|
|
||||||
if j2i_multi.get(j+1) != i and j2i_multi.get(j-1) != i:
|
|
||||||
j2i[j] = i
|
|
||||||
j2i_multi.pop(j)
|
|
||||||
return cost, i2j, j2i, i2j_multi, j2i_multi
|
|
||||||
|
|
||||||
|
|
||||||
def align(tokens_a, tokens_b):
|
def align(tokens_a, tokens_b):
|
||||||
|
@ -144,8 +95,6 @@ def align(tokens_a, tokens_b):
|
||||||
* b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
|
* b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
|
||||||
direction.
|
direction.
|
||||||
"""
|
"""
|
||||||
if not USE_NEW_ALIGN:
|
|
||||||
return _align_before_v2_2_2(tokens_a, tokens_b)
|
|
||||||
tokens_a = _normalize_for_alignment(tokens_a)
|
tokens_a = _normalize_for_alignment(tokens_a)
|
||||||
tokens_b = _normalize_for_alignment(tokens_b)
|
tokens_b = _normalize_for_alignment(tokens_b)
|
||||||
cost = 0
|
cost = 0
|
||||||
|
@ -382,6 +331,8 @@ class GoldCorpus(object):
|
||||||
def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
|
def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
|
||||||
if random.random() >= orth_variant_level:
|
if random.random() >= orth_variant_level:
|
||||||
return raw, paragraph_tuples
|
return raw, paragraph_tuples
|
||||||
|
raw_orig = str(raw)
|
||||||
|
lower = False
|
||||||
if random.random() >= 0.5:
|
if random.random() >= 0.5:
|
||||||
lower = True
|
lower = True
|
||||||
if raw is not None:
|
if raw is not None:
|
||||||
|
@ -442,8 +393,11 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
|
||||||
ids, words, tags, heads, labels, ner = sent_tuples
|
ids, words, tags, heads, labels, ner = sent_tuples
|
||||||
for word in words:
|
for word in words:
|
||||||
match_found = False
|
match_found = False
|
||||||
|
# skip whitespace words
|
||||||
|
if word.isspace():
|
||||||
|
match_found = True
|
||||||
# add identical word
|
# add identical word
|
||||||
if word not in variants and raw[raw_idx:].startswith(word):
|
elif word not in variants and raw[raw_idx:].startswith(word):
|
||||||
variant_raw += word
|
variant_raw += word
|
||||||
raw_idx += len(word)
|
raw_idx += len(word)
|
||||||
match_found = True
|
match_found = True
|
||||||
|
@ -458,7 +412,7 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
|
||||||
# something went wrong, abort
|
# something went wrong, abort
|
||||||
# (add a warning message?)
|
# (add a warning message?)
|
||||||
if not match_found:
|
if not match_found:
|
||||||
return raw, paragraph_tuples
|
return raw_orig, paragraph_tuples
|
||||||
# add following whitespace
|
# add following whitespace
|
||||||
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
|
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
|
||||||
variant_raw += raw[raw_idx]
|
variant_raw += raw[raw_idx]
|
||||||
|
@ -560,7 +514,7 @@ def _json_iterate(loc):
|
||||||
py_raw = file_.read()
|
py_raw = file_.read()
|
||||||
cdef long file_length = len(py_raw)
|
cdef long file_length = len(py_raw)
|
||||||
if file_length > 2 ** 30:
|
if file_length > 2 ** 30:
|
||||||
user_warning(Warnings.W027.format(size=file_length))
|
warnings.warn(Warnings.W027.format(size=file_length))
|
||||||
|
|
||||||
raw = <char*>py_raw
|
raw = <char*>py_raw
|
||||||
cdef int square_depth = 0
|
cdef int square_depth = 0
|
||||||
|
@ -700,6 +654,9 @@ cdef class GoldParse:
|
||||||
# if self.lenght > 0, this is modified latter.
|
# if self.lenght > 0, this is modified latter.
|
||||||
self.orig_annot = []
|
self.orig_annot = []
|
||||||
|
|
||||||
|
# temporary doc for aligning entity annotation
|
||||||
|
entdoc = None
|
||||||
|
|
||||||
# avoid allocating memory if the doc does not contain any tokens
|
# avoid allocating memory if the doc does not contain any tokens
|
||||||
if self.length > 0:
|
if self.length > 0:
|
||||||
if words is None:
|
if words is None:
|
||||||
|
@ -722,7 +679,25 @@ cdef class GoldParse:
|
||||||
entities = [(ent if ent is not None else "-") for ent in entities]
|
entities = [(ent if ent is not None else "-") for ent in entities]
|
||||||
if not isinstance(entities[0], basestring):
|
if not isinstance(entities[0], basestring):
|
||||||
# Assume we have entities specified by character offset.
|
# Assume we have entities specified by character offset.
|
||||||
entities = biluo_tags_from_offsets(doc, entities)
|
# Create a temporary Doc corresponding to provided words
|
||||||
|
# (to preserve gold tokenization) and text (to preserve
|
||||||
|
# character offsets).
|
||||||
|
entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
|
||||||
|
entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
|
||||||
|
entdoc_entities = biluo_tags_from_offsets(entdoc, entities)
|
||||||
|
# There may be some additional whitespace tokens in the
|
||||||
|
# temporary doc, so check that the annotations align with
|
||||||
|
# the provided words while building a list of BILUO labels.
|
||||||
|
entities = []
|
||||||
|
words_offset = 0
|
||||||
|
for i in range(len(entdoc_words)):
|
||||||
|
if words[i + words_offset] == entdoc_words[i]:
|
||||||
|
entities.append(entdoc_entities[i])
|
||||||
|
else:
|
||||||
|
words_offset -= 1
|
||||||
|
if len(entities) != len(words):
|
||||||
|
warnings.warn(Warnings.W029.format(text=doc.text))
|
||||||
|
entities = ["-" for _ in words]
|
||||||
|
|
||||||
# These are filled by the tagger/parser/entity recogniser
|
# These are filled by the tagger/parser/entity recogniser
|
||||||
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
|
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||||
|
@ -749,7 +724,8 @@ cdef class GoldParse:
|
||||||
# If we under-segment, we'll have one predicted word that covers a
|
# If we under-segment, we'll have one predicted word that covers a
|
||||||
# sequence of gold words.
|
# sequence of gold words.
|
||||||
# If we "mis-segment", we'll have a sequence of predicted words covering
|
# If we "mis-segment", we'll have a sequence of predicted words covering
|
||||||
# a sequence of gold words. That's many-to-many -- we don't do that.
|
# a sequence of gold words. That's many-to-many -- we don't do that
|
||||||
|
# except for NER spans where the start and end can be aligned.
|
||||||
cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
|
cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
|
||||||
|
|
||||||
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
|
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
|
||||||
|
@ -772,7 +748,6 @@ cdef class GoldParse:
|
||||||
self.tags[i] = tags[i2j_multi[i]]
|
self.tags[i] = tags[i2j_multi[i]]
|
||||||
self.morphology[i] = morphology[i2j_multi[i]]
|
self.morphology[i] = morphology[i2j_multi[i]]
|
||||||
is_last = i2j_multi[i] != i2j_multi.get(i+1)
|
is_last = i2j_multi[i] != i2j_multi.get(i+1)
|
||||||
is_first = i2j_multi[i] != i2j_multi.get(i-1)
|
|
||||||
# Set next word in multi-token span as head, until last
|
# Set next word in multi-token span as head, until last
|
||||||
if not is_last:
|
if not is_last:
|
||||||
self.heads[i] = i+1
|
self.heads[i] = i+1
|
||||||
|
@ -782,29 +757,9 @@ cdef class GoldParse:
|
||||||
if head_i:
|
if head_i:
|
||||||
self.heads[i] = self.gold_to_cand[head_i]
|
self.heads[i] = self.gold_to_cand[head_i]
|
||||||
self.labels[i] = deps[i2j_multi[i]]
|
self.labels[i] = deps[i2j_multi[i]]
|
||||||
# Now set NER...This is annoying because if we've split
|
|
||||||
# got an entity word split into two, we need to adjust the
|
|
||||||
# BILUO tags. We can't have BB or LL etc.
|
|
||||||
# Case 1: O -- easy.
|
|
||||||
ner_tag = entities[i2j_multi[i]]
|
ner_tag = entities[i2j_multi[i]]
|
||||||
if ner_tag == "O":
|
# Assign O/- for many-to-one O/- NER tags
|
||||||
self.ner[i] = "O"
|
if ner_tag in ("O", "-"):
|
||||||
# Case 2: U. This has to become a B I* L sequence.
|
|
||||||
elif ner_tag.startswith("U-"):
|
|
||||||
if is_first:
|
|
||||||
self.ner[i] = ner_tag.replace("U-", "B-", 1)
|
|
||||||
elif is_last:
|
|
||||||
self.ner[i] = ner_tag.replace("U-", "L-", 1)
|
|
||||||
else:
|
|
||||||
self.ner[i] = ner_tag.replace("U-", "I-", 1)
|
|
||||||
# Case 3: L. If not last, change to I.
|
|
||||||
elif ner_tag.startswith("L-"):
|
|
||||||
if is_last:
|
|
||||||
self.ner[i] = ner_tag
|
|
||||||
else:
|
|
||||||
self.ner[i] = ner_tag.replace("L-", "I-", 1)
|
|
||||||
# Case 4: I. Stays correct
|
|
||||||
elif ner_tag.startswith("I-"):
|
|
||||||
self.ner[i] = ner_tag
|
self.ner[i] = ner_tag
|
||||||
else:
|
else:
|
||||||
self.words[i] = words[gold_i]
|
self.words[i] = words[gold_i]
|
||||||
|
@ -816,6 +771,39 @@ cdef class GoldParse:
|
||||||
self.heads[i] = self.gold_to_cand[heads[gold_i]]
|
self.heads[i] = self.gold_to_cand[heads[gold_i]]
|
||||||
self.labels[i] = deps[gold_i]
|
self.labels[i] = deps[gold_i]
|
||||||
self.ner[i] = entities[gold_i]
|
self.ner[i] = entities[gold_i]
|
||||||
|
# Assign O/- for one-to-many O/- NER tags
|
||||||
|
for j, cand_j in enumerate(self.gold_to_cand):
|
||||||
|
if cand_j is None:
|
||||||
|
if j in j2i_multi:
|
||||||
|
i = j2i_multi[j]
|
||||||
|
ner_tag = entities[j]
|
||||||
|
if ner_tag in ("O", "-"):
|
||||||
|
self.ner[i] = ner_tag
|
||||||
|
|
||||||
|
# If there is entity annotation and some tokens remain unaligned,
|
||||||
|
# align all entities at the character level to account for all
|
||||||
|
# possible token misalignments within the entity spans
|
||||||
|
if any([e not in ("O", "-") for e in entities]) and None in self.ner:
|
||||||
|
# If the temporary entdoc wasn't created above, initialize it
|
||||||
|
if not entdoc:
|
||||||
|
entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
|
||||||
|
entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
|
||||||
|
# Get offsets based on gold words and BILUO entities
|
||||||
|
entdoc_offsets = offsets_from_biluo_tags(entdoc, entities)
|
||||||
|
aligned_offsets = []
|
||||||
|
aligned_spans = []
|
||||||
|
# Filter offsets to identify those that align with doc tokens
|
||||||
|
for offset in entdoc_offsets:
|
||||||
|
span = doc.char_span(offset[0], offset[1])
|
||||||
|
if span and not span.text.isspace():
|
||||||
|
aligned_offsets.append(offset)
|
||||||
|
aligned_spans.append(span)
|
||||||
|
# Convert back to BILUO for doc tokens and assign NER for all
|
||||||
|
# aligned spans
|
||||||
|
biluo_tags = biluo_tags_from_offsets(doc, aligned_offsets, missing=None)
|
||||||
|
for span in aligned_spans:
|
||||||
|
for i in range(span.start, span.end):
|
||||||
|
self.ner[i] = biluo_tags[i]
|
||||||
|
|
||||||
# Prevent whitespace that isn't within entities from being tagged as
|
# Prevent whitespace that isn't within entities from being tagged as
|
||||||
# an entity.
|
# an entity.
|
||||||
|
|
12
spacy/kb.pyx
12
spacy/kb.pyx
|
@ -1,7 +1,9 @@
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from spacy.errors import Errors, Warnings, user_warning
|
import warnings
|
||||||
|
|
||||||
|
from spacy.errors import Errors, Warnings
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
@ -115,7 +117,7 @@ cdef class KnowledgeBase:
|
||||||
|
|
||||||
# Return if this entity was added before
|
# Return if this entity was added before
|
||||||
if entity_hash in self._entry_index:
|
if entity_hash in self._entry_index:
|
||||||
user_warning(Warnings.W018.format(entity=entity))
|
warnings.warn(Warnings.W018.format(entity=entity))
|
||||||
return
|
return
|
||||||
|
|
||||||
# Raise an error if the provided entity vector is not of the correct length
|
# Raise an error if the provided entity vector is not of the correct length
|
||||||
|
@ -147,7 +149,7 @@ cdef class KnowledgeBase:
|
||||||
# only process this entity if its unique ID hadn't been added before
|
# only process this entity if its unique ID hadn't been added before
|
||||||
entity_hash = self.vocab.strings.add(entity_list[i])
|
entity_hash = self.vocab.strings.add(entity_list[i])
|
||||||
if entity_hash in self._entry_index:
|
if entity_hash in self._entry_index:
|
||||||
user_warning(Warnings.W018.format(entity=entity_list[i]))
|
warnings.warn(Warnings.W018.format(entity=entity_list[i]))
|
||||||
|
|
||||||
else:
|
else:
|
||||||
entity_vector = vector_list[i]
|
entity_vector = vector_list[i]
|
||||||
|
@ -195,7 +197,7 @@ cdef class KnowledgeBase:
|
||||||
|
|
||||||
# Check whether this alias was added before
|
# Check whether this alias was added before
|
||||||
if alias_hash in self._alias_index:
|
if alias_hash in self._alias_index:
|
||||||
user_warning(Warnings.W017.format(alias=alias))
|
warnings.warn(Warnings.W017.format(alias=alias))
|
||||||
return
|
return
|
||||||
|
|
||||||
cdef vector[int64_t] entry_indices
|
cdef vector[int64_t] entry_indices
|
||||||
|
@ -252,7 +254,7 @@ cdef class KnowledgeBase:
|
||||||
|
|
||||||
if is_present:
|
if is_present:
|
||||||
if not ignore_warnings:
|
if not ignore_warnings:
|
||||||
user_warning(Warnings.W024.format(entity=entity, alias=alias))
|
warnings.warn(Warnings.W024.format(entity=entity, alias=alias))
|
||||||
else:
|
else:
|
||||||
entry_indices.push_back(int(entry_index))
|
entry_indices.push_back(int(entry_index))
|
||||||
alias_entry.entry_indices = entry_indices
|
alias_entry.entry_indices = entry_indices
|
||||||
|
|
|
@ -9,10 +9,13 @@ Example sentences to test spaCy and its language models.
|
||||||
>>> docs = nlp.pipe(sentences)
|
>>> docs = nlp.pipe(sentences)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
sentences = [
|
sentences = [
|
||||||
"Apple overvejer at købe et britisk startup for 1 milliard dollar",
|
"Apple overvejer at købe et britisk startup for 1 milliard dollar.",
|
||||||
"Selvkørende biler flytter forsikringsansvaret over på producenterne",
|
"Selvkørende biler flytter forsikringsansvaret over på producenterne.",
|
||||||
"San Francisco overvejer at forbyde udbringningsrobotter på fortov",
|
"San Francisco overvejer at forbyde udbringningsrobotter på fortovet.",
|
||||||
"London er en stor by i Storbritannien",
|
"London er en storby i Storbritannien.",
|
||||||
|
"Hvor er du?",
|
||||||
|
"Hvem er Frankrings president?",
|
||||||
|
"Hvad er hovedstaden i USA?",
|
||||||
|
"Hvornår blev Barack Obama født?",
|
||||||
]
|
]
|
||||||
|
|
|
@ -70,6 +70,7 @@ for orth in [
|
||||||
"A/S",
|
"A/S",
|
||||||
"B.C.",
|
"B.C.",
|
||||||
"BK.",
|
"BK.",
|
||||||
|
"B.T.",
|
||||||
"Dr.",
|
"Dr.",
|
||||||
"Boul.",
|
"Boul.",
|
||||||
"Chr.",
|
"Chr.",
|
||||||
|
@ -79,6 +80,7 @@ for orth in [
|
||||||
"Hf.",
|
"Hf.",
|
||||||
"i/s",
|
"i/s",
|
||||||
"I/S",
|
"I/S",
|
||||||
|
"Inc.",
|
||||||
"Kprs.",
|
"Kprs.",
|
||||||
"L.A.",
|
"L.A.",
|
||||||
"Ll.",
|
"Ll.",
|
||||||
|
@ -149,6 +151,7 @@ for orth in [
|
||||||
"bygn.",
|
"bygn.",
|
||||||
"c/o",
|
"c/o",
|
||||||
"ca.",
|
"ca.",
|
||||||
|
"cm.",
|
||||||
"cand.",
|
"cand.",
|
||||||
"d.d.",
|
"d.d.",
|
||||||
"d.m.",
|
"d.m.",
|
||||||
|
@ -172,10 +175,12 @@ for orth in [
|
||||||
"dl.",
|
"dl.",
|
||||||
"do.",
|
"do.",
|
||||||
"dobb.",
|
"dobb.",
|
||||||
|
"dr.",
|
||||||
"dr.h.c",
|
"dr.h.c",
|
||||||
"dr.phil.",
|
"dr.phil.",
|
||||||
"ds.",
|
"ds.",
|
||||||
"dvs.",
|
"dvs.",
|
||||||
|
"d.v.s.",
|
||||||
"e.b.",
|
"e.b.",
|
||||||
"e.l.",
|
"e.l.",
|
||||||
"e.o.",
|
"e.o.",
|
||||||
|
@ -297,10 +302,14 @@ for orth in [
|
||||||
"kap.",
|
"kap.",
|
||||||
"kbh.",
|
"kbh.",
|
||||||
"kem.",
|
"kem.",
|
||||||
|
"kg.",
|
||||||
|
"kgs.",
|
||||||
"kgl.",
|
"kgl.",
|
||||||
"kl.",
|
"kl.",
|
||||||
"kld.",
|
"kld.",
|
||||||
|
"km.",
|
||||||
"km/t",
|
"km/t",
|
||||||
|
"km/t.",
|
||||||
"knsp.",
|
"knsp.",
|
||||||
"komm.",
|
"komm.",
|
||||||
"kons.",
|
"kons.",
|
||||||
|
@ -311,6 +320,7 @@ for orth in [
|
||||||
"kt.",
|
"kt.",
|
||||||
"ktr.",
|
"ktr.",
|
||||||
"kv.",
|
"kv.",
|
||||||
|
"kvm.",
|
||||||
"kvt.",
|
"kvt.",
|
||||||
"l.c.",
|
"l.c.",
|
||||||
"lab.",
|
"lab.",
|
||||||
|
@ -357,6 +367,7 @@ for orth in [
|
||||||
"nto.",
|
"nto.",
|
||||||
"nuv.",
|
"nuv.",
|
||||||
"o/m",
|
"o/m",
|
||||||
|
"o/m.",
|
||||||
"o.a.",
|
"o.a.",
|
||||||
"o.fl.",
|
"o.fl.",
|
||||||
"o.h.",
|
"o.h.",
|
||||||
|
@ -526,6 +537,7 @@ for orth in [
|
||||||
"vejl.",
|
"vejl.",
|
||||||
"vh.",
|
"vh.",
|
||||||
"vha.",
|
"vha.",
|
||||||
|
"vind.",
|
||||||
"vs.",
|
"vs.",
|
||||||
"vsa.",
|
"vsa.",
|
||||||
"vær.",
|
"vær.",
|
||||||
|
|
|
@ -2,12 +2,12 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
|
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
|
||||||
from ..char_classes import LIST_CURRENCY, CURRENCY, UNITS, PUNCT
|
from ..char_classes import CURRENCY, UNITS, PUNCT
|
||||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||||
from ..punctuation import _prefixes, _suffixes
|
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
|
||||||
|
|
||||||
|
|
||||||
_prefixes = ["``",] + list(_prefixes)
|
_prefixes = ["``"] + BASE_TOKENIZER_PREFIXES
|
||||||
|
|
||||||
_suffixes = (
|
_suffixes = (
|
||||||
["''", "/"]
|
["''", "/"]
|
||||||
|
|
|
@ -6,6 +6,7 @@ from .tag_map import TAG_MAP
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
|
@ -23,6 +24,8 @@ class SpanishDefaults(Language.Defaults):
|
||||||
)
|
)
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
|
infixes = TOKENIZER_INFIXES
|
||||||
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
|
@ -26,6 +26,15 @@ _num_words = [
|
||||||
"dieciocho",
|
"dieciocho",
|
||||||
"diecinueve",
|
"diecinueve",
|
||||||
"veinte",
|
"veinte",
|
||||||
|
"veintiuno",
|
||||||
|
"veintidós",
|
||||||
|
"veintitrés",
|
||||||
|
"veinticuatro",
|
||||||
|
"veinticinco",
|
||||||
|
"veintiséis",
|
||||||
|
"veintisiete",
|
||||||
|
"veintiocho",
|
||||||
|
"veintinueve",
|
||||||
"treinta",
|
"treinta",
|
||||||
"cuarenta",
|
"cuarenta",
|
||||||
"cincuenta",
|
"cincuenta",
|
||||||
|
|
48
spacy/lang/es/punctuation.py
Normal file
48
spacy/lang/es/punctuation.py
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
|
||||||
|
from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
|
||||||
|
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
||||||
|
from ..char_classes import merge_chars
|
||||||
|
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
|
||||||
|
|
||||||
|
|
||||||
|
_list_units = [u for u in LIST_UNITS if u != "%"]
|
||||||
|
_units = merge_chars(" ".join(_list_units))
|
||||||
|
_concat_quotes = CONCAT_QUOTES + "—–"
|
||||||
|
|
||||||
|
|
||||||
|
_suffixes = (
|
||||||
|
["—", "–"]
|
||||||
|
+ LIST_PUNCT
|
||||||
|
+ LIST_ELLIPSES
|
||||||
|
+ LIST_QUOTES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
|
r"(?<=[0-9])\+",
|
||||||
|
r"(?<=°[FfCcKk])\.",
|
||||||
|
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||||
|
r"(?<=[0-9])(?:{u})".format(u=_units),
|
||||||
|
r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
|
||||||
|
al=ALPHA_LOWER, e=r"%²\-\+", q=_concat_quotes, p=PUNCT
|
||||||
|
),
|
||||||
|
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
_infixes = (
|
||||||
|
LIST_ELLIPSES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
|
r"(?<=[0-9])[+\*^](?=[0-9-])",
|
||||||
|
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||||
|
al=ALPHA_LOWER, au=ALPHA_UPPER, q=_concat_quotes
|
||||||
|
),
|
||||||
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
TOKENIZER_SUFFIXES = _suffixes
|
||||||
|
TOKENIZER_INFIXES = _infixes
|
|
@ -43,14 +43,16 @@ for orth in [
|
||||||
"Av.",
|
"Av.",
|
||||||
"Avda.",
|
"Avda.",
|
||||||
"Cía.",
|
"Cía.",
|
||||||
|
"EE.UU.",
|
||||||
"etc.",
|
"etc.",
|
||||||
|
"fig.",
|
||||||
"Gob.",
|
"Gob.",
|
||||||
"Gral.",
|
"Gral.",
|
||||||
"Ing.",
|
"Ing.",
|
||||||
"J.C.",
|
"J.C.",
|
||||||
|
"km/h",
|
||||||
"Lic.",
|
"Lic.",
|
||||||
"m.n.",
|
"m.n.",
|
||||||
"no.",
|
|
||||||
"núm.",
|
"núm.",
|
||||||
"P.D.",
|
"P.D.",
|
||||||
"Prof.",
|
"Prof.",
|
||||||
|
|
|
@ -10,5 +10,5 @@ Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
sentences = [
|
sentences = [
|
||||||
"bilbon ko castinga egin da eta nik jakin ez zuetako inork egin al du edota parte hartu duen ezagunik ba al du",
|
"bilbon ko castinga egin da eta nik jakin ez zuetako inork egin al du edota parte hartu duen ezagunik ba al du",
|
||||||
"gaur telebistan entzunda denok martetik gatoz hortaz martzianoak gara beno nire ustez batzuk beste batzuk baino martzianoagoak dira"
|
"gaur telebistan entzunda denok martetik gatoz hortaz martzianoak gara beno nire ustez batzuk beste batzuk baino martzianoagoak dira",
|
||||||
]
|
]
|
||||||
|
|
|
@ -59,7 +59,6 @@ behin
|
||||||
""".split()
|
""".split()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
if text.startswith(("+", "-", "±", "~")):
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
text = text[1:]
|
text = text[1:]
|
||||||
|
|
|
@ -5,7 +5,7 @@ from __future__ import unicode_literals
|
||||||
# https://www.ranks.nl/stopwords/basque
|
# https://www.ranks.nl/stopwords/basque
|
||||||
# https://www.mustgo.com/worldlanguages/basque/
|
# https://www.mustgo.com/worldlanguages/basque/
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
al
|
al
|
||||||
anitz
|
anitz
|
||||||
arabera
|
arabera
|
||||||
|
|
|
@ -2,7 +2,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
||||||
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
@ -27,6 +28,7 @@ class FrenchDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
prefixes = TOKENIZER_PREFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
token_match = TOKEN_MATCH
|
token_match = TOKEN_MATCH
|
||||||
|
|
|
@ -1,15 +1,26 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..punctuation import TOKENIZER_INFIXES
|
from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
||||||
from ..char_classes import CONCAT_QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
from ..char_classes import CONCAT_QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||||
|
from ..char_classes import merge_chars
|
||||||
|
|
||||||
|
|
||||||
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
|
ELISION = "' ’".replace(" ", "")
|
||||||
HYPHENS = r"- – — ‐ ‑".strip().replace(" ", "").replace("\n", "")
|
HYPHENS = r"- – — ‐ ‑".replace(" ", "")
|
||||||
|
_prefixes_elision = "d l n"
|
||||||
|
_prefixes_elision += " " + _prefixes_elision.upper()
|
||||||
|
_hyphen_suffixes = "ce clés elle en il ils je là moi nous on t vous"
|
||||||
|
_hyphen_suffixes += " " + _hyphen_suffixes.upper()
|
||||||
|
|
||||||
|
|
||||||
|
_prefixes = TOKENIZER_PREFIXES + [
|
||||||
|
r"(?:({pe})[{el}])(?=[{a}])".format(
|
||||||
|
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
_suffixes = (
|
_suffixes = (
|
||||||
LIST_PUNCT
|
LIST_PUNCT
|
||||||
+ LIST_ELLIPSES
|
+ LIST_ELLIPSES
|
||||||
|
@ -17,7 +28,6 @@ _suffixes = (
|
||||||
+ [
|
+ [
|
||||||
r"(?<=[0-9])\+",
|
r"(?<=[0-9])\+",
|
||||||
r"(?<=°[FfCcKk])\.", # °C. -> ["°C", "."]
|
r"(?<=°[FfCcKk])\.", # °C. -> ["°C", "."]
|
||||||
r"(?<=[0-9])°[FfCcKk]", # 4°C -> ["4", "°C"]
|
|
||||||
r"(?<=[0-9])%", # 4% -> ["4", "%"]
|
r"(?<=[0-9])%", # 4% -> ["4", "%"]
|
||||||
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||||
|
@ -25,14 +35,17 @@ _suffixes = (
|
||||||
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
|
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
|
||||||
),
|
),
|
||||||
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||||
|
r"(?<=[{a}])[{h}]({hs})".format(
|
||||||
|
a=ALPHA, h=HYPHENS, hs=merge_chars(_hyphen_suffixes)
|
||||||
|
),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
_infixes = TOKENIZER_INFIXES + [
|
_infixes = TOKENIZER_INFIXES + [
|
||||||
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
|
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_PREFIXES = _prefixes
|
||||||
TOKENIZER_SUFFIXES = _suffixes
|
TOKENIZER_SUFFIXES = _suffixes
|
||||||
TOKENIZER_INFIXES = _infixes
|
TOKENIZER_INFIXES = _infixes
|
||||||
|
|
|
@ -6,7 +6,7 @@ import re
|
||||||
from .punctuation import ELISION, HYPHENS
|
from .punctuation import ELISION, HYPHENS
|
||||||
from ..tokenizer_exceptions import URL_PATTERN
|
from ..tokenizer_exceptions import URL_PATTERN
|
||||||
from ..char_classes import ALPHA_LOWER, ALPHA
|
from ..char_classes import ALPHA_LOWER, ALPHA
|
||||||
from ...symbols import ORTH, LEMMA, TAG
|
from ...symbols import ORTH, LEMMA
|
||||||
|
|
||||||
# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
|
# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
|
||||||
# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
|
# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
|
||||||
|
@ -56,7 +56,28 @@ for exc_data in [
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
|
||||||
for orth in ["etc."]:
|
for orth in [
|
||||||
|
"après-midi",
|
||||||
|
"au-delà",
|
||||||
|
"au-dessus",
|
||||||
|
"celle-ci",
|
||||||
|
"celles-ci",
|
||||||
|
"celui-ci",
|
||||||
|
"cf.",
|
||||||
|
"ci-dessous",
|
||||||
|
"elle-même",
|
||||||
|
"en-dessous",
|
||||||
|
"etc.",
|
||||||
|
"jusque-là",
|
||||||
|
"lui-même",
|
||||||
|
"MM.",
|
||||||
|
"No.",
|
||||||
|
"peut-être",
|
||||||
|
"pp.",
|
||||||
|
"quelques-uns",
|
||||||
|
"rendez-vous",
|
||||||
|
"Vol.",
|
||||||
|
]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
|
@ -72,7 +93,7 @@ for verb, verb_lemma in [
|
||||||
for pronoun in ["elle", "il", "on"]:
|
for pronoun in ["elle", "il", "on"]:
|
||||||
token = "{}-t-{}".format(orth, pronoun)
|
token = "{}-t-{}".format(orth, pronoun)
|
||||||
_exc[token] = [
|
_exc[token] = [
|
||||||
{LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
|
{LEMMA: verb_lemma, ORTH: orth}, # , TAG: "VERB"},
|
||||||
{LEMMA: "t", ORTH: "-t"},
|
{LEMMA: "t", ORTH: "-t"},
|
||||||
{LEMMA: pronoun, ORTH: "-" + pronoun},
|
{LEMMA: pronoun, ORTH: "-" + pronoun},
|
||||||
]
|
]
|
||||||
|
@ -81,7 +102,7 @@ for verb, verb_lemma in [("est", "être")]:
|
||||||
for orth in [verb, verb.title()]:
|
for orth in [verb, verb.title()]:
|
||||||
token = "{}-ce".format(orth)
|
token = "{}-ce".format(orth)
|
||||||
_exc[token] = [
|
_exc[token] = [
|
||||||
{LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
|
{LEMMA: verb_lemma, ORTH: orth}, # , TAG: "VERB"},
|
||||||
{LEMMA: "ce", ORTH: "-ce"},
|
{LEMMA: "ce", ORTH: "-ce"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -89,12 +110,29 @@ for verb, verb_lemma in [("est", "être")]:
|
||||||
for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]:
|
for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]:
|
||||||
for orth in [pre, pre.title()]:
|
for orth in [pre, pre.title()]:
|
||||||
_exc["%sest-ce" % orth] = [
|
_exc["%sest-ce" % orth] = [
|
||||||
{LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"},
|
{LEMMA: pre_lemma, ORTH: orth},
|
||||||
{LEMMA: "être", ORTH: "est", TAG: "VERB"},
|
{LEMMA: "être", ORTH: "est"},
|
||||||
{LEMMA: "ce", ORTH: "-ce"},
|
{LEMMA: "ce", ORTH: "-ce"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
for verb, pronoun in [("est", "il"), ("EST", "IL")]:
|
||||||
|
token = "{}-{}".format(verb, pronoun)
|
||||||
|
_exc[token] = [
|
||||||
|
{LEMMA: "être", ORTH: verb},
|
||||||
|
{LEMMA: pronoun, ORTH: "-" + pronoun},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
for s, verb, pronoun in [("s", "est", "il"), ("S", "EST", "IL")]:
|
||||||
|
token = "{}'{}-{}".format(s, verb, pronoun)
|
||||||
|
_exc[token] = [
|
||||||
|
{LEMMA: "se", ORTH: s + "'"},
|
||||||
|
{LEMMA: "être", ORTH: verb},
|
||||||
|
{LEMMA: pronoun, ORTH: "-" + pronoun},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
_infixes_exc = []
|
_infixes_exc = []
|
||||||
orig_elision = "'"
|
orig_elision = "'"
|
||||||
orig_hyphen = "-"
|
orig_hyphen = "-"
|
||||||
|
@ -423,5 +461,5 @@ _regular_exp.append(URL_PATTERN)
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = _exc
|
||||||
TOKEN_MATCH = re.compile(
|
TOKEN_MATCH = re.compile(
|
||||||
"|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE | re.UNICODE
|
"(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp)
|
||||||
).match
|
).match
|
||||||
|
|
18
spacy/lang/gu/__init__.py
Normal file
18
spacy/lang/gu/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
|
from ...language import Language
|
||||||
|
|
||||||
|
|
||||||
|
class GujaratiDefaults(Language.Defaults):
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
class Gujarati(Language):
|
||||||
|
lang = "gu"
|
||||||
|
Defaults = GujaratiDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Gujarati"]
|
22
spacy/lang/gu/examples.py
Normal file
22
spacy/lang/gu/examples.py
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.gu.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"લોકશાહી એ સરકારનું એક એવું તંત્ર છે જ્યાં નાગરિકો મત દ્વારા સત્તાનો ઉપયોગ કરે છે.",
|
||||||
|
"તે ગુજરાત રાજ્યના ધરમપુર શહેરમાં આવેલું હતું",
|
||||||
|
"કર્ણદેવ પહેલો સોલંકી વંશનો રાજા હતો",
|
||||||
|
"તેજપાળને બે પત્ની હતી",
|
||||||
|
"ગુજરાતમાં ભારતીય જનતા પક્ષનો ઉદય આ સમયગાળા દરમિયાન થયો",
|
||||||
|
"આંદોલનકારીઓએ ચીમનભાઇ પટેલના રાજીનામાની માંગણી કરી.",
|
||||||
|
"અહિયાં શું જોડાય છે?",
|
||||||
|
"મંદિરનો પૂર્વાભિમુખ ભાગ નાના મંડપ સાથે થોડો લંબચોરસ આકારનો છે.",
|
||||||
|
]
|
91
spacy/lang/gu/stop_words.py
Normal file
91
spacy/lang/gu/stop_words.py
Normal file
|
@ -0,0 +1,91 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
|
એમ
|
||||||
|
આ
|
||||||
|
એ
|
||||||
|
રહી
|
||||||
|
છે
|
||||||
|
છો
|
||||||
|
હતા
|
||||||
|
હતું
|
||||||
|
હતી
|
||||||
|
હોય
|
||||||
|
હતો
|
||||||
|
શકે
|
||||||
|
તે
|
||||||
|
તેના
|
||||||
|
તેનું
|
||||||
|
તેને
|
||||||
|
તેની
|
||||||
|
તેઓ
|
||||||
|
તેમને
|
||||||
|
તેમના
|
||||||
|
તેમણે
|
||||||
|
તેમનું
|
||||||
|
તેમાં
|
||||||
|
અને
|
||||||
|
અહીં
|
||||||
|
થી
|
||||||
|
થઈ
|
||||||
|
થાય
|
||||||
|
જે
|
||||||
|
ને
|
||||||
|
કે
|
||||||
|
ના
|
||||||
|
ની
|
||||||
|
નો
|
||||||
|
ને
|
||||||
|
નું
|
||||||
|
શું
|
||||||
|
માં
|
||||||
|
પણ
|
||||||
|
પર
|
||||||
|
જેવા
|
||||||
|
જેવું
|
||||||
|
જાય
|
||||||
|
જેમ
|
||||||
|
જેથી
|
||||||
|
માત્ર
|
||||||
|
માટે
|
||||||
|
પરથી
|
||||||
|
આવ્યું
|
||||||
|
એવી
|
||||||
|
આવી
|
||||||
|
રીતે
|
||||||
|
સુધી
|
||||||
|
થાય
|
||||||
|
થઈ
|
||||||
|
સાથે
|
||||||
|
લાગે
|
||||||
|
હોવા
|
||||||
|
છતાં
|
||||||
|
રહેલા
|
||||||
|
કરી
|
||||||
|
કરે
|
||||||
|
કેટલા
|
||||||
|
કોઈ
|
||||||
|
કેમ
|
||||||
|
કર્યો
|
||||||
|
કર્યુ
|
||||||
|
કરે
|
||||||
|
સૌથી
|
||||||
|
ત્યારબાદ
|
||||||
|
તથા
|
||||||
|
દ્વારા
|
||||||
|
જુઓ
|
||||||
|
જાઓ
|
||||||
|
જ્યારે
|
||||||
|
ત્યારે
|
||||||
|
શકો
|
||||||
|
નથી
|
||||||
|
હવે
|
||||||
|
અથવા
|
||||||
|
થતો
|
||||||
|
દર
|
||||||
|
એટલો
|
||||||
|
પરંતુ
|
||||||
|
""".split()
|
||||||
|
)
|
25
spacy/lang/hy/__init__.py
Normal file
25
spacy/lang/hy/__init__.py
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .tag_map import TAG_MAP
|
||||||
|
|
||||||
|
|
||||||
|
from ...attrs import LANG
|
||||||
|
from ...language import Language
|
||||||
|
from ...tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
class ArmenianDefaults(Language.Defaults):
|
||||||
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
|
lex_attr_getters[LANG] = lambda text: "hy"
|
||||||
|
|
||||||
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
tag_map = TAG_MAP
|
||||||
|
|
||||||
|
|
||||||
|
class Armenian(Language):
|
||||||
|
lang = "hy"
|
||||||
|
Defaults = ArmenianDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Armenian"]
|
16
spacy/lang/hy/examples.py
Normal file
16
spacy/lang/hy/examples.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
>>> from spacy.lang.hy.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Լոնդոնը Միացյալ Թագավորության մեծ քաղաք է։",
|
||||||
|
"Ո՞վ է Ֆրանսիայի նախագահը։",
|
||||||
|
"Որն է Միացյալ Նահանգների մայրաքաղաքը։",
|
||||||
|
"Ե՞րբ է ծնվել Բարաք Օբաման։",
|
||||||
|
]
|
58
spacy/lang/hy/lex_attrs.py
Normal file
58
spacy/lang/hy/lex_attrs.py
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
|
_num_words = [
|
||||||
|
"զրօ",
|
||||||
|
"մէկ",
|
||||||
|
"երկու",
|
||||||
|
"երեք",
|
||||||
|
"չորս",
|
||||||
|
"հինգ",
|
||||||
|
"վեց",
|
||||||
|
"յոթ",
|
||||||
|
"ութ",
|
||||||
|
"ինը",
|
||||||
|
"տասը",
|
||||||
|
"տասնմեկ",
|
||||||
|
"տասներկու",
|
||||||
|
"տասներեք",
|
||||||
|
"տասնչորս",
|
||||||
|
"տասնհինգ",
|
||||||
|
"տասնվեց",
|
||||||
|
"տասնյոթ",
|
||||||
|
"տասնութ",
|
||||||
|
"տասնինը",
|
||||||
|
"քսան" "երեսուն",
|
||||||
|
"քառասուն",
|
||||||
|
"հիսուն",
|
||||||
|
"վաթցսուն",
|
||||||
|
"յոթանասուն",
|
||||||
|
"ութսուն",
|
||||||
|
"ինիսուն",
|
||||||
|
"հարյուր",
|
||||||
|
"հազար",
|
||||||
|
"միլիոն",
|
||||||
|
"միլիարդ",
|
||||||
|
"տրիլիոն",
|
||||||
|
"քվինտիլիոն",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
|
text = text[1:]
|
||||||
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
if text.count("/") == 1:
|
||||||
|
num, denom = text.split("/")
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
if text.lower() in _num_words:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
110
spacy/lang/hy/stop_words.py
Normal file
110
spacy/lang/hy/stop_words.py
Normal file
|
@ -0,0 +1,110 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
|
նա
|
||||||
|
ողջը
|
||||||
|
այստեղ
|
||||||
|
ենք
|
||||||
|
նա
|
||||||
|
էիր
|
||||||
|
որպես
|
||||||
|
ուրիշ
|
||||||
|
բոլորը
|
||||||
|
այն
|
||||||
|
այլ
|
||||||
|
նույնչափ
|
||||||
|
էի
|
||||||
|
մի
|
||||||
|
և
|
||||||
|
ողջ
|
||||||
|
ես
|
||||||
|
ոմն
|
||||||
|
հետ
|
||||||
|
նրանք
|
||||||
|
ամենքը
|
||||||
|
ըստ
|
||||||
|
ինչ-ինչ
|
||||||
|
այսպես
|
||||||
|
համայն
|
||||||
|
մի
|
||||||
|
նաև
|
||||||
|
նույնքան
|
||||||
|
դա
|
||||||
|
ովևէ
|
||||||
|
համար
|
||||||
|
այնտեղ
|
||||||
|
էին
|
||||||
|
որոնք
|
||||||
|
սույն
|
||||||
|
ինչ-որ
|
||||||
|
ամենը
|
||||||
|
նույնպիսի
|
||||||
|
ու
|
||||||
|
իր
|
||||||
|
որոշ
|
||||||
|
միևնույն
|
||||||
|
ի
|
||||||
|
այնպիսի
|
||||||
|
մենք
|
||||||
|
ամեն ոք
|
||||||
|
նույն
|
||||||
|
երբևէ
|
||||||
|
այն
|
||||||
|
որևէ
|
||||||
|
ին
|
||||||
|
այդպես
|
||||||
|
նրա
|
||||||
|
որը
|
||||||
|
վրա
|
||||||
|
դու
|
||||||
|
էինք
|
||||||
|
այդպիսի
|
||||||
|
էիք
|
||||||
|
յուրաքանչյուրը
|
||||||
|
եմ
|
||||||
|
պիտի
|
||||||
|
այդ
|
||||||
|
ամբողջը
|
||||||
|
հետո
|
||||||
|
եք
|
||||||
|
ամեն
|
||||||
|
այլ
|
||||||
|
կամ
|
||||||
|
այսքան
|
||||||
|
որ
|
||||||
|
այնպես
|
||||||
|
այսինչ
|
||||||
|
բոլոր
|
||||||
|
է
|
||||||
|
մեկնումեկը
|
||||||
|
այդչափ
|
||||||
|
այնքան
|
||||||
|
ամբողջ
|
||||||
|
երբևիցե
|
||||||
|
այնչափ
|
||||||
|
ամենայն
|
||||||
|
մյուս
|
||||||
|
այնինչ
|
||||||
|
իսկ
|
||||||
|
այդտեղ
|
||||||
|
այս
|
||||||
|
սա
|
||||||
|
են
|
||||||
|
ամեն ինչ
|
||||||
|
որևիցե
|
||||||
|
ում
|
||||||
|
մեկը
|
||||||
|
այդ
|
||||||
|
դուք
|
||||||
|
այսչափ
|
||||||
|
այդքան
|
||||||
|
այսպիսի
|
||||||
|
էր
|
||||||
|
յուրաքանչյուր
|
||||||
|
այս
|
||||||
|
մեջ
|
||||||
|
թ
|
||||||
|
""".split()
|
||||||
|
)
|
2478
spacy/lang/hy/tag_map.py
Normal file
2478
spacy/lang/hy/tag_map.py
Normal file
File diff suppressed because it is too large
Load Diff
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
|
@ -22,6 +22,7 @@ class ItalianDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
|
prefixes = TOKENIZER_PREFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,15 +1,32 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..punctuation import TOKENIZER_INFIXES
|
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
|
||||||
from ..char_classes import ALPHA
|
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
||||||
|
from ..char_classes import ALPHA, HYPHENS, CONCAT_QUOTES
|
||||||
|
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER
|
||||||
|
|
||||||
|
|
||||||
ELISION = " ' ’ ".strip().replace(" ", "")
|
ELISION = "'’"
|
||||||
|
|
||||||
|
|
||||||
_infixes = TOKENIZER_INFIXES + [
|
_prefixes = [r"'[0-9][0-9]", r"[0-9]+°"] + BASE_TOKENIZER_PREFIXES
|
||||||
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
|
_infixes = (
|
||||||
|
LIST_ELLIPSES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
|
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||||
|
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||||
|
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||||
|
),
|
||||||
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}])(?:{h})(?=[{al}])".format(a=ALPHA, h=HYPHENS, al=ALPHA_LOWER),
|
||||||
|
r"(?<=[{a}0-9])[:<>=\/](?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}][{el}])(?=[{a}0-9\"])".format(a=ALPHA, el=ELISION),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
TOKENIZER_PREFIXES = _prefixes
|
||||||
TOKENIZER_INFIXES = _infixes
|
TOKENIZER_INFIXES = _infixes
|
||||||
|
|
|
@ -2,6 +2,56 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from ...symbols import ORTH, LEMMA
|
from ...symbols import ORTH, LEMMA
|
||||||
|
|
||||||
_exc = {"po'": [{ORTH: "po'", LEMMA: "poco"}]}
|
_exc = {
|
||||||
|
"all'art.": [{ORTH: "all'"}, {ORTH: "art."}],
|
||||||
|
"dall'art.": [{ORTH: "dall'"}, {ORTH: "art."}],
|
||||||
|
"dell'art.": [{ORTH: "dell'"}, {ORTH: "art."}],
|
||||||
|
"L'art.": [{ORTH: "L'"}, {ORTH: "art."}],
|
||||||
|
"l'art.": [{ORTH: "l'"}, {ORTH: "art."}],
|
||||||
|
"nell'art.": [{ORTH: "nell'"}, {ORTH: "art."}],
|
||||||
|
"po'": [{ORTH: "po'", LEMMA: "poco"}],
|
||||||
|
"sett..": [{ORTH: "sett."}, {ORTH: "."}],
|
||||||
|
}
|
||||||
|
|
||||||
|
for orth in [
|
||||||
|
"..",
|
||||||
|
"....",
|
||||||
|
"al.",
|
||||||
|
"all-path",
|
||||||
|
"art.",
|
||||||
|
"Art.",
|
||||||
|
"artt.",
|
||||||
|
"att.",
|
||||||
|
"by-pass",
|
||||||
|
"c.d.",
|
||||||
|
"centro-sinistra",
|
||||||
|
"check-up",
|
||||||
|
"Civ.",
|
||||||
|
"cm.",
|
||||||
|
"Cod.",
|
||||||
|
"col.",
|
||||||
|
"Cost.",
|
||||||
|
"d.C.",
|
||||||
|
'de"',
|
||||||
|
"distr.",
|
||||||
|
"E'",
|
||||||
|
"ecc.",
|
||||||
|
"e-mail",
|
||||||
|
"e/o",
|
||||||
|
"etc.",
|
||||||
|
"Jr.",
|
||||||
|
"n°",
|
||||||
|
"nord-est",
|
||||||
|
"pag.",
|
||||||
|
"Proc.",
|
||||||
|
"prof.",
|
||||||
|
"sett.",
|
||||||
|
"s.p.a.",
|
||||||
|
"ss.",
|
||||||
|
"St.",
|
||||||
|
"tel.",
|
||||||
|
"week-end",
|
||||||
|
]:
|
||||||
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = _exc
|
||||||
|
|
22
spacy/lang/kn/examples.py
Normal file
22
spacy/lang/kn/examples.py
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.en.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"ಆಪಲ್ ಒಂದು ಯು.ಕೆ. ಸ್ಟಾರ್ಟ್ಅಪ್ ಅನ್ನು ೧ ಶತಕೋಟಿ ಡಾಲರ್ಗಳಿಗೆ ಖರೀದಿಸಲು ನೋಡುತ್ತಿದೆ.",
|
||||||
|
"ಸ್ವಾಯತ್ತ ಕಾರುಗಳು ವಿಮಾ ಹೊಣೆಗಾರಿಕೆಯನ್ನು ತಯಾರಕರ ಕಡೆಗೆ ಬದಲಾಯಿಸುತ್ತವೆ.",
|
||||||
|
"ಕಾಲುದಾರಿ ವಿತರಣಾ ರೋಬೋಟ್ಗಳನ್ನು ನಿಷೇಧಿಸುವುದನ್ನು ಸ್ಯಾನ್ ಫ್ರಾನ್ಸಿಸ್ಕೊ ಪರಿಗಣಿಸುತ್ತದೆ.",
|
||||||
|
"ಲಂಡನ್ ಯುನೈಟೆಡ್ ಕಿಂಗ್ಡಂನ ದೊಡ್ಡ ನಗರ.",
|
||||||
|
"ನೀನು ಎಲ್ಲಿದಿಯಾ?",
|
||||||
|
"ಫ್ರಾನ್ಸಾದ ಅಧ್ಯಕ್ಷರು ಯಾರು?",
|
||||||
|
"ಯುನೈಟೆಡ್ ಸ್ಟೇಟ್ಸ್ನ ರಾಜಧಾನಿ ಯಾವುದು?",
|
||||||
|
"ಬರಾಕ್ ಒಬಾಮ ಯಾವಾಗ ಜನಿಸಿದರು?",
|
||||||
|
]
|
31
spacy/lang/lij/__init__.py
Normal file
31
spacy/lang/lij/__init__.py
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
from ..norm_exceptions import BASE_NORMS
|
||||||
|
from ...language import Language
|
||||||
|
from ...attrs import LANG, NORM
|
||||||
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
|
|
||||||
|
class LigurianDefaults(Language.Defaults):
|
||||||
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
|
lex_attr_getters[LANG] = lambda text: "lij"
|
||||||
|
lex_attr_getters[NORM] = add_lookups(
|
||||||
|
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||||
|
)
|
||||||
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
infixes = TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
|
class Ligurian(Language):
|
||||||
|
lang = "lij"
|
||||||
|
Defaults = LigurianDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Ligurian"]
|
18
spacy/lang/lij/examples.py
Normal file
18
spacy/lang/lij/examples.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.lij.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Sciusciâ e sciorbî no se peu.",
|
||||||
|
"Graçie di çetroin, che me son arrivæ.",
|
||||||
|
"Vegnime apreuvo, che ve fasso pescâ di òmmi.",
|
||||||
|
"Bella pe sempre l'ægua inta conchetta quande unn'agoggia d'ægua a se â trapaña.",
|
||||||
|
]
|
15
spacy/lang/lij/punctuation.py
Normal file
15
spacy/lang/lij/punctuation.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..punctuation import TOKENIZER_INFIXES
|
||||||
|
from ..char_classes import ALPHA
|
||||||
|
|
||||||
|
|
||||||
|
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
|
||||||
|
|
||||||
|
|
||||||
|
_infixes = TOKENIZER_INFIXES + [
|
||||||
|
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
|
||||||
|
]
|
||||||
|
|
||||||
|
TOKENIZER_INFIXES = _infixes
|
43
spacy/lang/lij/stop_words.py
Normal file
43
spacy/lang/lij/stop_words.py
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
|
a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuvo ascì atra atre atri atro avanti avei
|
||||||
|
|
||||||
|
bella belle belli bello ben
|
||||||
|
|
||||||
|
ch' che chì chi ciù co-a co-e co-i co-o comm' comme con cösa coscì cöse
|
||||||
|
|
||||||
|
d' da da-a da-e da-i da-o dapeu de delongo derê di do doe doî donde dòppo
|
||||||
|
|
||||||
|
é e ê ea ean emmo en ëse
|
||||||
|
|
||||||
|
fin fiña
|
||||||
|
|
||||||
|
gh' ghe guæei
|
||||||
|
|
||||||
|
i î in insemme int' inta inte inti into
|
||||||
|
|
||||||
|
l' lê lì lô
|
||||||
|
|
||||||
|
m' ma manco me megio meno mezo mi
|
||||||
|
|
||||||
|
na n' ne ni ninte nisciun nisciuña no
|
||||||
|
|
||||||
|
o ò ô oua
|
||||||
|
|
||||||
|
parte pe pe-a pe-i pe-e pe-o perché pittin pö primma pròpio
|
||||||
|
|
||||||
|
quæ quand' quande quarche quella quelle quelli quello
|
||||||
|
|
||||||
|
s' sce scê sci sciâ sciô sciù se segge seu sò solo son sott' sta stæta stæte stæti stæto ste sti sto
|
||||||
|
|
||||||
|
tanta tante tanti tanto te ti torna tra tròppo tutta tutte tutti tutto
|
||||||
|
|
||||||
|
un uña unn' unna
|
||||||
|
|
||||||
|
za zu
|
||||||
|
""".split()
|
||||||
|
)
|
52
spacy/lang/lij/tokenizer_exceptions.py
Normal file
52
spacy/lang/lij/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,52 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
from ...symbols import ORTH, LEMMA
|
||||||
|
|
||||||
|
_exc = {}
|
||||||
|
|
||||||
|
for raw, lemma in [
|
||||||
|
("a-a", "a-o"),
|
||||||
|
("a-e", "a-o"),
|
||||||
|
("a-o", "a-o"),
|
||||||
|
("a-i", "a-o"),
|
||||||
|
("co-a", "co-o"),
|
||||||
|
("co-e", "co-o"),
|
||||||
|
("co-i", "co-o"),
|
||||||
|
("co-o", "co-o"),
|
||||||
|
("da-a", "da-o"),
|
||||||
|
("da-e", "da-o"),
|
||||||
|
("da-i", "da-o"),
|
||||||
|
("da-o", "da-o"),
|
||||||
|
("pe-a", "pe-o"),
|
||||||
|
("pe-e", "pe-o"),
|
||||||
|
("pe-i", "pe-o"),
|
||||||
|
("pe-o", "pe-o"),
|
||||||
|
]:
|
||||||
|
for orth in [raw, raw.capitalize()]:
|
||||||
|
_exc[orth] = [{ORTH: orth, LEMMA: lemma}]
|
||||||
|
|
||||||
|
# Prefix + prepositions with à (e.g. "sott'a-o")
|
||||||
|
|
||||||
|
for prep, prep_lemma in [
|
||||||
|
("a-a", "a-o"),
|
||||||
|
("a-e", "a-o"),
|
||||||
|
("a-o", "a-o"),
|
||||||
|
("a-i", "a-o"),
|
||||||
|
]:
|
||||||
|
for prefix, prefix_lemma in [
|
||||||
|
("sott'", "sotta"),
|
||||||
|
("sott’", "sotta"),
|
||||||
|
("contr'", "contra"),
|
||||||
|
("contr’", "contra"),
|
||||||
|
("ch'", "che"),
|
||||||
|
("ch’", "che"),
|
||||||
|
("s'", "se"),
|
||||||
|
("s’", "se"),
|
||||||
|
]:
|
||||||
|
for prefix_orth in [prefix, prefix.capitalize()]:
|
||||||
|
_exc[prefix_orth + prep] = [
|
||||||
|
{ORTH: prefix_orth, LEMMA: prefix_lemma},
|
||||||
|
{ORTH: prep, LEMMA: prep_lemma},
|
||||||
|
]
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = _exc
|
|
@ -1,6 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
@ -26,7 +27,13 @@ class LithuanianDefaults(Language.Defaults):
|
||||||
)
|
)
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
infixes = TOKENIZER_INFIXES
|
||||||
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
mod_base_exceptions = {
|
||||||
|
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
|
||||||
|
}
|
||||||
|
del mod_base_exceptions["8)"]
|
||||||
|
tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
morph_rules = MORPH_RULES
|
morph_rules = MORPH_RULES
|
||||||
|
|
29
spacy/lang/lt/punctuation.py
Normal file
29
spacy/lang/lt/punctuation.py
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..char_classes import LIST_ICONS, LIST_ELLIPSES
|
||||||
|
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
||||||
|
from ..char_classes import HYPHENS
|
||||||
|
from ..punctuation import TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
|
|
||||||
|
_infixes = (
|
||||||
|
LIST_ELLIPSES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
|
r"(?<=[0-9])[+\*^](?=[0-9-])",
|
||||||
|
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||||
|
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||||
|
),
|
||||||
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||||
|
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_suffixes = ["\."] + list(TOKENIZER_SUFFIXES)
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_INFIXES = _infixes
|
||||||
|
TOKENIZER_SUFFIXES = _suffixes
|
|
@ -6,262 +6,264 @@ from ...symbols import ORTH
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
for orth in [
|
for orth in [
|
||||||
"G.",
|
"n-tosios",
|
||||||
"J. E.",
|
"?!",
|
||||||
"J. Em.",
|
# "G.",
|
||||||
"J.E.",
|
# "J. E.",
|
||||||
"J.Em.",
|
# "J. Em.",
|
||||||
"K.",
|
# "J.E.",
|
||||||
"N.",
|
# "J.Em.",
|
||||||
"V.",
|
# "K.",
|
||||||
"Vt.",
|
# "N.",
|
||||||
"a.",
|
# "V.",
|
||||||
"a.k.",
|
# "Vt.",
|
||||||
"a.s.",
|
# "a.",
|
||||||
"adv.",
|
# "a.k.",
|
||||||
"akad.",
|
# "a.s.",
|
||||||
"aklg.",
|
# "adv.",
|
||||||
"akt.",
|
# "akad.",
|
||||||
"al.",
|
# "aklg.",
|
||||||
"ang.",
|
# "akt.",
|
||||||
"angl.",
|
# "al.",
|
||||||
"aps.",
|
# "ang.",
|
||||||
"apskr.",
|
# "angl.",
|
||||||
"apyg.",
|
# "aps.",
|
||||||
"arbat.",
|
# "apskr.",
|
||||||
"asist.",
|
# "apyg.",
|
||||||
"asm.",
|
# "arbat.",
|
||||||
"asm.k.",
|
# "asist.",
|
||||||
"asmv.",
|
# "asm.",
|
||||||
"atk.",
|
# "asm.k.",
|
||||||
"atsak.",
|
# "asmv.",
|
||||||
"atsisk.",
|
# "atk.",
|
||||||
"atsisk.sąsk.",
|
# "atsak.",
|
||||||
"atv.",
|
# "atsisk.",
|
||||||
"aut.",
|
# "atsisk.sąsk.",
|
||||||
"avd.",
|
# "atv.",
|
||||||
"b.k.",
|
# "aut.",
|
||||||
"baud.",
|
# "avd.",
|
||||||
"biol.",
|
# "b.k.",
|
||||||
"bkl.",
|
# "baud.",
|
||||||
"bot.",
|
# "biol.",
|
||||||
"bt.",
|
# "bkl.",
|
||||||
"buv.",
|
# "bot.",
|
||||||
"ch.",
|
# "bt.",
|
||||||
"chem.",
|
# "buv.",
|
||||||
"corp.",
|
# "ch.",
|
||||||
"d.",
|
# "chem.",
|
||||||
"dab.",
|
# "corp.",
|
||||||
"dail.",
|
# "d.",
|
||||||
"dek.",
|
# "dab.",
|
||||||
"deš.",
|
# "dail.",
|
||||||
"dir.",
|
# "dek.",
|
||||||
"dirig.",
|
# "deš.",
|
||||||
"doc.",
|
# "dir.",
|
||||||
"dol.",
|
# "dirig.",
|
||||||
"dr.",
|
# "doc.",
|
||||||
"drp.",
|
# "dol.",
|
||||||
"dvit.",
|
# "dr.",
|
||||||
"dėst.",
|
# "drp.",
|
||||||
"dš.",
|
# "dvit.",
|
||||||
"dž.",
|
# "dėst.",
|
||||||
"e.b.",
|
# "dš.",
|
||||||
"e.bankas",
|
# "dž.",
|
||||||
"e.p.",
|
# "e.b.",
|
||||||
"e.parašas",
|
# "e.bankas",
|
||||||
"e.paštas",
|
# "e.p.",
|
||||||
"e.v.",
|
# "e.parašas",
|
||||||
"e.valdžia",
|
# "e.paštas",
|
||||||
"egz.",
|
# "e.v.",
|
||||||
"eil.",
|
# "e.valdžia",
|
||||||
"ekon.",
|
# "egz.",
|
||||||
"el.",
|
# "eil.",
|
||||||
"el.bankas",
|
# "ekon.",
|
||||||
"el.p.",
|
# "el.",
|
||||||
"el.parašas",
|
# "el.bankas",
|
||||||
"el.paštas",
|
# "el.p.",
|
||||||
"el.valdžia",
|
# "el.parašas",
|
||||||
"etc.",
|
# "el.paštas",
|
||||||
"ež.",
|
# "el.valdžia",
|
||||||
"fak.",
|
# "etc.",
|
||||||
"faks.",
|
# "ež.",
|
||||||
"feat.",
|
# "fak.",
|
||||||
"filol.",
|
# "faks.",
|
||||||
"filos.",
|
# "feat.",
|
||||||
"g.",
|
# "filol.",
|
||||||
"gen.",
|
# "filos.",
|
||||||
"geol.",
|
# "g.",
|
||||||
"gerb.",
|
# "gen.",
|
||||||
"gim.",
|
# "geol.",
|
||||||
"gr.",
|
# "gerb.",
|
||||||
"gv.",
|
# "gim.",
|
||||||
"gyd.",
|
# "gr.",
|
||||||
"gyv.",
|
# "gv.",
|
||||||
"habil.",
|
# "gyd.",
|
||||||
"inc.",
|
# "gyv.",
|
||||||
"insp.",
|
# "habil.",
|
||||||
"inž.",
|
# "inc.",
|
||||||
"ir pan.",
|
# "insp.",
|
||||||
"ir t. t.",
|
# "inž.",
|
||||||
"isp.",
|
# "ir pan.",
|
||||||
"istor.",
|
# "ir t. t.",
|
||||||
"it.",
|
# "isp.",
|
||||||
"just.",
|
# "istor.",
|
||||||
"k.",
|
# "it.",
|
||||||
"k. a.",
|
# "just.",
|
||||||
"k.a.",
|
# "k.",
|
||||||
"kab.",
|
# "k. a.",
|
||||||
"kand.",
|
# "k.a.",
|
||||||
"kart.",
|
# "kab.",
|
||||||
"kat.",
|
# "kand.",
|
||||||
"ketv.",
|
# "kart.",
|
||||||
"kh.",
|
# "kat.",
|
||||||
"kl.",
|
# "ketv.",
|
||||||
"kln.",
|
# "kh.",
|
||||||
"km.",
|
# "kl.",
|
||||||
"kn.",
|
# "kln.",
|
||||||
"koresp.",
|
# "km.",
|
||||||
"kpt.",
|
# "kn.",
|
||||||
"kr.",
|
# "koresp.",
|
||||||
"kt.",
|
# "kpt.",
|
||||||
"kub.",
|
# "kr.",
|
||||||
"kun.",
|
# "kt.",
|
||||||
"kv.",
|
# "kub.",
|
||||||
"kyš.",
|
# "kun.",
|
||||||
"l. e. p.",
|
# "kv.",
|
||||||
"l.e.p.",
|
# "kyš.",
|
||||||
"lenk.",
|
# "l. e. p.",
|
||||||
"liet.",
|
# "l.e.p.",
|
||||||
"lot.",
|
# "lenk.",
|
||||||
"lt.",
|
# "liet.",
|
||||||
"ltd.",
|
# "lot.",
|
||||||
"ltn.",
|
# "lt.",
|
||||||
"m.",
|
# "ltd.",
|
||||||
"m.e..",
|
# "ltn.",
|
||||||
"m.m.",
|
# "m.",
|
||||||
"mat.",
|
# "m.e..",
|
||||||
"med.",
|
# "m.m.",
|
||||||
"mgnt.",
|
# "mat.",
|
||||||
"mgr.",
|
# "med.",
|
||||||
"min.",
|
# "mgnt.",
|
||||||
"mjr.",
|
# "mgr.",
|
||||||
"ml.",
|
# "min.",
|
||||||
"mln.",
|
# "mjr.",
|
||||||
"mlrd.",
|
# "ml.",
|
||||||
"mob.",
|
# "mln.",
|
||||||
"mok.",
|
# "mlrd.",
|
||||||
"moksl.",
|
# "mob.",
|
||||||
"mokyt.",
|
# "mok.",
|
||||||
"mot.",
|
# "moksl.",
|
||||||
"mr.",
|
# "mokyt.",
|
||||||
"mst.",
|
# "mot.",
|
||||||
"mstl.",
|
# "mr.",
|
||||||
"mėn.",
|
# "mst.",
|
||||||
"nkt.",
|
# "mstl.",
|
||||||
"no.",
|
# "mėn.",
|
||||||
"nr.",
|
# "nkt.",
|
||||||
"ntk.",
|
# "no.",
|
||||||
"nuotr.",
|
# "nr.",
|
||||||
"op.",
|
# "ntk.",
|
||||||
"org.",
|
# "nuotr.",
|
||||||
"orig.",
|
# "op.",
|
||||||
"p.",
|
# "org.",
|
||||||
"p.d.",
|
# "orig.",
|
||||||
"p.m.e.",
|
# "p.",
|
||||||
"p.s.",
|
# "p.d.",
|
||||||
"pab.",
|
# "p.m.e.",
|
||||||
"pan.",
|
# "p.s.",
|
||||||
"past.",
|
# "pab.",
|
||||||
"pav.",
|
# "pan.",
|
||||||
"pavad.",
|
# "past.",
|
||||||
"per.",
|
# "pav.",
|
||||||
"perd.",
|
# "pavad.",
|
||||||
"pirm.",
|
# "per.",
|
||||||
"pl.",
|
# "perd.",
|
||||||
"plg.",
|
# "pirm.",
|
||||||
"plk.",
|
# "pl.",
|
||||||
"pr.",
|
# "plg.",
|
||||||
"pr.Kr.",
|
# "plk.",
|
||||||
"pranc.",
|
# "pr.",
|
||||||
"proc.",
|
# "pr.Kr.",
|
||||||
"prof.",
|
# "pranc.",
|
||||||
"prom.",
|
# "proc.",
|
||||||
"prot.",
|
# "prof.",
|
||||||
"psl.",
|
# "prom.",
|
||||||
"pss.",
|
# "prot.",
|
||||||
"pvz.",
|
# "psl.",
|
||||||
"pšt.",
|
# "pss.",
|
||||||
"r.",
|
# "pvz.",
|
||||||
"raj.",
|
# "pšt.",
|
||||||
"red.",
|
# "r.",
|
||||||
"rez.",
|
# "raj.",
|
||||||
"rež.",
|
# "red.",
|
||||||
"rus.",
|
# "rez.",
|
||||||
"rš.",
|
# "rež.",
|
||||||
"s.",
|
# "rus.",
|
||||||
"sav.",
|
# "rš.",
|
||||||
"saviv.",
|
# "s.",
|
||||||
"sek.",
|
# "sav.",
|
||||||
"sekr.",
|
# "saviv.",
|
||||||
"sen.",
|
# "sek.",
|
||||||
"sh.",
|
# "sekr.",
|
||||||
"sk.",
|
# "sen.",
|
||||||
"skg.",
|
# "sh.",
|
||||||
"skv.",
|
# "sk.",
|
||||||
"skyr.",
|
# "skg.",
|
||||||
"sp.",
|
# "skv.",
|
||||||
"spec.",
|
# "skyr.",
|
||||||
"sr.",
|
# "sp.",
|
||||||
"st.",
|
# "spec.",
|
||||||
"str.",
|
# "sr.",
|
||||||
"stud.",
|
# "st.",
|
||||||
"sąs.",
|
# "str.",
|
||||||
"t.",
|
# "stud.",
|
||||||
"t. p.",
|
# "sąs.",
|
||||||
"t. y.",
|
# "t.",
|
||||||
"t.p.",
|
# "t. p.",
|
||||||
"t.t.",
|
# "t. y.",
|
||||||
"t.y.",
|
# "t.p.",
|
||||||
"techn.",
|
# "t.t.",
|
||||||
"tel.",
|
# "t.y.",
|
||||||
"teol.",
|
# "techn.",
|
||||||
"th.",
|
# "tel.",
|
||||||
"tir.",
|
# "teol.",
|
||||||
"trit.",
|
# "th.",
|
||||||
"trln.",
|
# "tir.",
|
||||||
"tšk.",
|
# "trit.",
|
||||||
"tūks.",
|
# "trln.",
|
||||||
"tūkst.",
|
# "tšk.",
|
||||||
"up.",
|
# "tūks.",
|
||||||
"upl.",
|
# "tūkst.",
|
||||||
"v.s.",
|
# "up.",
|
||||||
"vad.",
|
# "upl.",
|
||||||
"val.",
|
# "v.s.",
|
||||||
"valg.",
|
# "vad.",
|
||||||
"ved.",
|
# "val.",
|
||||||
"vert.",
|
# "valg.",
|
||||||
"vet.",
|
# "ved.",
|
||||||
"vid.",
|
# "vert.",
|
||||||
"virš.",
|
# "vet.",
|
||||||
"vlsč.",
|
# "vid.",
|
||||||
"vnt.",
|
# "virš.",
|
||||||
"vok.",
|
# "vlsč.",
|
||||||
"vs.",
|
# "vnt.",
|
||||||
"vtv.",
|
# "vok.",
|
||||||
"vv.",
|
# "vs.",
|
||||||
"vyr.",
|
# "vtv.",
|
||||||
"vyresn.",
|
# "vv.",
|
||||||
"zool.",
|
# "vyr.",
|
||||||
"Įn",
|
# "vyresn.",
|
||||||
"įl.",
|
# "zool.",
|
||||||
"š.m.",
|
# "Įn",
|
||||||
"šnek.",
|
# "įl.",
|
||||||
"šv.",
|
# "š.m.",
|
||||||
"švč.",
|
# "šnek.",
|
||||||
"ž.ū.",
|
# "šv.",
|
||||||
"žin.",
|
# "švč.",
|
||||||
"žml.",
|
# "ž.ū.",
|
||||||
"žr.",
|
# "žin.",
|
||||||
|
# "žml.",
|
||||||
|
# "žr.",
|
||||||
]:
|
]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
18
spacy/lang/ml/__init__.py
Normal file
18
spacy/lang/ml/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
|
from ...language import Language
|
||||||
|
|
||||||
|
|
||||||
|
class MalayalamDefaults(Language.Defaults):
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
class Malayalam(Language):
|
||||||
|
lang = "ml"
|
||||||
|
Defaults = MalayalamDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Malayalam"]
|
19
spacy/lang/ml/examples.py
Normal file
19
spacy/lang/ml/examples.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.ml.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"അനാവശ്യമായി കണ്ണിലും മൂക്കിലും വായിലും സ്പർശിക്കാതിരിക്കുക",
|
||||||
|
"പൊതുരംഗത്ത് മലയാള ഭാഷയുടെ സമഗ്രപുരോഗതി ലക്ഷ്യമാക്കി പ്രവർത്തിക്കുന്ന സംഘടനയായ മലയാളഐക്യവേദിയുടെ വിദ്യാർത്ഥിക്കൂട്ടായ്മയാണ് വിദ്യാർത്ഥി മലയാളവേദി",
|
||||||
|
"എന്താണ് കവാടങ്ങൾ?",
|
||||||
|
"ചുരുക്കത്തിൽ വിക്കിപീഡിയയുടെ ഉള്ളടക്കത്തിലേക്കുള്ള പടിപ്പുരകളാണ് കവാടങ്ങൾ. അവ ലളിതവും വായനക്കാരനെ ആകർഷിക്കുന്നതുമായിരിക്കും",
|
||||||
|
"പതിനൊന്നുപേർ വീതമുള്ള രണ്ടു ടീമുകൾ കളിക്കുന്ന സംഘകായിക വിനോദമാണു ക്രിക്കറ്റ്",
|
||||||
|
]
|
80
spacy/lang/ml/lex_attrs.py
Normal file
80
spacy/lang/ml/lex_attrs.py
Normal file
|
@ -0,0 +1,80 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
|
# reference 2: https://www.omniglot.com/language/numbers/malayalam.htm
|
||||||
|
|
||||||
|
_num_words = [
|
||||||
|
"പൂജ്യം ",
|
||||||
|
"ഒന്ന് ",
|
||||||
|
"രണ്ട് ",
|
||||||
|
"മൂന്ന് ",
|
||||||
|
"നാല് ",
|
||||||
|
"അഞ്ച് ",
|
||||||
|
"ആറ് ",
|
||||||
|
"ഏഴ് ",
|
||||||
|
"എട്ട് ",
|
||||||
|
"ഒന്പത് ",
|
||||||
|
"പത്ത് ",
|
||||||
|
"പതിനൊന്ന്",
|
||||||
|
"പന്ത്രണ്ട്",
|
||||||
|
"പതി മൂന്നു",
|
||||||
|
"പതിനാല്",
|
||||||
|
"പതിനഞ്ച്",
|
||||||
|
"പതിനാറ്",
|
||||||
|
"പതിനേഴ്",
|
||||||
|
"പതിനെട്ട്",
|
||||||
|
"പത്തൊമ്പതു",
|
||||||
|
"ഇരുപത്",
|
||||||
|
"ഇരുപത്തിഒന്ന്",
|
||||||
|
"ഇരുപത്തിരണ്ട്",
|
||||||
|
"ഇരുപത്തിമൂന്ന്",
|
||||||
|
"ഇരുപത്തിനാല്",
|
||||||
|
"ഇരുപത്തിഅഞ്ചു",
|
||||||
|
"ഇരുപത്തിആറ്",
|
||||||
|
"ഇരുപത്തിഏഴ്",
|
||||||
|
"ഇരുപത്തിഎട്ടു",
|
||||||
|
"ഇരുപത്തിഒന്പത്",
|
||||||
|
"മുപ്പത്",
|
||||||
|
"മുപ്പത്തിഒന്ന്",
|
||||||
|
"മുപ്പത്തിരണ്ട്",
|
||||||
|
"മുപ്പത്തിമൂന്ന്",
|
||||||
|
"മുപ്പത്തിനാല്",
|
||||||
|
"മുപ്പത്തിഅഞ്ചു",
|
||||||
|
"മുപ്പത്തിആറ്",
|
||||||
|
"മുപ്പത്തിഏഴ്",
|
||||||
|
"മുപ്പത്തിഎട്ട്",
|
||||||
|
"മുപ്പത്തിഒന്പതു",
|
||||||
|
"നാല്പത് ",
|
||||||
|
"അന്പത് ",
|
||||||
|
"അറുപത് ",
|
||||||
|
"എഴുപത് ",
|
||||||
|
"എണ്പത് ",
|
||||||
|
"തൊണ്ണൂറ് ",
|
||||||
|
"നുറ് ",
|
||||||
|
"ആയിരം ",
|
||||||
|
"പത്തുലക്ഷം"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
"""
|
||||||
|
Check if text resembles a number
|
||||||
|
"""
|
||||||
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
|
text = text[1:]
|
||||||
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
if text.count("/") == 1:
|
||||||
|
num, denom = text.split("/")
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
if text in _num_words:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
18
spacy/lang/ml/stop_words.py
Normal file
18
spacy/lang/ml/stop_words.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
STOP_WORDS = set(
|
||||||
|
|
||||||
|
"""
|
||||||
|
അത്
|
||||||
|
ഇത്
|
||||||
|
ആയിരുന്നു
|
||||||
|
ആകുന്നു
|
||||||
|
വരെ
|
||||||
|
അന്നേരം
|
||||||
|
അന്ന്
|
||||||
|
ഇന്ന്
|
||||||
|
ആണ്
|
||||||
|
""".split()
|
||||||
|
)
|
|
@ -2,6 +2,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .morph_rules import MORPH_RULES
|
from .morph_rules import MORPH_RULES
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
@ -21,6 +23,9 @@ class NorwegianDefaults(Language.Defaults):
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||||
)
|
)
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
|
prefixes = TOKENIZER_PREFIXES
|
||||||
|
infixes = TOKENIZER_INFIXES
|
||||||
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
morph_rules = MORPH_RULES
|
morph_rules = MORPH_RULES
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user