mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-16 04:26:27 +03:00
Merge branch 'master' into spacy.io
This commit is contained in:
commit
dc94052d6e
106
.github/contributors/Baciccin.md
vendored
Normal file
106
.github/contributors/Baciccin.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | ------------------------ |
|
||||||
|
| Name | Giovanni Battista Parodi |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-03-19 |
|
||||||
|
| GitHub username | Baciccin |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/MiniLau.md
vendored
Normal file
106
.github/contributors/MiniLau.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Desausoi Laurent |
|
||||||
|
| Company name (if applicable) | / |
|
||||||
|
| Title or role (if applicable) | / |
|
||||||
|
| Date | 22 November 2019 |
|
||||||
|
| GitHub username | MiniLau |
|
||||||
|
| Website (optional) | / |
|
106
.github/contributors/Mlawrence95.md
vendored
Normal file
106
.github/contributors/Mlawrence95.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [ x ] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Mike Lawrence |
|
||||||
|
| Company name (if applicable) | NA |
|
||||||
|
| Title or role (if applicable) | NA |
|
||||||
|
| Date | April 17, 2020 |
|
||||||
|
| GitHub username | Mlawrence95 |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/YohannesDatasci.md
vendored
Normal file
106
.github/contributors/YohannesDatasci.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [X] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Yohannes |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-04-02 |
|
||||||
|
| GitHub username | YohannesDatasci |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/chopeen.md
vendored
Normal file
106
.github/contributors/chopeen.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Marek Grzenkowicz |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020.04.10 |
|
||||||
|
| GitHub username | chopeen |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/elben10
vendored
Normal file
106
.github/contributors/elben10
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Jakob Jul Elben |
|
||||||
|
| Company name (if applicable) | N/A |
|
||||||
|
| Title or role (if applicable) | N/A |
|
||||||
|
| Date | April 16th, 2020 |
|
||||||
|
| GitHub username | elben10 |
|
||||||
|
| Website (optional) | N/A |
|
106
.github/contributors/ilivans.md
vendored
Normal file
106
.github/contributors/ilivans.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | ------------------------ |
|
||||||
|
| Name | Ilia Ivanov |
|
||||||
|
| Company name (if applicable) | Chattermill |
|
||||||
|
| Title or role (if applicable) | DL Engineer |
|
||||||
|
| Date | 2020-05-14 |
|
||||||
|
| GitHub username | ilivans |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/jacse.md
vendored
Normal file
106
.github/contributors/jacse.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Jacob Lauritzen |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-03-30 |
|
||||||
|
| GitHub username | jacse |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/kevinlu1248.md
vendored
Normal file
106
.github/contributors/kevinlu1248.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Kevin Lu|
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | Student|
|
||||||
|
| Date | |
|
||||||
|
| GitHub username | kevinlu1248|
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/laszabine.md
vendored
Normal file
106
.github/contributors/laszabine.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Sabine Laszakovits |
|
||||||
|
| Company name (if applicable) | Austrian Academy of Sciences |
|
||||||
|
| Title or role (if applicable) | Data analyst |
|
||||||
|
| Date | 2020-04-16 |
|
||||||
|
| GitHub username | laszabine |
|
||||||
|
| Website (optional) | https://sabine.laszakovits.net |
|
106
.github/contributors/leicmi.md
vendored
Normal file
106
.github/contributors/leicmi.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Michael Leichtfried |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 30.03.2020 |
|
||||||
|
| GitHub username | leicmi |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/louisguitton.md
vendored
Normal file
106
.github/contributors/louisguitton.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Louis Guitton |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-04-25 |
|
||||||
|
| GitHub username | louisguitton |
|
||||||
|
| Website (optional) | https://guitton.co/ |
|
106
.github/contributors/michael-k.md
vendored
Normal file
106
.github/contributors/michael-k.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [X] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Michael Käufl |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-04-23 |
|
||||||
|
| GitHub username | michael-k |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/nikhilsaldanha.md
vendored
Normal file
106
.github/contributors/nikhilsaldanha.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Nikhil Saldanha |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-03-17 |
|
||||||
|
| GitHub username | nikhilsaldanha |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/osori.md
vendored
Normal file
106
.github/contributors/osori.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Ilkyu Ju |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-05-17 |
|
||||||
|
| GitHub username | osori |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/paoloq.md
vendored
Normal file
106
.github/contributors/paoloq.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Paolo Arduin |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 9 April 2020 |
|
||||||
|
| GitHub username | paoloq |
|
||||||
|
| Website (optional) | |
|
107
.github/contributors/punitvara.md
vendored
Normal file
107
.github/contributors/punitvara.md
vendored
Normal file
|
@ -0,0 +1,107 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | ------------------------ |
|
||||||
|
| Name | Punit Vara |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-04-26 |
|
||||||
|
| GitHub username | punitvara |
|
||||||
|
| Website (optional) | https://punitvara.com |
|
||||||
|
|
106
.github/contributors/sabiqueqb.md
vendored
Normal file
106
.github/contributors/sabiqueqb.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Sabique Ahammed Lava |
|
||||||
|
| Company name (if applicable) | QBurst |
|
||||||
|
| Title or role (if applicable) | Senior Engineer |
|
||||||
|
| Date | 24 Apr 2020 |
|
||||||
|
| GitHub username | sabiqueqb |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/sebastienharinck.md
vendored
Normal file
106
.github/contributors/sebastienharinck.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------------------------------- |
|
||||||
|
| Name | Sébastien Harinck |
|
||||||
|
| Company name (if applicable) | Odaxiom |
|
||||||
|
| Title or role (if applicable) | ML Engineer |
|
||||||
|
| Date | 2020-04-15 |
|
||||||
|
| GitHub username | sebastienharinck |
|
||||||
|
| Website (optional) | [https://odaxiom.com](https://odaxiom.com) |
|
106
.github/contributors/thomasthiebaud.md
vendored
Normal file
106
.github/contributors/thomasthiebaud.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
- Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
- to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
- each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
| ----------------------------- | --------------- |
|
||||||
|
| Name | Thomas Thiebaud |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-04-07 |
|
||||||
|
| GitHub username | thomasthiebaud |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/tommilligan.md
vendored
Normal file
106
.github/contributors/tommilligan.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
- Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
- to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
- each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
| ----------------------------- | ------------ |
|
||||||
|
| Name | Tom Milligan |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-03-24 |
|
||||||
|
| GitHub username | tommilligan |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/umarbutler.md
vendored
Normal file
106
.github/contributors/umarbutler.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | ------------------------ |
|
||||||
|
| Name | Umar Butler |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-04-09 |
|
||||||
|
| GitHub username | umarbutler |
|
||||||
|
| Website (optional) | https://umarbutler.com |
|
106
.github/contributors/vishnupriyavr.md
vendored
Normal file
106
.github/contributors/vishnupriyavr.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | ------------------------ |
|
||||||
|
| Name | Vishnu Priya VR |
|
||||||
|
| Company name (if applicable) | Uniphore |
|
||||||
|
| Title or role (if applicable) | NLP/AI Engineer |
|
||||||
|
| Date | 2020-05-03 |
|
||||||
|
| GitHub username | vishnupriyavr |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/vondersam.md
vendored
Normal file
106
.github/contributors/vondersam.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | ------------------------|
|
||||||
|
| Name | Samuel Rodríguez Medina |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | Computational linguist |
|
||||||
|
| Date | 28 April 2020 |
|
||||||
|
| GitHub username | vondersam |
|
||||||
|
| Website (optional) | |
|
|
@ -1,6 +1,7 @@
|
||||||
"""Prevent catastrophic forgetting with rehearsal updates."""
|
"""Prevent catastrophic forgetting with rehearsal updates."""
|
||||||
import plac
|
import plac
|
||||||
import random
|
import random
|
||||||
|
import warnings
|
||||||
import srsly
|
import srsly
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.gold import GoldParse
|
from spacy.gold import GoldParse
|
||||||
|
@ -66,7 +67,10 @@ def main(model_name, unlabelled_loc):
|
||||||
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
|
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
||||||
sizes = compounding(1.0, 4.0, 1.001)
|
sizes = compounding(1.0, 4.0, 1.001)
|
||||||
with nlp.disable_pipes(*other_pipes):
|
with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
|
||||||
|
# show warnings for misaligned entity spans once
|
||||||
|
warnings.filterwarnings("once", category=UserWarning, module='spacy')
|
||||||
|
|
||||||
for itn in range(n_iter):
|
for itn in range(n_iter):
|
||||||
random.shuffle(TRAIN_DATA)
|
random.shuffle(TRAIN_DATA)
|
||||||
random.shuffle(raw_docs)
|
random.shuffle(raw_docs)
|
||||||
|
|
|
@ -64,7 +64,7 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
|
||||||
"""Create a blank model with the specified vocab, set up the pipeline and train the entity linker.
|
"""Create a blank model with the specified vocab, set up the pipeline and train the entity linker.
|
||||||
The `vocab` should be the one used during creation of the KB."""
|
The `vocab` should be the one used during creation of the KB."""
|
||||||
vocab = Vocab().from_disk(vocab_path)
|
vocab = Vocab().from_disk(vocab_path)
|
||||||
# create blank Language class with correct vocab
|
# create blank English model with correct vocab
|
||||||
nlp = spacy.blank("en", vocab=vocab)
|
nlp = spacy.blank("en", vocab=vocab)
|
||||||
nlp.vocab.vectors.name = "spacy_pretrained_vectors"
|
nlp.vocab.vectors.name = "spacy_pretrained_vectors"
|
||||||
print("Created blank 'en' model with vocab from '%s'" % vocab_path)
|
print("Created blank 'en' model with vocab from '%s'" % vocab_path)
|
||||||
|
|
|
@ -8,12 +8,13 @@ For more details, see the documentation:
|
||||||
* NER: https://spacy.io/usage/linguistic-features#named-entities
|
* NER: https://spacy.io/usage/linguistic-features#named-entities
|
||||||
|
|
||||||
Compatible with: spaCy v2.0.0+
|
Compatible with: spaCy v2.0.0+
|
||||||
Last tested with: v2.1.0
|
Last tested with: v2.2.4
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import random
|
import random
|
||||||
|
import warnings
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.util import minibatch, compounding
|
from spacy.util import minibatch, compounding
|
||||||
|
@ -57,7 +58,11 @@ def main(model=None, output_dir=None, n_iter=100):
|
||||||
# get names of other pipes to disable them during training
|
# get names of other pipes to disable them during training
|
||||||
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
|
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
||||||
with nlp.disable_pipes(*other_pipes): # only train NER
|
# only train NER
|
||||||
|
with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
|
||||||
|
# show warnings for misaligned entity spans once
|
||||||
|
warnings.filterwarnings("once", category=UserWarning, module='spacy')
|
||||||
|
|
||||||
# reset and initialize the weights randomly – but only if we're
|
# reset and initialize the weights randomly – but only if we're
|
||||||
# training a new model
|
# training a new model
|
||||||
if model is None:
|
if model is None:
|
||||||
|
|
|
@ -24,12 +24,13 @@ For more details, see the documentation:
|
||||||
* NER: https://spacy.io/usage/linguistic-features#named-entities
|
* NER: https://spacy.io/usage/linguistic-features#named-entities
|
||||||
|
|
||||||
Compatible with: spaCy v2.1.0+
|
Compatible with: spaCy v2.1.0+
|
||||||
Last tested with: v2.1.0
|
Last tested with: v2.2.4
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import random
|
import random
|
||||||
|
import warnings
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.util import minibatch, compounding
|
from spacy.util import minibatch, compounding
|
||||||
|
@ -97,7 +98,11 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
|
||||||
# get names of other pipes to disable them during training
|
# get names of other pipes to disable them during training
|
||||||
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
|
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
||||||
with nlp.disable_pipes(*other_pipes): # only train NER
|
# only train NER
|
||||||
|
with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
|
||||||
|
# show warnings for misaligned entity spans once
|
||||||
|
warnings.filterwarnings("once", category=UserWarning, module='spacy')
|
||||||
|
|
||||||
sizes = compounding(1.0, 4.0, 1.001)
|
sizes = compounding(1.0, 4.0, 1.001)
|
||||||
# batch up the examples using spaCy's minibatch
|
# batch up the examples using spaCy's minibatch
|
||||||
for itn in range(n_iter):
|
for itn in range(n_iter):
|
||||||
|
|
20
setup.cfg
20
setup.cfg
|
@ -30,7 +30,7 @@ zip_safe = false
|
||||||
include_package_data = true
|
include_package_data = true
|
||||||
scripts =
|
scripts =
|
||||||
bin/spacy
|
bin/spacy
|
||||||
python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*
|
python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*
|
||||||
setup_requires =
|
setup_requires =
|
||||||
wheel
|
wheel
|
||||||
cython>=0.25
|
cython>=0.25
|
||||||
|
@ -59,19 +59,23 @@ install_requires =
|
||||||
|
|
||||||
[options.extras_require]
|
[options.extras_require]
|
||||||
lookups =
|
lookups =
|
||||||
spacy_lookups_data>=0.0.5,<0.2.0
|
spacy_lookups_data>=0.3.1,<0.4.0
|
||||||
cuda =
|
cuda =
|
||||||
cupy>=5.0.0b4
|
cupy>=5.0.0b4,<9.0.0
|
||||||
cuda80 =
|
cuda80 =
|
||||||
cupy-cuda80>=5.0.0b4
|
cupy-cuda80>=5.0.0b4,<9.0.0
|
||||||
cuda90 =
|
cuda90 =
|
||||||
cupy-cuda90>=5.0.0b4
|
cupy-cuda90>=5.0.0b4,<9.0.0
|
||||||
cuda91 =
|
cuda91 =
|
||||||
cupy-cuda91>=5.0.0b4
|
cupy-cuda91>=5.0.0b4,<9.0.0
|
||||||
cuda92 =
|
cuda92 =
|
||||||
cupy-cuda92>=5.0.0b4
|
cupy-cuda92>=5.0.0b4,<9.0.0
|
||||||
cuda100 =
|
cuda100 =
|
||||||
cupy-cuda100>=5.0.0b4
|
cupy-cuda100>=5.0.0b4,<9.0.0
|
||||||
|
cuda101 =
|
||||||
|
cupy-cuda101>=5.0.0b4,<9.0.0
|
||||||
|
cuda102 =
|
||||||
|
cupy-cuda102>=5.0.0b4,<9.0.0
|
||||||
# Language tokenizers with external dependencies
|
# Language tokenizers with external dependencies
|
||||||
ja =
|
ja =
|
||||||
fugashi>=0.1.3
|
fugashi>=0.1.3
|
||||||
|
|
1
setup.py
1
setup.py
|
@ -31,7 +31,6 @@ PACKAGES = find_packages()
|
||||||
|
|
||||||
|
|
||||||
MOD_NAMES = [
|
MOD_NAMES = [
|
||||||
"spacy._align",
|
|
||||||
"spacy.parts_of_speech",
|
"spacy.parts_of_speech",
|
||||||
"spacy.strings",
|
"spacy.strings",
|
||||||
"spacy.lexeme",
|
"spacy.lexeme",
|
||||||
|
|
|
@ -13,7 +13,7 @@ from . import pipeline
|
||||||
from .cli.info import info as cli_info
|
from .cli.info import info as cli_info
|
||||||
from .glossary import explain
|
from .glossary import explain
|
||||||
from .about import __version__
|
from .about import __version__
|
||||||
from .errors import Errors, Warnings, deprecation_warning
|
from .errors import Errors, Warnings
|
||||||
from . import util
|
from . import util
|
||||||
from .util import registry
|
from .util import registry
|
||||||
from .language import component
|
from .language import component
|
||||||
|
@ -26,7 +26,7 @@ if sys.maxunicode == 65535:
|
||||||
def load(name, **overrides):
|
def load(name, **overrides):
|
||||||
depr_path = overrides.get("path")
|
depr_path = overrides.get("path")
|
||||||
if depr_path not in (True, False, None):
|
if depr_path not in (True, False, None):
|
||||||
deprecation_warning(Warnings.W001.format(path=depr_path))
|
warnings.warn(Warnings.W001.format(path=depr_path), DeprecationWarning)
|
||||||
return util.load_model(name, **overrides)
|
return util.load_model(name, **overrides)
|
||||||
|
|
||||||
|
|
||||||
|
|
255
spacy/_align.pyx
255
spacy/_align.pyx
|
@ -1,255 +0,0 @@
|
||||||
# cython: infer_types=True
|
|
||||||
'''Do Levenshtein alignment, for evaluation of tokenized input.
|
|
||||||
|
|
||||||
Random notes:
|
|
||||||
|
|
||||||
r i n g
|
|
||||||
0 1 2 3 4
|
|
||||||
r 1 0 1 2 3
|
|
||||||
a 2 1 1 2 3
|
|
||||||
n 3 2 2 1 2
|
|
||||||
g 4 3 3 2 1
|
|
||||||
|
|
||||||
0,0: (1,1)=min(0+0,1+1,1+1)=0 S
|
|
||||||
1,0: (2,1)=min(1+1,0+1,2+1)=1 D
|
|
||||||
2,0: (3,1)=min(2+1,3+1,1+1)=2 D
|
|
||||||
3,0: (4,1)=min(3+1,4+1,2+1)=3 D
|
|
||||||
0,1: (1,2)=min(1+1,2+1,0+1)=1 D
|
|
||||||
1,1: (2,2)=min(0+1,1+1,1+1)=1 S
|
|
||||||
2,1: (3,2)=min(1+1,1+1,2+1)=2 S or I
|
|
||||||
3,1: (4,2)=min(2+1,2+1,3+1)=3 S or I
|
|
||||||
0,2: (1,3)=min(2+1,3+1,1+1)=2 I
|
|
||||||
1,2: (2,3)=min(1+1,2+1,1+1)=2 S or I
|
|
||||||
2,2: (3,3)
|
|
||||||
3,2: (4,3)
|
|
||||||
At state (i, j) we're asking "How do I transform S[:i+1] to T[:j+1]?"
|
|
||||||
|
|
||||||
We know the costs to transition:
|
|
||||||
|
|
||||||
S[:i] -> T[:j] (at D[i,j])
|
|
||||||
S[:i+1] -> T[:j] (at D[i+1,j])
|
|
||||||
S[:i] -> T[:j+1] (at D[i,j+1])
|
|
||||||
|
|
||||||
Further, now we can transform:
|
|
||||||
S[:i+1] -> S[:i] (DEL) for 1,
|
|
||||||
T[:j+1] -> T[:j] (INS) for 1.
|
|
||||||
S[i+1] -> T[j+1] (SUB) for 0 or 1
|
|
||||||
|
|
||||||
Therefore we have the costs:
|
|
||||||
SUB: Cost(S[:i]->T[:j]) + Cost(S[i]->S[j])
|
|
||||||
i.e. D[i, j] + S[i+1] != T[j+1]
|
|
||||||
INS: Cost(S[:i+1]->T[:j]) + Cost(T[:j+1]->T[:j])
|
|
||||||
i.e. D[i+1,j] + 1
|
|
||||||
DEL: Cost(S[:i]->T[:j+1]) + Cost(S[:i+1]->S[:i])
|
|
||||||
i.e. D[i,j+1] + 1
|
|
||||||
|
|
||||||
Source string S has length m, with index i
|
|
||||||
Target string T has length n, with index j
|
|
||||||
|
|
||||||
Output two alignment vectors: i2j (length m) and j2i (length n)
|
|
||||||
# function LevenshteinDistance(char s[1..m], char t[1..n]):
|
|
||||||
# for all i and j, d[i,j] will hold the Levenshtein distance between
|
|
||||||
# the first i characters of s and the first j characters of t
|
|
||||||
# note that d has (m+1)*(n+1) values
|
|
||||||
# set each element in d to zero
|
|
||||||
ring rang
|
|
||||||
- r i n g
|
|
||||||
- 0 0 0 0 0
|
|
||||||
r 0 0 0 0 0
|
|
||||||
a 0 0 0 0 0
|
|
||||||
n 0 0 0 0 0
|
|
||||||
g 0 0 0 0 0
|
|
||||||
|
|
||||||
# source prefixes can be transformed into empty string by
|
|
||||||
# dropping all characters
|
|
||||||
# d[i, 0] := i
|
|
||||||
ring rang
|
|
||||||
- r i n g
|
|
||||||
- 0 0 0 0 0
|
|
||||||
r 1 0 0 0 0
|
|
||||||
a 2 0 0 0 0
|
|
||||||
n 3 0 0 0 0
|
|
||||||
g 4 0 0 0 0
|
|
||||||
|
|
||||||
# target prefixes can be reached from empty source prefix
|
|
||||||
# by inserting every character
|
|
||||||
# d[0, j] := j
|
|
||||||
- r i n g
|
|
||||||
- 0 1 2 3 4
|
|
||||||
r 1 0 0 0 0
|
|
||||||
a 2 0 0 0 0
|
|
||||||
n 3 0 0 0 0
|
|
||||||
g 4 0 0 0 0
|
|
||||||
|
|
||||||
'''
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
from libc.stdint cimport uint32_t
|
|
||||||
import numpy
|
|
||||||
cimport numpy as np
|
|
||||||
from .compat import unicode_
|
|
||||||
from murmurhash.mrmr cimport hash32
|
|
||||||
|
|
||||||
|
|
||||||
def align(S, T):
|
|
||||||
cdef int m = len(S)
|
|
||||||
cdef int n = len(T)
|
|
||||||
cdef np.ndarray matrix = numpy.zeros((m+1, n+1), dtype='int32')
|
|
||||||
cdef np.ndarray i2j = numpy.zeros((m,), dtype='i')
|
|
||||||
cdef np.ndarray j2i = numpy.zeros((n,), dtype='i')
|
|
||||||
|
|
||||||
cdef np.ndarray S_arr = _convert_sequence(S)
|
|
||||||
cdef np.ndarray T_arr = _convert_sequence(T)
|
|
||||||
|
|
||||||
fill_matrix(<int*>matrix.data,
|
|
||||||
<const int*>S_arr.data, m, <const int*>T_arr.data, n)
|
|
||||||
fill_i2j(i2j, matrix)
|
|
||||||
fill_j2i(j2i, matrix)
|
|
||||||
for i in range(i2j.shape[0]):
|
|
||||||
if i2j[i] >= 0 and len(S[i]) != len(T[i2j[i]]):
|
|
||||||
i2j[i] = -1
|
|
||||||
for j in range(j2i.shape[0]):
|
|
||||||
if j2i[j] >= 0 and len(T[j]) != len(S[j2i[j]]):
|
|
||||||
j2i[j] = -1
|
|
||||||
return matrix[-1,-1], i2j, j2i, matrix
|
|
||||||
|
|
||||||
|
|
||||||
def multi_align(np.ndarray i2j, np.ndarray j2i, i_lengths, j_lengths):
|
|
||||||
'''Let's say we had:
|
|
||||||
|
|
||||||
Guess: [aa bb cc dd]
|
|
||||||
Truth: [aa bbcc dd]
|
|
||||||
i2j: [0, None, -2, 2]
|
|
||||||
j2i: [0, -2, 3]
|
|
||||||
|
|
||||||
We want:
|
|
||||||
|
|
||||||
i2j_multi: {1: 1, 2: 1}
|
|
||||||
j2i_multi: {}
|
|
||||||
'''
|
|
||||||
i2j_miss = _get_regions(i2j, i_lengths)
|
|
||||||
j2i_miss = _get_regions(j2i, j_lengths)
|
|
||||||
|
|
||||||
i2j_multi, j2i_multi = _get_mapping(i2j_miss, j2i_miss, i_lengths, j_lengths)
|
|
||||||
return i2j_multi, j2i_multi
|
|
||||||
|
|
||||||
|
|
||||||
def _get_regions(alignment, lengths):
|
|
||||||
regions = {}
|
|
||||||
start = None
|
|
||||||
offset = 0
|
|
||||||
for i in range(len(alignment)):
|
|
||||||
if alignment[i] < 0:
|
|
||||||
if start is None:
|
|
||||||
start = offset
|
|
||||||
regions.setdefault(start, [])
|
|
||||||
regions[start].append(i)
|
|
||||||
else:
|
|
||||||
start = None
|
|
||||||
offset += lengths[i]
|
|
||||||
return regions
|
|
||||||
|
|
||||||
|
|
||||||
def _get_mapping(miss1, miss2, lengths1, lengths2):
|
|
||||||
i2j = {}
|
|
||||||
j2i = {}
|
|
||||||
for start, region1 in miss1.items():
|
|
||||||
if not region1 or start not in miss2:
|
|
||||||
continue
|
|
||||||
region2 = miss2[start]
|
|
||||||
if sum(lengths1[i] for i in region1) == sum(lengths2[i] for i in region2):
|
|
||||||
j = region2.pop(0)
|
|
||||||
buff = []
|
|
||||||
# Consume tokens from region 1, until we meet the length of the
|
|
||||||
# first token in region2. If we do, align the tokens. If
|
|
||||||
# we exceed the length, break.
|
|
||||||
while region1:
|
|
||||||
buff.append(region1.pop(0))
|
|
||||||
if sum(lengths1[i] for i in buff) == lengths2[j]:
|
|
||||||
for i in buff:
|
|
||||||
i2j[i] = j
|
|
||||||
j2i[j] = buff[-1]
|
|
||||||
j += 1
|
|
||||||
buff = []
|
|
||||||
elif sum(lengths1[i] for i in buff) > lengths2[j]:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
if buff and sum(lengths1[i] for i in buff) == lengths2[j]:
|
|
||||||
for i in buff:
|
|
||||||
i2j[i] = j
|
|
||||||
j2i[j] = buff[-1]
|
|
||||||
return i2j, j2i
|
|
||||||
|
|
||||||
|
|
||||||
def _convert_sequence(seq):
|
|
||||||
if isinstance(seq, numpy.ndarray):
|
|
||||||
return numpy.ascontiguousarray(seq, dtype='uint32_t')
|
|
||||||
cdef np.ndarray output = numpy.zeros((len(seq),), dtype='uint32')
|
|
||||||
cdef bytes item_bytes
|
|
||||||
for i, item in enumerate(seq):
|
|
||||||
if item == "``":
|
|
||||||
item = '"'
|
|
||||||
elif item == "''":
|
|
||||||
item = '"'
|
|
||||||
if isinstance(item, unicode):
|
|
||||||
item_bytes = item.encode('utf8')
|
|
||||||
else:
|
|
||||||
item_bytes = item
|
|
||||||
output[i] = hash32(<void*><char*>item_bytes, len(item_bytes), 0)
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
cdef void fill_matrix(int* D,
|
|
||||||
const int* S, int m, const int* T, int n) nogil:
|
|
||||||
m1 = m+1
|
|
||||||
n1 = n+1
|
|
||||||
for i in range(m1*n1):
|
|
||||||
D[i] = 0
|
|
||||||
|
|
||||||
for i in range(m1):
|
|
||||||
D[i*n1] = i
|
|
||||||
|
|
||||||
for j in range(n1):
|
|
||||||
D[j] = j
|
|
||||||
|
|
||||||
cdef int sub_cost, ins_cost, del_cost
|
|
||||||
for j in range(n):
|
|
||||||
for i in range(m):
|
|
||||||
i_j = i*n1 + j
|
|
||||||
i1_j1 = (i+1)*n1 + j+1
|
|
||||||
i1_j = (i+1)*n1 + j
|
|
||||||
i_j1 = i*n1 + j+1
|
|
||||||
if S[i] != T[j]:
|
|
||||||
sub_cost = D[i_j] + 1
|
|
||||||
else:
|
|
||||||
sub_cost = D[i_j]
|
|
||||||
del_cost = D[i_j1] + 1
|
|
||||||
ins_cost = D[i1_j] + 1
|
|
||||||
best = min(min(sub_cost, ins_cost), del_cost)
|
|
||||||
D[i1_j1] = best
|
|
||||||
|
|
||||||
|
|
||||||
cdef void fill_i2j(np.ndarray i2j, np.ndarray D) except *:
|
|
||||||
j = D.shape[1]-2
|
|
||||||
cdef int i = D.shape[0]-2
|
|
||||||
while i >= 0:
|
|
||||||
while D[i+1, j] < D[i+1, j+1]:
|
|
||||||
j -= 1
|
|
||||||
if D[i, j+1] < D[i+1, j+1]:
|
|
||||||
i2j[i] = -1
|
|
||||||
else:
|
|
||||||
i2j[i] = j
|
|
||||||
j -= 1
|
|
||||||
i -= 1
|
|
||||||
|
|
||||||
cdef void fill_j2i(np.ndarray j2i, np.ndarray D) except *:
|
|
||||||
i = D.shape[0]-2
|
|
||||||
cdef int j = D.shape[1]-2
|
|
||||||
while j >= 0:
|
|
||||||
while D[i, j+1] < D[i+1, j+1]:
|
|
||||||
i -= 1
|
|
||||||
if D[i+1, j] < D[i+1, j+1]:
|
|
||||||
j2i[j] = -1
|
|
||||||
else:
|
|
||||||
j2i[j] = i
|
|
||||||
i -= 1
|
|
||||||
j -= 1
|
|
18
spacy/_ml.py
18
spacy/_ml.py
|
@ -2,6 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
|
import warnings
|
||||||
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
|
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
|
||||||
from thinc.t2t import ExtractWindow, ParametricAttention
|
from thinc.t2t import ExtractWindow, ParametricAttention
|
||||||
from thinc.t2v import Pooling, sum_pool, mean_pool
|
from thinc.t2v import Pooling, sum_pool, mean_pool
|
||||||
|
@ -22,7 +23,7 @@ from thinc.neural._classes.affine import _set_dimensions_if_needed
|
||||||
import thinc.extra.load_nlp
|
import thinc.extra.load_nlp
|
||||||
|
|
||||||
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
|
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
|
||||||
from .errors import Errors, user_warning, Warnings
|
from .errors import Errors, Warnings
|
||||||
from . import util
|
from . import util
|
||||||
from . import ml as new_ml
|
from . import ml as new_ml
|
||||||
from .ml import _legacy_tok2vec
|
from .ml import _legacy_tok2vec
|
||||||
|
@ -278,18 +279,19 @@ class PrecomputableAffine(Model):
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
def link_vectors_to_models(vocab):
|
def link_vectors_to_models(vocab, skip_rank=False):
|
||||||
vectors = vocab.vectors
|
vectors = vocab.vectors
|
||||||
if vectors.name is None:
|
if vectors.name is None:
|
||||||
vectors.name = VECTORS_KEY
|
vectors.name = VECTORS_KEY
|
||||||
if vectors.data.size != 0:
|
if vectors.data.size != 0:
|
||||||
user_warning(Warnings.W020.format(shape=vectors.data.shape))
|
warnings.warn(Warnings.W020.format(shape=vectors.data.shape))
|
||||||
ops = Model.ops
|
ops = Model.ops
|
||||||
|
if not skip_rank:
|
||||||
for word in vocab:
|
for word in vocab:
|
||||||
if word.orth in vectors.key2row:
|
if word.orth in vectors.key2row:
|
||||||
word.rank = vectors.key2row[word.orth]
|
word.rank = vectors.key2row[word.orth]
|
||||||
else:
|
else:
|
||||||
word.rank = 0
|
word.rank = util.OOV_RANK
|
||||||
data = ops.asarray(vectors.data)
|
data = ops.asarray(vectors.data)
|
||||||
# Set an entry here, so that vectors are accessed by StaticVectors
|
# Set an entry here, so that vectors are accessed by StaticVectors
|
||||||
# (unideal, I know)
|
# (unideal, I know)
|
||||||
|
@ -299,7 +301,7 @@ def link_vectors_to_models(vocab):
|
||||||
# This is a hack to avoid the problem in #3853.
|
# This is a hack to avoid the problem in #3853.
|
||||||
old_name = vectors.name
|
old_name = vectors.name
|
||||||
new_name = vectors.name + "_%d" % data.shape[0]
|
new_name = vectors.name + "_%d" % data.shape[0]
|
||||||
user_warning(Warnings.W019.format(old=old_name, new=new_name))
|
warnings.warn(Warnings.W019.format(old=old_name, new=new_name))
|
||||||
vectors.name = new_name
|
vectors.name = new_name
|
||||||
key = (ops.device, vectors.name)
|
key = (ops.device, vectors.name)
|
||||||
thinc.extra.load_nlp.VECTORS[key] = data
|
thinc.extra.load_nlp.VECTORS[key] = data
|
||||||
|
@ -693,9 +695,11 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
)
|
)
|
||||||
|
|
||||||
linear_model = build_bow_text_classifier(
|
linear_model = build_bow_text_classifier(
|
||||||
nr_class, ngram_size=cfg.get("ngram_size", 1), exclusive_classes=False
|
nr_class,
|
||||||
|
ngram_size=cfg.get("ngram_size", 1),
|
||||||
|
exclusive_classes=cfg.get("exclusive_classes", False),
|
||||||
)
|
)
|
||||||
if cfg.get("exclusive_classes"):
|
if cfg.get("exclusive_classes", False):
|
||||||
output_layer = Softmax(nr_class, nr_class * 2)
|
output_layer = Softmax(nr_class, nr_class * 2)
|
||||||
else:
|
else:
|
||||||
output_layer = (
|
output_layer = (
|
||||||
|
|
|
@ -1,11 +1,13 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
|
||||||
from .tokens import Doc, Token, Span
|
from .tokens import Doc, Token, Span
|
||||||
from .errors import Errors, Warnings, user_warning
|
from .errors import Errors, Warnings
|
||||||
|
|
||||||
|
|
||||||
def analyze_pipes(pipeline, name, pipe, index, warn=True):
|
def analyze_pipes(pipeline, name, pipe, index, warn=True):
|
||||||
|
@ -34,7 +36,7 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True):
|
||||||
if not fulfilled:
|
if not fulfilled:
|
||||||
problems.append(annot)
|
problems.append(annot)
|
||||||
if warn:
|
if warn:
|
||||||
user_warning(Warnings.W025.format(name=name, attr=annot))
|
warnings.warn(Warnings.W025.format(name=name, attr=annot))
|
||||||
return problems
|
return problems
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,7 @@ cdef enum attr_id_t:
|
||||||
LIKE_NUM
|
LIKE_NUM
|
||||||
LIKE_EMAIL
|
LIKE_EMAIL
|
||||||
IS_STOP
|
IS_STOP
|
||||||
IS_OOV
|
IS_OOV_DEPRECATED
|
||||||
IS_BRACKET
|
IS_BRACKET
|
||||||
IS_QUOTE
|
IS_QUOTE
|
||||||
IS_LEFT_PUNCT
|
IS_LEFT_PUNCT
|
||||||
|
@ -94,3 +94,4 @@ cdef enum attr_id_t:
|
||||||
ENT_ID = symbols.ENT_ID
|
ENT_ID = symbols.ENT_ID
|
||||||
|
|
||||||
IDX
|
IDX
|
||||||
|
SENT_END
|
|
@ -16,7 +16,7 @@ IDS = {
|
||||||
"LIKE_NUM": LIKE_NUM,
|
"LIKE_NUM": LIKE_NUM,
|
||||||
"LIKE_EMAIL": LIKE_EMAIL,
|
"LIKE_EMAIL": LIKE_EMAIL,
|
||||||
"IS_STOP": IS_STOP,
|
"IS_STOP": IS_STOP,
|
||||||
"IS_OOV": IS_OOV,
|
"IS_OOV_DEPRECATED": IS_OOV_DEPRECATED,
|
||||||
"IS_BRACKET": IS_BRACKET,
|
"IS_BRACKET": IS_BRACKET,
|
||||||
"IS_QUOTE": IS_QUOTE,
|
"IS_QUOTE": IS_QUOTE,
|
||||||
"IS_LEFT_PUNCT": IS_LEFT_PUNCT,
|
"IS_LEFT_PUNCT": IS_LEFT_PUNCT,
|
||||||
|
@ -88,6 +88,7 @@ IDS = {
|
||||||
"ENT_KB_ID": ENT_KB_ID,
|
"ENT_KB_ID": ENT_KB_ID,
|
||||||
"HEAD": HEAD,
|
"HEAD": HEAD,
|
||||||
"SENT_START": SENT_START,
|
"SENT_START": SENT_START,
|
||||||
|
"SENT_END": SENT_END,
|
||||||
"SPACY": SPACY,
|
"SPACY": SPACY,
|
||||||
"PROB": PROB,
|
"PROB": PROB,
|
||||||
"LANG": LANG,
|
"LANG": LANG,
|
||||||
|
|
|
@ -108,9 +108,11 @@ def debug_data(
|
||||||
msg.good("Corpus is loadable")
|
msg.good("Corpus is loadable")
|
||||||
|
|
||||||
# Create all gold data here to avoid iterating over the train_docs constantly
|
# Create all gold data here to avoid iterating over the train_docs constantly
|
||||||
gold_train_data = _compile_gold(train_docs, pipeline)
|
gold_train_data = _compile_gold(train_docs, pipeline, nlp)
|
||||||
gold_train_unpreprocessed_data = _compile_gold(train_docs_unpreprocessed, pipeline)
|
gold_train_unpreprocessed_data = _compile_gold(
|
||||||
gold_dev_data = _compile_gold(dev_docs, pipeline)
|
train_docs_unpreprocessed, pipeline, nlp
|
||||||
|
)
|
||||||
|
gold_dev_data = _compile_gold(dev_docs, pipeline, nlp)
|
||||||
|
|
||||||
train_texts = gold_train_data["texts"]
|
train_texts = gold_train_data["texts"]
|
||||||
dev_texts = gold_dev_data["texts"]
|
dev_texts = gold_dev_data["texts"]
|
||||||
|
@ -182,6 +184,21 @@ def debug_data(
|
||||||
nlp.vocab.vectors_length,
|
nlp.vocab.vectors_length,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
|
||||||
|
msg.warn(
|
||||||
|
"{} words in training data without vectors ({:0.2f}%)".format(
|
||||||
|
n_missing_vectors, n_missing_vectors / gold_train_data["n_words"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
msg.text(
|
||||||
|
"10 most common words without vectors: {}".format(
|
||||||
|
_format_labels(
|
||||||
|
gold_train_data["words_missing_vectors"].most_common(10),
|
||||||
|
counts=True,
|
||||||
|
)
|
||||||
|
),
|
||||||
|
show=verbose,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
msg.info("No word vectors present in the model")
|
msg.info("No word vectors present in the model")
|
||||||
|
|
||||||
|
@ -562,7 +579,7 @@ def _load_file(file_path, msg):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _compile_gold(train_docs, pipeline):
|
def _compile_gold(train_docs, pipeline, nlp):
|
||||||
data = {
|
data = {
|
||||||
"ner": Counter(),
|
"ner": Counter(),
|
||||||
"cats": Counter(),
|
"cats": Counter(),
|
||||||
|
@ -574,6 +591,7 @@ def _compile_gold(train_docs, pipeline):
|
||||||
"punct_ents": 0,
|
"punct_ents": 0,
|
||||||
"n_words": 0,
|
"n_words": 0,
|
||||||
"n_misaligned_words": 0,
|
"n_misaligned_words": 0,
|
||||||
|
"words_missing_vectors": Counter(),
|
||||||
"n_sents": 0,
|
"n_sents": 0,
|
||||||
"n_nonproj": 0,
|
"n_nonproj": 0,
|
||||||
"n_cycles": 0,
|
"n_cycles": 0,
|
||||||
|
@ -586,6 +604,10 @@ def _compile_gold(train_docs, pipeline):
|
||||||
data["n_words"] += len(valid_words)
|
data["n_words"] += len(valid_words)
|
||||||
data["n_misaligned_words"] += len(gold.words) - len(valid_words)
|
data["n_misaligned_words"] += len(gold.words) - len(valid_words)
|
||||||
data["texts"].add(doc.text)
|
data["texts"].add(doc.text)
|
||||||
|
if len(nlp.vocab.vectors):
|
||||||
|
for word in valid_words:
|
||||||
|
if nlp.vocab.strings[word] not in nlp.vocab.vectors:
|
||||||
|
data["words_missing_vectors"].update([word])
|
||||||
if "ner" in pipeline:
|
if "ner" in pipeline:
|
||||||
for i, label in enumerate(gold.ner):
|
for i, label in enumerate(gold.ner):
|
||||||
if label is None:
|
if label is None:
|
||||||
|
@ -636,7 +658,11 @@ def _format_labels(labels, counts=False):
|
||||||
def _get_examples_without_label(data, label):
|
def _get_examples_without_label(data, label):
|
||||||
count = 0
|
count = 0
|
||||||
for doc, gold in data:
|
for doc, gold in data:
|
||||||
labels = [label.split("-")[1] for label in gold.ner if label not in ("O", "-")]
|
labels = [
|
||||||
|
label.split("-")[1]
|
||||||
|
for label in gold.ner
|
||||||
|
if label is not None and label not in ("O", "-")
|
||||||
|
]
|
||||||
if label not in labels:
|
if label not in labels:
|
||||||
count += 1
|
count += 1
|
||||||
return count
|
return count
|
||||||
|
|
|
@ -43,6 +43,9 @@ def evaluate(
|
||||||
if displacy_path and not displacy_path.exists():
|
if displacy_path and not displacy_path.exists():
|
||||||
msg.fail("Visualization output directory not found", displacy_path, exits=1)
|
msg.fail("Visualization output directory not found", displacy_path, exits=1)
|
||||||
corpus = GoldCorpus(data_path, data_path)
|
corpus = GoldCorpus(data_path, data_path)
|
||||||
|
if model.startswith("blank:"):
|
||||||
|
nlp = util.get_lang_class(model.replace("blank:", ""))()
|
||||||
|
else:
|
||||||
nlp = util.load_model(model)
|
nlp = util.load_model(model)
|
||||||
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
|
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
|
||||||
begin = timer()
|
begin = timer()
|
||||||
|
|
|
@ -12,11 +12,14 @@ import tarfile
|
||||||
import gzip
|
import gzip
|
||||||
import zipfile
|
import zipfile
|
||||||
import srsly
|
import srsly
|
||||||
|
import warnings
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
|
||||||
from ..vectors import Vectors
|
from ..vectors import Vectors
|
||||||
from ..errors import Errors, Warnings, user_warning
|
from ..errors import Errors, Warnings
|
||||||
from ..util import ensure_path, get_lang_class
|
from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
|
||||||
|
from ..lookups import Lookups
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import ftfy
|
import ftfy
|
||||||
|
@ -34,6 +37,12 @@ DEFAULT_OOV_PROB = -20
|
||||||
jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
|
jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
|
||||||
clusters_loc=("Optional location of brown clusters data", "option", "c", str),
|
clusters_loc=("Optional location of brown clusters data", "option", "c", str),
|
||||||
vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
|
vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
|
||||||
|
truncate_vectors=(
|
||||||
|
"Optional number of vectors to truncate to when reading in vectors file",
|
||||||
|
"option",
|
||||||
|
"t",
|
||||||
|
int,
|
||||||
|
),
|
||||||
prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
|
prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
|
||||||
vectors_name=(
|
vectors_name=(
|
||||||
"Optional name for the word vectors, e.g. en_core_web_lg.vectors",
|
"Optional name for the word vectors, e.g. en_core_web_lg.vectors",
|
||||||
|
@ -42,6 +51,8 @@ DEFAULT_OOV_PROB = -20
|
||||||
str,
|
str,
|
||||||
),
|
),
|
||||||
model_name=("Optional name for the model meta", "option", "mn", str),
|
model_name=("Optional name for the model meta", "option", "mn", str),
|
||||||
|
omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
|
||||||
|
base_model=("Base model (for languages with custom tokenizers)", "option", "b", str),
|
||||||
)
|
)
|
||||||
def init_model(
|
def init_model(
|
||||||
lang,
|
lang,
|
||||||
|
@ -50,9 +61,12 @@ def init_model(
|
||||||
clusters_loc=None,
|
clusters_loc=None,
|
||||||
jsonl_loc=None,
|
jsonl_loc=None,
|
||||||
vectors_loc=None,
|
vectors_loc=None,
|
||||||
|
truncate_vectors=0,
|
||||||
prune_vectors=-1,
|
prune_vectors=-1,
|
||||||
vectors_name=None,
|
vectors_name=None,
|
||||||
model_name=None,
|
model_name=None,
|
||||||
|
omit_extra_lookups=False,
|
||||||
|
base_model=None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Create a new model from raw data, like word frequencies, Brown clusters
|
Create a new model from raw data, like word frequencies, Brown clusters
|
||||||
|
@ -84,10 +98,19 @@ def init_model(
|
||||||
lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)
|
lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)
|
||||||
|
|
||||||
with msg.loading("Creating model..."):
|
with msg.loading("Creating model..."):
|
||||||
nlp = create_model(lang, lex_attrs, name=model_name)
|
nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
|
||||||
|
|
||||||
|
# Create empty extra lexeme tables so the data from spacy-lookups-data
|
||||||
|
# isn't loaded if these features are accessed
|
||||||
|
if omit_extra_lookups:
|
||||||
|
nlp.vocab.lookups_extra = Lookups()
|
||||||
|
nlp.vocab.lookups_extra.add_table("lexeme_cluster")
|
||||||
|
nlp.vocab.lookups_extra.add_table("lexeme_prob")
|
||||||
|
nlp.vocab.lookups_extra.add_table("lexeme_settings")
|
||||||
|
|
||||||
msg.good("Successfully created model")
|
msg.good("Successfully created model")
|
||||||
if vectors_loc is not None:
|
if vectors_loc is not None:
|
||||||
add_vectors(nlp, vectors_loc, prune_vectors, vectors_name)
|
add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name)
|
||||||
vec_added = len(nlp.vocab.vectors)
|
vec_added = len(nlp.vocab.vectors)
|
||||||
lex_added = len(nlp.vocab)
|
lex_added = len(nlp.vocab)
|
||||||
msg.good(
|
msg.good(
|
||||||
|
@ -144,20 +167,23 @@ def read_attrs_from_deprecated(freqs_loc, clusters_loc):
|
||||||
return lex_attrs
|
return lex_attrs
|
||||||
|
|
||||||
|
|
||||||
def create_model(lang, lex_attrs, name=None):
|
def create_model(lang, lex_attrs, name=None, base_model=None):
|
||||||
|
if base_model:
|
||||||
|
nlp = load_model(base_model)
|
||||||
|
# keep the tokenizer but remove any existing pipeline components due to
|
||||||
|
# potentially conflicting vectors
|
||||||
|
for pipe in nlp.pipe_names:
|
||||||
|
nlp.remove_pipe(pipe)
|
||||||
|
else:
|
||||||
lang_class = get_lang_class(lang)
|
lang_class = get_lang_class(lang)
|
||||||
nlp = lang_class()
|
nlp = lang_class()
|
||||||
for lexeme in nlp.vocab:
|
for lexeme in nlp.vocab:
|
||||||
lexeme.rank = 0
|
lexeme.rank = OOV_RANK
|
||||||
lex_added = 0
|
|
||||||
for attrs in lex_attrs:
|
for attrs in lex_attrs:
|
||||||
if "settings" in attrs:
|
if "settings" in attrs:
|
||||||
continue
|
continue
|
||||||
lexeme = nlp.vocab[attrs["orth"]]
|
lexeme = nlp.vocab[attrs["orth"]]
|
||||||
lexeme.set_attrs(**attrs)
|
lexeme.set_attrs(**attrs)
|
||||||
lexeme.is_oov = False
|
|
||||||
lex_added += 1
|
|
||||||
lex_added += 1
|
|
||||||
if len(nlp.vocab):
|
if len(nlp.vocab):
|
||||||
oov_prob = min(lex.prob for lex in nlp.vocab) - 1
|
oov_prob = min(lex.prob for lex in nlp.vocab) - 1
|
||||||
else:
|
else:
|
||||||
|
@ -168,25 +194,24 @@ def create_model(lang, lex_attrs, name=None):
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
|
def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
|
||||||
vectors_loc = ensure_path(vectors_loc)
|
vectors_loc = ensure_path(vectors_loc)
|
||||||
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
||||||
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
|
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
|
||||||
for lex in nlp.vocab:
|
for lex in nlp.vocab:
|
||||||
if lex.rank:
|
if lex.rank and lex.rank != OOV_RANK:
|
||||||
nlp.vocab.vectors.add(lex.orth, row=lex.rank)
|
nlp.vocab.vectors.add(lex.orth, row=lex.rank)
|
||||||
else:
|
else:
|
||||||
if vectors_loc:
|
if vectors_loc:
|
||||||
with msg.loading("Reading vectors from {}".format(vectors_loc)):
|
with msg.loading("Reading vectors from {}".format(vectors_loc)):
|
||||||
vectors_data, vector_keys = read_vectors(vectors_loc)
|
vectors_data, vector_keys = read_vectors(vectors_loc, truncate_vectors)
|
||||||
msg.good("Loaded vectors from {}".format(vectors_loc))
|
msg.good("Loaded vectors from {}".format(vectors_loc))
|
||||||
else:
|
else:
|
||||||
vectors_data, vector_keys = (None, None)
|
vectors_data, vector_keys = (None, None)
|
||||||
if vector_keys is not None:
|
if vector_keys is not None:
|
||||||
for word in vector_keys:
|
for word in vector_keys:
|
||||||
if word not in nlp.vocab:
|
if word not in nlp.vocab:
|
||||||
lexeme = nlp.vocab[word]
|
nlp.vocab[word]
|
||||||
lexeme.is_oov = False
|
|
||||||
if vectors_data is not None:
|
if vectors_data is not None:
|
||||||
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
||||||
if name is None:
|
if name is None:
|
||||||
|
@ -198,9 +223,11 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
|
||||||
nlp.vocab.prune_vectors(prune_vectors)
|
nlp.vocab.prune_vectors(prune_vectors)
|
||||||
|
|
||||||
|
|
||||||
def read_vectors(vectors_loc):
|
def read_vectors(vectors_loc, truncate_vectors=0):
|
||||||
f = open_file(vectors_loc)
|
f = open_file(vectors_loc)
|
||||||
shape = tuple(int(size) for size in next(f).split())
|
shape = tuple(int(size) for size in next(f).split())
|
||||||
|
if truncate_vectors >= 1:
|
||||||
|
shape = (truncate_vectors, shape[1])
|
||||||
vectors_data = numpy.zeros(shape=shape, dtype="f")
|
vectors_data = numpy.zeros(shape=shape, dtype="f")
|
||||||
vectors_keys = []
|
vectors_keys = []
|
||||||
for i, line in enumerate(tqdm(f)):
|
for i, line in enumerate(tqdm(f)):
|
||||||
|
@ -211,6 +238,8 @@ def read_vectors(vectors_loc):
|
||||||
msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
|
msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
|
||||||
vectors_data[i] = numpy.asarray(pieces, dtype="f")
|
vectors_data[i] = numpy.asarray(pieces, dtype="f")
|
||||||
vectors_keys.append(word)
|
vectors_keys.append(word)
|
||||||
|
if i == truncate_vectors - 1:
|
||||||
|
break
|
||||||
return vectors_data, vectors_keys
|
return vectors_data, vectors_keys
|
||||||
|
|
||||||
|
|
||||||
|
@ -246,7 +275,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
||||||
def read_clusters(clusters_loc):
|
def read_clusters(clusters_loc):
|
||||||
clusters = {}
|
clusters = {}
|
||||||
if ftfy is None:
|
if ftfy is None:
|
||||||
user_warning(Warnings.W004)
|
warnings.warn(Warnings.W004)
|
||||||
with clusters_loc.open() as f:
|
with clusters_loc.open() as f:
|
||||||
for line in tqdm(f):
|
for line in tqdm(f):
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -15,9 +15,9 @@ import random
|
||||||
|
|
||||||
from .._ml import create_default_optimizer
|
from .._ml import create_default_optimizer
|
||||||
from ..util import use_gpu as set_gpu
|
from ..util import use_gpu as set_gpu
|
||||||
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
|
|
||||||
from ..gold import GoldCorpus
|
from ..gold import GoldCorpus
|
||||||
from ..compat import path2str
|
from ..compat import path2str
|
||||||
|
from ..lookups import Lookups
|
||||||
from .. import util
|
from .. import util
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
|
@ -58,6 +58,7 @@ from .. import about
|
||||||
textcat_arch=("Textcat model architecture", "option", "ta", str),
|
textcat_arch=("Textcat model architecture", "option", "ta", str),
|
||||||
textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str),
|
textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str),
|
||||||
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
|
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
|
||||||
|
omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
|
||||||
verbose=("Display more information for debug", "flag", "VV", bool),
|
verbose=("Display more information for debug", "flag", "VV", bool),
|
||||||
debug=("Run data diagnostics before training", "flag", "D", bool),
|
debug=("Run data diagnostics before training", "flag", "D", bool),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
@ -97,6 +98,7 @@ def train(
|
||||||
textcat_arch="bow",
|
textcat_arch="bow",
|
||||||
textcat_positive_label=None,
|
textcat_positive_label=None,
|
||||||
tag_map_path=None,
|
tag_map_path=None,
|
||||||
|
omit_extra_lookups=False,
|
||||||
verbose=False,
|
verbose=False,
|
||||||
debug=False,
|
debug=False,
|
||||||
):
|
):
|
||||||
|
@ -225,7 +227,9 @@ def train(
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
msg.text("Extending component from base model '{}'".format(pipe))
|
msg.text("Extending component from base model '{}'".format(pipe))
|
||||||
disabled_pipes = nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline])
|
disabled_pipes = nlp.disable_pipes(
|
||||||
|
[p for p in nlp.pipe_names if p not in pipeline]
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
msg.text("Starting with blank model '{}'".format(lang))
|
msg.text("Starting with blank model '{}'".format(lang))
|
||||||
lang_cls = util.get_lang_class(lang)
|
lang_cls = util.get_lang_class(lang)
|
||||||
|
@ -246,6 +250,14 @@ def train(
|
||||||
# Update tag map with provided mapping
|
# Update tag map with provided mapping
|
||||||
nlp.vocab.morphology.tag_map.update(tag_map)
|
nlp.vocab.morphology.tag_map.update(tag_map)
|
||||||
|
|
||||||
|
# Create empty extra lexeme tables so the data from spacy-lookups-data
|
||||||
|
# isn't loaded if these features are accessed
|
||||||
|
if omit_extra_lookups:
|
||||||
|
nlp.vocab.lookups_extra = Lookups()
|
||||||
|
nlp.vocab.lookups_extra.add_table("lexeme_cluster")
|
||||||
|
nlp.vocab.lookups_extra.add_table("lexeme_prob")
|
||||||
|
nlp.vocab.lookups_extra.add_table("lexeme_settings")
|
||||||
|
|
||||||
if vectors:
|
if vectors:
|
||||||
msg.text("Loading vector from model '{}'".format(vectors))
|
msg.text("Loading vector from model '{}'".format(vectors))
|
||||||
_load_vectors(nlp, vectors)
|
_load_vectors(nlp, vectors)
|
||||||
|
@ -361,7 +373,7 @@ def train(
|
||||||
if len(textcat_labels) == 2:
|
if len(textcat_labels) == 2:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"If the textcat component is a binary classifier with "
|
"If the textcat component is a binary classifier with "
|
||||||
"exclusive classes, provide '--textcat_positive_label' for "
|
"exclusive classes, provide '--textcat-positive-label' for "
|
||||||
"an evaluation on the positive class."
|
"an evaluation on the positive class."
|
||||||
)
|
)
|
||||||
msg.text(
|
msg.text(
|
||||||
|
@ -415,10 +427,10 @@ def train(
|
||||||
losses=losses,
|
losses=losses,
|
||||||
)
|
)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
msg.warn("Error during training")
|
err = "Error during training"
|
||||||
if init_tok2vec:
|
if init_tok2vec:
|
||||||
msg.warn("Did you provide the same parameters during 'train' as during 'pretrain'?")
|
err += " Did you provide the same parameters during 'train' as during 'pretrain'?"
|
||||||
msg.fail("Original error message: {}".format(e), exits=1)
|
msg.fail(err, "Original error message: {}".format(e), exits=1)
|
||||||
if raw_text:
|
if raw_text:
|
||||||
# If raw text is available, perform 'rehearsal' updates,
|
# If raw text is available, perform 'rehearsal' updates,
|
||||||
# which use unlabelled data to reduce overfitting.
|
# which use unlabelled data to reduce overfitting.
|
||||||
|
@ -452,6 +464,9 @@ def train(
|
||||||
cpu_wps = nwords / (end_time - start_time)
|
cpu_wps = nwords / (end_time - start_time)
|
||||||
else:
|
else:
|
||||||
gpu_wps = nwords / (end_time - start_time)
|
gpu_wps = nwords / (end_time - start_time)
|
||||||
|
# Only evaluate on CPU in the first iteration (for
|
||||||
|
# timing) if GPU is enabled
|
||||||
|
if i == 0:
|
||||||
with Model.use_device("cpu"):
|
with Model.use_device("cpu"):
|
||||||
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
||||||
for name, component in nlp_loaded.pipeline:
|
for name, component in nlp_loaded.pipeline:
|
||||||
|
@ -546,7 +561,11 @@ def train(
|
||||||
)
|
)
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
msg.warn("Aborting and saving the final best model. Encountered exception: {}".format(e))
|
msg.warn(
|
||||||
|
"Aborting and saving the final best model. "
|
||||||
|
"Encountered exception: {}".format(e),
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
finally:
|
finally:
|
||||||
best_pipes = nlp.pipe_names
|
best_pipes = nlp.pipe_names
|
||||||
if disabled_pipes:
|
if disabled_pipes:
|
||||||
|
@ -561,15 +580,25 @@ def train(
|
||||||
final_meta.setdefault("speed", {})
|
final_meta.setdefault("speed", {})
|
||||||
final_meta["speed"].setdefault("cpu", None)
|
final_meta["speed"].setdefault("cpu", None)
|
||||||
final_meta["speed"].setdefault("gpu", None)
|
final_meta["speed"].setdefault("gpu", None)
|
||||||
|
meta.setdefault("speed", {})
|
||||||
|
meta["speed"].setdefault("cpu", None)
|
||||||
|
meta["speed"].setdefault("gpu", None)
|
||||||
# combine cpu and gpu speeds with the base model speeds
|
# combine cpu and gpu speeds with the base model speeds
|
||||||
if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]:
|
if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]:
|
||||||
speed = _get_total_speed([final_meta["speed"]["cpu"], meta["speed"]["cpu"]])
|
speed = _get_total_speed(
|
||||||
|
[final_meta["speed"]["cpu"], meta["speed"]["cpu"]]
|
||||||
|
)
|
||||||
final_meta["speed"]["cpu"] = speed
|
final_meta["speed"]["cpu"] = speed
|
||||||
if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]:
|
if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]:
|
||||||
speed = _get_total_speed([final_meta["speed"]["gpu"], meta["speed"]["gpu"]])
|
speed = _get_total_speed(
|
||||||
|
[final_meta["speed"]["gpu"], meta["speed"]["gpu"]]
|
||||||
|
)
|
||||||
final_meta["speed"]["gpu"] = speed
|
final_meta["speed"]["gpu"] = speed
|
||||||
# if there were no speeds to update, overwrite with meta
|
# if there were no speeds to update, overwrite with meta
|
||||||
if final_meta["speed"]["cpu"] is None and final_meta["speed"]["gpu"] is None:
|
if (
|
||||||
|
final_meta["speed"]["cpu"] is None
|
||||||
|
and final_meta["speed"]["gpu"] is None
|
||||||
|
):
|
||||||
final_meta["speed"].update(meta["speed"])
|
final_meta["speed"].update(meta["speed"])
|
||||||
# note: beam speeds are not combined with the base model
|
# note: beam speeds are not combined with the base model
|
||||||
if has_beam_widths:
|
if has_beam_widths:
|
||||||
|
@ -611,15 +640,6 @@ def _create_progress_bar(total):
|
||||||
|
|
||||||
def _load_vectors(nlp, vectors):
|
def _load_vectors(nlp, vectors):
|
||||||
util.load_model(vectors, vocab=nlp.vocab)
|
util.load_model(vectors, vocab=nlp.vocab)
|
||||||
for lex in nlp.vocab:
|
|
||||||
values = {}
|
|
||||||
for attr, func in nlp.vocab.lex_attr_getters.items():
|
|
||||||
# These attrs are expected to be set by data. Others should
|
|
||||||
# be set by calling the language functions.
|
|
||||||
if attr not in (CLUSTER, PROB, IS_OOV, LANG):
|
|
||||||
values[lex.vocab.strings[attr]] = func(lex.orth_)
|
|
||||||
lex.set_attrs(**values)
|
|
||||||
lex.is_oov = False
|
|
||||||
|
|
||||||
|
|
||||||
def _load_pretrained_tok2vec(nlp, loc):
|
def _load_pretrained_tok2vec(nlp, loc):
|
||||||
|
@ -661,6 +681,8 @@ def _find_best(experiment_dir, component):
|
||||||
if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
|
if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
|
||||||
accs = srsly.read_json(epoch_model / "accuracy.json")
|
accs = srsly.read_json(epoch_model / "accuracy.json")
|
||||||
scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
|
scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
|
||||||
|
# remove per_type dicts from score list for max() comparison
|
||||||
|
scores = [score for score in scores if isinstance(score, float)]
|
||||||
accuracies.append((scores, epoch_model))
|
accuracies.append((scores, epoch_model))
|
||||||
if accuracies:
|
if accuracies:
|
||||||
return max(accuracies)[1]
|
return max(accuracies)[1]
|
||||||
|
|
|
@ -7,10 +7,12 @@ USAGE: https://spacy.io/usage/visualizers
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
|
||||||
from .render import DependencyRenderer, EntityRenderer
|
from .render import DependencyRenderer, EntityRenderer
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
from ..compat import b_to_str
|
from ..compat import b_to_str
|
||||||
from ..errors import Errors, Warnings, user_warning
|
from ..errors import Errors, Warnings
|
||||||
from ..util import is_in_jupyter
|
from ..util import is_in_jupyter
|
||||||
|
|
||||||
|
|
||||||
|
@ -89,7 +91,7 @@ def serve(
|
||||||
from wsgiref import simple_server
|
from wsgiref import simple_server
|
||||||
|
|
||||||
if is_in_jupyter():
|
if is_in_jupyter():
|
||||||
user_warning(Warnings.W011)
|
warnings.warn(Warnings.W011)
|
||||||
|
|
||||||
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
||||||
httpd = simple_server.make_server(host, port, app)
|
httpd = simple_server.make_server(host, port, app)
|
||||||
|
@ -119,7 +121,7 @@ def parse_deps(orig_doc, options={}):
|
||||||
"""
|
"""
|
||||||
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"]))
|
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"]))
|
||||||
if not doc.is_parsed:
|
if not doc.is_parsed:
|
||||||
user_warning(Warnings.W005)
|
warnings.warn(Warnings.W005)
|
||||||
if options.get("collapse_phrases", False):
|
if options.get("collapse_phrases", False):
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
for np in list(doc.noun_chunks):
|
for np in list(doc.noun_chunks):
|
||||||
|
@ -146,9 +148,14 @@ def parse_deps(orig_doc, options={}):
|
||||||
retokenizer.merge(span, attrs=attrs)
|
retokenizer.merge(span, attrs=attrs)
|
||||||
fine_grained = options.get("fine_grained")
|
fine_grained = options.get("fine_grained")
|
||||||
add_lemma = options.get("add_lemma")
|
add_lemma = options.get("add_lemma")
|
||||||
words = [{"text": w.text,
|
words = [
|
||||||
|
{
|
||||||
|
"text": w.text,
|
||||||
"tag": w.tag_ if fine_grained else w.pos_,
|
"tag": w.tag_ if fine_grained else w.pos_,
|
||||||
"lemma": w.lemma_ if add_lemma else None} for w in doc]
|
"lemma": w.lemma_ if add_lemma else None,
|
||||||
|
}
|
||||||
|
for w in doc
|
||||||
|
]
|
||||||
|
|
||||||
arcs = []
|
arcs = []
|
||||||
for word in doc:
|
for word in doc:
|
||||||
|
@ -179,7 +186,7 @@ def parse_ents(doc, options={}):
|
||||||
for ent in doc.ents
|
for ent in doc.ents
|
||||||
]
|
]
|
||||||
if not ents:
|
if not ents:
|
||||||
user_warning(Warnings.W006)
|
warnings.warn(Warnings.W006)
|
||||||
title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None
|
title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None
|
||||||
settings = get_doc_settings(doc)
|
settings = get_doc_settings(doc)
|
||||||
return {"text": doc.text, "ents": ents, "title": title, "settings": settings}
|
return {"text": doc.text, "ents": ents, "title": title, "settings": settings}
|
||||||
|
|
|
@ -3,7 +3,13 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_WORDS_LEMMA, TPL_DEP_ARCS, TPL_ENTS
|
from .templates import (
|
||||||
|
TPL_DEP_SVG,
|
||||||
|
TPL_DEP_WORDS,
|
||||||
|
TPL_DEP_WORDS_LEMMA,
|
||||||
|
TPL_DEP_ARCS,
|
||||||
|
TPL_ENTS,
|
||||||
|
)
|
||||||
from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
|
from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
|
||||||
from ..util import minify_html, escape_html, registry
|
from ..util import minify_html, escape_html, registry
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
@ -83,7 +89,10 @@ class DependencyRenderer(object):
|
||||||
self.width = self.offset_x + len(words) * self.distance
|
self.width = self.offset_x + len(words) * self.distance
|
||||||
self.height = self.offset_y + 3 * self.word_spacing
|
self.height = self.offset_y + 3 * self.word_spacing
|
||||||
self.id = render_id
|
self.id = render_id
|
||||||
words = [self.render_word(w["text"], w["tag"], w.get("lemma", None), i) for i, w in enumerate(words)]
|
words = [
|
||||||
|
self.render_word(w["text"], w["tag"], w.get("lemma", None), i)
|
||||||
|
for i, w in enumerate(words)
|
||||||
|
]
|
||||||
arcs = [
|
arcs = [
|
||||||
self.render_arrow(a["label"], a["start"], a["end"], a["dir"], i)
|
self.render_arrow(a["label"], a["start"], a["end"], a["dir"], i)
|
||||||
for i, a in enumerate(arcs)
|
for i, a in enumerate(arcs)
|
||||||
|
@ -101,7 +110,9 @@ class DependencyRenderer(object):
|
||||||
lang=self.lang,
|
lang=self.lang,
|
||||||
)
|
)
|
||||||
|
|
||||||
def render_word(self, text, tag, lemma, i,):
|
def render_word(
|
||||||
|
self, text, tag, lemma, i,
|
||||||
|
):
|
||||||
"""Render individual word.
|
"""Render individual word.
|
||||||
|
|
||||||
text (unicode): Word text.
|
text (unicode): Word text.
|
||||||
|
@ -115,7 +126,9 @@ class DependencyRenderer(object):
|
||||||
x = self.width - x
|
x = self.width - x
|
||||||
html_text = escape_html(text)
|
html_text = escape_html(text)
|
||||||
if lemma is not None:
|
if lemma is not None:
|
||||||
return TPL_DEP_WORDS_LEMMA.format(text=html_text, tag=tag, lemma=lemma, x=x, y=y)
|
return TPL_DEP_WORDS_LEMMA.format(
|
||||||
|
text=html_text, tag=tag, lemma=lemma, x=x, y=y
|
||||||
|
)
|
||||||
return TPL_DEP_WORDS.format(text=html_text, tag=tag, x=x, y=y)
|
return TPL_DEP_WORDS.format(text=html_text, tag=tag, x=x, y=y)
|
||||||
|
|
||||||
def render_arrow(self, label, start, end, direction, i):
|
def render_arrow(self, label, start, end, direction, i):
|
||||||
|
|
|
@ -1,17 +1,16 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import os
|
|
||||||
import warnings
|
|
||||||
import inspect
|
|
||||||
|
|
||||||
|
|
||||||
def add_codes(err_cls):
|
def add_codes(err_cls):
|
||||||
"""Add error codes to string messages via class attribute names."""
|
"""Add error codes to string messages via class attribute names."""
|
||||||
|
|
||||||
class ErrorsWithCodes(object):
|
class ErrorsWithCodes(err_cls):
|
||||||
def __getattribute__(self, code):
|
def __getattribute__(self, code):
|
||||||
msg = getattr(err_cls, code)
|
msg = super().__getattribute__(code)
|
||||||
|
if code.startswith("__"): # python system attributes like __class__
|
||||||
|
return msg
|
||||||
|
else:
|
||||||
return "[{code}] {msg}".format(code=code, msg=msg)
|
return "[{code}] {msg}".format(code=code, msg=msg)
|
||||||
|
|
||||||
return ErrorsWithCodes()
|
return ErrorsWithCodes()
|
||||||
|
@ -93,8 +92,7 @@ class Warnings(object):
|
||||||
W022 = ("Training a new part-of-speech tagger using a model with no "
|
W022 = ("Training a new part-of-speech tagger using a model with no "
|
||||||
"lemmatization rules or data. This means that the trained model "
|
"lemmatization rules or data. This means that the trained model "
|
||||||
"may not be able to lemmatize correctly. If this is intentional "
|
"may not be able to lemmatize correctly. If this is intentional "
|
||||||
"or the language you're using doesn't have lemmatization data, "
|
"or the language you're using doesn't have lemmatization data. "
|
||||||
"you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. "
|
|
||||||
"If this is surprising, make sure you have the spacy-lookups-data "
|
"If this is surprising, make sure you have the spacy-lookups-data "
|
||||||
"package installed.")
|
"package installed.")
|
||||||
W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. "
|
W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. "
|
||||||
|
@ -110,7 +108,13 @@ class Warnings(object):
|
||||||
W028 = ("Doc.from_array was called with a vector of type '{type}', "
|
W028 = ("Doc.from_array was called with a vector of type '{type}', "
|
||||||
"but is expecting one of type 'uint64' instead. This may result "
|
"but is expecting one of type 'uint64' instead. This may result "
|
||||||
"in problems with the vocab further on in the pipeline.")
|
"in problems with the vocab further on in the pipeline.")
|
||||||
|
W029 = ("Unable to align tokens with entities from character offsets. "
|
||||||
|
"Discarding entity annotation for the text: {text}.")
|
||||||
|
W030 = ("Some entities could not be aligned in the text \"{text}\" with "
|
||||||
|
"entities \"{entities}\". Use "
|
||||||
|
"`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
|
||||||
|
" to check the alignment. Misaligned entities ('-') will be "
|
||||||
|
"ignored during training.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
@ -552,6 +556,17 @@ class Errors(object):
|
||||||
"array.")
|
"array.")
|
||||||
E191 = ("Invalid head: the head token must be from the same doc as the "
|
E191 = ("Invalid head: the head token must be from the same doc as the "
|
||||||
"token itself.")
|
"token itself.")
|
||||||
|
E192 = ("Unable to resize vectors in place with cupy.")
|
||||||
|
E193 = ("Unable to resize vectors in place if the resized vector dimension "
|
||||||
|
"({new_dim}) is not the same as the current vector dimension "
|
||||||
|
"({curr_dim}).")
|
||||||
|
E194 = ("Unable to aligned mismatched text '{text}' and words '{words}'.")
|
||||||
|
E195 = ("Matcher can be called on {good} only, got {got}.")
|
||||||
|
E196 = ("Refusing to write to token.is_sent_end. Sentence boundaries can "
|
||||||
|
"only be fixed with token.is_sent_start.")
|
||||||
|
E197 = ("Row out of bounds, unable to add row {row} for key {key}.")
|
||||||
|
E198 = ("Unable to return {n} most similar vectors for the current vectors "
|
||||||
|
"table, which contains {n_rows} vectors.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
@ -586,64 +601,3 @@ class MatchPatternError(ValueError):
|
||||||
|
|
||||||
class AlignmentError(ValueError):
|
class AlignmentError(ValueError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class ModelsWarning(UserWarning):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
WARNINGS = {
|
|
||||||
"user": UserWarning,
|
|
||||||
"deprecation": DeprecationWarning,
|
|
||||||
"models": ModelsWarning,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def _get_warn_types(arg):
|
|
||||||
if arg == "": # don't show any warnings
|
|
||||||
return []
|
|
||||||
if not arg or arg == "all": # show all available warnings
|
|
||||||
return WARNINGS.keys()
|
|
||||||
return [w_type.strip() for w_type in arg.split(",") if w_type.strip() in WARNINGS]
|
|
||||||
|
|
||||||
|
|
||||||
def _get_warn_excl(arg):
|
|
||||||
if not arg:
|
|
||||||
return []
|
|
||||||
return [w_id.strip() for w_id in arg.split(",")]
|
|
||||||
|
|
||||||
|
|
||||||
SPACY_WARNING_FILTER = os.environ.get("SPACY_WARNING_FILTER")
|
|
||||||
SPACY_WARNING_TYPES = _get_warn_types(os.environ.get("SPACY_WARNING_TYPES"))
|
|
||||||
SPACY_WARNING_IGNORE = _get_warn_excl(os.environ.get("SPACY_WARNING_IGNORE"))
|
|
||||||
|
|
||||||
|
|
||||||
def user_warning(message):
|
|
||||||
_warn(message, "user")
|
|
||||||
|
|
||||||
|
|
||||||
def deprecation_warning(message):
|
|
||||||
_warn(message, "deprecation")
|
|
||||||
|
|
||||||
|
|
||||||
def models_warning(message):
|
|
||||||
_warn(message, "models")
|
|
||||||
|
|
||||||
|
|
||||||
def _warn(message, warn_type="user"):
|
|
||||||
"""
|
|
||||||
message (unicode): The message to display.
|
|
||||||
category (Warning): The Warning to show.
|
|
||||||
"""
|
|
||||||
if message.startswith("["):
|
|
||||||
w_id = message.split("[", 1)[1].split("]", 1)[0] # get ID from string
|
|
||||||
else:
|
|
||||||
w_id = None
|
|
||||||
ignore_warning = w_id and w_id in SPACY_WARNING_IGNORE
|
|
||||||
if warn_type in SPACY_WARNING_TYPES and not ignore_warning:
|
|
||||||
category = WARNINGS[warn_type]
|
|
||||||
stack = inspect.stack()[-1]
|
|
||||||
with warnings.catch_warnings():
|
|
||||||
if SPACY_WARNING_FILTER:
|
|
||||||
warnings.simplefilter(SPACY_WARNING_FILTER, category)
|
|
||||||
warnings.warn_explicit(message, category, stack[1], stack[2])
|
|
||||||
|
|
168
spacy/gold.pyx
168
spacy/gold.pyx
|
@ -10,10 +10,11 @@ import shutil
|
||||||
import itertools
|
import itertools
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import srsly
|
import srsly
|
||||||
|
import warnings
|
||||||
|
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
from .tokens import Doc, Span
|
from .tokens import Doc, Span
|
||||||
from .errors import Errors, AlignmentError, user_warning, Warnings
|
from .errors import Errors, AlignmentError, Warnings
|
||||||
from .compat import path2str
|
from .compat import path2str
|
||||||
from . import util
|
from . import util
|
||||||
from .util import minibatch, itershuffle
|
from .util import minibatch, itershuffle
|
||||||
|
@ -21,7 +22,6 @@ from .util import minibatch, itershuffle
|
||||||
from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek
|
from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek
|
||||||
|
|
||||||
|
|
||||||
USE_NEW_ALIGN = False
|
|
||||||
punct_re = re.compile(r"\W")
|
punct_re = re.compile(r"\W")
|
||||||
|
|
||||||
|
|
||||||
|
@ -73,57 +73,8 @@ def merge_sents(sents):
|
||||||
return [(m_deps, (m_cats, m_brackets))]
|
return [(m_deps, (m_cats, m_brackets))]
|
||||||
|
|
||||||
|
|
||||||
_ALIGNMENT_NORM_MAP = [("``", "'"), ("''", "'"), ('"', "'"), ("`", "'")]
|
|
||||||
|
|
||||||
|
|
||||||
def _normalize_for_alignment(tokens):
|
def _normalize_for_alignment(tokens):
|
||||||
tokens = [w.replace(" ", "").lower() for w in tokens]
|
return [w.replace(" ", "").lower() for w in tokens]
|
||||||
output = []
|
|
||||||
for token in tokens:
|
|
||||||
token = token.replace(" ", "").lower()
|
|
||||||
for before, after in _ALIGNMENT_NORM_MAP:
|
|
||||||
token = token.replace(before, after)
|
|
||||||
output.append(token)
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
def _align_before_v2_2_2(tokens_a, tokens_b):
|
|
||||||
"""Calculate alignment tables between two tokenizations, using the Levenshtein
|
|
||||||
algorithm. The alignment is case-insensitive.
|
|
||||||
|
|
||||||
tokens_a (List[str]): The candidate tokenization.
|
|
||||||
tokens_b (List[str]): The reference tokenization.
|
|
||||||
RETURNS: (tuple): A 5-tuple consisting of the following information:
|
|
||||||
* cost (int): The number of misaligned tokens.
|
|
||||||
* a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
|
|
||||||
For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
|
|
||||||
to `tokens_b[6]`. If there's no one-to-one alignment for a token,
|
|
||||||
it has the value -1.
|
|
||||||
* b2a (List[int]): The same as `a2b`, but mapping the other direction.
|
|
||||||
* a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
|
|
||||||
to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
|
|
||||||
the same token of `tokens_b`.
|
|
||||||
* b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
|
|
||||||
direction.
|
|
||||||
"""
|
|
||||||
from . import _align
|
|
||||||
if tokens_a == tokens_b:
|
|
||||||
alignment = numpy.arange(len(tokens_a))
|
|
||||||
return 0, alignment, alignment, {}, {}
|
|
||||||
tokens_a = [w.replace(" ", "").lower() for w in tokens_a]
|
|
||||||
tokens_b = [w.replace(" ", "").lower() for w in tokens_b]
|
|
||||||
cost, i2j, j2i, matrix = _align.align(tokens_a, tokens_b)
|
|
||||||
i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in tokens_a],
|
|
||||||
[len(w) for w in tokens_b])
|
|
||||||
for i, j in list(i2j_multi.items()):
|
|
||||||
if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j:
|
|
||||||
i2j[i] = j
|
|
||||||
i2j_multi.pop(i)
|
|
||||||
for j, i in list(j2i_multi.items()):
|
|
||||||
if j2i_multi.get(j+1) != i and j2i_multi.get(j-1) != i:
|
|
||||||
j2i[j] = i
|
|
||||||
j2i_multi.pop(j)
|
|
||||||
return cost, i2j, j2i, i2j_multi, j2i_multi
|
|
||||||
|
|
||||||
|
|
||||||
def align(tokens_a, tokens_b):
|
def align(tokens_a, tokens_b):
|
||||||
|
@ -144,8 +95,6 @@ def align(tokens_a, tokens_b):
|
||||||
* b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
|
* b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
|
||||||
direction.
|
direction.
|
||||||
"""
|
"""
|
||||||
if not USE_NEW_ALIGN:
|
|
||||||
return _align_before_v2_2_2(tokens_a, tokens_b)
|
|
||||||
tokens_a = _normalize_for_alignment(tokens_a)
|
tokens_a = _normalize_for_alignment(tokens_a)
|
||||||
tokens_b = _normalize_for_alignment(tokens_b)
|
tokens_b = _normalize_for_alignment(tokens_b)
|
||||||
cost = 0
|
cost = 0
|
||||||
|
@ -382,6 +331,8 @@ class GoldCorpus(object):
|
||||||
def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
|
def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
|
||||||
if random.random() >= orth_variant_level:
|
if random.random() >= orth_variant_level:
|
||||||
return raw, paragraph_tuples
|
return raw, paragraph_tuples
|
||||||
|
raw_orig = str(raw)
|
||||||
|
lower = False
|
||||||
if random.random() >= 0.5:
|
if random.random() >= 0.5:
|
||||||
lower = True
|
lower = True
|
||||||
if raw is not None:
|
if raw is not None:
|
||||||
|
@ -442,8 +393,11 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
|
||||||
ids, words, tags, heads, labels, ner = sent_tuples
|
ids, words, tags, heads, labels, ner = sent_tuples
|
||||||
for word in words:
|
for word in words:
|
||||||
match_found = False
|
match_found = False
|
||||||
|
# skip whitespace words
|
||||||
|
if word.isspace():
|
||||||
|
match_found = True
|
||||||
# add identical word
|
# add identical word
|
||||||
if word not in variants and raw[raw_idx:].startswith(word):
|
elif word not in variants and raw[raw_idx:].startswith(word):
|
||||||
variant_raw += word
|
variant_raw += word
|
||||||
raw_idx += len(word)
|
raw_idx += len(word)
|
||||||
match_found = True
|
match_found = True
|
||||||
|
@ -458,7 +412,7 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
|
||||||
# something went wrong, abort
|
# something went wrong, abort
|
||||||
# (add a warning message?)
|
# (add a warning message?)
|
||||||
if not match_found:
|
if not match_found:
|
||||||
return raw, paragraph_tuples
|
return raw_orig, paragraph_tuples
|
||||||
# add following whitespace
|
# add following whitespace
|
||||||
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
|
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
|
||||||
variant_raw += raw[raw_idx]
|
variant_raw += raw[raw_idx]
|
||||||
|
@ -560,7 +514,7 @@ def _json_iterate(loc):
|
||||||
py_raw = file_.read()
|
py_raw = file_.read()
|
||||||
cdef long file_length = len(py_raw)
|
cdef long file_length = len(py_raw)
|
||||||
if file_length > 2 ** 30:
|
if file_length > 2 ** 30:
|
||||||
user_warning(Warnings.W027.format(size=file_length))
|
warnings.warn(Warnings.W027.format(size=file_length))
|
||||||
|
|
||||||
raw = <char*>py_raw
|
raw = <char*>py_raw
|
||||||
cdef int square_depth = 0
|
cdef int square_depth = 0
|
||||||
|
@ -700,8 +654,19 @@ cdef class GoldParse:
|
||||||
# if self.lenght > 0, this is modified latter.
|
# if self.lenght > 0, this is modified latter.
|
||||||
self.orig_annot = []
|
self.orig_annot = []
|
||||||
|
|
||||||
|
# temporary doc for aligning entity annotation
|
||||||
|
entdoc = None
|
||||||
|
|
||||||
# avoid allocating memory if the doc does not contain any tokens
|
# avoid allocating memory if the doc does not contain any tokens
|
||||||
if self.length > 0:
|
if self.length == 0:
|
||||||
|
self.words = []
|
||||||
|
self.tags = []
|
||||||
|
self.heads = []
|
||||||
|
self.labels = []
|
||||||
|
self.ner = []
|
||||||
|
self.morphology = []
|
||||||
|
|
||||||
|
else:
|
||||||
if words is None:
|
if words is None:
|
||||||
words = [token.text for token in doc]
|
words = [token.text for token in doc]
|
||||||
if tags is None:
|
if tags is None:
|
||||||
|
@ -722,7 +687,25 @@ cdef class GoldParse:
|
||||||
entities = [(ent if ent is not None else "-") for ent in entities]
|
entities = [(ent if ent is not None else "-") for ent in entities]
|
||||||
if not isinstance(entities[0], basestring):
|
if not isinstance(entities[0], basestring):
|
||||||
# Assume we have entities specified by character offset.
|
# Assume we have entities specified by character offset.
|
||||||
entities = biluo_tags_from_offsets(doc, entities)
|
# Create a temporary Doc corresponding to provided words
|
||||||
|
# (to preserve gold tokenization) and text (to preserve
|
||||||
|
# character offsets).
|
||||||
|
entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
|
||||||
|
entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
|
||||||
|
entdoc_entities = biluo_tags_from_offsets(entdoc, entities)
|
||||||
|
# There may be some additional whitespace tokens in the
|
||||||
|
# temporary doc, so check that the annotations align with
|
||||||
|
# the provided words while building a list of BILUO labels.
|
||||||
|
entities = []
|
||||||
|
words_offset = 0
|
||||||
|
for i in range(len(entdoc_words)):
|
||||||
|
if words[i + words_offset] == entdoc_words[i]:
|
||||||
|
entities.append(entdoc_entities[i])
|
||||||
|
else:
|
||||||
|
words_offset -= 1
|
||||||
|
if len(entities) != len(words):
|
||||||
|
warnings.warn(Warnings.W029.format(text=doc.text))
|
||||||
|
entities = ["-" for _ in words]
|
||||||
|
|
||||||
# These are filled by the tagger/parser/entity recogniser
|
# These are filled by the tagger/parser/entity recogniser
|
||||||
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
|
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||||
|
@ -749,7 +732,8 @@ cdef class GoldParse:
|
||||||
# If we under-segment, we'll have one predicted word that covers a
|
# If we under-segment, we'll have one predicted word that covers a
|
||||||
# sequence of gold words.
|
# sequence of gold words.
|
||||||
# If we "mis-segment", we'll have a sequence of predicted words covering
|
# If we "mis-segment", we'll have a sequence of predicted words covering
|
||||||
# a sequence of gold words. That's many-to-many -- we don't do that.
|
# a sequence of gold words. That's many-to-many -- we don't do that
|
||||||
|
# except for NER spans where the start and end can be aligned.
|
||||||
cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
|
cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
|
||||||
|
|
||||||
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
|
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
|
||||||
|
@ -772,7 +756,6 @@ cdef class GoldParse:
|
||||||
self.tags[i] = tags[i2j_multi[i]]
|
self.tags[i] = tags[i2j_multi[i]]
|
||||||
self.morphology[i] = morphology[i2j_multi[i]]
|
self.morphology[i] = morphology[i2j_multi[i]]
|
||||||
is_last = i2j_multi[i] != i2j_multi.get(i+1)
|
is_last = i2j_multi[i] != i2j_multi.get(i+1)
|
||||||
is_first = i2j_multi[i] != i2j_multi.get(i-1)
|
|
||||||
# Set next word in multi-token span as head, until last
|
# Set next word in multi-token span as head, until last
|
||||||
if not is_last:
|
if not is_last:
|
||||||
self.heads[i] = i+1
|
self.heads[i] = i+1
|
||||||
|
@ -782,29 +765,9 @@ cdef class GoldParse:
|
||||||
if head_i:
|
if head_i:
|
||||||
self.heads[i] = self.gold_to_cand[head_i]
|
self.heads[i] = self.gold_to_cand[head_i]
|
||||||
self.labels[i] = deps[i2j_multi[i]]
|
self.labels[i] = deps[i2j_multi[i]]
|
||||||
# Now set NER...This is annoying because if we've split
|
|
||||||
# got an entity word split into two, we need to adjust the
|
|
||||||
# BILUO tags. We can't have BB or LL etc.
|
|
||||||
# Case 1: O -- easy.
|
|
||||||
ner_tag = entities[i2j_multi[i]]
|
ner_tag = entities[i2j_multi[i]]
|
||||||
if ner_tag == "O":
|
# Assign O/- for many-to-one O/- NER tags
|
||||||
self.ner[i] = "O"
|
if ner_tag in ("O", "-"):
|
||||||
# Case 2: U. This has to become a B I* L sequence.
|
|
||||||
elif ner_tag.startswith("U-"):
|
|
||||||
if is_first:
|
|
||||||
self.ner[i] = ner_tag.replace("U-", "B-", 1)
|
|
||||||
elif is_last:
|
|
||||||
self.ner[i] = ner_tag.replace("U-", "L-", 1)
|
|
||||||
else:
|
|
||||||
self.ner[i] = ner_tag.replace("U-", "I-", 1)
|
|
||||||
# Case 3: L. If not last, change to I.
|
|
||||||
elif ner_tag.startswith("L-"):
|
|
||||||
if is_last:
|
|
||||||
self.ner[i] = ner_tag
|
|
||||||
else:
|
|
||||||
self.ner[i] = ner_tag.replace("L-", "I-", 1)
|
|
||||||
# Case 4: I. Stays correct
|
|
||||||
elif ner_tag.startswith("I-"):
|
|
||||||
self.ner[i] = ner_tag
|
self.ner[i] = ner_tag
|
||||||
else:
|
else:
|
||||||
self.words[i] = words[gold_i]
|
self.words[i] = words[gold_i]
|
||||||
|
@ -816,6 +779,39 @@ cdef class GoldParse:
|
||||||
self.heads[i] = self.gold_to_cand[heads[gold_i]]
|
self.heads[i] = self.gold_to_cand[heads[gold_i]]
|
||||||
self.labels[i] = deps[gold_i]
|
self.labels[i] = deps[gold_i]
|
||||||
self.ner[i] = entities[gold_i]
|
self.ner[i] = entities[gold_i]
|
||||||
|
# Assign O/- for one-to-many O/- NER tags
|
||||||
|
for j, cand_j in enumerate(self.gold_to_cand):
|
||||||
|
if cand_j is None:
|
||||||
|
if j in j2i_multi:
|
||||||
|
i = j2i_multi[j]
|
||||||
|
ner_tag = entities[j]
|
||||||
|
if ner_tag in ("O", "-"):
|
||||||
|
self.ner[i] = ner_tag
|
||||||
|
|
||||||
|
# If there is entity annotation and some tokens remain unaligned,
|
||||||
|
# align all entities at the character level to account for all
|
||||||
|
# possible token misalignments within the entity spans
|
||||||
|
if any([e not in ("O", "-") for e in entities]) and None in self.ner:
|
||||||
|
# If the temporary entdoc wasn't created above, initialize it
|
||||||
|
if not entdoc:
|
||||||
|
entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
|
||||||
|
entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
|
||||||
|
# Get offsets based on gold words and BILUO entities
|
||||||
|
entdoc_offsets = offsets_from_biluo_tags(entdoc, entities)
|
||||||
|
aligned_offsets = []
|
||||||
|
aligned_spans = []
|
||||||
|
# Filter offsets to identify those that align with doc tokens
|
||||||
|
for offset in entdoc_offsets:
|
||||||
|
span = doc.char_span(offset[0], offset[1])
|
||||||
|
if span and not span.text.isspace():
|
||||||
|
aligned_offsets.append(offset)
|
||||||
|
aligned_spans.append(span)
|
||||||
|
# Convert back to BILUO for doc tokens and assign NER for all
|
||||||
|
# aligned spans
|
||||||
|
biluo_tags = biluo_tags_from_offsets(doc, aligned_offsets, missing=None)
|
||||||
|
for span in aligned_spans:
|
||||||
|
for i in range(span.start, span.end):
|
||||||
|
self.ner[i] = biluo_tags[i]
|
||||||
|
|
||||||
# Prevent whitespace that isn't within entities from being tagged as
|
# Prevent whitespace that isn't within entities from being tagged as
|
||||||
# an entity.
|
# an entity.
|
||||||
|
@ -961,6 +957,12 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
biluo[token.i] = missing
|
biluo[token.i] = missing
|
||||||
|
if "-" in biluo:
|
||||||
|
ent_str = str(entities)
|
||||||
|
warnings.warn(Warnings.W030.format(
|
||||||
|
text=doc.text[:50] + "..." if len(doc.text) > 50 else doc.text,
|
||||||
|
entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str
|
||||||
|
))
|
||||||
return biluo
|
return biluo
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,7 @@ from libcpp.vector cimport vector
|
||||||
from libc.stdint cimport int32_t, int64_t
|
from libc.stdint cimport int32_t, int64_t
|
||||||
from libc.stdio cimport FILE
|
from libc.stdio cimport FILE
|
||||||
|
|
||||||
from spacy.vocab cimport Vocab
|
from .vocab cimport Vocab
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
|
|
||||||
from .structs cimport KBEntryC, AliasC
|
from .structs cimport KBEntryC, AliasC
|
||||||
|
@ -169,4 +169,3 @@ cdef class Reader:
|
||||||
cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
|
cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
|
||||||
|
|
||||||
cdef int _read(self, void* value, size_t size) except -1
|
cdef int _read(self, void* value, size_t size) except -1
|
||||||
|
|
||||||
|
|
23
spacy/kb.pyx
23
spacy/kb.pyx
|
@ -1,21 +1,20 @@
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from spacy.errors import Errors, Warnings, user_warning
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
|
|
||||||
from cpython.exc cimport PyErr_SetFromErrno
|
from cpython.exc cimport PyErr_SetFromErrno
|
||||||
|
|
||||||
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
|
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
|
||||||
from libc.stdint cimport int32_t, int64_t
|
from libc.stdint cimport int32_t, int64_t
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
from os import path
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
|
|
||||||
from os import path
|
from .errors import Errors, Warnings
|
||||||
from libcpp.vector cimport vector
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Candidate:
|
cdef class Candidate:
|
||||||
|
@ -115,7 +114,7 @@ cdef class KnowledgeBase:
|
||||||
|
|
||||||
# Return if this entity was added before
|
# Return if this entity was added before
|
||||||
if entity_hash in self._entry_index:
|
if entity_hash in self._entry_index:
|
||||||
user_warning(Warnings.W018.format(entity=entity))
|
warnings.warn(Warnings.W018.format(entity=entity))
|
||||||
return
|
return
|
||||||
|
|
||||||
# Raise an error if the provided entity vector is not of the correct length
|
# Raise an error if the provided entity vector is not of the correct length
|
||||||
|
@ -147,7 +146,7 @@ cdef class KnowledgeBase:
|
||||||
# only process this entity if its unique ID hadn't been added before
|
# only process this entity if its unique ID hadn't been added before
|
||||||
entity_hash = self.vocab.strings.add(entity_list[i])
|
entity_hash = self.vocab.strings.add(entity_list[i])
|
||||||
if entity_hash in self._entry_index:
|
if entity_hash in self._entry_index:
|
||||||
user_warning(Warnings.W018.format(entity=entity_list[i]))
|
warnings.warn(Warnings.W018.format(entity=entity_list[i]))
|
||||||
|
|
||||||
else:
|
else:
|
||||||
entity_vector = vector_list[i]
|
entity_vector = vector_list[i]
|
||||||
|
@ -195,7 +194,7 @@ cdef class KnowledgeBase:
|
||||||
|
|
||||||
# Check whether this alias was added before
|
# Check whether this alias was added before
|
||||||
if alias_hash in self._alias_index:
|
if alias_hash in self._alias_index:
|
||||||
user_warning(Warnings.W017.format(alias=alias))
|
warnings.warn(Warnings.W017.format(alias=alias))
|
||||||
return
|
return
|
||||||
|
|
||||||
cdef vector[int64_t] entry_indices
|
cdef vector[int64_t] entry_indices
|
||||||
|
@ -252,7 +251,7 @@ cdef class KnowledgeBase:
|
||||||
|
|
||||||
if is_present:
|
if is_present:
|
||||||
if not ignore_warnings:
|
if not ignore_warnings:
|
||||||
user_warning(Warnings.W024.format(entity=entity, alias=alias))
|
warnings.warn(Warnings.W024.format(entity=entity, alias=alias))
|
||||||
else:
|
else:
|
||||||
entry_indices.push_back(int(entry_index))
|
entry_indices.push_back(int(entry_index))
|
||||||
alias_entry.entry_indices = entry_indices
|
alias_entry.entry_indices = entry_indices
|
||||||
|
@ -584,5 +583,3 @@ cdef class Reader:
|
||||||
cdef int _read(self, void* value, size_t size) except -1:
|
cdef int _read(self, void* value, size_t size) except -1:
|
||||||
status = fread(value, size, 1, self._fp)
|
status = fread(value, size, 1, self._fp)
|
||||||
return status
|
return status
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .norm_exceptions import NORM_EXCEPTIONS
|
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
@ -10,19 +9,15 @@ from .morph_rules import MORPH_RULES
|
||||||
from ..tag_map import TAG_MAP
|
from ..tag_map import TAG_MAP
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
class DanishDefaults(Language.Defaults):
|
class DanishDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
lex_attr_getters[LANG] = lambda text: "da"
|
lex_attr_getters[LANG] = lambda text: "da"
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
|
|
||||||
)
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
morph_rules = MORPH_RULES
|
morph_rules = MORPH_RULES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
|
|
@ -9,10 +9,13 @@ Example sentences to test spaCy and its language models.
|
||||||
>>> docs = nlp.pipe(sentences)
|
>>> docs = nlp.pipe(sentences)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
sentences = [
|
sentences = [
|
||||||
"Apple overvejer at købe et britisk startup for 1 milliard dollar",
|
"Apple overvejer at købe et britisk startup for 1 milliard dollar.",
|
||||||
"Selvkørende biler flytter forsikringsansvaret over på producenterne",
|
"Selvkørende biler flytter forsikringsansvaret over på producenterne.",
|
||||||
"San Francisco overvejer at forbyde udbringningsrobotter på fortov",
|
"San Francisco overvejer at forbyde udbringningsrobotter på fortovet.",
|
||||||
"London er en stor by i Storbritannien",
|
"London er en storby i Storbritannien.",
|
||||||
|
"Hvor er du?",
|
||||||
|
"Hvem er Frankrings president?",
|
||||||
|
"Hvad er hovedstaden i USA?",
|
||||||
|
"Hvornår blev Barack Obama født?",
|
||||||
]
|
]
|
||||||
|
|
|
@ -1,527 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
"""
|
|
||||||
Special-case rules for normalizing tokens to improve the model's predictions.
|
|
||||||
For example 'mysterium' vs 'mysterie' and similar.
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
# Sources:
|
|
||||||
# 1: https://dsn.dk/retskrivning/om-retskrivningsordbogen/mere-om-retskrivningsordbogen-2012/endrede-stave-og-ordformer/
|
|
||||||
# 2: http://www.tjerry-korrektur.dk/ord-med-flere-stavemaader/
|
|
||||||
|
|
||||||
_exc = {
|
|
||||||
# Alternative spelling
|
|
||||||
"a-kraft-værk": "a-kraftværk", # 1
|
|
||||||
"ålborg": "aalborg", # 2
|
|
||||||
"århus": "aarhus",
|
|
||||||
"accessoirer": "accessoires", # 1
|
|
||||||
"affektert": "affekteret", # 1
|
|
||||||
"afrikander": "afrikaaner", # 1
|
|
||||||
"aftabuere": "aftabuisere", # 1
|
|
||||||
"aftabuering": "aftabuisering", # 1
|
|
||||||
"akvarium": "akvarie", # 1
|
|
||||||
"alenefader": "alenefar", # 1
|
|
||||||
"alenemoder": "alenemor", # 1
|
|
||||||
"alkoholambulatorium": "alkoholambulatorie", # 1
|
|
||||||
"ambulatorium": "ambulatorie", # 1
|
|
||||||
"ananassene": "ananasserne", # 2
|
|
||||||
"anførelsestegn": "anførselstegn", # 1
|
|
||||||
"anseelig": "anselig", # 2
|
|
||||||
"antioxydant": "antioxidant", # 1
|
|
||||||
"artrig": "artsrig", # 1
|
|
||||||
"auditorium": "auditorie", # 1
|
|
||||||
"avocado": "avokado", # 2
|
|
||||||
"bagerst": "bagest", # 2
|
|
||||||
"bagstræv": "bagstræb", # 1
|
|
||||||
"bagstræver": "bagstræber", # 1
|
|
||||||
"bagstræverisk": "bagstræberisk", # 1
|
|
||||||
"balde": "balle", # 2
|
|
||||||
"barselorlov": "barselsorlov", # 1
|
|
||||||
"barselvikar": "barselsvikar", # 1
|
|
||||||
"baskien": "baskerlandet", # 1
|
|
||||||
"bayrisk": "bayersk", # 1
|
|
||||||
"bedstefader": "bedstefar", # 1
|
|
||||||
"bedstemoder": "bedstemor", # 1
|
|
||||||
"behefte": "behæfte", # 1
|
|
||||||
"beheftelse": "behæftelse", # 1
|
|
||||||
"bidragydende": "bidragsydende", # 1
|
|
||||||
"bidragyder": "bidragsyder", # 1
|
|
||||||
"billiondel": "billiontedel", # 1
|
|
||||||
"blaseret": "blasert", # 1
|
|
||||||
"bleskifte": "bleskift", # 1
|
|
||||||
"blodbroder": "blodsbroder", # 2
|
|
||||||
"blyantspidser": "blyantsspidser", # 2
|
|
||||||
"boligministerium": "boligministerie", # 1
|
|
||||||
"borhul": "borehul", # 1
|
|
||||||
"broder": "bror", # 2
|
|
||||||
"buldog": "bulldog", # 2
|
|
||||||
"bådhus": "bådehus", # 1
|
|
||||||
"børnepleje": "barnepleje", # 1
|
|
||||||
"børneseng": "barneseng", # 1
|
|
||||||
"børnestol": "barnestol", # 1
|
|
||||||
"cairo": "kairo", # 1
|
|
||||||
"cambodia": "cambodja", # 1
|
|
||||||
"cambodianer": "cambodjaner", # 1
|
|
||||||
"cambodiansk": "cambodjansk", # 1
|
|
||||||
"camouflage": "kamuflage", # 2
|
|
||||||
"campylobacter": "kampylobakter", # 1
|
|
||||||
"centeret": "centret", # 2
|
|
||||||
"chefskahyt": "chefkahyt", # 1
|
|
||||||
"chefspost": "chefpost", # 1
|
|
||||||
"chefssekretær": "chefsekretær", # 1
|
|
||||||
"chefsstol": "chefstol", # 1
|
|
||||||
"cirkulærskrivelse": "cirkulæreskrivelse", # 1
|
|
||||||
"cognacsglas": "cognacglas", # 1
|
|
||||||
"columnist": "kolumnist", # 1
|
|
||||||
"cricket": "kricket", # 2
|
|
||||||
"dagplejemoder": "dagplejemor", # 1
|
|
||||||
"damaskesdug": "damaskdug", # 1
|
|
||||||
"damp-barn": "dampbarn", # 1
|
|
||||||
"delfinarium": "delfinarie", # 1
|
|
||||||
"dentallaboratorium": "dentallaboratorie", # 1
|
|
||||||
"diaramme": "diasramme", # 1
|
|
||||||
"diaré": "diarré", # 1
|
|
||||||
"dioxyd": "dioxid", # 1
|
|
||||||
"dommedagsprædiken": "dommedagspræken", # 1
|
|
||||||
"donut": "doughnut", # 2
|
|
||||||
"driftmæssig": "driftsmæssig", # 1
|
|
||||||
"driftsikker": "driftssikker", # 1
|
|
||||||
"driftsikring": "driftssikring", # 1
|
|
||||||
"drikkejogurt": "drikkeyoghurt", # 1
|
|
||||||
"drivein": "drive-in", # 1
|
|
||||||
"driveinbiograf": "drive-in-biograf", # 1
|
|
||||||
"drøvel": "drøbel", # 1
|
|
||||||
"dødskriterium": "dødskriterie", # 1
|
|
||||||
"e-mail-adresse": "e-mailadresse", # 1
|
|
||||||
"e-post-adresse": "e-postadresse", # 1
|
|
||||||
"egypten": "ægypten", # 2
|
|
||||||
"ekskommunicere": "ekskommunikere", # 1
|
|
||||||
"eksperimentarium": "eksperimentarie", # 1
|
|
||||||
"elsass": "Alsace", # 1
|
|
||||||
"elsasser": "alsacer", # 1
|
|
||||||
"elsassisk": "alsacisk", # 1
|
|
||||||
"elvetal": "ellevetal", # 1
|
|
||||||
"elvetiden": "ellevetiden", # 1
|
|
||||||
"elveårig": "elleveårig", # 1
|
|
||||||
"elveårs": "elleveårs", # 1
|
|
||||||
"elveårsbarn": "elleveårsbarn", # 1
|
|
||||||
"elvte": "ellevte", # 1
|
|
||||||
"elvtedel": "ellevtedel", # 1
|
|
||||||
"energiministerium": "energiministerie", # 1
|
|
||||||
"erhvervsministerium": "erhvervsministerie", # 1
|
|
||||||
"espaliere": "spaliere", # 2
|
|
||||||
"evangelium": "evangelie", # 1
|
|
||||||
"fagministerium": "fagministerie", # 1
|
|
||||||
"fakse": "faxe", # 1
|
|
||||||
"fangstkvota": "fangstkvote", # 1
|
|
||||||
"fader": "far", # 2
|
|
||||||
"farbroder": "farbror", # 1
|
|
||||||
"farfader": "farfar", # 1
|
|
||||||
"farmoder": "farmor", # 1
|
|
||||||
"federal": "føderal", # 1
|
|
||||||
"federalisering": "føderalisering", # 1
|
|
||||||
"federalisme": "føderalisme", # 1
|
|
||||||
"federalist": "føderalist", # 1
|
|
||||||
"federalistisk": "føderalistisk", # 1
|
|
||||||
"federation": "føderation", # 1
|
|
||||||
"federativ": "føderativ", # 1
|
|
||||||
"fejlbeheftet": "fejlbehæftet", # 1
|
|
||||||
"femetagers": "femetages", # 2
|
|
||||||
"femhundredekroneseddel": "femhundredkroneseddel", # 2
|
|
||||||
"filmpremiere": "filmpræmiere", # 2
|
|
||||||
"finansimperium": "finansimperie", # 1
|
|
||||||
"finansministerium": "finansministerie", # 1
|
|
||||||
"firehjulstræk": "firhjulstræk", # 2
|
|
||||||
"fjernstudium": "fjernstudie", # 1
|
|
||||||
"formalier": "formalia", # 1
|
|
||||||
"formandsskift": "formandsskifte", # 1
|
|
||||||
"fornemst": "fornemmest", # 2
|
|
||||||
"fornuftparti": "fornuftsparti", # 1
|
|
||||||
"fornuftstridig": "fornuftsstridig", # 1
|
|
||||||
"fornuftvæsen": "fornuftsvæsen", # 1
|
|
||||||
"fornuftægteskab": "fornuftsægteskab", # 1
|
|
||||||
"forretningsministerium": "forretningsministerie", # 1
|
|
||||||
"forskningsministerium": "forskningsministerie", # 1
|
|
||||||
"forstudium": "forstudie", # 1
|
|
||||||
"forsvarsministerium": "forsvarsministerie", # 1
|
|
||||||
"frilægge": "fritlægge", # 1
|
|
||||||
"frilæggelse": "fritlæggelse", # 1
|
|
||||||
"frilægning": "fritlægning", # 1
|
|
||||||
"fristille": "fritstille", # 1
|
|
||||||
"fristilling": "fritstilling", # 1
|
|
||||||
"fuldttegnet": "fuldtegnet", # 1
|
|
||||||
"fødestedskriterium": "fødestedskriterie", # 1
|
|
||||||
"fødevareministerium": "fødevareministerie", # 1
|
|
||||||
"følesløs": "følelsesløs", # 1
|
|
||||||
"følgeligt": "følgelig", # 1
|
|
||||||
"førne": "førn", # 1
|
|
||||||
"gearskift": "gearskifte", # 2
|
|
||||||
"gladeligt": "gladelig", # 1
|
|
||||||
"glosehefte": "glosehæfte", # 1
|
|
||||||
"glædeløs": "glædesløs", # 1
|
|
||||||
"gonoré": "gonorré", # 1
|
|
||||||
"grangiveligt": "grangivelig", # 1
|
|
||||||
"grundliggende": "grundlæggende", # 2
|
|
||||||
"grønsag": "grøntsag", # 2
|
|
||||||
"gudbenådet": "gudsbenådet", # 1
|
|
||||||
"gudfader": "gudfar", # 1
|
|
||||||
"gudmoder": "gudmor", # 1
|
|
||||||
"gulvmop": "gulvmoppe", # 1
|
|
||||||
"gymnasium": "gymnasie", # 1
|
|
||||||
"hackning": "hacking", # 1
|
|
||||||
"halvbroder": "halvbror", # 1
|
|
||||||
"halvelvetiden": "halvellevetiden", # 1
|
|
||||||
"handelsgymnasium": "handelsgymnasie", # 1
|
|
||||||
"hefte": "hæfte", # 1
|
|
||||||
"hefteklamme": "hæfteklamme", # 1
|
|
||||||
"heftelse": "hæftelse", # 1
|
|
||||||
"heftemaskine": "hæftemaskine", # 1
|
|
||||||
"heftepistol": "hæftepistol", # 1
|
|
||||||
"hefteplaster": "hæfteplaster", # 1
|
|
||||||
"heftestraf": "hæftestraf", # 1
|
|
||||||
"heftning": "hæftning", # 1
|
|
||||||
"helbroder": "helbror", # 1
|
|
||||||
"hjemmeklasse": "hjemklasse", # 1
|
|
||||||
"hjulspin": "hjulspind", # 1
|
|
||||||
"huggevåben": "hugvåben", # 1
|
|
||||||
"hulmurisolering": "hulmursisolering", # 1
|
|
||||||
"hurtiggående": "hurtigtgående", # 2
|
|
||||||
"hurtigttørrende": "hurtigtørrende", # 2
|
|
||||||
"husmoder": "husmor", # 1
|
|
||||||
"hydroxyd": "hydroxid", # 1
|
|
||||||
"håndmikser": "håndmixer", # 1
|
|
||||||
"højtaler": "højttaler", # 2
|
|
||||||
"hønemoder": "hønemor", # 1
|
|
||||||
"ide": "idé", # 2
|
|
||||||
"imperium": "imperie", # 1
|
|
||||||
"imponerthed": "imponerethed", # 1
|
|
||||||
"inbox": "indboks", # 2
|
|
||||||
"indenrigsministerium": "indenrigsministerie", # 1
|
|
||||||
"indhefte": "indhæfte", # 1
|
|
||||||
"indheftning": "indhæftning", # 1
|
|
||||||
"indicium": "indicie", # 1
|
|
||||||
"indkassere": "inkassere", # 2
|
|
||||||
"iota": "jota", # 1
|
|
||||||
"jobskift": "jobskifte", # 1
|
|
||||||
"jogurt": "yoghurt", # 1
|
|
||||||
"jukeboks": "jukebox", # 1
|
|
||||||
"justitsministerium": "justitsministerie", # 1
|
|
||||||
"kalorifere": "kalorifer", # 1
|
|
||||||
"kandidatstipendium": "kandidatstipendie", # 1
|
|
||||||
"kannevas": "kanvas", # 1
|
|
||||||
"kaperssauce": "kaperssovs", # 1
|
|
||||||
"kigge": "kikke", # 2
|
|
||||||
"kirkeministerium": "kirkeministerie", # 1
|
|
||||||
"klapmydse": "klapmyds", # 1
|
|
||||||
"klimakterium": "klimakterie", # 1
|
|
||||||
"klogeligt": "klogelig", # 1
|
|
||||||
"knivblad": "knivsblad", # 1
|
|
||||||
"kollegaer": "kolleger", # 2
|
|
||||||
"kollegium": "kollegie", # 1
|
|
||||||
"kollegiehefte": "kollegiehæfte", # 1
|
|
||||||
"kollokviumx": "kollokvium", # 1
|
|
||||||
"kommissorium": "kommissorie", # 1
|
|
||||||
"kompendium": "kompendie", # 1
|
|
||||||
"komplicerthed": "komplicerethed", # 1
|
|
||||||
"konfederation": "konføderation", # 1
|
|
||||||
"konfedereret": "konfødereret", # 1
|
|
||||||
"konferensstudium": "konferensstudie", # 1
|
|
||||||
"konservatorium": "konservatorie", # 1
|
|
||||||
"konsulere": "konsultere", # 1
|
|
||||||
"kradsbørstig": "krasbørstig", # 2
|
|
||||||
"kravsspecifikation": "kravspecifikation", # 1
|
|
||||||
"krematorium": "krematorie", # 1
|
|
||||||
"krep": "crepe", # 1
|
|
||||||
"krepnylon": "crepenylon", # 1
|
|
||||||
"kreppapir": "crepepapir", # 1
|
|
||||||
"kricket": "cricket", # 2
|
|
||||||
"kriterium": "kriterie", # 1
|
|
||||||
"kroat": "kroater", # 2
|
|
||||||
"kroki": "croquis", # 1
|
|
||||||
"kronprinsepar": "kronprinspar", # 2
|
|
||||||
"kropdoven": "kropsdoven", # 1
|
|
||||||
"kroplus": "kropslus", # 1
|
|
||||||
"krøllefedt": "krølfedt", # 1
|
|
||||||
"kulturministerium": "kulturministerie", # 1
|
|
||||||
"kuponhefte": "kuponhæfte", # 1
|
|
||||||
"kvota": "kvote", # 1
|
|
||||||
"kvotaordning": "kvoteordning", # 1
|
|
||||||
"laboratorium": "laboratorie", # 1
|
|
||||||
"laksfarve": "laksefarve", # 1
|
|
||||||
"laksfarvet": "laksefarvet", # 1
|
|
||||||
"laksrød": "lakserød", # 1
|
|
||||||
"laksyngel": "lakseyngel", # 1
|
|
||||||
"laksørred": "lakseørred", # 1
|
|
||||||
"landbrugsministerium": "landbrugsministerie", # 1
|
|
||||||
"landskampstemning": "landskampsstemning", # 1
|
|
||||||
"langust": "languster", # 1
|
|
||||||
"lappegrejer": "lappegrej", # 1
|
|
||||||
"lavløn": "lavtløn", # 1
|
|
||||||
"lillebroder": "lillebror", # 1
|
|
||||||
"linear": "lineær", # 1
|
|
||||||
"loftlampe": "loftslampe", # 2
|
|
||||||
"log-in": "login", # 1
|
|
||||||
"login": "log-in", # 2
|
|
||||||
"lovmedholdig": "lovmedholdelig", # 1
|
|
||||||
"ludder": "luder", # 2
|
|
||||||
"lysholder": "lyseholder", # 1
|
|
||||||
"lægeskifte": "lægeskift", # 1
|
|
||||||
"lærvillig": "lærevillig", # 1
|
|
||||||
"løgsauce": "løgsovs", # 1
|
|
||||||
"madmoder": "madmor", # 1
|
|
||||||
"majonæse": "mayonnaise", # 1
|
|
||||||
"mareridtagtig": "mareridtsagtig", # 1
|
|
||||||
"margen": "margin", # 2
|
|
||||||
"martyrium": "martyrie", # 1
|
|
||||||
"mellemstatlig": "mellemstatslig", # 1
|
|
||||||
"menneskene": "menneskerne", # 2
|
|
||||||
"metropolis": "metropol", # 1
|
|
||||||
"miks": "mix", # 1
|
|
||||||
"mikse": "mixe", # 1
|
|
||||||
"miksepult": "mixerpult", # 1
|
|
||||||
"mikser": "mixer", # 1
|
|
||||||
"mikserpult": "mixerpult", # 1
|
|
||||||
"mikslån": "mixlån", # 1
|
|
||||||
"miksning": "mixning", # 1
|
|
||||||
"miljøministerium": "miljøministerie", # 1
|
|
||||||
"milliarddel": "milliardtedel", # 1
|
|
||||||
"milliondel": "milliontedel", # 1
|
|
||||||
"ministerium": "ministerie", # 1
|
|
||||||
"mop": "moppe", # 1
|
|
||||||
"moder": "mor", # 2
|
|
||||||
"moratorium": "moratorie", # 1
|
|
||||||
"morbroder": "morbror", # 1
|
|
||||||
"morfader": "morfar", # 1
|
|
||||||
"mormoder": "mormor", # 1
|
|
||||||
"musikkonservatorium": "musikkonservatorie", # 1
|
|
||||||
"muslingskal": "muslingeskal", # 1
|
|
||||||
"mysterium": "mysterie", # 1
|
|
||||||
"naturalieydelse": "naturalydelse", # 1
|
|
||||||
"naturalieøkonomi": "naturaløkonomi", # 1
|
|
||||||
"navnebroder": "navnebror", # 1
|
|
||||||
"nerium": "nerie", # 1
|
|
||||||
"nådeløs": "nådesløs", # 1
|
|
||||||
"nærforestående": "nærtforestående", # 1
|
|
||||||
"nærstående": "nærtstående", # 1
|
|
||||||
"observatorium": "observatorie", # 1
|
|
||||||
"oldefader": "oldefar", # 1
|
|
||||||
"oldemoder": "oldemor", # 1
|
|
||||||
"opgraduere": "opgradere", # 1
|
|
||||||
"opgraduering": "opgradering", # 1
|
|
||||||
"oratorium": "oratorie", # 1
|
|
||||||
"overbookning": "overbooking", # 1
|
|
||||||
"overpræsidium": "overpræsidie", # 1
|
|
||||||
"overstatlig": "overstatslig", # 1
|
|
||||||
"oxyd": "oxid", # 1
|
|
||||||
"oxydere": "oxidere", # 1
|
|
||||||
"oxydering": "oxidering", # 1
|
|
||||||
"pakkenellike": "pakkenelliker", # 1
|
|
||||||
"papirtynd": "papirstynd", # 1
|
|
||||||
"pastoralseminarium": "pastoralseminarie", # 1
|
|
||||||
"peanutsene": "peanuttene", # 2
|
|
||||||
"penalhus": "pennalhus", # 2
|
|
||||||
"pensakrav": "pensumkrav", # 1
|
|
||||||
"pepperoni": "peperoni", # 1
|
|
||||||
"peruaner": "peruvianer", # 1
|
|
||||||
"petrole": "petrol", # 1
|
|
||||||
"piltast": "piletast", # 1
|
|
||||||
"piltaste": "piletast", # 1
|
|
||||||
"planetarium": "planetarie", # 1
|
|
||||||
"plasteret": "plastret", # 2
|
|
||||||
"plastic": "plastik", # 2
|
|
||||||
"play-off-kamp": "playoffkamp", # 1
|
|
||||||
"plejefader": "plejefar", # 1
|
|
||||||
"plejemoder": "plejemor", # 1
|
|
||||||
"podium": "podie", # 2
|
|
||||||
"praha": "prag", # 2
|
|
||||||
"preciøs": "pretiøs", # 2
|
|
||||||
"privilegium": "privilegie", # 1
|
|
||||||
"progredere": "progrediere", # 1
|
|
||||||
"præsidium": "præsidie", # 1
|
|
||||||
"psykodelisk": "psykedelisk", # 1
|
|
||||||
"pudsegrejer": "pudsegrej", # 1
|
|
||||||
"referensgruppe": "referencegruppe", # 1
|
|
||||||
"referensramme": "referenceramme", # 1
|
|
||||||
"refugium": "refugie", # 1
|
|
||||||
"registeret": "registret", # 2
|
|
||||||
"remedium": "remedie", # 1
|
|
||||||
"remiks": "remix", # 1
|
|
||||||
"reservert": "reserveret", # 1
|
|
||||||
"ressortministerium": "ressortministerie", # 1
|
|
||||||
"ressource": "resurse", # 2
|
|
||||||
"resætte": "resette", # 1
|
|
||||||
"rettelig": "retteligt", # 1
|
|
||||||
"rettetaste": "rettetast", # 1
|
|
||||||
"returtaste": "returtast", # 1
|
|
||||||
"risici": "risikoer", # 2
|
|
||||||
"roll-on": "rollon", # 1
|
|
||||||
"rollehefte": "rollehæfte", # 1
|
|
||||||
"rostbøf": "roastbeef", # 1
|
|
||||||
"rygsæksturist": "rygsækturist", # 1
|
|
||||||
"rødstjært": "rødstjert", # 1
|
|
||||||
"saddel": "sadel", # 2
|
|
||||||
"samaritan": "samaritaner", # 2
|
|
||||||
"sanatorium": "sanatorie", # 1
|
|
||||||
"sauce": "sovs", # 1
|
|
||||||
"scanning": "skanning", # 2
|
|
||||||
"sceneskifte": "sceneskift", # 1
|
|
||||||
"scilla": "skilla", # 1
|
|
||||||
"sejflydende": "sejtflydende", # 1
|
|
||||||
"selvstudium": "selvstudie", # 1
|
|
||||||
"seminarium": "seminarie", # 1
|
|
||||||
"sennepssauce": "sennepssovs ", # 1
|
|
||||||
"servitutbeheftet": "servitutbehæftet", # 1
|
|
||||||
"sit-in": "sitin", # 1
|
|
||||||
"skatteministerium": "skatteministerie", # 1
|
|
||||||
"skifer": "skiffer", # 2
|
|
||||||
"skyldsfølelse": "skyldfølelse", # 1
|
|
||||||
"skysauce": "skysovs", # 1
|
|
||||||
"sladdertaske": "sladretaske", # 2
|
|
||||||
"sladdervorn": "sladrevorn", # 2
|
|
||||||
"slagsbroder": "slagsbror", # 1
|
|
||||||
"slettetaste": "slettetast", # 1
|
|
||||||
"smørsauce": "smørsovs", # 1
|
|
||||||
"snitsel": "schnitzel", # 1
|
|
||||||
"snobbeeffekt": "snobeffekt", # 2
|
|
||||||
"socialministerium": "socialministerie", # 1
|
|
||||||
"solarium": "solarie", # 1
|
|
||||||
"soldebroder": "soldebror", # 1
|
|
||||||
"spagetti": "spaghetti", # 1
|
|
||||||
"spagettistrop": "spaghettistrop", # 1
|
|
||||||
"spagettiwestern": "spaghettiwestern", # 1
|
|
||||||
"spin-off": "spinoff", # 1
|
|
||||||
"spinnefiskeri": "spindefiskeri", # 1
|
|
||||||
"spolorm": "spoleorm", # 1
|
|
||||||
"sproglaboratorium": "sproglaboratorie", # 1
|
|
||||||
"spækbræt": "spækkebræt", # 2
|
|
||||||
"stand-in": "standin", # 1
|
|
||||||
"stand-up-comedy": "standupcomedy", # 1
|
|
||||||
"stand-up-komiker": "standupkomiker", # 1
|
|
||||||
"statsministerium": "statsministerie", # 1
|
|
||||||
"stedbroder": "stedbror", # 1
|
|
||||||
"stedfader": "stedfar", # 1
|
|
||||||
"stedmoder": "stedmor", # 1
|
|
||||||
"stilehefte": "stilehæfte", # 1
|
|
||||||
"stipendium": "stipendie", # 1
|
|
||||||
"stjært": "stjert", # 1
|
|
||||||
"stjærthage": "stjerthage", # 1
|
|
||||||
"storebroder": "storebror", # 1
|
|
||||||
"stortå": "storetå", # 1
|
|
||||||
"strabads": "strabadser", # 1
|
|
||||||
"strømlinjet": "strømlinet", # 1
|
|
||||||
"studium": "studie", # 1
|
|
||||||
"stænkelap": "stænklap", # 1
|
|
||||||
"sundhedsministerium": "sundhedsministerie", # 1
|
|
||||||
"suppositorium": "suppositorie", # 1
|
|
||||||
"svejts": "schweiz", # 1
|
|
||||||
"svejtser": "schweizer", # 1
|
|
||||||
"svejtserfranc": "schweizerfranc", # 1
|
|
||||||
"svejtserost": "schweizerost", # 1
|
|
||||||
"svejtsisk": "schweizisk", # 1
|
|
||||||
"svigerfader": "svigerfar", # 1
|
|
||||||
"svigermoder": "svigermor", # 1
|
|
||||||
"svirebroder": "svirebror", # 1
|
|
||||||
"symposium": "symposie", # 1
|
|
||||||
"sælarium": "sælarie", # 1
|
|
||||||
"søreme": "sørme", # 2
|
|
||||||
"søterritorium": "søterritorie", # 1
|
|
||||||
"t-bone-steak": "t-bonesteak", # 1
|
|
||||||
"tabgivende": "tabsgivende", # 1
|
|
||||||
"tabuere": "tabuisere", # 1
|
|
||||||
"tabuering": "tabuisering", # 1
|
|
||||||
"tackle": "takle", # 2
|
|
||||||
"tackling": "takling", # 2
|
|
||||||
"taifun": "tyfon", # 1
|
|
||||||
"take-off": "takeoff", # 1
|
|
||||||
"taknemlig": "taknemmelig", # 2
|
|
||||||
"talehørelærer": "tale-høre-lærer", # 1
|
|
||||||
"talehøreundervisning": "tale-høre-undervisning", # 1
|
|
||||||
"tandstik": "tandstikker", # 1
|
|
||||||
"tao": "dao", # 1
|
|
||||||
"taoisme": "daoisme", # 1
|
|
||||||
"taoist": "daoist", # 1
|
|
||||||
"taoistisk": "daoistisk", # 1
|
|
||||||
"taverne": "taverna", # 1
|
|
||||||
"teateret": "teatret", # 2
|
|
||||||
"tekno": "techno", # 1
|
|
||||||
"temposkifte": "temposkift", # 1
|
|
||||||
"terrarium": "terrarie", # 1
|
|
||||||
"territorium": "territorie", # 1
|
|
||||||
"tesis": "tese", # 1
|
|
||||||
"tidsstudium": "tidsstudie", # 1
|
|
||||||
"tipoldefader": "tipoldefar", # 1
|
|
||||||
"tipoldemoder": "tipoldemor", # 1
|
|
||||||
"tomatsauce": "tomatsovs", # 1
|
|
||||||
"tonart": "toneart", # 1
|
|
||||||
"trafikministerium": "trafikministerie", # 1
|
|
||||||
"tredve": "tredive", # 1
|
|
||||||
"tredver": "trediver", # 1
|
|
||||||
"tredveårig": "trediveårig", # 1
|
|
||||||
"tredveårs": "trediveårs", # 1
|
|
||||||
"tredveårsfødselsdag": "trediveårsfødselsdag", # 1
|
|
||||||
"tredvte": "tredivte", # 1
|
|
||||||
"tredvtedel": "tredivtedel", # 1
|
|
||||||
"troldunge": "troldeunge", # 1
|
|
||||||
"trommestikke": "trommestik", # 1
|
|
||||||
"trubadur": "troubadour", # 2
|
|
||||||
"trøstepræmie": "trøstpræmie", # 2
|
|
||||||
"tummerum": "trummerum", # 1
|
|
||||||
"tumultuarisk": "tumultarisk", # 1
|
|
||||||
"tunghørighed": "tunghørhed", # 1
|
|
||||||
"tus": "tusch", # 2
|
|
||||||
"tusind": "tusinde", # 2
|
|
||||||
"tvillingbroder": "tvillingebror", # 1
|
|
||||||
"tvillingbror": "tvillingebror", # 1
|
|
||||||
"tvillingebroder": "tvillingebror", # 1
|
|
||||||
"ubeheftet": "ubehæftet", # 1
|
|
||||||
"udenrigsministerium": "udenrigsministerie", # 1
|
|
||||||
"udhulning": "udhuling", # 1
|
|
||||||
"udslaggivende": "udslagsgivende", # 1
|
|
||||||
"udspekulert": "udspekuleret", # 1
|
|
||||||
"udviklingsministerium": "udviklingsministerie", # 1
|
|
||||||
"uforpligtigende": "uforpligtende", # 1
|
|
||||||
"uheldvarslende": "uheldsvarslende", # 1
|
|
||||||
"uimponerthed": "uimponerethed", # 1
|
|
||||||
"undervisningsministerium": "undervisningsministerie", # 1
|
|
||||||
"unægtelig": "unægteligt", # 1
|
|
||||||
"urinale": "urinal", # 1
|
|
||||||
"uvederheftig": "uvederhæftig", # 1
|
|
||||||
"vabel": "vable", # 2
|
|
||||||
"vadi": "wadi", # 1
|
|
||||||
"vaklevorn": "vakkelvorn", # 1
|
|
||||||
"vanadin": "vanadium", # 1
|
|
||||||
"vaselin": "vaseline", # 1
|
|
||||||
"vederheftig": "vederhæftig", # 1
|
|
||||||
"vedhefte": "vedhæfte", # 1
|
|
||||||
"velar": "velær", # 1
|
|
||||||
"videndeling": "vidensdeling", # 2
|
|
||||||
"vinkelanførelsestegn": "vinkelanførselstegn", # 1
|
|
||||||
"vipstjært": "vipstjert", # 1
|
|
||||||
"vismut": "bismut", # 1
|
|
||||||
"visvas": "vissevasse", # 1
|
|
||||||
"voksværk": "vokseværk", # 1
|
|
||||||
"værtdyr": "værtsdyr", # 1
|
|
||||||
"værtplante": "værtsplante", # 1
|
|
||||||
"wienersnitsel": "wienerschnitzel", # 1
|
|
||||||
"yderliggående": "yderligtgående", # 2
|
|
||||||
"zombi": "zombie", # 1
|
|
||||||
"ægbakke": "æggebakke", # 1
|
|
||||||
"ægformet": "æggeformet", # 1
|
|
||||||
"ægleder": "æggeleder", # 1
|
|
||||||
"ækvilibrist": "ekvilibrist", # 2
|
|
||||||
"æselsøre": "æseløre", # 1
|
|
||||||
"øjehule": "øjenhule", # 1
|
|
||||||
"øjelåg": "øjenlåg", # 1
|
|
||||||
"øjeåbner": "øjenåbner", # 1
|
|
||||||
"økonomiministerium": "økonomiministerie", # 1
|
|
||||||
"ørenring": "ørering", # 2
|
|
||||||
"øvehefte": "øvehæfte", # 1
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
NORM_EXCEPTIONS = {}
|
|
||||||
|
|
||||||
for string, norm in _exc.items():
|
|
||||||
NORM_EXCEPTIONS[string] = norm
|
|
||||||
NORM_EXCEPTIONS[string.title()] = norm
|
|
|
@ -6,7 +6,7 @@ Source: https://forkortelse.dk/ and various others.
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import ORTH, LEMMA, NORM, TAG, PUNCT
|
from ...symbols import ORTH, LEMMA, NORM
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
@ -52,7 +52,7 @@ for exc_data in [
|
||||||
{ORTH: "Ons.", LEMMA: "onsdag"},
|
{ORTH: "Ons.", LEMMA: "onsdag"},
|
||||||
{ORTH: "Fre.", LEMMA: "fredag"},
|
{ORTH: "Fre.", LEMMA: "fredag"},
|
||||||
{ORTH: "Lør.", LEMMA: "lørdag"},
|
{ORTH: "Lør.", LEMMA: "lørdag"},
|
||||||
{ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller", TAG: "CC"},
|
{ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller"},
|
||||||
]:
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
@ -70,6 +70,7 @@ for orth in [
|
||||||
"A/S",
|
"A/S",
|
||||||
"B.C.",
|
"B.C.",
|
||||||
"BK.",
|
"BK.",
|
||||||
|
"B.T.",
|
||||||
"Dr.",
|
"Dr.",
|
||||||
"Boul.",
|
"Boul.",
|
||||||
"Chr.",
|
"Chr.",
|
||||||
|
@ -79,6 +80,7 @@ for orth in [
|
||||||
"Hf.",
|
"Hf.",
|
||||||
"i/s",
|
"i/s",
|
||||||
"I/S",
|
"I/S",
|
||||||
|
"Inc.",
|
||||||
"Kprs.",
|
"Kprs.",
|
||||||
"L.A.",
|
"L.A.",
|
||||||
"Ll.",
|
"Ll.",
|
||||||
|
@ -149,6 +151,7 @@ for orth in [
|
||||||
"bygn.",
|
"bygn.",
|
||||||
"c/o",
|
"c/o",
|
||||||
"ca.",
|
"ca.",
|
||||||
|
"cm.",
|
||||||
"cand.",
|
"cand.",
|
||||||
"d.d.",
|
"d.d.",
|
||||||
"d.m.",
|
"d.m.",
|
||||||
|
@ -172,10 +175,12 @@ for orth in [
|
||||||
"dl.",
|
"dl.",
|
||||||
"do.",
|
"do.",
|
||||||
"dobb.",
|
"dobb.",
|
||||||
|
"dr.",
|
||||||
"dr.h.c",
|
"dr.h.c",
|
||||||
"dr.phil.",
|
"dr.phil.",
|
||||||
"ds.",
|
"ds.",
|
||||||
"dvs.",
|
"dvs.",
|
||||||
|
"d.v.s.",
|
||||||
"e.b.",
|
"e.b.",
|
||||||
"e.l.",
|
"e.l.",
|
||||||
"e.o.",
|
"e.o.",
|
||||||
|
@ -297,10 +302,14 @@ for orth in [
|
||||||
"kap.",
|
"kap.",
|
||||||
"kbh.",
|
"kbh.",
|
||||||
"kem.",
|
"kem.",
|
||||||
|
"kg.",
|
||||||
|
"kgs.",
|
||||||
"kgl.",
|
"kgl.",
|
||||||
"kl.",
|
"kl.",
|
||||||
"kld.",
|
"kld.",
|
||||||
|
"km.",
|
||||||
"km/t",
|
"km/t",
|
||||||
|
"km/t.",
|
||||||
"knsp.",
|
"knsp.",
|
||||||
"komm.",
|
"komm.",
|
||||||
"kons.",
|
"kons.",
|
||||||
|
@ -311,6 +320,7 @@ for orth in [
|
||||||
"kt.",
|
"kt.",
|
||||||
"ktr.",
|
"ktr.",
|
||||||
"kv.",
|
"kv.",
|
||||||
|
"kvm.",
|
||||||
"kvt.",
|
"kvt.",
|
||||||
"l.c.",
|
"l.c.",
|
||||||
"lab.",
|
"lab.",
|
||||||
|
@ -357,6 +367,7 @@ for orth in [
|
||||||
"nto.",
|
"nto.",
|
||||||
"nuv.",
|
"nuv.",
|
||||||
"o/m",
|
"o/m",
|
||||||
|
"o/m.",
|
||||||
"o.a.",
|
"o.a.",
|
||||||
"o.fl.",
|
"o.fl.",
|
||||||
"o.h.",
|
"o.h.",
|
||||||
|
@ -526,6 +537,7 @@ for orth in [
|
||||||
"vejl.",
|
"vejl.",
|
||||||
"vh.",
|
"vh.",
|
||||||
"vha.",
|
"vha.",
|
||||||
|
"vind.",
|
||||||
"vs.",
|
"vs.",
|
||||||
"vsa.",
|
"vsa.",
|
||||||
"vær.",
|
"vær.",
|
||||||
|
@ -565,7 +577,7 @@ for h in range(1, 31 + 1):
|
||||||
for period in ["."]:
|
for period in ["."]:
|
||||||
_exc["%d%s" % (h, period)] = [{ORTH: "%d." % h}]
|
_exc["%d%s" % (h, period)] = [{ORTH: "%d." % h}]
|
||||||
|
|
||||||
_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: ".", TAG: PUNCT}]}
|
_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: "."}]}
|
||||||
_exc.update(_custom_base_exc)
|
_exc.update(_custom_base_exc)
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = _exc
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .norm_exceptions import NORM_EXCEPTIONS
|
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
|
@ -10,18 +9,14 @@ from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
class GermanDefaults(Language.Defaults):
|
class GermanDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: "de"
|
lex_attr_getters[LANG] = lambda text: "de"
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
|
||||||
Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS
|
|
||||||
)
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
|
@ -1,16 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
# Here we only want to include the absolute most common words. Otherwise,
|
|
||||||
# this list would get impossibly long for German – especially considering the
|
|
||||||
# old vs. new spelling rules, and all possible cases.
|
|
||||||
|
|
||||||
|
|
||||||
_exc = {"daß": "dass"}
|
|
||||||
|
|
||||||
|
|
||||||
NORM_EXCEPTIONS = {}
|
|
||||||
|
|
||||||
for string, norm in _exc.items():
|
|
||||||
NORM_EXCEPTIONS[string] = norm
|
|
||||||
NORM_EXCEPTIONS[string.title()] = norm
|
|
|
@ -2,12 +2,12 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
|
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
|
||||||
from ..char_classes import LIST_CURRENCY, CURRENCY, UNITS, PUNCT
|
from ..char_classes import CURRENCY, UNITS, PUNCT
|
||||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||||
from ..punctuation import _prefixes, _suffixes
|
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
|
||||||
|
|
||||||
|
|
||||||
_prefixes = ["``",] + list(_prefixes)
|
_prefixes = ["``"] + BASE_TOKENIZER_PREFIXES
|
||||||
|
|
||||||
_suffixes = (
|
_suffixes = (
|
||||||
["''", "/"]
|
["''", "/"]
|
||||||
|
|
|
@ -2,9 +2,10 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(obj):
|
def noun_chunks(doclike):
|
||||||
"""
|
"""
|
||||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
"""
|
"""
|
||||||
|
@ -27,13 +28,17 @@ def noun_chunks(obj):
|
||||||
"og",
|
"og",
|
||||||
"app",
|
"app",
|
||||||
]
|
]
|
||||||
doc = obj.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
|
if not doc.is_parsed:
|
||||||
|
raise ValueError(Errors.E029)
|
||||||
|
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
np_deps = set(doc.vocab.strings.add(label) for label in labels)
|
np_deps = set(doc.vocab.strings.add(label) for label in labels)
|
||||||
close_app = doc.vocab.strings.add("nk")
|
close_app = doc.vocab.strings.add("nk")
|
||||||
|
|
||||||
rbracket = 0
|
rbracket = 0
|
||||||
for i, word in enumerate(obj):
|
for i, word in enumerate(doclike):
|
||||||
if i < rbracket:
|
if i < rbracket:
|
||||||
continue
|
continue
|
||||||
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
|
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
|
||||||
|
|
|
@ -10,21 +10,16 @@ from .lemmatizer import GreekLemmatizer
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from .norm_exceptions import NORM_EXCEPTIONS
|
|
||||||
from ..norm_exceptions import BASE_NORMS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lookups import Lookups
|
from ...lookups import Lookups
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
class GreekDefaults(Language.Defaults):
|
class GreekDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
lex_attr_getters[LANG] = lambda text: "el"
|
lex_attr_getters[LANG] = lambda text: "el"
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
|
|
||||||
)
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -2,9 +2,10 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(obj):
|
def noun_chunks(doclike):
|
||||||
"""
|
"""
|
||||||
Detect base noun phrases. Works on both Doc and Span.
|
Detect base noun phrases. Works on both Doc and Span.
|
||||||
"""
|
"""
|
||||||
|
@ -13,34 +14,34 @@ def noun_chunks(obj):
|
||||||
# obj tag corrects some DEP tagger mistakes.
|
# obj tag corrects some DEP tagger mistakes.
|
||||||
# Further improvement of the models will eliminate the need for this tag.
|
# Further improvement of the models will eliminate the need for this tag.
|
||||||
labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
|
labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
|
||||||
doc = obj.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
|
if not doc.is_parsed:
|
||||||
|
raise ValueError(Errors.E029)
|
||||||
|
|
||||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
nmod = doc.vocab.strings.add("nmod")
|
nmod = doc.vocab.strings.add("nmod")
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
seen = set()
|
prev_end = -1
|
||||||
for i, word in enumerate(obj):
|
for i, word in enumerate(doclike):
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
continue
|
continue
|
||||||
# Prevent nested chunks from being produced
|
# Prevent nested chunks from being produced
|
||||||
if word.i in seen:
|
if word.left_edge.i <= prev_end:
|
||||||
continue
|
continue
|
||||||
if word.dep in np_deps:
|
if word.dep in np_deps:
|
||||||
if any(w.i in seen for w in word.subtree):
|
|
||||||
continue
|
|
||||||
flag = False
|
flag = False
|
||||||
if word.pos == NOUN:
|
if word.pos == NOUN:
|
||||||
# check for patterns such as γραμμή παραγωγής
|
# check for patterns such as γραμμή παραγωγής
|
||||||
for potential_nmod in word.rights:
|
for potential_nmod in word.rights:
|
||||||
if potential_nmod.dep == nmod:
|
if potential_nmod.dep == nmod:
|
||||||
seen.update(
|
prev_end = potential_nmod.i
|
||||||
j for j in range(word.left_edge.i, potential_nmod.i + 1)
|
|
||||||
)
|
|
||||||
yield word.left_edge.i, potential_nmod.i + 1, np_label
|
yield word.left_edge.i, potential_nmod.i + 1, np_label
|
||||||
flag = True
|
flag = True
|
||||||
break
|
break
|
||||||
if flag is False:
|
if flag is False:
|
||||||
seen.update(j for j in range(word.left_edge.i, word.i + 1))
|
prev_end = word.i
|
||||||
yield word.left_edge.i, word.i + 1, np_label
|
yield word.left_edge.i, word.i + 1, np_label
|
||||||
elif word.dep == conj:
|
elif word.dep == conj:
|
||||||
# covers the case: έχει όμορφα και έξυπνα παιδιά
|
# covers the case: έχει όμορφα και έξυπνα παιδιά
|
||||||
|
@ -49,9 +50,7 @@ def noun_chunks(obj):
|
||||||
head = head.head
|
head = head.head
|
||||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||||
if head.dep in np_deps:
|
if head.dep in np_deps:
|
||||||
if any(w.i in seen for w in word.subtree):
|
prev_end = word.i
|
||||||
continue
|
|
||||||
seen.update(j for j in range(word.left_edge.i, word.i + 1))
|
|
||||||
yield word.left_edge.i, word.i + 1, np_label
|
yield word.left_edge.i, word.i + 1, np_label
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .norm_exceptions import NORM_EXCEPTIONS
|
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
@ -10,10 +9,9 @@ from .morph_rules import MORPH_RULES
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
def _return_en(_):
|
def _return_en(_):
|
||||||
|
@ -24,9 +22,6 @@ class EnglishDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
lex_attr_getters[LANG] = _return_en
|
lex_attr_getters[LANG] = _return_en
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
|
|
||||||
)
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -2,9 +2,10 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(obj):
|
def noun_chunks(doclike):
|
||||||
"""
|
"""
|
||||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
"""
|
"""
|
||||||
|
@ -19,21 +20,23 @@ def noun_chunks(obj):
|
||||||
"attr",
|
"attr",
|
||||||
"ROOT",
|
"ROOT",
|
||||||
]
|
]
|
||||||
doc = obj.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
|
if not doc.is_parsed:
|
||||||
|
raise ValueError(Errors.E029)
|
||||||
|
|
||||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
seen = set()
|
prev_end = -1
|
||||||
for i, word in enumerate(obj):
|
for i, word in enumerate(doclike):
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
continue
|
continue
|
||||||
# Prevent nested chunks from being produced
|
# Prevent nested chunks from being produced
|
||||||
if word.i in seen:
|
if word.left_edge.i <= prev_end:
|
||||||
continue
|
continue
|
||||||
if word.dep in np_deps:
|
if word.dep in np_deps:
|
||||||
if any(w.i in seen for w in word.subtree):
|
prev_end = word.i
|
||||||
continue
|
|
||||||
seen.update(j for j in range(word.left_edge.i, word.i + 1))
|
|
||||||
yield word.left_edge.i, word.i + 1, np_label
|
yield word.left_edge.i, word.i + 1, np_label
|
||||||
elif word.dep == conj:
|
elif word.dep == conj:
|
||||||
head = word.head
|
head = word.head
|
||||||
|
@ -41,9 +44,7 @@ def noun_chunks(obj):
|
||||||
head = head.head
|
head = head.head
|
||||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||||
if head.dep in np_deps:
|
if head.dep in np_deps:
|
||||||
if any(w.i in seen for w in word.subtree):
|
prev_end = word.i
|
||||||
continue
|
|
||||||
seen.update(j for j in range(word.left_edge.i, word.i + 1))
|
|
||||||
yield word.left_edge.i, word.i + 1, np_label
|
yield word.left_edge.i, word.i + 1, np_label
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -77,12 +77,12 @@ for pron in ["i", "you", "he", "she", "it", "we", "they"]:
|
||||||
|
|
||||||
_exc[orth + "'d"] = [
|
_exc[orth + "'d"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||||
{ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
|
{ORTH: "'d", NORM: "'d"},
|
||||||
]
|
]
|
||||||
|
|
||||||
_exc[orth + "d"] = [
|
_exc[orth + "d"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||||
{ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
|
{ORTH: "d", NORM: "'d"},
|
||||||
]
|
]
|
||||||
|
|
||||||
_exc[orth + "'d've"] = [
|
_exc[orth + "'d've"] = [
|
||||||
|
@ -195,7 +195,10 @@ for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
|
||||||
{ORTH: "'d", NORM: "'d"},
|
{ORTH: "'d", NORM: "'d"},
|
||||||
]
|
]
|
||||||
|
|
||||||
_exc[orth + "d"] = [{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: "d"}]
|
_exc[orth + "d"] = [
|
||||||
|
{ORTH: orth, LEMMA: word, NORM: word},
|
||||||
|
{ORTH: "d", NORM: "'d"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[orth + "'d've"] = [
|
_exc[orth + "'d've"] = [
|
||||||
{ORTH: orth, LEMMA: word, NORM: word},
|
{ORTH: orth, LEMMA: word, NORM: word},
|
||||||
|
|
|
@ -6,6 +6,7 @@ from .tag_map import TAG_MAP
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
|
@ -23,6 +24,8 @@ class SpanishDefaults(Language.Defaults):
|
||||||
)
|
)
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
|
infixes = TOKENIZER_INFIXES
|
||||||
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
|
@ -26,6 +26,15 @@ _num_words = [
|
||||||
"dieciocho",
|
"dieciocho",
|
||||||
"diecinueve",
|
"diecinueve",
|
||||||
"veinte",
|
"veinte",
|
||||||
|
"veintiuno",
|
||||||
|
"veintidós",
|
||||||
|
"veintitrés",
|
||||||
|
"veinticuatro",
|
||||||
|
"veinticinco",
|
||||||
|
"veintiséis",
|
||||||
|
"veintisiete",
|
||||||
|
"veintiocho",
|
||||||
|
"veintinueve",
|
||||||
"treinta",
|
"treinta",
|
||||||
"cuarenta",
|
"cuarenta",
|
||||||
"cincuenta",
|
"cincuenta",
|
||||||
|
|
47
spacy/lang/es/punctuation.py
Normal file
47
spacy/lang/es/punctuation.py
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
|
||||||
|
from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
|
||||||
|
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
||||||
|
from ..char_classes import merge_chars
|
||||||
|
|
||||||
|
|
||||||
|
_list_units = [u for u in LIST_UNITS if u != "%"]
|
||||||
|
_units = merge_chars(" ".join(_list_units))
|
||||||
|
_concat_quotes = CONCAT_QUOTES + "—–"
|
||||||
|
|
||||||
|
|
||||||
|
_suffixes = (
|
||||||
|
["—", "–"]
|
||||||
|
+ LIST_PUNCT
|
||||||
|
+ LIST_ELLIPSES
|
||||||
|
+ LIST_QUOTES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
|
r"(?<=[0-9])\+",
|
||||||
|
r"(?<=°[FfCcKk])\.",
|
||||||
|
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||||
|
r"(?<=[0-9])(?:{u})".format(u=_units),
|
||||||
|
r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
|
||||||
|
al=ALPHA_LOWER, e=r"%²\-\+", q=_concat_quotes, p=PUNCT
|
||||||
|
),
|
||||||
|
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
_infixes = (
|
||||||
|
LIST_ELLIPSES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
|
r"(?<=[0-9])[+\*^](?=[0-9-])",
|
||||||
|
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||||
|
al=ALPHA_LOWER, au=ALPHA_UPPER, q=_concat_quotes
|
||||||
|
),
|
||||||
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
TOKENIZER_SUFFIXES = _suffixes
|
||||||
|
TOKENIZER_INFIXES = _infixes
|
|
@ -2,10 +2,15 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
|
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
|
||||||
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(obj):
|
def noun_chunks(doclike):
|
||||||
doc = obj.doc
|
doc = doclike.doc
|
||||||
|
|
||||||
|
if not doc.is_parsed:
|
||||||
|
raise ValueError(Errors.E029)
|
||||||
|
|
||||||
if not len(doc):
|
if not len(doc):
|
||||||
return
|
return
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
|
@ -16,7 +21,7 @@ def noun_chunks(obj):
|
||||||
np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
|
np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
|
||||||
stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
|
stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
|
||||||
token = doc[0]
|
token = doc[0]
|
||||||
while token and token.i < len(doc):
|
while token and token.i < len(doclike):
|
||||||
if token.pos in [PROPN, NOUN, PRON]:
|
if token.pos in [PROPN, NOUN, PRON]:
|
||||||
left, right = noun_bounds(
|
left, right = noun_bounds(
|
||||||
doc, token, np_left_deps, np_right_deps, stop_deps
|
doc, token, np_left_deps, np_right_deps, stop_deps
|
||||||
|
|
|
@ -43,14 +43,16 @@ for orth in [
|
||||||
"Av.",
|
"Av.",
|
||||||
"Avda.",
|
"Avda.",
|
||||||
"Cía.",
|
"Cía.",
|
||||||
|
"EE.UU.",
|
||||||
"etc.",
|
"etc.",
|
||||||
|
"fig.",
|
||||||
"Gob.",
|
"Gob.",
|
||||||
"Gral.",
|
"Gral.",
|
||||||
"Ing.",
|
"Ing.",
|
||||||
"J.C.",
|
"J.C.",
|
||||||
|
"km/h",
|
||||||
"Lic.",
|
"Lic.",
|
||||||
"m.n.",
|
"m.n.",
|
||||||
"no.",
|
|
||||||
"núm.",
|
"núm.",
|
||||||
"P.D.",
|
"P.D.",
|
||||||
"Prof.",
|
"Prof.",
|
||||||
|
|
|
@ -10,5 +10,5 @@ Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
sentences = [
|
sentences = [
|
||||||
"bilbon ko castinga egin da eta nik jakin ez zuetako inork egin al du edota parte hartu duen ezagunik ba al du",
|
"bilbon ko castinga egin da eta nik jakin ez zuetako inork egin al du edota parte hartu duen ezagunik ba al du",
|
||||||
"gaur telebistan entzunda denok martetik gatoz hortaz martzianoak gara beno nire ustez batzuk beste batzuk baino martzianoagoak dira"
|
"gaur telebistan entzunda denok martetik gatoz hortaz martzianoak gara beno nire ustez batzuk beste batzuk baino martzianoagoak dira",
|
||||||
]
|
]
|
||||||
|
|
|
@ -59,7 +59,6 @@ behin
|
||||||
""".split()
|
""".split()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
if text.startswith(("+", "-", "±", "~")):
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
text = text[1:]
|
text = text[1:]
|
||||||
|
|
|
@ -10,6 +10,7 @@ from .lex_attrs import LEX_ATTRS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
class PersianDefaults(Language.Defaults):
|
class PersianDefaults(Language.Defaults):
|
||||||
|
@ -24,6 +25,7 @@ class PersianDefaults(Language.Defaults):
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
class Persian(Language):
|
class Persian(Language):
|
||||||
|
|
|
@ -2,9 +2,10 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(obj):
|
def noun_chunks(doclike):
|
||||||
"""
|
"""
|
||||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
"""
|
"""
|
||||||
|
@ -19,21 +20,23 @@ def noun_chunks(obj):
|
||||||
"attr",
|
"attr",
|
||||||
"ROOT",
|
"ROOT",
|
||||||
]
|
]
|
||||||
doc = obj.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
|
if not doc.is_parsed:
|
||||||
|
raise ValueError(Errors.E029)
|
||||||
|
|
||||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
seen = set()
|
prev_end = -1
|
||||||
for i, word in enumerate(obj):
|
for i, word in enumerate(doclike):
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
continue
|
continue
|
||||||
# Prevent nested chunks from being produced
|
# Prevent nested chunks from being produced
|
||||||
if word.i in seen:
|
if word.left_edge.i <= prev_end:
|
||||||
continue
|
continue
|
||||||
if word.dep in np_deps:
|
if word.dep in np_deps:
|
||||||
if any(w.i in seen for w in word.subtree):
|
prev_end = word.i
|
||||||
continue
|
|
||||||
seen.update(j for j in range(word.left_edge.i, word.i + 1))
|
|
||||||
yield word.left_edge.i, word.i + 1, np_label
|
yield word.left_edge.i, word.i + 1, np_label
|
||||||
elif word.dep == conj:
|
elif word.dep == conj:
|
||||||
head = word.head
|
head = word.head
|
||||||
|
@ -41,9 +44,7 @@ def noun_chunks(obj):
|
||||||
head = head.head
|
head = head.head
|
||||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||||
if head.dep in np_deps:
|
if head.dep in np_deps:
|
||||||
if any(w.i in seen for w in word.subtree):
|
prev_end = word.i
|
||||||
continue
|
|
||||||
seen.update(j for j in range(word.left_edge.i, word.i + 1))
|
|
||||||
yield word.left_edge.i, word.i + 1, np_label
|
yield word.left_edge.i, word.i + 1, np_label
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
||||||
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
@ -27,6 +28,7 @@ class FrenchDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
prefixes = TOKENIZER_PREFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
token_match = TOKEN_MATCH
|
token_match = TOKEN_MATCH
|
||||||
|
|
|
@ -1,15 +1,26 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..punctuation import TOKENIZER_INFIXES
|
from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
||||||
from ..char_classes import CONCAT_QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
from ..char_classes import CONCAT_QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||||
|
from ..char_classes import merge_chars
|
||||||
|
|
||||||
|
|
||||||
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
|
ELISION = "' ’".replace(" ", "")
|
||||||
HYPHENS = r"- – — ‐ ‑".strip().replace(" ", "").replace("\n", "")
|
HYPHENS = r"- – — ‐ ‑".replace(" ", "")
|
||||||
|
_prefixes_elision = "d l n"
|
||||||
|
_prefixes_elision += " " + _prefixes_elision.upper()
|
||||||
|
_hyphen_suffixes = "ce clés elle en il ils je là moi nous on t vous"
|
||||||
|
_hyphen_suffixes += " " + _hyphen_suffixes.upper()
|
||||||
|
|
||||||
|
|
||||||
|
_prefixes = TOKENIZER_PREFIXES + [
|
||||||
|
r"(?:({pe})[{el}])(?=[{a}])".format(
|
||||||
|
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
_suffixes = (
|
_suffixes = (
|
||||||
LIST_PUNCT
|
LIST_PUNCT
|
||||||
+ LIST_ELLIPSES
|
+ LIST_ELLIPSES
|
||||||
|
@ -17,7 +28,6 @@ _suffixes = (
|
||||||
+ [
|
+ [
|
||||||
r"(?<=[0-9])\+",
|
r"(?<=[0-9])\+",
|
||||||
r"(?<=°[FfCcKk])\.", # °C. -> ["°C", "."]
|
r"(?<=°[FfCcKk])\.", # °C. -> ["°C", "."]
|
||||||
r"(?<=[0-9])°[FfCcKk]", # 4°C -> ["4", "°C"]
|
|
||||||
r"(?<=[0-9])%", # 4% -> ["4", "%"]
|
r"(?<=[0-9])%", # 4% -> ["4", "%"]
|
||||||
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||||
|
@ -25,14 +35,17 @@ _suffixes = (
|
||||||
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
|
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
|
||||||
),
|
),
|
||||||
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||||
|
r"(?<=[{a}])[{h}]({hs})".format(
|
||||||
|
a=ALPHA, h=HYPHENS, hs=merge_chars(_hyphen_suffixes)
|
||||||
|
),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
_infixes = TOKENIZER_INFIXES + [
|
_infixes = TOKENIZER_INFIXES + [
|
||||||
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
|
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_PREFIXES = _prefixes
|
||||||
TOKENIZER_SUFFIXES = _suffixes
|
TOKENIZER_SUFFIXES = _suffixes
|
||||||
TOKENIZER_INFIXES = _infixes
|
TOKENIZER_INFIXES = _infixes
|
||||||
|
|
|
@ -2,9 +2,10 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(obj):
|
def noun_chunks(doclike):
|
||||||
"""
|
"""
|
||||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
"""
|
"""
|
||||||
|
@ -18,21 +19,23 @@ def noun_chunks(obj):
|
||||||
"nmod",
|
"nmod",
|
||||||
"nmod:poss",
|
"nmod:poss",
|
||||||
]
|
]
|
||||||
doc = obj.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
|
if not doc.is_parsed:
|
||||||
|
raise ValueError(Errors.E029)
|
||||||
|
|
||||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
seen = set()
|
prev_end = -1
|
||||||
for i, word in enumerate(obj):
|
for i, word in enumerate(doclike):
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
continue
|
continue
|
||||||
# Prevent nested chunks from being produced
|
# Prevent nested chunks from being produced
|
||||||
if word.i in seen:
|
if word.left_edge.i <= prev_end:
|
||||||
continue
|
continue
|
||||||
if word.dep in np_deps:
|
if word.dep in np_deps:
|
||||||
if any(w.i in seen for w in word.subtree):
|
prev_end = word.right_edge.i
|
||||||
continue
|
|
||||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
|
|
||||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||||
elif word.dep == conj:
|
elif word.dep == conj:
|
||||||
head = word.head
|
head = word.head
|
||||||
|
@ -40,9 +43,7 @@ def noun_chunks(obj):
|
||||||
head = head.head
|
head = head.head
|
||||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||||
if head.dep in np_deps:
|
if head.dep in np_deps:
|
||||||
if any(w.i in seen for w in word.subtree):
|
prev_end = word.right_edge.i
|
||||||
continue
|
|
||||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
|
|
||||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,7 @@ import re
|
||||||
from .punctuation import ELISION, HYPHENS
|
from .punctuation import ELISION, HYPHENS
|
||||||
from ..tokenizer_exceptions import URL_PATTERN
|
from ..tokenizer_exceptions import URL_PATTERN
|
||||||
from ..char_classes import ALPHA_LOWER, ALPHA
|
from ..char_classes import ALPHA_LOWER, ALPHA
|
||||||
from ...symbols import ORTH, LEMMA, TAG
|
from ...symbols import ORTH, LEMMA
|
||||||
|
|
||||||
# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
|
# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
|
||||||
# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
|
# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
|
||||||
|
@ -56,7 +56,28 @@ for exc_data in [
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
|
||||||
for orth in ["etc."]:
|
for orth in [
|
||||||
|
"après-midi",
|
||||||
|
"au-delà",
|
||||||
|
"au-dessus",
|
||||||
|
"celle-ci",
|
||||||
|
"celles-ci",
|
||||||
|
"celui-ci",
|
||||||
|
"cf.",
|
||||||
|
"ci-dessous",
|
||||||
|
"elle-même",
|
||||||
|
"en-dessous",
|
||||||
|
"etc.",
|
||||||
|
"jusque-là",
|
||||||
|
"lui-même",
|
||||||
|
"MM.",
|
||||||
|
"No.",
|
||||||
|
"peut-être",
|
||||||
|
"pp.",
|
||||||
|
"quelques-uns",
|
||||||
|
"rendez-vous",
|
||||||
|
"Vol.",
|
||||||
|
]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
|
@ -72,7 +93,7 @@ for verb, verb_lemma in [
|
||||||
for pronoun in ["elle", "il", "on"]:
|
for pronoun in ["elle", "il", "on"]:
|
||||||
token = "{}-t-{}".format(orth, pronoun)
|
token = "{}-t-{}".format(orth, pronoun)
|
||||||
_exc[token] = [
|
_exc[token] = [
|
||||||
{LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
|
{LEMMA: verb_lemma, ORTH: orth}, # , TAG: "VERB"},
|
||||||
{LEMMA: "t", ORTH: "-t"},
|
{LEMMA: "t", ORTH: "-t"},
|
||||||
{LEMMA: pronoun, ORTH: "-" + pronoun},
|
{LEMMA: pronoun, ORTH: "-" + pronoun},
|
||||||
]
|
]
|
||||||
|
@ -81,7 +102,7 @@ for verb, verb_lemma in [("est", "être")]:
|
||||||
for orth in [verb, verb.title()]:
|
for orth in [verb, verb.title()]:
|
||||||
token = "{}-ce".format(orth)
|
token = "{}-ce".format(orth)
|
||||||
_exc[token] = [
|
_exc[token] = [
|
||||||
{LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
|
{LEMMA: verb_lemma, ORTH: orth}, # , TAG: "VERB"},
|
||||||
{LEMMA: "ce", ORTH: "-ce"},
|
{LEMMA: "ce", ORTH: "-ce"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -89,12 +110,29 @@ for verb, verb_lemma in [("est", "être")]:
|
||||||
for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]:
|
for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]:
|
||||||
for orth in [pre, pre.title()]:
|
for orth in [pre, pre.title()]:
|
||||||
_exc["%sest-ce" % orth] = [
|
_exc["%sest-ce" % orth] = [
|
||||||
{LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"},
|
{LEMMA: pre_lemma, ORTH: orth},
|
||||||
{LEMMA: "être", ORTH: "est", TAG: "VERB"},
|
{LEMMA: "être", ORTH: "est"},
|
||||||
{LEMMA: "ce", ORTH: "-ce"},
|
{LEMMA: "ce", ORTH: "-ce"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
for verb, pronoun in [("est", "il"), ("EST", "IL")]:
|
||||||
|
token = "{}-{}".format(verb, pronoun)
|
||||||
|
_exc[token] = [
|
||||||
|
{LEMMA: "être", ORTH: verb},
|
||||||
|
{LEMMA: pronoun, ORTH: "-" + pronoun},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
for s, verb, pronoun in [("s", "est", "il"), ("S", "EST", "IL")]:
|
||||||
|
token = "{}'{}-{}".format(s, verb, pronoun)
|
||||||
|
_exc[token] = [
|
||||||
|
{LEMMA: "se", ORTH: s + "'"},
|
||||||
|
{LEMMA: "être", ORTH: verb},
|
||||||
|
{LEMMA: pronoun, ORTH: "-" + pronoun},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
_infixes_exc = []
|
_infixes_exc = []
|
||||||
orig_elision = "'"
|
orig_elision = "'"
|
||||||
orig_hyphen = "-"
|
orig_hyphen = "-"
|
||||||
|
@ -423,5 +461,5 @@ _regular_exp.append(URL_PATTERN)
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = _exc
|
||||||
TOKEN_MATCH = re.compile(
|
TOKEN_MATCH = re.compile(
|
||||||
"|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE | re.UNICODE
|
"(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp)
|
||||||
).match
|
).match
|
||||||
|
|
18
spacy/lang/gu/__init__.py
Normal file
18
spacy/lang/gu/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
|
from ...language import Language
|
||||||
|
|
||||||
|
|
||||||
|
class GujaratiDefaults(Language.Defaults):
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
class Gujarati(Language):
|
||||||
|
lang = "gu"
|
||||||
|
Defaults = GujaratiDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Gujarati"]
|
22
spacy/lang/gu/examples.py
Normal file
22
spacy/lang/gu/examples.py
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.gu.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"લોકશાહી એ સરકારનું એક એવું તંત્ર છે જ્યાં નાગરિકો મત દ્વારા સત્તાનો ઉપયોગ કરે છે.",
|
||||||
|
"તે ગુજરાત રાજ્યના ધરમપુર શહેરમાં આવેલું હતું",
|
||||||
|
"કર્ણદેવ પહેલો સોલંકી વંશનો રાજા હતો",
|
||||||
|
"તેજપાળને બે પત્ની હતી",
|
||||||
|
"ગુજરાતમાં ભારતીય જનતા પક્ષનો ઉદય આ સમયગાળા દરમિયાન થયો",
|
||||||
|
"આંદોલનકારીઓએ ચીમનભાઇ પટેલના રાજીનામાની માંગણી કરી.",
|
||||||
|
"અહિયાં શું જોડાય છે?",
|
||||||
|
"મંદિરનો પૂર્વાભિમુખ ભાગ નાના મંડપ સાથે થોડો લંબચોરસ આકારનો છે.",
|
||||||
|
]
|
91
spacy/lang/gu/stop_words.py
Normal file
91
spacy/lang/gu/stop_words.py
Normal file
|
@ -0,0 +1,91 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
|
એમ
|
||||||
|
આ
|
||||||
|
એ
|
||||||
|
રહી
|
||||||
|
છે
|
||||||
|
છો
|
||||||
|
હતા
|
||||||
|
હતું
|
||||||
|
હતી
|
||||||
|
હોય
|
||||||
|
હતો
|
||||||
|
શકે
|
||||||
|
તે
|
||||||
|
તેના
|
||||||
|
તેનું
|
||||||
|
તેને
|
||||||
|
તેની
|
||||||
|
તેઓ
|
||||||
|
તેમને
|
||||||
|
તેમના
|
||||||
|
તેમણે
|
||||||
|
તેમનું
|
||||||
|
તેમાં
|
||||||
|
અને
|
||||||
|
અહીં
|
||||||
|
થી
|
||||||
|
થઈ
|
||||||
|
થાય
|
||||||
|
જે
|
||||||
|
ને
|
||||||
|
કે
|
||||||
|
ના
|
||||||
|
ની
|
||||||
|
નો
|
||||||
|
ને
|
||||||
|
નું
|
||||||
|
શું
|
||||||
|
માં
|
||||||
|
પણ
|
||||||
|
પર
|
||||||
|
જેવા
|
||||||
|
જેવું
|
||||||
|
જાય
|
||||||
|
જેમ
|
||||||
|
જેથી
|
||||||
|
માત્ર
|
||||||
|
માટે
|
||||||
|
પરથી
|
||||||
|
આવ્યું
|
||||||
|
એવી
|
||||||
|
આવી
|
||||||
|
રીતે
|
||||||
|
સુધી
|
||||||
|
થાય
|
||||||
|
થઈ
|
||||||
|
સાથે
|
||||||
|
લાગે
|
||||||
|
હોવા
|
||||||
|
છતાં
|
||||||
|
રહેલા
|
||||||
|
કરી
|
||||||
|
કરે
|
||||||
|
કેટલા
|
||||||
|
કોઈ
|
||||||
|
કેમ
|
||||||
|
કર્યો
|
||||||
|
કર્યુ
|
||||||
|
કરે
|
||||||
|
સૌથી
|
||||||
|
ત્યારબાદ
|
||||||
|
તથા
|
||||||
|
દ્વારા
|
||||||
|
જુઓ
|
||||||
|
જાઓ
|
||||||
|
જ્યારે
|
||||||
|
ત્યારે
|
||||||
|
શકો
|
||||||
|
નથી
|
||||||
|
હવે
|
||||||
|
અથવા
|
||||||
|
થતો
|
||||||
|
દર
|
||||||
|
એટલો
|
||||||
|
પરંતુ
|
||||||
|
""".split()
|
||||||
|
)
|
26
spacy/lang/hy/__init__.py
Normal file
26
spacy/lang/hy/__init__.py
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .tag_map import TAG_MAP
|
||||||
|
|
||||||
|
from ...attrs import LANG
|
||||||
|
from ...language import Language
|
||||||
|
|
||||||
|
|
||||||
|
class ArmenianDefaults(Language.Defaults):
|
||||||
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
|
lex_attr_getters[LANG] = lambda text: "hy"
|
||||||
|
|
||||||
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
tag_map = TAG_MAP
|
||||||
|
|
||||||
|
|
||||||
|
class Armenian(Language):
|
||||||
|
lang = "hy"
|
||||||
|
Defaults = ArmenianDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Armenian"]
|
16
spacy/lang/hy/examples.py
Normal file
16
spacy/lang/hy/examples.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
>>> from spacy.lang.hy.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Լոնդոնը Միացյալ Թագավորության մեծ քաղաք է։",
|
||||||
|
"Ո՞վ է Ֆրանսիայի նախագահը։",
|
||||||
|
"Որն է Միացյալ Նահանգների մայրաքաղաքը։",
|
||||||
|
"Ե՞րբ է ծնվել Բարաք Օբաման։",
|
||||||
|
]
|
59
spacy/lang/hy/lex_attrs.py
Normal file
59
spacy/lang/hy/lex_attrs.py
Normal file
|
@ -0,0 +1,59 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
|
_num_words = [
|
||||||
|
"զրօ",
|
||||||
|
"մէկ",
|
||||||
|
"երկու",
|
||||||
|
"երեք",
|
||||||
|
"չորս",
|
||||||
|
"հինգ",
|
||||||
|
"վեց",
|
||||||
|
"յոթ",
|
||||||
|
"ութ",
|
||||||
|
"ինը",
|
||||||
|
"տասը",
|
||||||
|
"տասնմեկ",
|
||||||
|
"տասներկու",
|
||||||
|
"տասներեք",
|
||||||
|
"տասնչորս",
|
||||||
|
"տասնհինգ",
|
||||||
|
"տասնվեց",
|
||||||
|
"տասնյոթ",
|
||||||
|
"տասնութ",
|
||||||
|
"տասնինը",
|
||||||
|
"քսան" "երեսուն",
|
||||||
|
"քառասուն",
|
||||||
|
"հիսուն",
|
||||||
|
"վաթցսուն",
|
||||||
|
"յոթանասուն",
|
||||||
|
"ութսուն",
|
||||||
|
"ինիսուն",
|
||||||
|
"հարյուր",
|
||||||
|
"հազար",
|
||||||
|
"միլիոն",
|
||||||
|
"միլիարդ",
|
||||||
|
"տրիլիոն",
|
||||||
|
"քվինտիլիոն",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
|
text = text[1:]
|
||||||
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
if text.count("/") == 1:
|
||||||
|
num, denom = text.split("/")
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
if text.lower() in _num_words:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
110
spacy/lang/hy/stop_words.py
Normal file
110
spacy/lang/hy/stop_words.py
Normal file
|
@ -0,0 +1,110 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
|
նա
|
||||||
|
ողջը
|
||||||
|
այստեղ
|
||||||
|
ենք
|
||||||
|
նա
|
||||||
|
էիր
|
||||||
|
որպես
|
||||||
|
ուրիշ
|
||||||
|
բոլորը
|
||||||
|
այն
|
||||||
|
այլ
|
||||||
|
նույնչափ
|
||||||
|
էի
|
||||||
|
մի
|
||||||
|
և
|
||||||
|
ողջ
|
||||||
|
ես
|
||||||
|
ոմն
|
||||||
|
հետ
|
||||||
|
նրանք
|
||||||
|
ամենքը
|
||||||
|
ըստ
|
||||||
|
ինչ-ինչ
|
||||||
|
այսպես
|
||||||
|
համայն
|
||||||
|
մի
|
||||||
|
նաև
|
||||||
|
նույնքան
|
||||||
|
դա
|
||||||
|
ովևէ
|
||||||
|
համար
|
||||||
|
այնտեղ
|
||||||
|
էին
|
||||||
|
որոնք
|
||||||
|
սույն
|
||||||
|
ինչ-որ
|
||||||
|
ամենը
|
||||||
|
նույնպիսի
|
||||||
|
ու
|
||||||
|
իր
|
||||||
|
որոշ
|
||||||
|
միևնույն
|
||||||
|
ի
|
||||||
|
այնպիսի
|
||||||
|
մենք
|
||||||
|
ամեն ոք
|
||||||
|
նույն
|
||||||
|
երբևէ
|
||||||
|
այն
|
||||||
|
որևէ
|
||||||
|
ին
|
||||||
|
այդպես
|
||||||
|
նրա
|
||||||
|
որը
|
||||||
|
վրա
|
||||||
|
դու
|
||||||
|
էինք
|
||||||
|
այդպիսի
|
||||||
|
էիք
|
||||||
|
յուրաքանչյուրը
|
||||||
|
եմ
|
||||||
|
պիտի
|
||||||
|
այդ
|
||||||
|
ամբողջը
|
||||||
|
հետո
|
||||||
|
եք
|
||||||
|
ամեն
|
||||||
|
այլ
|
||||||
|
կամ
|
||||||
|
այսքան
|
||||||
|
որ
|
||||||
|
այնպես
|
||||||
|
այսինչ
|
||||||
|
բոլոր
|
||||||
|
է
|
||||||
|
մեկնումեկը
|
||||||
|
այդչափ
|
||||||
|
այնքան
|
||||||
|
ամբողջ
|
||||||
|
երբևիցե
|
||||||
|
այնչափ
|
||||||
|
ամենայն
|
||||||
|
մյուս
|
||||||
|
այնինչ
|
||||||
|
իսկ
|
||||||
|
այդտեղ
|
||||||
|
այս
|
||||||
|
սա
|
||||||
|
են
|
||||||
|
ամեն ինչ
|
||||||
|
որևիցե
|
||||||
|
ում
|
||||||
|
մեկը
|
||||||
|
այդ
|
||||||
|
դուք
|
||||||
|
այսչափ
|
||||||
|
այդքան
|
||||||
|
այսպիսի
|
||||||
|
էր
|
||||||
|
յուրաքանչյուր
|
||||||
|
այս
|
||||||
|
մեջ
|
||||||
|
թ
|
||||||
|
""".split()
|
||||||
|
)
|
2478
spacy/lang/hy/tag_map.py
Normal file
2478
spacy/lang/hy/tag_map.py
Normal file
File diff suppressed because it is too large
Load Diff
|
@ -4,25 +4,20 @@ from __future__ import unicode_literals
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .norm_exceptions import NORM_EXCEPTIONS
|
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
class IndonesianDefaults(Language.Defaults):
|
class IndonesianDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: "id"
|
lex_attr_getters[LANG] = lambda text: "id"
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
|
|
||||||
)
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
|
|
|
@ -1,532 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
# Daftar kosakata yang sering salah dieja
|
|
||||||
# https://id.wikipedia.org/wiki/Wikipedia:Daftar_kosakata_bahasa_Indonesia_yang_sering_salah_dieja
|
|
||||||
_exc = {
|
|
||||||
# Slang and abbreviations
|
|
||||||
"silahkan": "silakan",
|
|
||||||
"yg": "yang",
|
|
||||||
"kalo": "kalau",
|
|
||||||
"cawu": "caturwulan",
|
|
||||||
"ok": "oke",
|
|
||||||
"gak": "tidak",
|
|
||||||
"enggak": "tidak",
|
|
||||||
"nggak": "tidak",
|
|
||||||
"ndak": "tidak",
|
|
||||||
"ngga": "tidak",
|
|
||||||
"dgn": "dengan",
|
|
||||||
"tdk": "tidak",
|
|
||||||
"jg": "juga",
|
|
||||||
"klo": "kalau",
|
|
||||||
"denger": "dengar",
|
|
||||||
"pinter": "pintar",
|
|
||||||
"krn": "karena",
|
|
||||||
"nemuin": "menemukan",
|
|
||||||
"jgn": "jangan",
|
|
||||||
"udah": "sudah",
|
|
||||||
"sy": "saya",
|
|
||||||
"udh": "sudah",
|
|
||||||
"dapetin": "mendapatkan",
|
|
||||||
"ngelakuin": "melakukan",
|
|
||||||
"ngebuat": "membuat",
|
|
||||||
"membikin": "membuat",
|
|
||||||
"bikin": "buat",
|
|
||||||
# Daftar kosakata yang sering salah dieja
|
|
||||||
"malpraktik": "malapraktik",
|
|
||||||
"malfungsi": "malafungsi",
|
|
||||||
"malserap": "malaserap",
|
|
||||||
"maladaptasi": "malaadaptasi",
|
|
||||||
"malsuai": "malasuai",
|
|
||||||
"maldistribusi": "maladistribusi",
|
|
||||||
"malgizi": "malagizi",
|
|
||||||
"malsikap": "malasikap",
|
|
||||||
"memperhatikan": "memerhatikan",
|
|
||||||
"akte": "akta",
|
|
||||||
"cemilan": "camilan",
|
|
||||||
"esei": "esai",
|
|
||||||
"frase": "frasa",
|
|
||||||
"kafeteria": "kafetaria",
|
|
||||||
"ketapel": "katapel",
|
|
||||||
"kenderaan": "kendaraan",
|
|
||||||
"menejemen": "manajemen",
|
|
||||||
"menejer": "manajer",
|
|
||||||
"mesjid": "masjid",
|
|
||||||
"rebo": "rabu",
|
|
||||||
"seksama": "saksama",
|
|
||||||
"senggama": "sanggama",
|
|
||||||
"sekedar": "sekadar",
|
|
||||||
"seprei": "seprai",
|
|
||||||
"semedi": "semadi",
|
|
||||||
"samadi": "semadi",
|
|
||||||
"amandemen": "amendemen",
|
|
||||||
"algoritma": "algoritme",
|
|
||||||
"aritmatika": "aritmetika",
|
|
||||||
"metoda": "metode",
|
|
||||||
"materai": "meterai",
|
|
||||||
"meterei": "meterai",
|
|
||||||
"kalendar": "kalender",
|
|
||||||
"kadaluwarsa": "kedaluwarsa",
|
|
||||||
"katagori": "kategori",
|
|
||||||
"parlamen": "parlemen",
|
|
||||||
"sekular": "sekuler",
|
|
||||||
"selular": "seluler",
|
|
||||||
"sirkular": "sirkuler",
|
|
||||||
"survai": "survei",
|
|
||||||
"survey": "survei",
|
|
||||||
"aktuil": "aktual",
|
|
||||||
"formil": "formal",
|
|
||||||
"trotoir": "trotoar",
|
|
||||||
"komersiil": "komersial",
|
|
||||||
"komersil": "komersial",
|
|
||||||
"tradisionil": "tradisionial",
|
|
||||||
"orisinil": "orisinal",
|
|
||||||
"orijinil": "orisinal",
|
|
||||||
"afdol": "afdal",
|
|
||||||
"antri": "antre",
|
|
||||||
"apotik": "apotek",
|
|
||||||
"atlit": "atlet",
|
|
||||||
"atmosfir": "atmosfer",
|
|
||||||
"cidera": "cedera",
|
|
||||||
"cendikiawan": "cendekiawan",
|
|
||||||
"cepet": "cepat",
|
|
||||||
"cinderamata": "cenderamata",
|
|
||||||
"debet": "debit",
|
|
||||||
"difinisi": "definisi",
|
|
||||||
"dekrit": "dekret",
|
|
||||||
"disain": "desain",
|
|
||||||
"diskripsi": "deskripsi",
|
|
||||||
"diskotik": "diskotek",
|
|
||||||
"eksim": "eksem",
|
|
||||||
"exim": "eksem",
|
|
||||||
"faidah": "faedah",
|
|
||||||
"ekstrim": "ekstrem",
|
|
||||||
"ekstrimis": "ekstremis",
|
|
||||||
"komplit": "komplet",
|
|
||||||
"konkrit": "konkret",
|
|
||||||
"kongkrit": "konkret",
|
|
||||||
"kongkret": "konkret",
|
|
||||||
"kridit": "kredit",
|
|
||||||
"musium": "museum",
|
|
||||||
"pinalti": "penalti",
|
|
||||||
"piranti": "peranti",
|
|
||||||
"pinsil": "pensil",
|
|
||||||
"personil": "personel",
|
|
||||||
"sistim": "sistem",
|
|
||||||
"teoritis": "teoretis",
|
|
||||||
"vidio": "video",
|
|
||||||
"cengkeh": "cengkih",
|
|
||||||
"desertasi": "disertasi",
|
|
||||||
"hakekat": "hakikat",
|
|
||||||
"intelejen": "intelijen",
|
|
||||||
"kaedah": "kaidah",
|
|
||||||
"kempes": "kempis",
|
|
||||||
"kementrian": "kementerian",
|
|
||||||
"ledeng": "leding",
|
|
||||||
"nasehat": "nasihat",
|
|
||||||
"penasehat": "penasihat",
|
|
||||||
"praktek": "praktik",
|
|
||||||
"praktekum": "praktikum",
|
|
||||||
"resiko": "risiko",
|
|
||||||
"retsleting": "ritsleting",
|
|
||||||
"senen": "senin",
|
|
||||||
"amuba": "ameba",
|
|
||||||
"punggawa": "penggawa",
|
|
||||||
"surban": "serban",
|
|
||||||
"nomer": "nomor",
|
|
||||||
"sorban": "serban",
|
|
||||||
"bis": "bus",
|
|
||||||
"agribisnis": "agrobisnis",
|
|
||||||
"kantung": "kantong",
|
|
||||||
"khutbah": "khotbah",
|
|
||||||
"mandur": "mandor",
|
|
||||||
"rubuh": "roboh",
|
|
||||||
"pastur": "pastor",
|
|
||||||
"supir": "sopir",
|
|
||||||
"goncang": "guncang",
|
|
||||||
"goa": "gua",
|
|
||||||
"kaos": "kaus",
|
|
||||||
"kokoh": "kukuh",
|
|
||||||
"komulatif": "kumulatif",
|
|
||||||
"kolomnis": "kolumnis",
|
|
||||||
"korma": "kurma",
|
|
||||||
"lobang": "lubang",
|
|
||||||
"limo": "limusin",
|
|
||||||
"limosin": "limusin",
|
|
||||||
"mangkok": "mangkuk",
|
|
||||||
"saos": "saus",
|
|
||||||
"sop": "sup",
|
|
||||||
"sorga": "surga",
|
|
||||||
"tegor": "tegur",
|
|
||||||
"telor": "telur",
|
|
||||||
"obrak-abrik": "ubrak-abrik",
|
|
||||||
"ekwivalen": "ekuivalen",
|
|
||||||
"frekwensi": "frekuensi",
|
|
||||||
"konsekwensi": "konsekuensi",
|
|
||||||
"kwadran": "kuadran",
|
|
||||||
"kwadrat": "kuadrat",
|
|
||||||
"kwalifikasi": "kualifikasi",
|
|
||||||
"kwalitas": "kualitas",
|
|
||||||
"kwalitet": "kualitas",
|
|
||||||
"kwalitatif": "kualitatif",
|
|
||||||
"kwantitas": "kuantitas",
|
|
||||||
"kwantitatif": "kuantitatif",
|
|
||||||
"kwantum": "kuantum",
|
|
||||||
"kwartal": "kuartal",
|
|
||||||
"kwintal": "kuintal",
|
|
||||||
"kwitansi": "kuitansi",
|
|
||||||
"kwatir": "khawatir",
|
|
||||||
"kuatir": "khawatir",
|
|
||||||
"jadual": "jadwal",
|
|
||||||
"hirarki": "hierarki",
|
|
||||||
"karir": "karier",
|
|
||||||
"aktip": "aktif",
|
|
||||||
"daptar": "daftar",
|
|
||||||
"efektip": "efektif",
|
|
||||||
"epektif": "efektif",
|
|
||||||
"epektip": "efektif",
|
|
||||||
"Pebruari": "Februari",
|
|
||||||
"pisik": "fisik",
|
|
||||||
"pondasi": "fondasi",
|
|
||||||
"photo": "foto",
|
|
||||||
"photokopi": "fotokopi",
|
|
||||||
"hapal": "hafal",
|
|
||||||
"insap": "insaf",
|
|
||||||
"insyaf": "insaf",
|
|
||||||
"konperensi": "konferensi",
|
|
||||||
"kreatip": "kreatif",
|
|
||||||
"kreativ": "kreatif",
|
|
||||||
"maap": "maaf",
|
|
||||||
"napsu": "nafsu",
|
|
||||||
"negatip": "negatif",
|
|
||||||
"negativ": "negatif",
|
|
||||||
"objektip": "objektif",
|
|
||||||
"obyektip": "objektif",
|
|
||||||
"obyektif": "objektif",
|
|
||||||
"pasip": "pasif",
|
|
||||||
"pasiv": "pasif",
|
|
||||||
"positip": "positif",
|
|
||||||
"positiv": "positif",
|
|
||||||
"produktip": "produktif",
|
|
||||||
"produktiv": "produktif",
|
|
||||||
"sarap": "saraf",
|
|
||||||
"sertipikat": "sertifikat",
|
|
||||||
"subjektip": "subjektif",
|
|
||||||
"subyektip": "subjektif",
|
|
||||||
"subyektif": "subjektif",
|
|
||||||
"tarip": "tarif",
|
|
||||||
"transitip": "transitif",
|
|
||||||
"transitiv": "transitif",
|
|
||||||
"faham": "paham",
|
|
||||||
"fikir": "pikir",
|
|
||||||
"berfikir": "berpikir",
|
|
||||||
"telefon": "telepon",
|
|
||||||
"telfon": "telepon",
|
|
||||||
"telpon": "telepon",
|
|
||||||
"tilpon": "telepon",
|
|
||||||
"nafas": "napas",
|
|
||||||
"bernafas": "bernapas",
|
|
||||||
"pernafasan": "pernapasan",
|
|
||||||
"vermak": "permak",
|
|
||||||
"vulpen": "pulpen",
|
|
||||||
"aktifis": "aktivis",
|
|
||||||
"konfeksi": "konveksi",
|
|
||||||
"motifasi": "motivasi",
|
|
||||||
"Nopember": "November",
|
|
||||||
"propinsi": "provinsi",
|
|
||||||
"babtis": "baptis",
|
|
||||||
"jerembab": "jerembap",
|
|
||||||
"lembab": "lembap",
|
|
||||||
"sembab": "sembap",
|
|
||||||
"saptu": "sabtu",
|
|
||||||
"tekat": "tekad",
|
|
||||||
"bejad": "bejat",
|
|
||||||
"nekad": "nekat",
|
|
||||||
"otoped": "otopet",
|
|
||||||
"skuad": "skuat",
|
|
||||||
"jenius": "genius",
|
|
||||||
"marjin": "margin",
|
|
||||||
"marjinal": "marginal",
|
|
||||||
"obyek": "objek",
|
|
||||||
"subyek": "subjek",
|
|
||||||
"projek": "proyek",
|
|
||||||
"azas": "asas",
|
|
||||||
"ijasah": "ijazah",
|
|
||||||
"jenasah": "jenazah",
|
|
||||||
"plasa": "plaza",
|
|
||||||
"bathin": "batin",
|
|
||||||
"Katholik": "Katolik",
|
|
||||||
"orthografi": "ortografi",
|
|
||||||
"pathogen": "patogen",
|
|
||||||
"theologi": "teologi",
|
|
||||||
"ijin": "izin",
|
|
||||||
"rejeki": "rezeki",
|
|
||||||
"rejim": "rezim",
|
|
||||||
"jaman": "zaman",
|
|
||||||
"jamrud": "zamrud",
|
|
||||||
"jinah": "zina",
|
|
||||||
"perjinahan": "perzinaan",
|
|
||||||
"anugrah": "anugerah",
|
|
||||||
"cendrawasih": "cenderawasih",
|
|
||||||
"jendral": "jenderal",
|
|
||||||
"kripik": "keripik",
|
|
||||||
"krupuk": "kerupuk",
|
|
||||||
"ksatria": "kesatria",
|
|
||||||
"mentri": "menteri",
|
|
||||||
"negri": "negeri",
|
|
||||||
"Prancis": "Perancis",
|
|
||||||
"sebrang": "seberang",
|
|
||||||
"menyebrang": "menyeberang",
|
|
||||||
"Sumatra": "Sumatera",
|
|
||||||
"trampil": "terampil",
|
|
||||||
"isteri": "istri",
|
|
||||||
"justeru": "justru",
|
|
||||||
"perajurit": "prajurit",
|
|
||||||
"putera": "putra",
|
|
||||||
"puteri": "putri",
|
|
||||||
"samudera": "samudra",
|
|
||||||
"sastera": "sastra",
|
|
||||||
"sutera": "sutra",
|
|
||||||
"terompet": "trompet",
|
|
||||||
"iklas": "ikhlas",
|
|
||||||
"iktisar": "ikhtisar",
|
|
||||||
"kafilah": "khafilah",
|
|
||||||
"kawatir": "khawatir",
|
|
||||||
"kotbah": "khotbah",
|
|
||||||
"kusyuk": "khusyuk",
|
|
||||||
"makluk": "makhluk",
|
|
||||||
"mahluk": "makhluk",
|
|
||||||
"mahkluk": "makhluk",
|
|
||||||
"nahkoda": "nakhoda",
|
|
||||||
"nakoda": "nakhoda",
|
|
||||||
"tahta": "takhta",
|
|
||||||
"takhyul": "takhayul",
|
|
||||||
"tahyul": "takhayul",
|
|
||||||
"tahayul": "takhayul",
|
|
||||||
"akhli": "ahli",
|
|
||||||
"anarkhi": "anarki",
|
|
||||||
"kharisma": "karisma",
|
|
||||||
"kharismatik": "karismatik",
|
|
||||||
"mahsud": "maksud",
|
|
||||||
"makhsud": "maksud",
|
|
||||||
"rakhmat": "rahmat",
|
|
||||||
"tekhnik": "teknik",
|
|
||||||
"tehnik": "teknik",
|
|
||||||
"tehnologi": "teknologi",
|
|
||||||
"ikhwal": "ihwal",
|
|
||||||
"expor": "ekspor",
|
|
||||||
"extra": "ekstra",
|
|
||||||
"komplex": "komplek",
|
|
||||||
"sex": "seks",
|
|
||||||
"taxi": "taksi",
|
|
||||||
"extasi": "ekstasi",
|
|
||||||
"syaraf": "saraf",
|
|
||||||
"syurga": "surga",
|
|
||||||
"mashur": "masyhur",
|
|
||||||
"masyur": "masyhur",
|
|
||||||
"mahsyur": "masyhur",
|
|
||||||
"mashyur": "masyhur",
|
|
||||||
"muadzin": "muazin",
|
|
||||||
"adzan": "azan",
|
|
||||||
"ustadz": "ustaz",
|
|
||||||
"ustad": "ustaz",
|
|
||||||
"ustadzah": "ustaz",
|
|
||||||
"dzikir": "zikir",
|
|
||||||
"dzuhur": "zuhur",
|
|
||||||
"dhuhur": "zuhur",
|
|
||||||
"zhuhur": "zuhur",
|
|
||||||
"analisa": "analisis",
|
|
||||||
"diagnosa": "diagnosis",
|
|
||||||
"hipotesa": "hipotesis",
|
|
||||||
"sintesa": "sintesis",
|
|
||||||
"aktiviti": "aktivitas",
|
|
||||||
"aktifitas": "aktivitas",
|
|
||||||
"efektifitas": "efektivitas",
|
|
||||||
"komuniti": "komunitas",
|
|
||||||
"kreatifitas": "kreativitas",
|
|
||||||
"produktifitas": "produktivitas",
|
|
||||||
"realiti": "realitas",
|
|
||||||
"realita": "realitas",
|
|
||||||
"selebriti": "selebritas",
|
|
||||||
"spotifitas": "sportivitas",
|
|
||||||
"universiti": "universitas",
|
|
||||||
"utiliti": "utilitas",
|
|
||||||
"validiti": "validitas",
|
|
||||||
"dilokalisir": "dilokalisasi",
|
|
||||||
"didramatisir": "didramatisasi",
|
|
||||||
"dipolitisir": "dipolitisasi",
|
|
||||||
"dinetralisir": "dinetralisasi",
|
|
||||||
"dikonfrontir": "dikonfrontasi",
|
|
||||||
"mendominir": "mendominasi",
|
|
||||||
"koordinir": "koordinasi",
|
|
||||||
"proklamir": "proklamasi",
|
|
||||||
"terorganisir": "terorganisasi",
|
|
||||||
"terealisir": "terealisasi",
|
|
||||||
"robah": "ubah",
|
|
||||||
"dirubah": "diubah",
|
|
||||||
"merubah": "mengubah",
|
|
||||||
"terlanjur": "telanjur",
|
|
||||||
"terlantar": "telantar",
|
|
||||||
"penglepasan": "pelepasan",
|
|
||||||
"pelihatan": "penglihatan",
|
|
||||||
"pemukiman": "permukiman",
|
|
||||||
"pengrumahan": "perumahan",
|
|
||||||
"penyewaan": "persewaan",
|
|
||||||
"menyintai": "mencintai",
|
|
||||||
"menyolok": "mencolok",
|
|
||||||
"contek": "sontek",
|
|
||||||
"mencontek": "menyontek",
|
|
||||||
"pungkir": "mungkir",
|
|
||||||
"dipungkiri": "dimungkiri",
|
|
||||||
"kupungkiri": "kumungkiri",
|
|
||||||
"kaupungkiri": "kaumungkiri",
|
|
||||||
"nampak": "tampak",
|
|
||||||
"nampaknya": "tampaknya",
|
|
||||||
"nongkrong": "tongkrong",
|
|
||||||
"berternak": "beternak",
|
|
||||||
"berterbangan": "beterbangan",
|
|
||||||
"berserta": "beserta",
|
|
||||||
"berperkara": "beperkara",
|
|
||||||
"berpergian": "bepergian",
|
|
||||||
"berkerja": "bekerja",
|
|
||||||
"berberapa": "beberapa",
|
|
||||||
"terbersit": "tebersit",
|
|
||||||
"terpercaya": "tepercaya",
|
|
||||||
"terperdaya": "teperdaya",
|
|
||||||
"terpercik": "tepercik",
|
|
||||||
"terpergok": "tepergok",
|
|
||||||
"aksesoris": "aksesori",
|
|
||||||
"handal": "andal",
|
|
||||||
"hantar": "antar",
|
|
||||||
"panutan": "anutan",
|
|
||||||
"atsiri": "asiri",
|
|
||||||
"bhakti": "bakti",
|
|
||||||
"china": "cina",
|
|
||||||
"dharma": "darma",
|
|
||||||
"diktaktor": "diktator",
|
|
||||||
"eksport": "ekspor",
|
|
||||||
"hembus": "embus",
|
|
||||||
"hadits": "hadis",
|
|
||||||
"hadist": "hadits",
|
|
||||||
"harafiah": "harfiah",
|
|
||||||
"himbau": "imbau",
|
|
||||||
"import": "impor",
|
|
||||||
"inget": "ingat",
|
|
||||||
"hisap": "isap",
|
|
||||||
"interprestasi": "interpretasi",
|
|
||||||
"kangker": "kanker",
|
|
||||||
"konggres": "kongres",
|
|
||||||
"lansekap": "lanskap",
|
|
||||||
"maghrib": "magrib",
|
|
||||||
"emak": "mak",
|
|
||||||
"moderen": "modern",
|
|
||||||
"pasport": "paspor",
|
|
||||||
"perduli": "peduli",
|
|
||||||
"ramadhan": "ramadan",
|
|
||||||
"rapih": "rapi",
|
|
||||||
"Sansekerta": "Sanskerta",
|
|
||||||
"shalat": "salat",
|
|
||||||
"sholat": "salat",
|
|
||||||
"silahkan": "silakan",
|
|
||||||
"standard": "standar",
|
|
||||||
"hutang": "utang",
|
|
||||||
"zinah": "zina",
|
|
||||||
"ambulan": "ambulans",
|
|
||||||
"antartika": "sntarktika",
|
|
||||||
"arteri": "arteria",
|
|
||||||
"asik": "asyik",
|
|
||||||
"australi": "australia",
|
|
||||||
"denga": "dengan",
|
|
||||||
"depo": "depot",
|
|
||||||
"detil": "detail",
|
|
||||||
"ensiklopedi": "ensiklopedia",
|
|
||||||
"elit": "elite",
|
|
||||||
"frustasi": "frustrasi",
|
|
||||||
"gladi": "geladi",
|
|
||||||
"greget": "gereget",
|
|
||||||
"itali": "italia",
|
|
||||||
"karna": "karena",
|
|
||||||
"klenteng": "kelenteng",
|
|
||||||
"erling": "kerling",
|
|
||||||
"kontruksi": "konstruksi",
|
|
||||||
"masal": "massal",
|
|
||||||
"merk": "merek",
|
|
||||||
"respon": "respons",
|
|
||||||
"diresponi": "direspons",
|
|
||||||
"skak": "sekak",
|
|
||||||
"stir": "setir",
|
|
||||||
"singapur": "singapura",
|
|
||||||
"standarisasi": "standardisasi",
|
|
||||||
"varitas": "varietas",
|
|
||||||
"amphibi": "amfibi",
|
|
||||||
"anjlog": "anjlok",
|
|
||||||
"alpukat": "avokad",
|
|
||||||
"alpokat": "avokad",
|
|
||||||
"bolpen": "pulpen",
|
|
||||||
"cabe": "cabai",
|
|
||||||
"cabay": "cabai",
|
|
||||||
"ceret": "cerek",
|
|
||||||
"differensial": "diferensial",
|
|
||||||
"duren": "durian",
|
|
||||||
"faksimili": "faksimile",
|
|
||||||
"faksimil": "faksimile",
|
|
||||||
"graha": "gerha",
|
|
||||||
"goblog": "goblok",
|
|
||||||
"gombrong": "gombroh",
|
|
||||||
"horden": "gorden",
|
|
||||||
"korden": "gorden",
|
|
||||||
"gubug": "gubuk",
|
|
||||||
"imaginasi": "imajinasi",
|
|
||||||
"jerigen": "jeriken",
|
|
||||||
"jirigen": "jeriken",
|
|
||||||
"carut-marut": "karut-marut",
|
|
||||||
"kwota": "kuota",
|
|
||||||
"mahzab": "mazhab",
|
|
||||||
"mempesona": "memesona",
|
|
||||||
"milyar": "miliar",
|
|
||||||
"missi": "misi",
|
|
||||||
"nenas": "nanas",
|
|
||||||
"negoisasi": "negosiasi",
|
|
||||||
"automotif": "otomotif",
|
|
||||||
"pararel": "paralel",
|
|
||||||
"paska": "pasca",
|
|
||||||
"prosen": "persen",
|
|
||||||
"pete": "petai",
|
|
||||||
"petay": "petai",
|
|
||||||
"proffesor": "profesor",
|
|
||||||
"rame": "ramai",
|
|
||||||
"rapot": "rapor",
|
|
||||||
"rileks": "relaks",
|
|
||||||
"rileksasi": "relaksasi",
|
|
||||||
"renumerasi": "remunerasi",
|
|
||||||
"seketaris": "sekretaris",
|
|
||||||
"sekertaris": "sekretaris",
|
|
||||||
"sensorik": "sensoris",
|
|
||||||
"sentausa": "sentosa",
|
|
||||||
"strawberi": "stroberi",
|
|
||||||
"strawbery": "stroberi",
|
|
||||||
"taqwa": "takwa",
|
|
||||||
"tauco": "taoco",
|
|
||||||
"tauge": "taoge",
|
|
||||||
"toge": "taoge",
|
|
||||||
"tauladan": "teladan",
|
|
||||||
"taubat": "tobat",
|
|
||||||
"trilyun": "triliun",
|
|
||||||
"vissi": "visi",
|
|
||||||
"coklat": "cokelat",
|
|
||||||
"narkotika": "narkotik",
|
|
||||||
"oase": "oasis",
|
|
||||||
"politisi": "politikus",
|
|
||||||
"terong": "terung",
|
|
||||||
"wool": "wol",
|
|
||||||
"himpit": "impit",
|
|
||||||
"mujizat": "mukjizat",
|
|
||||||
"mujijat": "mukjizat",
|
|
||||||
"yag": "yang",
|
|
||||||
}
|
|
||||||
|
|
||||||
NORM_EXCEPTIONS = {}
|
|
||||||
|
|
||||||
for string, norm in _exc.items():
|
|
||||||
NORM_EXCEPTIONS[string] = norm
|
|
||||||
NORM_EXCEPTIONS[string.title()] = norm
|
|
|
@ -2,9 +2,10 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(obj):
|
def noun_chunks(doclike):
|
||||||
"""
|
"""
|
||||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
"""
|
"""
|
||||||
|
@ -18,21 +19,23 @@ def noun_chunks(obj):
|
||||||
"nmod",
|
"nmod",
|
||||||
"nmod:poss",
|
"nmod:poss",
|
||||||
]
|
]
|
||||||
doc = obj.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
|
if not doc.is_parsed:
|
||||||
|
raise ValueError(Errors.E029)
|
||||||
|
|
||||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
seen = set()
|
prev_end = -1
|
||||||
for i, word in enumerate(obj):
|
for i, word in enumerate(doclike):
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
continue
|
continue
|
||||||
# Prevent nested chunks from being produced
|
# Prevent nested chunks from being produced
|
||||||
if word.i in seen:
|
if word.left_edge.i <= prev_end:
|
||||||
continue
|
continue
|
||||||
if word.dep in np_deps:
|
if word.dep in np_deps:
|
||||||
if any(w.i in seen for w in word.subtree):
|
prev_end = word.right_edge.i
|
||||||
continue
|
|
||||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
|
|
||||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||||
elif word.dep == conj:
|
elif word.dep == conj:
|
||||||
head = word.head
|
head = word.head
|
||||||
|
@ -40,9 +43,7 @@ def noun_chunks(obj):
|
||||||
head = head.head
|
head = head.head
|
||||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||||
if head.dep in np_deps:
|
if head.dep in np_deps:
|
||||||
if any(w.i in seen for w in word.subtree):
|
prev_end = word.right_edge.i
|
||||||
continue
|
|
||||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
|
|
||||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
|
@ -22,6 +22,7 @@ class ItalianDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
|
prefixes = TOKENIZER_PREFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,15 +1,32 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..punctuation import TOKENIZER_INFIXES
|
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
|
||||||
from ..char_classes import ALPHA
|
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
||||||
|
from ..char_classes import ALPHA, HYPHENS, CONCAT_QUOTES
|
||||||
|
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER
|
||||||
|
|
||||||
|
|
||||||
ELISION = " ' ’ ".strip().replace(" ", "")
|
ELISION = "'’"
|
||||||
|
|
||||||
|
|
||||||
_infixes = TOKENIZER_INFIXES + [
|
_prefixes = [r"'[0-9][0-9]", r"[0-9]+°"] + BASE_TOKENIZER_PREFIXES
|
||||||
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
|
|
||||||
|
|
||||||
|
_infixes = (
|
||||||
|
LIST_ELLIPSES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
|
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||||
|
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||||
|
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||||
|
),
|
||||||
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}])(?:{h})(?=[{al}])".format(a=ALPHA, h=HYPHENS, al=ALPHA_LOWER),
|
||||||
|
r"(?<=[{a}0-9])[:<>=\/](?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}][{el}])(?=[{a}0-9\"])".format(a=ALPHA, el=ELISION),
|
||||||
]
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
TOKENIZER_PREFIXES = _prefixes
|
||||||
TOKENIZER_INFIXES = _infixes
|
TOKENIZER_INFIXES = _infixes
|
||||||
|
|
|
@ -2,6 +2,56 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from ...symbols import ORTH, LEMMA
|
from ...symbols import ORTH, LEMMA
|
||||||
|
|
||||||
_exc = {"po'": [{ORTH: "po'", LEMMA: "poco"}]}
|
_exc = {
|
||||||
|
"all'art.": [{ORTH: "all'"}, {ORTH: "art."}],
|
||||||
|
"dall'art.": [{ORTH: "dall'"}, {ORTH: "art."}],
|
||||||
|
"dell'art.": [{ORTH: "dell'"}, {ORTH: "art."}],
|
||||||
|
"L'art.": [{ORTH: "L'"}, {ORTH: "art."}],
|
||||||
|
"l'art.": [{ORTH: "l'"}, {ORTH: "art."}],
|
||||||
|
"nell'art.": [{ORTH: "nell'"}, {ORTH: "art."}],
|
||||||
|
"po'": [{ORTH: "po'", LEMMA: "poco"}],
|
||||||
|
"sett..": [{ORTH: "sett."}, {ORTH: "."}],
|
||||||
|
}
|
||||||
|
|
||||||
|
for orth in [
|
||||||
|
"..",
|
||||||
|
"....",
|
||||||
|
"al.",
|
||||||
|
"all-path",
|
||||||
|
"art.",
|
||||||
|
"Art.",
|
||||||
|
"artt.",
|
||||||
|
"att.",
|
||||||
|
"by-pass",
|
||||||
|
"c.d.",
|
||||||
|
"centro-sinistra",
|
||||||
|
"check-up",
|
||||||
|
"Civ.",
|
||||||
|
"cm.",
|
||||||
|
"Cod.",
|
||||||
|
"col.",
|
||||||
|
"Cost.",
|
||||||
|
"d.C.",
|
||||||
|
'de"',
|
||||||
|
"distr.",
|
||||||
|
"E'",
|
||||||
|
"ecc.",
|
||||||
|
"e-mail",
|
||||||
|
"e/o",
|
||||||
|
"etc.",
|
||||||
|
"Jr.",
|
||||||
|
"n°",
|
||||||
|
"nord-est",
|
||||||
|
"pag.",
|
||||||
|
"Proc.",
|
||||||
|
"prof.",
|
||||||
|
"sett.",
|
||||||
|
"s.p.a.",
|
||||||
|
"ss.",
|
||||||
|
"St.",
|
||||||
|
"tel.",
|
||||||
|
"week-end",
|
||||||
|
]:
|
||||||
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = _exc
|
||||||
|
|
22
spacy/lang/kn/examples.py
Normal file
22
spacy/lang/kn/examples.py
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.en.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"ಆಪಲ್ ಒಂದು ಯು.ಕೆ. ಸ್ಟಾರ್ಟ್ಅಪ್ ಅನ್ನು ೧ ಶತಕೋಟಿ ಡಾಲರ್ಗಳಿಗೆ ಖರೀದಿಸಲು ನೋಡುತ್ತಿದೆ.",
|
||||||
|
"ಸ್ವಾಯತ್ತ ಕಾರುಗಳು ವಿಮಾ ಹೊಣೆಗಾರಿಕೆಯನ್ನು ತಯಾರಕರ ಕಡೆಗೆ ಬದಲಾಯಿಸುತ್ತವೆ.",
|
||||||
|
"ಕಾಲುದಾರಿ ವಿತರಣಾ ರೋಬೋಟ್ಗಳನ್ನು ನಿಷೇಧಿಸುವುದನ್ನು ಸ್ಯಾನ್ ಫ್ರಾನ್ಸಿಸ್ಕೊ ಪರಿಗಣಿಸುತ್ತದೆ.",
|
||||||
|
"ಲಂಡನ್ ಯುನೈಟೆಡ್ ಕಿಂಗ್ಡಂನ ದೊಡ್ಡ ನಗರ.",
|
||||||
|
"ನೀನು ಎಲ್ಲಿದಿಯಾ?",
|
||||||
|
"ಫ್ರಾನ್ಸಾದ ಅಧ್ಯಕ್ಷರು ಯಾರು?",
|
||||||
|
"ಯುನೈಟೆಡ್ ಸ್ಟೇಟ್ಸ್ನ ರಾಜಧಾನಿ ಯಾವುದು?",
|
||||||
|
"ಬರಾಕ್ ಒಬಾಮ ಯಾವಾಗ ಜನಿಸಿದರು?",
|
||||||
|
]
|
|
@ -9,8 +9,8 @@ Example sentences to test spaCy and its language models.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
sentences = [
|
sentences = [
|
||||||
"애플이 영국의 신생 기업을 10억 달러에 구매를 고려중이다.",
|
"애플이 영국의 스타트업을 10억 달러에 인수하는 것을 알아보고 있다.",
|
||||||
"자동 운전 자동차의 손해 배상 책임에 자동차 메이커에 일정한 부담을 요구하겠다.",
|
"자율주행 자동차의 손해 배상 책임이 제조 업체로 옮겨 가다",
|
||||||
"자동 배달 로봇이 보도를 주행하는 것을 샌프란시스코시가 금지를 검토중이라고 합니다.",
|
"샌프란시스코 시가 자동 배달 로봇의 보도 주행 금지를 검토 중이라고 합니다.",
|
||||||
"런던은 영국의 수도이자 가장 큰 도시입니다.",
|
"런던은 영국의 수도이자 가장 큰 도시입니다.",
|
||||||
]
|
]
|
||||||
|
|
|
@ -2,26 +2,21 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .norm_exceptions import NORM_EXCEPTIONS
|
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
class LuxembourgishDefaults(Language.Defaults):
|
class LuxembourgishDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
lex_attr_getters[LANG] = lambda text: "lb"
|
lex_attr_getters[LANG] = lambda text: "lb"
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
|
||||||
Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS
|
|
||||||
)
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
|
|
|
@ -1,16 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
# TODO
|
|
||||||
# norm execptions: find a possibility to deal with the zillions of spelling
|
|
||||||
# variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.)
|
|
||||||
# here one could include the most common spelling mistakes
|
|
||||||
|
|
||||||
_exc = {"dass": "datt", "viläicht": "vläicht"}
|
|
||||||
|
|
||||||
|
|
||||||
NORM_EXCEPTIONS = {}
|
|
||||||
|
|
||||||
for string, norm in _exc.items():
|
|
||||||
NORM_EXCEPTIONS[string] = norm
|
|
||||||
NORM_EXCEPTIONS[string.title()] = norm
|
|
|
@ -186,10 +186,6 @@ def suffix(string):
|
||||||
return string[-3:]
|
return string[-3:]
|
||||||
|
|
||||||
|
|
||||||
def cluster(string):
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
def is_alpha(string):
|
def is_alpha(string):
|
||||||
return string.isalpha()
|
return string.isalpha()
|
||||||
|
|
||||||
|
@ -218,20 +214,11 @@ def is_stop(string, stops=set()):
|
||||||
return string.lower() in stops
|
return string.lower() in stops
|
||||||
|
|
||||||
|
|
||||||
def is_oov(string):
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def get_prob(string):
|
|
||||||
return -20.0
|
|
||||||
|
|
||||||
|
|
||||||
LEX_ATTRS = {
|
LEX_ATTRS = {
|
||||||
attrs.LOWER: lower,
|
attrs.LOWER: lower,
|
||||||
attrs.NORM: lower,
|
attrs.NORM: lower,
|
||||||
attrs.PREFIX: prefix,
|
attrs.PREFIX: prefix,
|
||||||
attrs.SUFFIX: suffix,
|
attrs.SUFFIX: suffix,
|
||||||
attrs.CLUSTER: cluster,
|
|
||||||
attrs.IS_ALPHA: is_alpha,
|
attrs.IS_ALPHA: is_alpha,
|
||||||
attrs.IS_DIGIT: is_digit,
|
attrs.IS_DIGIT: is_digit,
|
||||||
attrs.IS_LOWER: is_lower,
|
attrs.IS_LOWER: is_lower,
|
||||||
|
@ -239,8 +226,6 @@ LEX_ATTRS = {
|
||||||
attrs.IS_TITLE: is_title,
|
attrs.IS_TITLE: is_title,
|
||||||
attrs.IS_UPPER: is_upper,
|
attrs.IS_UPPER: is_upper,
|
||||||
attrs.IS_STOP: is_stop,
|
attrs.IS_STOP: is_stop,
|
||||||
attrs.IS_OOV: is_oov,
|
|
||||||
attrs.PROB: get_prob,
|
|
||||||
attrs.LIKE_EMAIL: like_email,
|
attrs.LIKE_EMAIL: like_email,
|
||||||
attrs.LIKE_NUM: like_num,
|
attrs.LIKE_NUM: like_num,
|
||||||
attrs.IS_PUNCT: is_punct,
|
attrs.IS_PUNCT: is_punct,
|
||||||
|
|
31
spacy/lang/lij/__init__.py
Normal file
31
spacy/lang/lij/__init__.py
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
from ..norm_exceptions import BASE_NORMS
|
||||||
|
from ...language import Language
|
||||||
|
from ...attrs import LANG, NORM
|
||||||
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
|
|
||||||
|
class LigurianDefaults(Language.Defaults):
|
||||||
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
|
lex_attr_getters[LANG] = lambda text: "lij"
|
||||||
|
lex_attr_getters[NORM] = add_lookups(
|
||||||
|
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||||
|
)
|
||||||
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
infixes = TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
|
class Ligurian(Language):
|
||||||
|
lang = "lij"
|
||||||
|
Defaults = LigurianDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Ligurian"]
|
18
spacy/lang/lij/examples.py
Normal file
18
spacy/lang/lij/examples.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.lij.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Sciusciâ e sciorbî no se peu.",
|
||||||
|
"Graçie di çetroin, che me son arrivæ.",
|
||||||
|
"Vegnime apreuvo, che ve fasso pescâ di òmmi.",
|
||||||
|
"Bella pe sempre l'ægua inta conchetta quande unn'agoggia d'ægua a se â trapaña.",
|
||||||
|
]
|
15
spacy/lang/lij/punctuation.py
Normal file
15
spacy/lang/lij/punctuation.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..punctuation import TOKENIZER_INFIXES
|
||||||
|
from ..char_classes import ALPHA
|
||||||
|
|
||||||
|
|
||||||
|
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
|
||||||
|
|
||||||
|
|
||||||
|
_infixes = TOKENIZER_INFIXES + [
|
||||||
|
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
|
||||||
|
]
|
||||||
|
|
||||||
|
TOKENIZER_INFIXES = _infixes
|
43
spacy/lang/lij/stop_words.py
Normal file
43
spacy/lang/lij/stop_words.py
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
|
a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuvo ascì atra atre atri atro avanti avei
|
||||||
|
|
||||||
|
bella belle belli bello ben
|
||||||
|
|
||||||
|
ch' che chì chi ciù co-a co-e co-i co-o comm' comme con cösa coscì cöse
|
||||||
|
|
||||||
|
d' da da-a da-e da-i da-o dapeu de delongo derê di do doe doî donde dòppo
|
||||||
|
|
||||||
|
é e ê ea ean emmo en ëse
|
||||||
|
|
||||||
|
fin fiña
|
||||||
|
|
||||||
|
gh' ghe guæei
|
||||||
|
|
||||||
|
i î in insemme int' inta inte inti into
|
||||||
|
|
||||||
|
l' lê lì lô
|
||||||
|
|
||||||
|
m' ma manco me megio meno mezo mi
|
||||||
|
|
||||||
|
na n' ne ni ninte nisciun nisciuña no
|
||||||
|
|
||||||
|
o ò ô oua
|
||||||
|
|
||||||
|
parte pe pe-a pe-i pe-e pe-o perché pittin pö primma pròpio
|
||||||
|
|
||||||
|
quæ quand' quande quarche quella quelle quelli quello
|
||||||
|
|
||||||
|
s' sce scê sci sciâ sciô sciù se segge seu sò solo son sott' sta stæta stæte stæti stæto ste sti sto
|
||||||
|
|
||||||
|
tanta tante tanti tanto te ti torna tra tròppo tutta tutte tutti tutto
|
||||||
|
|
||||||
|
un uña unn' unna
|
||||||
|
|
||||||
|
za zu
|
||||||
|
""".split()
|
||||||
|
)
|
52
spacy/lang/lij/tokenizer_exceptions.py
Normal file
52
spacy/lang/lij/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,52 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
from ...symbols import ORTH, LEMMA
|
||||||
|
|
||||||
|
_exc = {}
|
||||||
|
|
||||||
|
for raw, lemma in [
|
||||||
|
("a-a", "a-o"),
|
||||||
|
("a-e", "a-o"),
|
||||||
|
("a-o", "a-o"),
|
||||||
|
("a-i", "a-o"),
|
||||||
|
("co-a", "co-o"),
|
||||||
|
("co-e", "co-o"),
|
||||||
|
("co-i", "co-o"),
|
||||||
|
("co-o", "co-o"),
|
||||||
|
("da-a", "da-o"),
|
||||||
|
("da-e", "da-o"),
|
||||||
|
("da-i", "da-o"),
|
||||||
|
("da-o", "da-o"),
|
||||||
|
("pe-a", "pe-o"),
|
||||||
|
("pe-e", "pe-o"),
|
||||||
|
("pe-i", "pe-o"),
|
||||||
|
("pe-o", "pe-o"),
|
||||||
|
]:
|
||||||
|
for orth in [raw, raw.capitalize()]:
|
||||||
|
_exc[orth] = [{ORTH: orth, LEMMA: lemma}]
|
||||||
|
|
||||||
|
# Prefix + prepositions with à (e.g. "sott'a-o")
|
||||||
|
|
||||||
|
for prep, prep_lemma in [
|
||||||
|
("a-a", "a-o"),
|
||||||
|
("a-e", "a-o"),
|
||||||
|
("a-o", "a-o"),
|
||||||
|
("a-i", "a-o"),
|
||||||
|
]:
|
||||||
|
for prefix, prefix_lemma in [
|
||||||
|
("sott'", "sotta"),
|
||||||
|
("sott’", "sotta"),
|
||||||
|
("contr'", "contra"),
|
||||||
|
("contr’", "contra"),
|
||||||
|
("ch'", "che"),
|
||||||
|
("ch’", "che"),
|
||||||
|
("s'", "se"),
|
||||||
|
("s’", "se"),
|
||||||
|
]:
|
||||||
|
for prefix_orth in [prefix, prefix.capitalize()]:
|
||||||
|
_exc[prefix_orth + prep] = [
|
||||||
|
{ORTH: prefix_orth, LEMMA: prefix_lemma},
|
||||||
|
{ORTH: prep, LEMMA: prep_lemma},
|
||||||
|
]
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = _exc
|
|
@ -1,6 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
@ -26,7 +27,13 @@ class LithuanianDefaults(Language.Defaults):
|
||||||
)
|
)
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
infixes = TOKENIZER_INFIXES
|
||||||
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
mod_base_exceptions = {
|
||||||
|
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
|
||||||
|
}
|
||||||
|
del mod_base_exceptions["8)"]
|
||||||
|
tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
morph_rules = MORPH_RULES
|
morph_rules = MORPH_RULES
|
||||||
|
|
29
spacy/lang/lt/punctuation.py
Normal file
29
spacy/lang/lt/punctuation.py
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..char_classes import LIST_ICONS, LIST_ELLIPSES
|
||||||
|
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
||||||
|
from ..char_classes import HYPHENS
|
||||||
|
from ..punctuation import TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
|
|
||||||
|
_infixes = (
|
||||||
|
LIST_ELLIPSES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
|
r"(?<=[0-9])[+\*^](?=[0-9-])",
|
||||||
|
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||||
|
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||||
|
),
|
||||||
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||||
|
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_suffixes = ["\."] + list(TOKENIZER_SUFFIXES)
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_INFIXES = _infixes
|
||||||
|
TOKENIZER_SUFFIXES = _suffixes
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user