mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge branch 'master' into develop
This commit is contained in:
commit
158b98a3ef
106
.github/contributors/GuiGel.md
vendored
Normal file
106
.github/contributors/GuiGel.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Guillaume Gelabert |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 2019-11-15 |
|
||||
| GitHub username | GuiGel |
|
||||
| Website (optional) | |
|
106
.github/contributors/Olamyy.md
vendored
Normal file
106
.github/contributors/Olamyy.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [ x ] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Olamilekan Wahab |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 8/11/2019 |
|
||||
| GitHub username | Olamyy |
|
||||
| Website (optional) | |
|
106
.github/contributors/aajanki.md
vendored
Normal file
106
.github/contributors/aajanki.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Antti Ajanki |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 2019-11-30 |
|
||||
| GitHub username | aajanki |
|
||||
| Website (optional) | |
|
106
.github/contributors/erip.md
vendored
Normal file
106
.github/contributors/erip.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Elijah Rippeth |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 2019-11-16 |
|
||||
| GitHub username | erip |
|
||||
| Website (optional) | |
|
106
.github/contributors/mmaybeno.md
vendored
Normal file
106
.github/contributors/mmaybeno.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Matt Maybeno |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 2019-11-19 |
|
||||
| GitHub username | mmaybeno |
|
||||
| Website (optional) | |
|
87
.github/contributors/mr-bjerre.md
vendored
Normal file
87
.github/contributors/mr-bjerre.md
vendored
Normal file
|
@ -0,0 +1,87 @@
|
|||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Nicolai Bjerre Pedersen |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 2019-12-06 |
|
||||
| GitHub username | mr_bjerre |
|
||||
| Website (optional) | |
|
106
.github/contributors/questoph.md
vendored
Normal file
106
.github/contributors/questoph.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Christoph Purschke |
|
||||
| Company name (if applicable) | University of Luxembourg |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 14/11/2019 |
|
||||
| GitHub username | questoph |
|
||||
| Website (optional) | https://purschke.info |
|
|
@ -35,24 +35,12 @@ jobs:
|
|||
dependsOn: 'Validate'
|
||||
strategy:
|
||||
matrix:
|
||||
# Python 2.7 currently doesn't work because it seems to be a narrow
|
||||
# unicode build, which causes problems with the regular expressions
|
||||
|
||||
# Python27Linux:
|
||||
# imageName: 'ubuntu-16.04'
|
||||
# python.version: '2.7'
|
||||
# Python27Mac:
|
||||
# imageName: 'macos-10.13'
|
||||
# python.version: '2.7'
|
||||
Python35Linux:
|
||||
imageName: 'ubuntu-16.04'
|
||||
python.version: '3.5'
|
||||
Python35Windows:
|
||||
imageName: 'vs2017-win2016'
|
||||
python.version: '3.5'
|
||||
Python35Mac:
|
||||
imageName: 'macos-10.13'
|
||||
python.version: '3.5'
|
||||
Python36Linux:
|
||||
imageName: 'ubuntu-16.04'
|
||||
python.version: '3.6'
|
||||
|
@ -62,15 +50,25 @@ jobs:
|
|||
Python36Mac:
|
||||
imageName: 'macos-10.13'
|
||||
python.version: '3.6'
|
||||
Python37Linux:
|
||||
# Don't test on 3.7 for now to speed up builds
|
||||
# Python37Linux:
|
||||
# imageName: 'ubuntu-16.04'
|
||||
# python.version: '3.7'
|
||||
# Python37Windows:
|
||||
# imageName: 'vs2017-win2016'
|
||||
# python.version: '3.7'
|
||||
# Python37Mac:
|
||||
# imageName: 'macos-10.13'
|
||||
# python.version: '3.7'
|
||||
Python38Linux:
|
||||
imageName: 'ubuntu-16.04'
|
||||
python.version: '3.7'
|
||||
Python37Windows:
|
||||
python.version: '3.8'
|
||||
Python38Windows:
|
||||
imageName: 'vs2017-win2016'
|
||||
python.version: '3.7'
|
||||
Python37Mac:
|
||||
python.version: '3.8'
|
||||
Python38Mac:
|
||||
imageName: 'macos-10.13'
|
||||
python.version: '3.7'
|
||||
python.version: '3.8'
|
||||
maxParallel: 4
|
||||
pool:
|
||||
vmImage: $(imageName)
|
||||
|
@ -81,10 +79,8 @@ jobs:
|
|||
versionSpec: '$(python.version)'
|
||||
architecture: 'x64'
|
||||
|
||||
# Downgrading pip is necessary to prevent a wheel version incompatiblity.
|
||||
# Might be fixed in the future or some other way, so investigate again.
|
||||
- script: |
|
||||
python -m pip install -U pip==18.1 setuptools
|
||||
python -m pip install -U setuptools
|
||||
pip install -r requirements.txt
|
||||
displayName: 'Install dependencies'
|
||||
|
||||
|
|
|
@ -8,6 +8,7 @@ import plac
|
|||
from pathlib import Path
|
||||
import re
|
||||
import json
|
||||
import tqdm
|
||||
|
||||
import spacy
|
||||
import spacy.util
|
||||
|
@ -225,6 +226,13 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
|||
|
||||
|
||||
def write_conllu(docs, file_):
|
||||
if not Token.has_extension("get_conllu_lines"):
|
||||
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
||||
if not Token.has_extension("begins_fused"):
|
||||
Token.set_extension("begins_fused", default=False)
|
||||
if not Token.has_extension("inside_fused"):
|
||||
Token.set_extension("inside_fused", default=False)
|
||||
|
||||
merger = Matcher(docs[0].vocab)
|
||||
merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
|
||||
for i, doc in enumerate(docs):
|
||||
|
@ -483,8 +491,9 @@ def main(
|
|||
vectors_dir=None,
|
||||
use_oracle_segments=False,
|
||||
):
|
||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
import tqdm
|
||||
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
||||
Token.set_extension("begins_fused", default=False)
|
||||
Token.set_extension("inside_fused", default=False)
|
||||
|
||||
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
||||
Token.set_extension("begins_fused", default=False)
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import logging
|
||||
import random
|
||||
|
||||
from tqdm import tqdm
|
||||
from collections import defaultdict
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -119,8 +120,6 @@ def get_eval_results(data, el_pipe=None):
|
|||
Only evaluate entities that overlap between gold and NER, to isolate the performance of the NEL.
|
||||
If the docs in the data require further processing with an entity linker, set el_pipe.
|
||||
"""
|
||||
from tqdm import tqdm
|
||||
|
||||
docs = []
|
||||
golds = []
|
||||
for d, g in tqdm(data, leave=False):
|
||||
|
|
|
@ -6,6 +6,7 @@ import bz2
|
|||
import logging
|
||||
import random
|
||||
import json
|
||||
from tqdm import tqdm
|
||||
|
||||
from functools import partial
|
||||
|
||||
|
@ -457,9 +458,6 @@ def read_training(nlp, entity_file_path, dev, limit, kb, labels_discard=None):
|
|||
""" This method provides training examples that correspond to the entity annotations found by the nlp object.
|
||||
For training, it will include both positive and negative examples by using the candidate generator from the kb.
|
||||
For testing (kb=None), it will include all positive examples only."""
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
if not labels_discard:
|
||||
labels_discard = []
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@ import attr
|
|||
from pathlib import Path
|
||||
import re
|
||||
import json
|
||||
import tqdm
|
||||
|
||||
import spacy
|
||||
import spacy.util
|
||||
|
@ -291,11 +292,6 @@ def get_token_conllu(token, i):
|
|||
return "\n".join(lines)
|
||||
|
||||
|
||||
Token.set_extension("get_conllu_lines", method=get_token_conllu, force=True)
|
||||
Token.set_extension("begins_fused", default=False, force=True)
|
||||
Token.set_extension("inside_fused", default=False, force=True)
|
||||
|
||||
|
||||
##################
|
||||
# Initialization #
|
||||
##################
|
||||
|
@ -394,8 +390,9 @@ class TreebankPaths(object):
|
|||
limit=("Size limit", "option", "n", int),
|
||||
)
|
||||
def main(ud_dir, parses_dir, config, corpus, limit=0):
|
||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
import tqdm
|
||||
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
||||
Token.set_extension("begins_fused", default=False)
|
||||
Token.set_extension("inside_fused", default=False)
|
||||
|
||||
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
||||
Token.set_extension("begins_fused", default=False)
|
||||
|
@ -426,10 +423,7 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
|
|||
for batch in batches:
|
||||
pbar.update(sum(len(ex.doc) for ex in batch))
|
||||
nlp.update(
|
||||
examples=batch,
|
||||
sgd=optimizer,
|
||||
drop=config.dropout,
|
||||
losses=losses,
|
||||
examples=batch, sgd=optimizer, drop=config.dropout, losses=losses,
|
||||
)
|
||||
|
||||
out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i)
|
||||
|
|
|
@ -8,8 +8,8 @@ For more details, see the documentation:
|
|||
* Knowledge base: https://spacy.io/api/kb
|
||||
* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
|
||||
|
||||
Compatible with: spaCy v2.2
|
||||
Last tested with: v2.2
|
||||
Compatible with: spaCy v2.2.3
|
||||
Last tested with: v2.2.3
|
||||
"""
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
|
|
|
@ -14,6 +14,7 @@ pre-train with the development data, but also not *so* terrible: we're not using
|
|||
the development labels, after all --- only the unlabelled text.
|
||||
"""
|
||||
import plac
|
||||
import tqdm
|
||||
import random
|
||||
import spacy
|
||||
import thinc.extra.datasets
|
||||
|
@ -106,9 +107,6 @@ def create_pipeline(width, embed_size, vectors_model):
|
|||
|
||||
|
||||
def train_tensorizer(nlp, texts, dropout, n_iter):
|
||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
import tqdm
|
||||
|
||||
tensorizer = nlp.create_pipe("tensorizer")
|
||||
nlp.add_pipe(tensorizer)
|
||||
optimizer = nlp.begin_training()
|
||||
|
@ -122,9 +120,6 @@ def train_tensorizer(nlp, texts, dropout, n_iter):
|
|||
|
||||
|
||||
def train_textcat(nlp, n_texts, n_iter=10):
|
||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
import tqdm
|
||||
|
||||
textcat = nlp.get_pipe("textcat")
|
||||
tok2vec_weights = textcat.model.tok2vec.to_bytes()
|
||||
(train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
|
||||
|
|
|
@ -8,8 +8,8 @@ For more details, see the documentation:
|
|||
* Training: https://spacy.io/usage/training
|
||||
* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
|
||||
|
||||
Compatible with: spaCy v2.2
|
||||
Last tested with: v2.2
|
||||
Compatible with: spaCy v2.2.3
|
||||
Last tested with: v2.2.3
|
||||
"""
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
|
@ -22,6 +22,7 @@ from spacy.vocab import Vocab
|
|||
|
||||
import spacy
|
||||
from spacy.kb import KnowledgeBase
|
||||
from spacy.pipeline import EntityRuler
|
||||
from spacy.tokens import Span
|
||||
from spacy.util import minibatch, compounding
|
||||
|
||||
|
@ -70,22 +71,35 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
|
|||
nlp.vocab.vectors.name = "spacy_pretrained_vectors"
|
||||
print("Created blank 'en' model with vocab from '%s'" % vocab_path)
|
||||
|
||||
# create the built-in pipeline components and add them to the pipeline
|
||||
# nlp.create_pipe works for built-ins that are registered with spaCy
|
||||
# Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy.
|
||||
nlp.add_pipe(nlp.create_pipe('sentencizer'))
|
||||
|
||||
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data.
|
||||
# Note that in a realistic application, an actual NER algorithm should be used instead.
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
|
||||
# Create the Entity Linker component and add it to the pipeline.
|
||||
if "entity_linker" not in nlp.pipe_names:
|
||||
entity_linker = nlp.create_pipe("entity_linker")
|
||||
# use only the predicted EL score and not the prior probability (for demo purposes)
|
||||
cfg = {"incl_prior": False}
|
||||
entity_linker = nlp.create_pipe("entity_linker", cfg)
|
||||
kb = KnowledgeBase(vocab=nlp.vocab)
|
||||
kb.load_bulk(kb_path)
|
||||
print("Loaded Knowledge Base from '%s'" % kb_path)
|
||||
entity_linker.set_kb(kb)
|
||||
nlp.add_pipe(entity_linker, last=True)
|
||||
else:
|
||||
entity_linker = nlp.get_pipe("entity_linker")
|
||||
kb = entity_linker.kb
|
||||
|
||||
# make sure the annotated examples correspond to known identifiers in the knowlege base
|
||||
kb_ids = kb.get_entity_strings()
|
||||
# Convert the texts to docs to make sure we have doc.ents set for the training examples.
|
||||
# Also ensure that the annotated examples correspond to known identifiers in the knowlege base.
|
||||
kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings()
|
||||
TRAIN_DOCS = []
|
||||
for text, annotation in TRAIN_DATA:
|
||||
with nlp.disable_pipes("entity_linker"):
|
||||
doc = nlp(text)
|
||||
annotation_clean = annotation
|
||||
for offset, kb_id_dict in annotation["links"].items():
|
||||
new_dict = {}
|
||||
for kb_id, value in kb_id_dict.items():
|
||||
|
@ -95,7 +109,8 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
|
|||
print(
|
||||
"Removed", kb_id, "from training because it is not in the KB."
|
||||
)
|
||||
annotation["links"][offset] = new_dict
|
||||
annotation_clean["links"][offset] = new_dict
|
||||
TRAIN_DOCS.append((doc, annotation_clean))
|
||||
|
||||
# get names of other pipes to disable them during training
|
||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
|
||||
|
@ -103,10 +118,10 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
|
|||
# reset and initialize the weights randomly
|
||||
optimizer = nlp.begin_training()
|
||||
for itn in range(n_iter):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
random.shuffle(TRAIN_DOCS)
|
||||
losses = {}
|
||||
# batch up the examples using spaCy's minibatch
|
||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||
batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))
|
||||
for batch in batches:
|
||||
nlp.update(
|
||||
batch,
|
||||
|
@ -136,16 +151,8 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
|
|||
|
||||
def _apply_model(nlp):
|
||||
for text, annotation in TRAIN_DATA:
|
||||
doc = nlp.tokenizer(text)
|
||||
|
||||
# set entities so the evaluation is independent of the NER step
|
||||
# all the examples contain 'Russ Cochran' as the first two tokens in the sentence
|
||||
rc_ent = Span(doc, 0, 2, label=PERSON)
|
||||
doc.ents = [rc_ent]
|
||||
|
||||
# apply the entity linker which will now make predictions for the 'Russ Cochran' entities
|
||||
doc = nlp.get_pipe("entity_linker")(doc)
|
||||
|
||||
doc = nlp(text)
|
||||
print()
|
||||
print("Entities", [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents])
|
||||
print("Tokens", [(t.text, t.ent_type_, t.ent_kb_id_) for t in doc])
|
||||
|
|
|
@ -8,6 +8,7 @@ from __future__ import unicode_literals
|
|||
|
||||
from os import path
|
||||
|
||||
import tqdm
|
||||
import math
|
||||
import numpy
|
||||
import plac
|
||||
|
@ -35,9 +36,6 @@ from tensorflow.contrib.tensorboard.plugins.projector import (
|
|||
),
|
||||
)
|
||||
def main(vectors_loc, out_loc, name="spaCy_vectors"):
|
||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
import tqdm
|
||||
|
||||
meta_file = "{}.tsv".format(name)
|
||||
out_meta_file = path.join(out_loc, meta_file)
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# Our libraries
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=7.3.0,<7.4.0
|
||||
thinc==7.4.0.dev0
|
||||
blis>=0.4.0,<0.5.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
wasabi>=0.4.0,<1.1.0
|
||||
|
@ -12,6 +12,7 @@ numpy>=1.15.0
|
|||
requests>=2.13.0,<3.0.0
|
||||
plac>=0.9.6,<1.2.0
|
||||
pathlib==1.0.1; python_version < "3.4"
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
# Optional dependencies
|
||||
jsonschema>=2.6.0,<3.1.0
|
||||
# Development dependencies
|
||||
|
|
|
@ -38,13 +38,13 @@ setup_requires =
|
|||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=7.3.0,<7.4.0
|
||||
thinc==7.4.0.dev0
|
||||
install_requires =
|
||||
# Our libraries
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=7.3.0,<7.4.0
|
||||
thinc==7.4.0.dev0
|
||||
blis>=0.4.0,<0.5.0
|
||||
wasabi>=0.4.0,<1.1.0
|
||||
srsly>=0.1.0,<1.1.0
|
||||
|
@ -73,7 +73,7 @@ cuda100 =
|
|||
cupy-cuda100>=5.0.0b4
|
||||
# Language tokenizers with external dependencies
|
||||
ja =
|
||||
mecab-python3==0.7
|
||||
fugashi>=0.1.3
|
||||
ko =
|
||||
natto-py==0.9.0
|
||||
th =
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy"
|
||||
__version__ = "2.2.2"
|
||||
__version__ = "2.2.3"
|
||||
__release__ = True
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
|
|
|
@ -7,8 +7,9 @@ from spacy.gold import Example
|
|||
from ...gold import iob_to_biluo
|
||||
|
||||
|
||||
def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None,
|
||||
ner_map=None, **_):
|
||||
def conllu2json(
|
||||
input_data, n_sents=10, use_morphology=False, lang=None, ner_map=None, **_
|
||||
):
|
||||
"""
|
||||
Convert conllu files into JSON format for use with train cli.
|
||||
use_morphology parameter enables appending morphology to tags, which is
|
||||
|
@ -29,13 +30,19 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None,
|
|||
has_ner_tags = False
|
||||
for i, example in enumerate(conll_data):
|
||||
if not checked_for_ner:
|
||||
has_ner_tags = is_ner(example.token_annotation.entities[0],
|
||||
MISC_NER_PATTERN)
|
||||
has_ner_tags = is_ner(
|
||||
example.token_annotation.entities[0], MISC_NER_PATTERN
|
||||
)
|
||||
checked_for_ner = True
|
||||
raw += example.text
|
||||
sentences.append(generate_sentence(example.token_annotation,
|
||||
has_ner_tags, MISC_NER_PATTERN,
|
||||
ner_map=ner_map))
|
||||
sentences.append(
|
||||
generate_sentence(
|
||||
example.token_annotation,
|
||||
has_ner_tags,
|
||||
MISC_NER_PATTERN,
|
||||
ner_map=ner_map,
|
||||
)
|
||||
)
|
||||
# Real-sized documents could be extracted using the comments on the
|
||||
# conllu document
|
||||
if len(sentences) % n_sents == 0:
|
||||
|
@ -105,8 +112,9 @@ def read_conllx(input_data, use_morphology=False, n=0):
|
|||
if space:
|
||||
raw += " "
|
||||
example = Example(doc=raw)
|
||||
example.set_token_annotation(ids=ids, words=words, tags=tags,
|
||||
heads=heads, deps=deps, entities=ents)
|
||||
example.set_token_annotation(
|
||||
ids=ids, words=words, tags=tags, heads=heads, deps=deps, entities=ents
|
||||
)
|
||||
yield example
|
||||
i += 1
|
||||
if 1 <= n <= i:
|
||||
|
@ -143,13 +151,11 @@ def extract_tags(iob, tag_pattern, ner_map=None):
|
|||
return new_iob
|
||||
|
||||
|
||||
def generate_sentence(token_annotation, has_ner_tags, tag_pattern,
|
||||
ner_map=None):
|
||||
def generate_sentence(token_annotation, has_ner_tags, tag_pattern, ner_map=None):
|
||||
sentence = {}
|
||||
tokens = []
|
||||
if has_ner_tags:
|
||||
iob = extract_tags(token_annotation.entities, tag_pattern,
|
||||
ner_map=ner_map)
|
||||
iob = extract_tags(token_annotation.entities, tag_pattern, ner_map=ner_map)
|
||||
biluo = iob_to_biluo(iob)
|
||||
for i, id in enumerate(token_annotation.ids):
|
||||
token = {}
|
||||
|
|
|
@ -3,6 +3,7 @@ from __future__ import unicode_literals
|
|||
|
||||
import plac
|
||||
import math
|
||||
from tqdm import tqdm
|
||||
import numpy
|
||||
from ast import literal_eval
|
||||
from pathlib import Path
|
||||
|
@ -116,9 +117,6 @@ def open_file(loc):
|
|||
|
||||
|
||||
def read_attrs_from_deprecated(freqs_loc, clusters_loc):
|
||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
from tqdm import tqdm
|
||||
|
||||
if freqs_loc is not None:
|
||||
with msg.loading("Counting frequencies..."):
|
||||
probs, _ = read_freqs(freqs_loc)
|
||||
|
@ -201,9 +199,6 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
|
|||
|
||||
|
||||
def read_vectors(vectors_loc):
|
||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
from tqdm import tqdm
|
||||
|
||||
f = open_file(vectors_loc)
|
||||
shape = tuple(int(size) for size in next(f).split())
|
||||
vectors_data = numpy.zeros(shape=shape, dtype="f")
|
||||
|
@ -220,9 +215,6 @@ def read_vectors(vectors_loc):
|
|||
|
||||
|
||||
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
from tqdm import tqdm
|
||||
|
||||
counts = PreshCounter()
|
||||
total = 0
|
||||
with freqs_loc.open() as f:
|
||||
|
@ -252,9 +244,6 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
|||
|
||||
|
||||
def read_clusters(clusters_loc):
|
||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
from tqdm import tqdm
|
||||
|
||||
clusters = {}
|
||||
if ftfy is None:
|
||||
user_warning(Warnings.W004)
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
from __future__ import unicode_literals, division, print_function
|
||||
|
||||
import plac
|
||||
import tqdm
|
||||
from pathlib import Path
|
||||
import srsly
|
||||
import cProfile
|
||||
|
@ -46,9 +47,6 @@ def profile(model, inputs=None, n_texts=10000):
|
|||
|
||||
|
||||
def parse_texts(nlp, texts):
|
||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
import tqdm
|
||||
|
||||
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
|
||||
pass
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@ from __future__ import unicode_literals, division, print_function
|
|||
|
||||
import plac
|
||||
import os
|
||||
import tqdm
|
||||
from pathlib import Path
|
||||
from thinc.neural._classes.model import Model
|
||||
from timeit import default_timer as timer
|
||||
|
@ -88,10 +89,6 @@ def train(
|
|||
JSON format. To convert data from other formats, use the `spacy convert`
|
||||
command.
|
||||
"""
|
||||
|
||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
import tqdm
|
||||
|
||||
util.fix_random_seed()
|
||||
util.set_env_log(verbose)
|
||||
|
||||
|
@ -524,9 +521,6 @@ def _score_for_model(meta):
|
|||
|
||||
@contextlib.contextmanager
|
||||
def _create_progress_bar(total):
|
||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
import tqdm
|
||||
|
||||
if int(os.environ.get("LOG_FRIENDLY", 0)):
|
||||
yield
|
||||
else:
|
||||
|
|
|
@ -53,7 +53,9 @@ class Warnings(object):
|
|||
W009 = ("Custom factory '{name}' provided by entry points of another "
|
||||
"package overwrites built-in factory.")
|
||||
W010 = ("As of v2.1.0, the PhraseMatcher doesn't have a phrase length "
|
||||
"limit anymore, so the max_length argument is now deprecated.")
|
||||
"limit anymore, so the max_length argument is now deprecated. "
|
||||
"If you did not specify this parameter, make sure you call the "
|
||||
"constructor with named arguments instead of positional ones.")
|
||||
W011 = ("It looks like you're calling displacy.serve from within a "
|
||||
"Jupyter notebook or a similar environment. This likely means "
|
||||
"you're already running a local web server, so there's no need to "
|
||||
|
@ -72,7 +74,7 @@ class Warnings(object):
|
|||
"instead.")
|
||||
W014 = ("As of v2.1.0, the `disable` keyword argument on the serialization "
|
||||
"methods is and should be replaced with `exclude`. This makes it "
|
||||
"consistent with the other objects serializable.")
|
||||
"consistent with the other serializable objects.")
|
||||
W015 = ("As of v2.1.0, the use of keyword arguments to exclude fields from "
|
||||
"being serialized or deserialized is deprecated. Please use the "
|
||||
"`exclude` argument instead. For example: exclude=['{arg}'].")
|
||||
|
@ -81,7 +83,8 @@ class Warnings(object):
|
|||
"Future versions may introduce a `n_process` argument for "
|
||||
"parallel inference via multiprocessing.")
|
||||
W017 = ("Alias '{alias}' already exists in the Knowledge Base.")
|
||||
W018 = ("Entity '{entity}' already exists in the Knowledge Base.")
|
||||
W018 = ("Entity '{entity}' already exists in the Knowledge Base - "
|
||||
"ignoring the duplicate entry.")
|
||||
W019 = ("Changing vectors name from {old} to {new}, to avoid clash with "
|
||||
"previously loaded vectors. See Issue #3853.")
|
||||
W020 = ("Unnamed vectors. This won't allow multiple vectors models to be "
|
||||
|
@ -101,6 +104,7 @@ class Warnings(object):
|
|||
"the Knowledge Base.")
|
||||
W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
|
||||
"previous components in the pipeline declare that they assign it.")
|
||||
W026 = ("Unable to set all sentence boundaries from dependency parses.")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
@ -529,17 +533,19 @@ class Errors(object):
|
|||
E185 = ("Received invalid attribute in component attribute declaration: "
|
||||
"{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
|
||||
E186 = ("'{tok_a}' and '{tok_b}' are different texts.")
|
||||
E187 = ("Tokenizer special cases are not allowed to modify the text. "
|
||||
E187 = ("Only unicode strings are supported as labels.")
|
||||
E188 = ("Could not match the gold entity links to entities in the doc - "
|
||||
"make sure the gold EL data refers to valid results of the "
|
||||
"named entity recognizer in the `nlp` pipeline.")
|
||||
# TODO: fix numbering after merging develop into master
|
||||
E997 = ("Tokenizer special cases are not allowed to modify the text. "
|
||||
"This would map '{chunk}' to '{orth}' given token attributes "
|
||||
"'{token_attrs}'.")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
E998 = ("Can only create GoldParse's from Example's without a Doc, "
|
||||
"if get_gold_parses() is called with a Vocab object.")
|
||||
E999 = ("Encountered an unexpected format for the dictionary holding "
|
||||
"gold annotations: {gold_dict}")
|
||||
|
||||
|
||||
@add_codes
|
||||
class TempErrors(object):
|
||||
T003 = ("Resizing pretrained Tagger models is not currently supported.")
|
||||
|
|
|
@ -1121,7 +1121,7 @@ cdef class GoldParse:
|
|||
return not nonproj.is_nonproj_tree(self.heads)
|
||||
|
||||
|
||||
def docs_to_json(docs, id=0):
|
||||
def docs_to_json(docs, id=0, ner_missing_tag="O"):
|
||||
"""Convert a list of Doc objects into the JSON-serializable format used by
|
||||
the spacy train command.
|
||||
|
||||
|
@ -1139,7 +1139,7 @@ def docs_to_json(docs, id=0):
|
|||
json_cat = {"label": cat, "value": val}
|
||||
json_para["cats"].append(json_cat)
|
||||
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
|
||||
biluo_tags = biluo_tags_from_offsets(doc, ent_offsets)
|
||||
biluo_tags = biluo_tags_from_offsets(doc, ent_offsets, missing=ner_missing_tag)
|
||||
for j, sent in enumerate(doc.sents):
|
||||
json_sent = {"tokens": [], "brackets": []}
|
||||
for token in sent:
|
||||
|
|
11
spacy/kb.pyx
11
spacy/kb.pyx
|
@ -136,19 +136,24 @@ cdef class KnowledgeBase:
|
|||
if len(entity_list) != len(freq_list) or len(entity_list) != len(vector_list):
|
||||
raise ValueError(Errors.E140)
|
||||
|
||||
nr_entities = len(entity_list)
|
||||
nr_entities = len(set(entity_list))
|
||||
self._entry_index = PreshMap(nr_entities+1)
|
||||
self._entries = entry_vec(nr_entities+1)
|
||||
|
||||
i = 0
|
||||
cdef KBEntryC entry
|
||||
cdef hash_t entity_hash
|
||||
while i < nr_entities:
|
||||
while i < len(entity_list):
|
||||
# only process this entity if its unique ID hadn't been added before
|
||||
entity_hash = self.vocab.strings.add(entity_list[i])
|
||||
if entity_hash in self._entry_index:
|
||||
user_warning(Warnings.W018.format(entity=entity_list[i]))
|
||||
|
||||
else:
|
||||
entity_vector = vector_list[i]
|
||||
if len(entity_vector) != self.entity_vector_length:
|
||||
raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
|
||||
|
||||
entity_hash = self.vocab.strings.add(entity_list[i])
|
||||
entry.entity_hash = entity_hash
|
||||
entry.freq = freq_list[i]
|
||||
|
||||
|
|
|
@ -31,6 +31,10 @@ _latin_u_supplement = r"\u00C0-\u00D6\u00D8-\u00DE"
|
|||
_latin_l_supplement = r"\u00DF-\u00F6\u00F8-\u00FF"
|
||||
_latin_supplement = r"\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF"
|
||||
|
||||
_hangul_syllables = r"\uAC00-\uD7AF"
|
||||
_hangul_jamo = r"\u1100-\u11FF"
|
||||
_hangul = _hangul_syllables + _hangul_jamo
|
||||
|
||||
# letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh
|
||||
_latin_u_extendedA = (
|
||||
r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C"
|
||||
|
@ -202,7 +206,15 @@ _upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian
|
|||
_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower
|
||||
|
||||
_uncased = (
|
||||
_bengali + _hebrew + _persian + _sinhala + _hindi + _kannada + _tamil + _telugu
|
||||
_bengali
|
||||
+ _hebrew
|
||||
+ _persian
|
||||
+ _sinhala
|
||||
+ _hindi
|
||||
+ _kannada
|
||||
+ _tamil
|
||||
+ _telugu
|
||||
+ _hangul
|
||||
)
|
||||
|
||||
ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased)
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB
|
||||
from ...symbols import NOUN, PROPN, PART, INTJ, PRON
|
||||
from ...symbols import NOUN, PROPN, PART, INTJ, PRON, AUX
|
||||
|
||||
|
||||
TAG_MAP = {
|
||||
|
@ -4249,4 +4249,20 @@ TAG_MAP = {
|
|||
"Voice": "Act",
|
||||
"Case": "Nom|Gen|Dat|Acc|Voc",
|
||||
},
|
||||
'ADJ': {POS: ADJ},
|
||||
'ADP': {POS: ADP},
|
||||
'ADV': {POS: ADV},
|
||||
'AtDf': {POS: DET},
|
||||
'AUX': {POS: AUX},
|
||||
'CCONJ': {POS: CCONJ},
|
||||
'DET': {POS: DET},
|
||||
'NOUN': {POS: NOUN},
|
||||
'NUM': {POS: NUM},
|
||||
'PART': {POS: PART},
|
||||
'PRON': {POS: PRON},
|
||||
'PROPN': {POS: PROPN},
|
||||
'SCONJ': {POS: SCONJ},
|
||||
'SYM': {POS: SYM},
|
||||
'VERB': {POS: VERB},
|
||||
'X': {POS: X},
|
||||
}
|
||||
|
|
|
@ -305,6 +305,9 @@ TAG_MAP = {
|
|||
"VERB__VerbForm=Ger": {"morph": "VerbForm=Ger", POS: VERB},
|
||||
"VERB__VerbForm=Inf": {"morph": "VerbForm=Inf", POS: VERB},
|
||||
"X___": {"morph": "_", POS: X},
|
||||
"___PunctType=Quot": {POS: PUNCT},
|
||||
"___VerbForm=Inf": {POS: VERB},
|
||||
"___Number=Sing|Person=2|PronType=Prs": {POS: PRON},
|
||||
"_SP": {"morph": "_", POS: SPACE},
|
||||
}
|
||||
# fmt: on
|
||||
|
|
|
@ -3,6 +3,8 @@ from __future__ import unicode_literals
|
|||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
|
@ -13,10 +15,13 @@ from ...util import update_exc, add_lookups
|
|||
|
||||
class FinnishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "fi"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
|
|
@ -18,7 +18,8 @@ _num_words = [
|
|||
"kymmenen",
|
||||
"yksitoista",
|
||||
"kaksitoista",
|
||||
"kolmetoista" "neljätoista",
|
||||
"kolmetoista",
|
||||
"neljätoista",
|
||||
"viisitoista",
|
||||
"kuusitoista",
|
||||
"seitsemäntoista",
|
||||
|
|
33
spacy/lang/fi/punctuation.py
Normal file
33
spacy/lang/fi/punctuation.py
Normal file
|
@ -0,0 +1,33 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||
from ..punctuation import TOKENIZER_SUFFIXES
|
||||
|
||||
|
||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||
|
||||
_infixes = (
|
||||
LIST_ELLIPSES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
|
||||
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||
]
|
||||
)
|
||||
|
||||
_suffixes = [
|
||||
suffix
|
||||
for suffix in TOKENIZER_SUFFIXES
|
||||
if suffix not in ["'s", "'S", "’s", "’S", r"\'"]
|
||||
]
|
||||
|
||||
|
||||
TOKENIZER_INFIXES = _infixes
|
||||
TOKENIZER_SUFFIXES = _suffixes
|
|
@ -5,7 +5,7 @@ from ..punctuation import TOKENIZER_INFIXES
|
|||
from ..char_classes import ALPHA
|
||||
|
||||
|
||||
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
|
||||
ELISION = " ' ’ ".strip().replace(" ", "")
|
||||
|
||||
|
||||
_infixes = TOKENIZER_INFIXES + [
|
||||
|
|
|
@ -12,21 +12,23 @@ from ...tokens import Doc
|
|||
from ...compat import copy_reg
|
||||
from ...util import DummyTokenizer
|
||||
|
||||
# Handling for multiple spaces in a row is somewhat awkward, this simplifies
|
||||
# the flow by creating a dummy with the same interface.
|
||||
DummyNode = namedtuple("DummyNode", ["surface", "pos", "feature"])
|
||||
DummyNodeFeatures = namedtuple("DummyNodeFeatures", ["lemma"])
|
||||
DummySpace = DummyNode(' ', ' ', DummyNodeFeatures(' '))
|
||||
|
||||
ShortUnitWord = namedtuple("ShortUnitWord", ["surface", "lemma", "pos"])
|
||||
|
||||
|
||||
def try_mecab_import():
|
||||
"""Mecab is required for Japanese support, so check for it.
|
||||
def try_fugashi_import():
|
||||
"""Fugashi is required for Japanese support, so check for it.
|
||||
It it's not available blow up and explain how to fix it."""
|
||||
try:
|
||||
import MeCab
|
||||
import fugashi
|
||||
|
||||
return MeCab
|
||||
return fugashi
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Japanese support requires MeCab: "
|
||||
"https://github.com/SamuraiT/mecab-python3"
|
||||
"Japanese support requires Fugashi: "
|
||||
"https://github.com/polm/fugashi"
|
||||
)
|
||||
|
||||
|
||||
|
@ -39,7 +41,7 @@ def resolve_pos(token):
|
|||
"""
|
||||
|
||||
# this is only used for consecutive ascii spaces
|
||||
if token.pos == "空白":
|
||||
if token.surface == " ":
|
||||
return "空白"
|
||||
|
||||
# TODO: This is a first take. The rules here are crude approximations.
|
||||
|
@ -53,55 +55,45 @@ def resolve_pos(token):
|
|||
return token.pos + ",ADJ"
|
||||
return token.pos
|
||||
|
||||
def get_words_and_spaces(tokenizer, text):
|
||||
"""Get the individual tokens that make up the sentence and handle white space.
|
||||
|
||||
Japanese doesn't usually use white space, and MeCab's handling of it for
|
||||
multiple spaces in a row is somewhat awkward.
|
||||
"""
|
||||
|
||||
tokens = tokenizer.parseToNodeList(text)
|
||||
|
||||
def detailed_tokens(tokenizer, text):
|
||||
"""Format Mecab output into a nice data structure, based on Janome."""
|
||||
node = tokenizer.parseToNode(text)
|
||||
node = node.next # first node is beginning of sentence and empty, skip it
|
||||
words = []
|
||||
spaces = []
|
||||
while node.posid != 0:
|
||||
surface = node.surface
|
||||
base = surface # a default value. Updated if available later.
|
||||
parts = node.feature.split(",")
|
||||
pos = ",".join(parts[0:4])
|
||||
if len(parts) > 7:
|
||||
# this information is only available for words in the tokenizer
|
||||
# dictionary
|
||||
base = parts[7]
|
||||
words.append(ShortUnitWord(surface, base, pos))
|
||||
|
||||
# The way MeCab stores spaces is that the rlength of the next token is
|
||||
# the length of that token plus any preceding whitespace, **in bytes**.
|
||||
# also note that this is only for half-width / ascii spaces. Full width
|
||||
# spaces just become tokens.
|
||||
scount = node.next.rlength - node.next.length
|
||||
spaces.append(bool(scount))
|
||||
while scount > 1:
|
||||
words.append(ShortUnitWord(" ", " ", "空白"))
|
||||
for token in tokens:
|
||||
# If there's more than one space, spaces after the first become tokens
|
||||
for ii in range(len(token.white_space) - 1):
|
||||
words.append(DummySpace)
|
||||
spaces.append(False)
|
||||
scount -= 1
|
||||
|
||||
node = node.next
|
||||
words.append(token)
|
||||
spaces.append(bool(token.white_space))
|
||||
return words, spaces
|
||||
|
||||
|
||||
class JapaneseTokenizer(DummyTokenizer):
|
||||
def __init__(self, cls, nlp=None):
|
||||
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
||||
self.tokenizer = try_mecab_import().Tagger()
|
||||
self.tokenizer.parseToNode("") # see #2901
|
||||
self.tokenizer = try_fugashi_import().Tagger()
|
||||
self.tokenizer.parseToNodeList("") # see #2901
|
||||
|
||||
def __call__(self, text):
|
||||
dtokens, spaces = detailed_tokens(self.tokenizer, text)
|
||||
dtokens, spaces = get_words_and_spaces(self.tokenizer, text)
|
||||
words = [x.surface for x in dtokens]
|
||||
doc = Doc(self.vocab, words=words, spaces=spaces)
|
||||
mecab_tags = []
|
||||
unidic_tags = []
|
||||
for token, dtoken in zip(doc, dtokens):
|
||||
mecab_tags.append(dtoken.pos)
|
||||
unidic_tags.append(dtoken.pos)
|
||||
token.tag_ = resolve_pos(dtoken)
|
||||
token.lemma_ = dtoken.lemma
|
||||
doc.user_data["mecab_tags"] = mecab_tags
|
||||
|
||||
# if there's no lemma info (it's an unk) just use the surface
|
||||
token.lemma_ = dtoken.feature.lemma or dtoken.surface
|
||||
doc.user_data["unidic_tags"] = unidic_tags
|
||||
return doc
|
||||
|
||||
|
||||
|
@ -131,5 +123,4 @@ def pickle_japanese(instance):
|
|||
|
||||
copy_reg.pickle(Japanese, pickle_japanese)
|
||||
|
||||
|
||||
__all__ = ["Japanese"]
|
||||
|
|
67
spacy/lang/ko/lex_attrs.py
Normal file
67
spacy/lang/ko/lex_attrs.py
Normal file
|
@ -0,0 +1,67 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
_num_words = [
|
||||
"영",
|
||||
"공",
|
||||
# Native Korean number system
|
||||
"하나",
|
||||
"둘",
|
||||
"셋",
|
||||
"넷",
|
||||
"다섯",
|
||||
"여섯",
|
||||
"일곱",
|
||||
"여덟",
|
||||
"아홉",
|
||||
"열",
|
||||
"스물",
|
||||
"서른",
|
||||
"마흔",
|
||||
"쉰",
|
||||
"예순",
|
||||
"일흔",
|
||||
"여든",
|
||||
"아흔",
|
||||
# Sino-Korean number system
|
||||
"일",
|
||||
"이",
|
||||
"삼",
|
||||
"사",
|
||||
"오",
|
||||
"육",
|
||||
"칠",
|
||||
"팔",
|
||||
"구",
|
||||
"십",
|
||||
"백",
|
||||
"천",
|
||||
"만",
|
||||
"십만",
|
||||
"백만",
|
||||
"천만",
|
||||
"일억",
|
||||
"십억",
|
||||
"백억",
|
||||
]
|
||||
|
||||
|
||||
def like_num(text):
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count("/") == 1:
|
||||
num, denom = text.split("/")
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
if any(char.lower() in _num_words for char in text):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
|
@ -3,6 +3,7 @@ from __future__ import unicode_literals
|
|||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
|
@ -24,6 +25,7 @@ class LuxembourgishDefaults(Language.Defaults):
|
|||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
tag_map = TAG_MAP
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
||||
|
||||
class Luxembourgish(Language):
|
||||
|
|
|
@ -6,7 +6,7 @@ from __future__ import unicode_literals
|
|||
# variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.)
|
||||
# here one could include the most common spelling mistakes
|
||||
|
||||
_exc = {"datt": "dass", "wgl.": "weg.", "vläicht": "viläicht"}
|
||||
_exc = {"dass": "datt", "viläicht": "vläicht"}
|
||||
|
||||
|
||||
NORM_EXCEPTIONS = {}
|
||||
|
|
23
spacy/lang/lb/punctuation.py
Normal file
23
spacy/lang/lb/punctuation.py
Normal file
|
@ -0,0 +1,23 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||
|
||||
ELISION = " ' ’ ".strip().replace(" ", "")
|
||||
|
||||
_infixes = (
|
||||
LIST_ELLIPSES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
|
||||
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[0-9])-(?=[0-9])",
|
||||
]
|
||||
)
|
||||
|
||||
TOKENIZER_INFIXES = _infixes
|
|
@ -2,33 +2,17 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import ORTH, LEMMA, NORM
|
||||
from ..punctuation import TOKENIZER_PREFIXES
|
||||
|
||||
# TODO
|
||||
# tokenize cliticised definite article "d'" as token of its own: d'Kanner > [d'] [Kanner]
|
||||
# treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions)
|
||||
|
||||
# how to write the tokenisation exeption for the articles d' / D' ? This one is not working.
|
||||
_prefixes = [
|
||||
prefix for prefix in TOKENIZER_PREFIXES if prefix not in ["d'", "D'", "d’", "D’"]
|
||||
]
|
||||
|
||||
|
||||
_exc = {
|
||||
"d'mannst": [
|
||||
{ORTH: "d'", LEMMA: "d'"},
|
||||
{ORTH: "mannst", LEMMA: "mann", NORM: "mann"},
|
||||
],
|
||||
"d'éischt": [
|
||||
{ORTH: "d'", LEMMA: "d'"},
|
||||
{ORTH: "éischt", LEMMA: "éischt", NORM: "éischt"},
|
||||
],
|
||||
}
|
||||
_exc = {}
|
||||
|
||||
# translate / delete what is not necessary
|
||||
# what does PRON_LEMMA mean?
|
||||
for exc_data in [
|
||||
{ORTH: "wgl.", LEMMA: "wann ech gelift", NORM: "wann ech gelieft"},
|
||||
{ORTH: "'t", LEMMA: "et", NORM: "et"},
|
||||
{ORTH: "'T", LEMMA: "et", NORM: "et"},
|
||||
{ORTH: "wgl.", LEMMA: "wannechgelift", NORM: "wannechgelift"},
|
||||
{ORTH: "M.", LEMMA: "Monsieur", NORM: "Monsieur"},
|
||||
{ORTH: "Mme.", LEMMA: "Madame", NORM: "Madame"},
|
||||
{ORTH: "Dr.", LEMMA: "Dokter", NORM: "Dokter"},
|
||||
|
@ -36,7 +20,7 @@ for exc_data in [
|
|||
{ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"},
|
||||
{ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"},
|
||||
{ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"},
|
||||
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"},
|
||||
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"}
|
||||
]:
|
||||
_exc[exc_data[ORTH]] = [exc_data]
|
||||
|
||||
|
@ -64,6 +48,4 @@ for orth in [
|
|||
]:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
|
||||
TOKENIZER_PREFIXES = _prefixes
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import POS, PUNCT, ADJ, CONJ, SCONJ, SYM, NUM, DET, ADV, ADP, X
|
||||
from ...symbols import POS, PUNCT, ADJ, CONJ, CCONJ, SCONJ, SYM, NUM, DET, ADV, ADP, X
|
||||
from ...symbols import VERB, NOUN, PROPN, PART, INTJ, PRON, AUX
|
||||
|
||||
|
||||
# Tags are a combination of POS and morphological features from a yet
|
||||
# unpublished dataset developed by Schibsted, Nasjonalbiblioteket and LTG. The
|
||||
# Tags are a combination of POS and morphological features from a
|
||||
# https://github.com/ltgoslo/norne developed by Schibsted, Nasjonalbiblioteket and LTG. The
|
||||
# data format is .conllu and follows the Universal Dependencies annotation.
|
||||
# (There are some annotation differences compared to this dataset:
|
||||
# https://github.com/UniversalDependencies/UD_Norwegian-Bokmaal
|
||||
|
@ -467,4 +467,97 @@ TAG_MAP = {
|
|||
"VERB__VerbForm=Part": {"morph": "VerbForm=Part", POS: VERB},
|
||||
"VERB___": {"morph": "_", POS: VERB},
|
||||
"X___": {"morph": "_", POS: X},
|
||||
'CCONJ___': {"morph": "_", POS: CCONJ},
|
||||
"ADJ__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADJ},
|
||||
"ADJ__Abbr=Yes|Degree=Pos": {"morph": "Abbr=Yes|Degree=Pos", POS: ADJ},
|
||||
"ADJ__Case=Gen|Definite=Def|Number=Sing|VerbForm=Part": {"morph": "Case=Gen|Definite=Def|Number=Sing|VerbForm=Part", POS: ADJ},
|
||||
"ADJ__Definite=Def|Number=Sing|VerbForm=Part": {"morph": "Definite=Def|Number=Sing|VerbForm=Part", POS: ADJ},
|
||||
"ADJ__Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part", POS: ADJ},
|
||||
"ADJ__Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part", POS: ADJ},
|
||||
"ADJ__Definite=Ind|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Number=Sing|VerbForm=Part", POS: ADJ},
|
||||
"ADJ__Number=Sing|VerbForm=Part": {"morph": "Number=Sing|VerbForm=Part", POS: ADJ},
|
||||
"ADJ__VerbForm=Part": {"morph": "VerbForm=Part", POS: ADJ},
|
||||
"ADP__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADP},
|
||||
"ADV__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADV},
|
||||
"DET__Case=Gen|Gender=Masc|Number=Sing|PronType=Art": {"morph": "Case=Gen|Gender=Masc|Number=Sing|PronType=Art", POS: DET},
|
||||
"DET__Case=Gen|Number=Plur|PronType=Tot": {"morph": "Case=Gen|Number=Plur|PronType=Tot", POS: DET},
|
||||
"DET__Definite=Def|PronType=Prs": {"morph": "Definite=Def|PronType=Prs", POS: DET},
|
||||
"DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs": {"morph": "Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs", POS: DET},
|
||||
"DET__Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs": {"morph": "Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs", POS: DET},
|
||||
"DET__Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs": {"morph": "Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs", POS: DET},
|
||||
"DET__Gender=Fem|Number=Sing|PronType=Art": {"morph": "Gender=Fem|Number=Sing|PronType=Art", POS: DET},
|
||||
"DET__Gender=Fem|Number=Sing|PronType=Ind": {"morph": "Gender=Fem|Number=Sing|PronType=Ind", POS: DET},
|
||||
"DET__Gender=Fem|Number=Sing|PronType=Prs": {"morph": "Gender=Fem|Number=Sing|PronType=Prs", POS: DET},
|
||||
"DET__Gender=Fem|Number=Sing|PronType=Tot": {"morph": "Gender=Fem|Number=Sing|PronType=Tot", POS: DET},
|
||||
"DET__Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg": {"morph": "Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg", POS: DET},
|
||||
"DET__Gender=Masc|Number=Sing|PronType=Art": {"morph": "Gender=Masc|Number=Sing|PronType=Art", POS: DET},
|
||||
"DET__Gender=Masc|Number=Sing|PronType=Ind": {"morph": "Gender=Masc|Number=Sing|PronType=Ind", POS: DET},
|
||||
"DET__Gender=Masc|Number=Sing|PronType=Tot": {"morph": "Gender=Masc|Number=Sing|PronType=Tot", POS: DET},
|
||||
"DET__Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg": {"morph": "Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg", POS: DET},
|
||||
"DET__Gender=Neut|Number=Sing|PronType=Art": {"morph": "Gender=Neut|Number=Sing|PronType=Art", POS: DET},
|
||||
"DET__Gender=Neut|Number=Sing|PronType=Dem,Ind": {"morph": "Gender=Neut|Number=Sing|PronType=Dem,Ind", POS: DET},
|
||||
"DET__Gender=Neut|Number=Sing|PronType=Ind": {"morph": "Gender=Neut|Number=Sing|PronType=Ind", POS: DET},
|
||||
"DET__Gender=Neut|Number=Sing|PronType=Tot": {"morph": "Gender=Neut|Number=Sing|PronType=Tot", POS: DET},
|
||||
"DET__Number=Plur|Polarity=Neg|PronType=Neg": {"morph": "Number=Plur|Polarity=Neg|PronType=Neg", POS: DET},
|
||||
"DET__Number=Plur|PronType=Art": {"morph": "Number=Plur|PronType=Art", POS: DET},
|
||||
"DET__Number=Plur|PronType=Ind": {"morph": "Number=Plur|PronType=Ind", POS: DET},
|
||||
"DET__Number=Plur|PronType=Prs": {"morph": "Number=Plur|PronType=Prs", POS: DET},
|
||||
"DET__Number=Plur|PronType=Tot": {"morph": "Number=Plur|PronType=Tot", POS: DET},
|
||||
"DET__PronType=Ind": {"morph": "PronType=Ind", POS: DET},
|
||||
"DET__PronType=Prs": {"morph": "PronType=Prs", POS: DET},
|
||||
"NOUN__Abbr=Yes": {"morph": "Abbr=Yes", POS: NOUN},
|
||||
"NOUN__Abbr=Yes|Case=Gen": {"morph": "Abbr=Yes|Case=Gen", POS: NOUN},
|
||||
"NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing": {"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing", POS: NOUN},
|
||||
"NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing": {"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing", POS: NOUN},
|
||||
"NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing": {"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing", POS: NOUN},
|
||||
"NOUN__Abbr=Yes|Gender=Masc": {"morph": "Abbr=Yes|Gender=Masc", POS: NOUN},
|
||||
"NUM__Case=Gen|Number=Plur|NumType=Card": {"morph": "Case=Gen|Number=Plur|NumType=Card", POS: NUM},
|
||||
"NUM__Definite=Def|Number=Sing|NumType=Card": {"morph": "Definite=Def|Number=Sing|NumType=Card", POS: NUM},
|
||||
"NUM__Definite=Def|NumType=Card": {"morph": "Definite=Def|NumType=Card", POS: NUM},
|
||||
"NUM__Gender=Fem|Number=Sing|NumType=Card": {"morph": "Gender=Fem|Number=Sing|NumType=Card", POS: NUM},
|
||||
"NUM__Gender=Masc|Number=Sing|NumType=Card": {"morph": "Gender=Masc|Number=Sing|NumType=Card", POS: NUM},
|
||||
"NUM__Gender=Neut|Number=Sing|NumType=Card": {"morph": "Gender=Neut|Number=Sing|NumType=Card", POS: NUM},
|
||||
"NUM__Number=Plur|NumType=Card": {"morph": "Number=Plur|NumType=Card", POS: NUM},
|
||||
"NUM__Number=Sing|NumType=Card": {"morph": "Number=Sing|NumType=Card", POS: NUM},
|
||||
"NUM__NumType=Card": {"morph": "NumType=Card", POS: NUM},
|
||||
"PART__Polarity=Neg": {"morph": "Polarity=Neg", POS: PART},
|
||||
"PRON__Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs", POS: PRON},
|
||||
"PRON__Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs", POS: PRON},
|
||||
"PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs", POS: PRON},
|
||||
"PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs", POS: PRON},
|
||||
"PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs", POS: PRON},
|
||||
"PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs", POS: PRON},
|
||||
"PRON__Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs": {"morph": "Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs", POS: PRON},
|
||||
"PRON__Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs": {"morph": "Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs", POS: PRON},
|
||||
"PRON__Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs", POS: PRON},
|
||||
"PRON__Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs", POS: PRON},
|
||||
"PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs", POS: PRON},
|
||||
"PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs", POS: PRON},
|
||||
"PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs", POS: PRON},
|
||||
"PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs", POS: PRON},
|
||||
"PRON__Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs", POS: PRON},
|
||||
"PRON__Animacy=Hum|Number=Plur|PronType=Rcp": {"morph": "Animacy=Hum|Number=Plur|PronType=Rcp", POS: PRON},
|
||||
"PRON__Animacy=Hum|Number=Sing|PronType=Art,Prs": {"morph": "Animacy=Hum|Number=Sing|PronType=Art,Prs", POS: PRON},
|
||||
"PRON__Animacy=Hum|Poss=Yes|PronType=Int": {"morph": "Animacy=Hum|Poss=Yes|PronType=Int", POS: PRON},
|
||||
"PRON__Animacy=Hum|PronType=Int": {"morph": "Animacy=Hum|PronType=Int", POS: PRON},
|
||||
"PRON__Case=Acc|PronType=Prs|Reflex=Yes": {"morph": "Case=Acc|PronType=Prs|Reflex=Yes", POS: PRON},
|
||||
"PRON__Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs": { "morph": "Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs", POS: PRON},
|
||||
"PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs": {"morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs", POS: PRON},
|
||||
"PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot": {"morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot", POS: PRON},
|
||||
"PRON__Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs": {"morph": "Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs", POS: PRON},
|
||||
"PRON__Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs": {"morph": "Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs", POS: PRON},
|
||||
"PRON__Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs": {"morph": "Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs", POS: PRON},
|
||||
"PRON__Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs": {"morph": "Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs", POS: PRON},
|
||||
"PRON__Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs": {"morph": "Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs", POS: PRON},
|
||||
"PRON__Number=Plur|Person=3|PronType=Ind,Prs": {"morph": "Number=Plur|Person=3|PronType=Ind,Prs", POS: PRON},
|
||||
"PRON__Number=Plur|Person=3|PronType=Prs,Tot": {"morph": "Number=Plur|Person=3|PronType=Prs,Tot", POS: PRON},
|
||||
"PRON__Number=Plur|Poss=Yes|PronType=Prs": {"morph": "Number=Plur|Poss=Yes|PronType=Prs", POS: PRON},
|
||||
"PRON__Number=Plur|Poss=Yes|PronType=Rcp": {"morph": "Number=Plur|Poss=Yes|PronType=Rcp", POS: PRON},
|
||||
"PRON__Number=Sing|Polarity=Neg|PronType=Neg": {"morph": "Number=Sing|Polarity=Neg|PronType=Neg", POS: PRON},
|
||||
"PRON__PronType=Prs": {"morph": "PronType=Prs", POS: PRON},
|
||||
"PRON__PronType=Rel": {"morph": "PronType=Rel", POS: PRON},
|
||||
"PROPN__Abbr=Yes": {"morph": "Abbr=Yes", POS: PROPN},
|
||||
"PROPN__Abbr=Yes|Case=Gen": {"morph": "Abbr=Yes|Case=Gen", POS: PROPN},
|
||||
"VERB__Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin": {"morph": "Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin", POS: VERB},
|
||||
"VERB__Definite=Ind|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Number=Sing|VerbForm=Part", POS: VERB},
|
||||
}
|
||||
|
|
|
@ -5039,5 +5039,19 @@ TAG_MAP = {
|
|||
"punc": {POS: PUNCT},
|
||||
"v-pcp|M|P": {POS: VERB},
|
||||
"v-pcp|M|S": {POS: VERB},
|
||||
"ADJ": {POS: ADJ},
|
||||
"AUX": {POS: AUX},
|
||||
"CCONJ": {POS: CCONJ},
|
||||
"DET": {POS: DET},
|
||||
"INTJ": {POS: INTJ},
|
||||
"NUM": {POS: NUM},
|
||||
"PART": {POS: PART},
|
||||
"PRON": {POS: PRON},
|
||||
"PUNCT": {POS: PUNCT},
|
||||
"SCONJ": {POS: SCONJ},
|
||||
"SYM": {POS: SYM},
|
||||
"VERB": {POS: VERB},
|
||||
"X": {POS: X},
|
||||
"adv": {POS: ADV},
|
||||
"_SP": {POS: SPACE},
|
||||
}
|
||||
|
|
24
spacy/lang/yo/__init__.py
Normal file
24
spacy/lang/yo/__init__.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
|
||||
|
||||
class YorubaDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "yo"
|
||||
stop_words = STOP_WORDS
|
||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||
|
||||
|
||||
class Yoruba(Language):
|
||||
lang = "yo"
|
||||
Defaults = YorubaDefaults
|
||||
|
||||
|
||||
__all__ = ["Yoruba"]
|
26
spacy/lang/yo/examples.py
Normal file
26
spacy/lang/yo/examples.py
Normal file
|
@ -0,0 +1,26 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.yo.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
# 1. https://yo.wikipedia.org/wiki/Wikipedia:%C3%80y%E1%BB%8Dk%C3%A0_p%C3%A0t%C3%A0k%C3%AC
|
||||
# 2.https://yo.wikipedia.org/wiki/Oj%C3%BAew%C3%A9_%C3%80k%E1%BB%8D%CC%81k%E1%BB%8D%CC%81
|
||||
# 3. https://www.bbc.com/yoruba
|
||||
|
||||
sentences = [
|
||||
"Ìjọba Tanzania fi Ajìjàgbara Ọmọ Orílẹ̀-èdèe Uganda sí àtìmọ́lé",
|
||||
"Olúṣẹ́gun Ọbásanjọ́, tí ó jẹ́ Ààrẹ ìjọba ológun àná (láti ọdún 1976 sí 1979), tí ó sì tún ṣe Ààrẹ ìjọba alágbádá tí ìbò gbé wọlé (ní ọdún 1999 sí 2007), kúndùn láti máa bu ẹnu àtẹ́ lu àwọn "
|
||||
"ètò ìjọba Ààrẹ orílẹ̀-èdè Nàìjíríà tí ó jẹ tẹ̀lé e.",
|
||||
"Akin Alabi rán ẹnu mọ́ agbárá Adárí Òsìsẹ̀, àwọn ọmọ Nàìjíríà dẹnu bò ó",
|
||||
"Ta ló leè dúró s'ẹ́gbẹ̀ẹ́ Okunnu láì rẹ́rìín?",
|
||||
"Dídarapọ̀ mọ́n ìpolongo",
|
||||
"Bi a se n so, omobinrin ni oruko ni ojo kejo bee naa ni omokunrin ni oruko ni ojo kesan.",
|
||||
"Oríṣìíríṣìí nǹkan ló le yọrí sí orúkọ tí a sọ ọmọ",
|
||||
"Gbogbo won ni won ni oriki ti won",
|
||||
]
|
115
spacy/lang/yo/lex_attrs.py
Normal file
115
spacy/lang/yo/lex_attrs.py
Normal file
|
@ -0,0 +1,115 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import unicodedata
|
||||
|
||||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
_num_words = [
|
||||
"ení",
|
||||
"oókàn",
|
||||
"ọ̀kanlá",
|
||||
"ẹ́ẹdọ́gbọ̀n",
|
||||
"àádọ́fà",
|
||||
"ẹ̀walélúɡba",
|
||||
"egbèje",
|
||||
"ẹgbàárin",
|
||||
"èjì",
|
||||
"eéjì",
|
||||
"èjìlá",
|
||||
"ọgbọ̀n,",
|
||||
"ọgọ́fà",
|
||||
"ọ̀ọ́dúrún",
|
||||
"ẹgbẹ̀jọ",
|
||||
"ẹ̀ẹ́dẹ́ɡbàárùn",
|
||||
"ẹ̀ta",
|
||||
"ẹẹ́ta",
|
||||
"ẹ̀talá",
|
||||
"aárùndílogójì",
|
||||
"àádóje",
|
||||
"irinwó",
|
||||
"ẹgbẹ̀sàn",
|
||||
"ẹgbàárùn",
|
||||
"ẹ̀rin",
|
||||
"ẹẹ́rin",
|
||||
"ẹ̀rinlá",
|
||||
"ogójì",
|
||||
"ogóje",
|
||||
"ẹ̀ẹ́dẹ́gbẹ̀ta",
|
||||
"ẹgbàá",
|
||||
"ẹgbàájọ",
|
||||
"àrún",
|
||||
"aárùn",
|
||||
"ẹ́ẹdógún",
|
||||
"àádọ́ta",
|
||||
"àádọ́jọ",
|
||||
"ẹgbẹ̀ta",
|
||||
"ẹgboókànlá",
|
||||
"ẹgbàawǎ",
|
||||
"ẹ̀fà",
|
||||
"ẹẹ́fà",
|
||||
"ẹẹ́rìndílógún",
|
||||
"ọgọ́ta",
|
||||
"ọgọ́jọ",
|
||||
"ọ̀ọ́dẹ́gbẹ̀rin",
|
||||
"ẹgbẹ́ẹdógún",
|
||||
"ọkẹ́marun",
|
||||
"èje",
|
||||
"etàdílógún",
|
||||
"àádọ́rin",
|
||||
"àádọ́sán",
|
||||
"ẹgbẹ̀rin",
|
||||
"ẹgbàajì",
|
||||
"ẹgbẹ̀ẹgbẹ̀rún",
|
||||
"ẹ̀jọ",
|
||||
"ẹẹ́jọ",
|
||||
"eéjìdílógún",
|
||||
"ọgọ́rin",
|
||||
"ọgọsàn",
|
||||
"ẹ̀ẹ́dẹ́gbẹ̀rún",
|
||||
"ẹgbẹ́ẹdọ́gbọ̀n",
|
||||
"ọgọ́rùn ọkẹ́",
|
||||
"ẹ̀sán",
|
||||
"ẹẹ́sàn",
|
||||
"oókàndílógún",
|
||||
"àádọ́rùn",
|
||||
"ẹ̀wadilúɡba",
|
||||
"ẹgbẹ̀rún",
|
||||
"ẹgbàáta",
|
||||
"ẹ̀wá",
|
||||
"ẹẹ́wàá",
|
||||
"ogún",
|
||||
"ọgọ́rùn",
|
||||
"igba",
|
||||
"ẹgbẹ̀fà",
|
||||
"ẹ̀ẹ́dẹ́ɡbarin",
|
||||
]
|
||||
|
||||
|
||||
def strip_accents_text(text):
|
||||
"""
|
||||
Converts the string to NFD, separates & returns only the base characters
|
||||
:param text:
|
||||
:return: input string without diacritic adornments on base characters
|
||||
"""
|
||||
return "".join(
|
||||
c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn"
|
||||
)
|
||||
|
||||
|
||||
def like_num(text):
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
num_markers = ["dí", "dọ", "lé", "dín", "di", "din", "le", "do"]
|
||||
if any(mark in text for mark in num_markers):
|
||||
return True
|
||||
text = strip_accents_text(text)
|
||||
_num_words_stripped = [strip_accents_text(num) for num in _num_words]
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text in _num_words_stripped or text.lower() in _num_words_stripped:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
12
spacy/lang/yo/stop_words.py
Normal file
12
spacy/lang/yo/stop_words.py
Normal file
|
@ -0,0 +1,12 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
# stop words as whitespace-separated list.
|
||||
# Source: https://raw.githubusercontent.com/dohliam/more-stoplists/master/yo/yo.txt
|
||||
|
||||
STOP_WORDS = set(
|
||||
"a an b bá bí bẹ̀rẹ̀ d e f fún fẹ́ g gbogbo i inú j jù jẹ jẹ́ k kan kì kí kò "
|
||||
"l láti lè lọ m mi mo máa mọ̀ n ni náà ní nígbà nítorí nǹkan o p padà pé "
|
||||
"púpọ̀ pẹ̀lú r rẹ̀ s sì sí sínú t ti tí u w wà wá wọn wọ́n y yìí à àti àwọn á "
|
||||
"è é ì í ò òun ó ù ú ń ńlá ǹ ̀ ́ ̣ ṣ ṣe ṣé ṣùgbọ́n ẹ ẹmọ́ ọ ọjọ́ ọ̀pọ̀lọpọ̀".split()
|
||||
)
|
|
@ -4,19 +4,95 @@ from __future__ import unicode_literals
|
|||
from ...attrs import LANG
|
||||
from ...language import Language
|
||||
from ...tokens import Doc
|
||||
from ...util import DummyTokenizer
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tag_map import TAG_MAP
|
||||
|
||||
|
||||
def try_jieba_import(use_jieba):
|
||||
try:
|
||||
import jieba
|
||||
|
||||
return jieba
|
||||
except ImportError:
|
||||
if use_jieba:
|
||||
msg = (
|
||||
"Jieba not installed. Either set Chinese.use_jieba = False, "
|
||||
"or install it https://github.com/fxsjy/jieba"
|
||||
)
|
||||
raise ImportError(msg)
|
||||
|
||||
|
||||
class ChineseTokenizer(DummyTokenizer):
|
||||
def __init__(self, cls, nlp=None):
|
||||
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
||||
self.use_jieba = cls.use_jieba
|
||||
self.jieba_seg = try_jieba_import(self.use_jieba)
|
||||
self.tokenizer = Language.Defaults().create_tokenizer(nlp)
|
||||
|
||||
def __call__(self, text):
|
||||
# use jieba
|
||||
if self.use_jieba:
|
||||
jieba_words = list(
|
||||
[x for x in self.jieba_seg.cut(text, cut_all=False) if x]
|
||||
)
|
||||
words = [jieba_words[0]]
|
||||
spaces = [False]
|
||||
for i in range(1, len(jieba_words)):
|
||||
word = jieba_words[i]
|
||||
if word.isspace():
|
||||
# second token in adjacent whitespace following a
|
||||
# non-space token
|
||||
if spaces[-1]:
|
||||
words.append(word)
|
||||
spaces.append(False)
|
||||
# first space token following non-space token
|
||||
elif word == " " and not words[-1].isspace():
|
||||
spaces[-1] = True
|
||||
# token is non-space whitespace or any whitespace following
|
||||
# a whitespace token
|
||||
else:
|
||||
# extend previous whitespace token with more whitespace
|
||||
if words[-1].isspace():
|
||||
words[-1] += word
|
||||
# otherwise it's a new whitespace token
|
||||
else:
|
||||
words.append(word)
|
||||
spaces.append(False)
|
||||
else:
|
||||
words.append(word)
|
||||
spaces.append(False)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
||||
# split into individual characters
|
||||
words = []
|
||||
spaces = []
|
||||
for token in self.tokenizer(text):
|
||||
if token.text.isspace():
|
||||
words.append(token.text)
|
||||
spaces.append(False)
|
||||
else:
|
||||
words.extend(list(token.text))
|
||||
spaces.extend([False] * len(token.text))
|
||||
spaces[-1] = bool(token.whitespace_)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
||||
|
||||
class ChineseDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "zh"
|
||||
use_jieba = True
|
||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
tag_map = TAG_MAP
|
||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||
use_jieba = True
|
||||
|
||||
@classmethod
|
||||
def create_tokenizer(cls, nlp=None):
|
||||
return ChineseTokenizer(cls, nlp)
|
||||
|
||||
|
||||
class Chinese(Language):
|
||||
|
@ -24,26 +100,7 @@ class Chinese(Language):
|
|||
Defaults = ChineseDefaults # override defaults
|
||||
|
||||
def make_doc(self, text):
|
||||
if self.Defaults.use_jieba:
|
||||
try:
|
||||
import jieba
|
||||
except ImportError:
|
||||
msg = (
|
||||
"Jieba not installed. Either set Chinese.use_jieba = False, "
|
||||
"or install it https://github.com/fxsjy/jieba"
|
||||
)
|
||||
raise ImportError(msg)
|
||||
words = list(jieba.cut(text, cut_all=False))
|
||||
words = [x for x in words if x]
|
||||
return Doc(self.vocab, words=words, spaces=[False] * len(words))
|
||||
else:
|
||||
words = []
|
||||
spaces = []
|
||||
for token in self.tokenizer(text):
|
||||
words.extend(list(token.text))
|
||||
spaces.extend([False] * len(token.text))
|
||||
spaces[-1] = bool(token.whitespace_)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
return self.tokenizer(text)
|
||||
|
||||
|
||||
__all__ = ["Chinese"]
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import POS, PUNCT, ADJ, CONJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
|
||||
from ...symbols import NOUN, PART, INTJ, PRON
|
||||
from ...symbols import POS, PUNCT, ADJ, SCONJ, CCONJ, NUM, DET, ADV, ADP, X
|
||||
from ...symbols import NOUN, PART, INTJ, PRON, VERB, SPACE
|
||||
|
||||
# The Chinese part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank tag set.
|
||||
# We also map the tags to the simpler Google Universal POS tag set.
|
||||
# The Chinese part-of-speech tagger uses the OntoNotes 5 version of the Penn
|
||||
# Treebank tag set. We also map the tags to the simpler Universal Dependencies
|
||||
# v2 tag set.
|
||||
|
||||
TAG_MAP = {
|
||||
"AS": {POS: PART},
|
||||
|
@ -38,10 +39,11 @@ TAG_MAP = {
|
|||
"OD": {POS: NUM},
|
||||
"DT": {POS: DET},
|
||||
"CC": {POS: CCONJ},
|
||||
"CS": {POS: CONJ},
|
||||
"CS": {POS: SCONJ},
|
||||
"AD": {POS: ADV},
|
||||
"JJ": {POS: ADJ},
|
||||
"P": {POS: ADP},
|
||||
"PN": {POS: PRON},
|
||||
"PU": {POS: PUNCT},
|
||||
"_SP": {POS: SPACE},
|
||||
}
|
||||
|
|
|
@ -650,7 +650,7 @@ class Language(object):
|
|||
kwargs = component_cfg.get(name, {})
|
||||
kwargs.setdefault("batch_size", batch_size)
|
||||
if not hasattr(pipe, "pipe"):
|
||||
examples = _pipe(pipe, examples, kwargs)
|
||||
examples = _pipe(examples, pipe, kwargs)
|
||||
else:
|
||||
examples = pipe.pipe(examples, as_example=True, **kwargs)
|
||||
for ex in examples:
|
||||
|
|
|
@ -677,7 +677,9 @@ def _get_attr_values(spec, string_store):
|
|||
value = string_store.add(value)
|
||||
elif isinstance(value, bool):
|
||||
value = int(value)
|
||||
elif isinstance(value, (dict, int)):
|
||||
elif isinstance(value, int):
|
||||
pass
|
||||
elif isinstance(value, dict):
|
||||
continue
|
||||
else:
|
||||
raise ValueError(Errors.E153.format(vtype=type(value).__name__))
|
||||
|
|
|
@ -292,13 +292,14 @@ class EntityRuler(object):
|
|||
self.add_patterns(patterns)
|
||||
else:
|
||||
cfg = {}
|
||||
deserializers = {
|
||||
deserializers_patterns = {
|
||||
"patterns": lambda p: self.add_patterns(
|
||||
srsly.read_jsonl(p.with_suffix(".jsonl"))
|
||||
),
|
||||
"cfg": lambda p: cfg.update(srsly.read_json(p)),
|
||||
)}
|
||||
deserializers_cfg = {
|
||||
"cfg": lambda p: cfg.update(srsly.read_json(p))
|
||||
}
|
||||
from_disk(path, deserializers, {})
|
||||
from_disk(path, deserializers_cfg, {})
|
||||
self.overwrite = cfg.get("overwrite", False)
|
||||
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
|
||||
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
|
||||
|
@ -307,6 +308,7 @@ class EntityRuler(object):
|
|||
self.phrase_matcher = PhraseMatcher(
|
||||
self.nlp.vocab, attr=self.phrase_matcher_attr
|
||||
)
|
||||
from_disk(path, deserializers_patterns, {})
|
||||
return self
|
||||
|
||||
def to_disk(self, path, **kwargs):
|
||||
|
|
|
@ -13,7 +13,6 @@ from thinc.misc import LayerNorm
|
|||
from thinc.neural.util import to_categorical
|
||||
from thinc.neural.util import get_array_module
|
||||
|
||||
from spacy.gold import Example
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..syntax.nn_parser cimport Parser
|
||||
from ..syntax.ner cimport BiluoPushDown
|
||||
|
@ -24,6 +23,8 @@ from ..vocab cimport Vocab
|
|||
from .functions import merge_subtokens
|
||||
from ..language import Language, component
|
||||
from ..syntax import nonproj
|
||||
from ..gold import Example
|
||||
from ..compat import basestring_
|
||||
from ..attrs import POS, ID
|
||||
from ..parts_of_speech import X
|
||||
from ..kb import KnowledgeBase
|
||||
|
@ -593,6 +594,8 @@ class Tagger(Pipe):
|
|||
return build_tagger_model(n_tags, **cfg)
|
||||
|
||||
def add_label(self, label, values=None):
|
||||
if not isinstance(label, basestring_):
|
||||
raise ValueError(Errors.E187)
|
||||
if label in self.labels:
|
||||
return 0
|
||||
if self.model not in (True, False, None):
|
||||
|
@ -1238,6 +1241,8 @@ class TextCategorizer(Pipe):
|
|||
return float(mean_square_error), d_scores
|
||||
|
||||
def add_label(self, label):
|
||||
if not isinstance(label, basestring_):
|
||||
raise ValueError(Errors.E187)
|
||||
if label in self.labels:
|
||||
return 0
|
||||
if self.model not in (None, True, False):
|
||||
|
@ -1358,7 +1363,7 @@ cdef class EntityRecognizer(Parser):
|
|||
|
||||
@component(
|
||||
"entity_linker",
|
||||
requires=["doc.ents", "token.ent_iob", "token.ent_type"],
|
||||
requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
|
||||
assigns=["token.ent_kb_id"]
|
||||
)
|
||||
class EntityLinker(Pipe):
|
||||
|
@ -1429,13 +1434,20 @@ class EntityLinker(Pipe):
|
|||
for entity, kb_dict in gold.links.items():
|
||||
start, end = entity
|
||||
mention = doc.text[start:end]
|
||||
|
||||
# the gold annotations should link to proper entities - if this fails, the dataset is likely corrupt
|
||||
if not (start, end) in ents_by_offset:
|
||||
raise RuntimeError(Errors.E188)
|
||||
ent = ents_by_offset[(start, end)]
|
||||
|
||||
for kb_id, value in kb_dict.items():
|
||||
# Currently only training on the positive instances - we assume there is at least 1 per doc/gold
|
||||
if value:
|
||||
try:
|
||||
sentence_docs.append(ent.sent.as_doc())
|
||||
except AttributeError:
|
||||
# Catch the exception when ent.sent is None and provide a user-friendly warning
|
||||
raise RuntimeError(Errors.E030)
|
||||
|
||||
sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop)
|
||||
loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds)
|
||||
|
@ -1523,7 +1535,7 @@ class EntityLinker(Pipe):
|
|||
if len(doc) > 0:
|
||||
# Looping through each sentence and each entity
|
||||
# This may go wrong if there are entities across sentences - because they might not get a KB ID
|
||||
for sent in doc.ents:
|
||||
for sent in doc.sents:
|
||||
sent_doc = sent.as_doc()
|
||||
# currently, the context is the same for each entity in a sentence (should be refined)
|
||||
sentence_encoding = self.model([sent_doc])[0]
|
||||
|
@ -1704,6 +1716,55 @@ class Sentencizer(Pipe):
|
|||
return example
|
||||
return doc
|
||||
|
||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||
for docs in util.minibatch(stream, size=batch_size):
|
||||
docs = list(docs)
|
||||
tag_ids = self.predict(docs)
|
||||
self.set_annotations(docs, tag_ids)
|
||||
yield from docs
|
||||
|
||||
def predict(self, docs):
|
||||
"""Apply the pipeline's model to a batch of docs, without
|
||||
modifying them.
|
||||
"""
|
||||
if not any(len(doc) for doc in docs):
|
||||
# Handle cases where there are no tokens in any docs.
|
||||
guesses = [[] for doc in docs]
|
||||
return guesses
|
||||
guesses = []
|
||||
for doc in docs:
|
||||
start = 0
|
||||
seen_period = False
|
||||
doc_guesses = [False] * len(doc)
|
||||
doc_guesses[0] = True
|
||||
for i, token in enumerate(doc):
|
||||
is_in_punct_chars = token.text in self.punct_chars
|
||||
if seen_period and not token.is_punct and not is_in_punct_chars:
|
||||
doc_guesses[start] = True
|
||||
start = token.i
|
||||
seen_period = False
|
||||
elif is_in_punct_chars:
|
||||
seen_period = True
|
||||
if start < len(doc):
|
||||
doc_guesses[start] = True
|
||||
guesses.append(doc_guesses)
|
||||
return guesses
|
||||
|
||||
def set_annotations(self, docs, batch_tag_ids, tensors=None):
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
cdef Doc doc
|
||||
cdef int idx = 0
|
||||
for i, doc in enumerate(docs):
|
||||
doc_tag_ids = batch_tag_ids[i]
|
||||
for j, tag_id in enumerate(doc_tag_ids):
|
||||
# Don't clobber existing sentence boundaries
|
||||
if doc.c[j].sent_start == 0:
|
||||
if tag_id:
|
||||
doc.c[j].sent_start = 1
|
||||
else:
|
||||
doc.c[j].sent_start = -1
|
||||
|
||||
def to_bytes(self, **kwargs):
|
||||
"""Serialize the sentencizer to a bytestring.
|
||||
|
||||
|
|
|
@ -269,7 +269,9 @@ class Scorer(object):
|
|||
gold_tags = set()
|
||||
gold_sent_starts = set()
|
||||
gold_ents = set(tags_to_entities(orig.entities))
|
||||
for id_, tag, head, dep, sent_start in zip(orig.ids, orig.tags, orig.heads, orig.deps, orig.sent_starts):
|
||||
for id_, tag, head, dep, sent_start in zip(
|
||||
orig.ids, orig.tags, orig.heads, orig.deps, orig.sent_starts
|
||||
):
|
||||
gold_tags.add((id_, tag))
|
||||
if sent_start:
|
||||
gold_sent_starts.add(id_)
|
||||
|
@ -308,8 +310,10 @@ class Scorer(object):
|
|||
self.labelled_per_dep[token.dep_.lower()] = PRFScore()
|
||||
if token.dep_.lower() not in cand_deps_per_dep:
|
||||
cand_deps_per_dep[token.dep_.lower()] = set()
|
||||
cand_deps_per_dep[token.dep_.lower()].add((gold_i, gold_head, token.dep_.lower()))
|
||||
if "-" not in orig.entities:
|
||||
cand_deps_per_dep[token.dep_.lower()].add(
|
||||
(gold_i, gold_head, token.dep_.lower())
|
||||
)
|
||||
if "-" not in [token[-1] for token in gold.orig_annot]:
|
||||
# Find all NER labels in gold and doc
|
||||
ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents])
|
||||
# Set up all labels for per type scoring and prepare gold per type
|
||||
|
@ -342,7 +346,9 @@ class Scorer(object):
|
|||
self.sent_starts.score_set(cand_sent_starts, gold_sent_starts)
|
||||
self.labelled.score_set(cand_deps, gold_deps)
|
||||
for dep in self.labelled_per_dep:
|
||||
self.labelled_per_dep[dep].score_set(cand_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set()))
|
||||
self.labelled_per_dep[dep].score_set(
|
||||
cand_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set())
|
||||
)
|
||||
self.unlabelled.score_set(
|
||||
set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps)
|
||||
)
|
||||
|
|
|
@ -69,7 +69,8 @@ cdef class ParserBeam(object):
|
|||
cdef StateC* st
|
||||
for state in states:
|
||||
beam = Beam(self.moves.n_moves, width, min_density=density)
|
||||
beam.initialize(self.moves.init_beam_state, state.c.length,
|
||||
beam.initialize(self.moves.init_beam_state,
|
||||
self.moves.del_beam_state, state.c.length,
|
||||
state.c._sent)
|
||||
for i in range(beam.width):
|
||||
st = <StateC*>beam.at(i)
|
||||
|
|
|
@ -42,11 +42,17 @@ cdef WeightsC get_c_weights(model) except *:
|
|||
cdef precompute_hiddens state2vec = model.state2vec
|
||||
output.feat_weights = state2vec.get_feat_weights()
|
||||
output.feat_bias = <const float*>state2vec.bias.data
|
||||
cdef np.ndarray vec2scores_W = model.vec2scores.W
|
||||
cdef np.ndarray vec2scores_b = model.vec2scores.b
|
||||
cdef np.ndarray class_mask = model._class_mask
|
||||
cdef np.ndarray vec2scores_W
|
||||
cdef np.ndarray vec2scores_b
|
||||
if model.vec2scores is None:
|
||||
output.hidden_weights = NULL
|
||||
output.hidden_bias = NULL
|
||||
else:
|
||||
vec2scores_W = model.vec2scores.W
|
||||
vec2scores_b = model.vec2scores.b
|
||||
output.hidden_weights = <const float*>vec2scores_W.data
|
||||
output.hidden_bias = <const float*>vec2scores_b.data
|
||||
cdef np.ndarray class_mask = model._class_mask
|
||||
output.seen_classes = <const float*>class_mask.data
|
||||
return output
|
||||
|
||||
|
@ -54,6 +60,9 @@ cdef WeightsC get_c_weights(model) except *:
|
|||
cdef SizesC get_c_sizes(model, int batch_size) except *:
|
||||
cdef SizesC output
|
||||
output.states = batch_size
|
||||
if model.vec2scores is None:
|
||||
output.classes = model.state2vec.nO
|
||||
else:
|
||||
output.classes = model.vec2scores.nO
|
||||
output.hiddens = model.state2vec.nO
|
||||
output.pieces = model.state2vec.nP
|
||||
|
@ -105,11 +114,12 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
|
|||
|
||||
cdef void predict_states(ActivationsC* A, StateC** states,
|
||||
const WeightsC* W, SizesC n) nogil:
|
||||
cdef double one = 1.0
|
||||
resize_activations(A, n)
|
||||
memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
|
||||
memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
|
||||
for i in range(n.states):
|
||||
states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
|
||||
memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
|
||||
memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
|
||||
sum_state_features(A.unmaxed,
|
||||
W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
|
||||
for i in range(n.states):
|
||||
|
@ -120,7 +130,9 @@ cdef void predict_states(ActivationsC* A, StateC** states,
|
|||
which = Vec.arg_max(&A.unmaxed[index], n.pieces)
|
||||
A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
|
||||
memset(A.scores, 0, n.states * n.classes * sizeof(float))
|
||||
cdef double one = 1.0
|
||||
if W.hidden_weights == NULL:
|
||||
memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
|
||||
else:
|
||||
# Compute hidden-to-output
|
||||
blis.cy.gemm(blis.cy.NO_TRANSPOSE, blis.cy.TRANSPOSE,
|
||||
n.states, n.classes, n.hiddens, one,
|
||||
|
@ -219,7 +231,9 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
|
|||
class ParserModel(Model):
|
||||
def __init__(self, tok2vec, lower_model, upper_model, unseen_classes=None):
|
||||
Model.__init__(self)
|
||||
self._layers = [tok2vec, lower_model, upper_model]
|
||||
self._layers = [tok2vec, lower_model]
|
||||
if upper_model is not None:
|
||||
self._layers.append(upper_model)
|
||||
self.unseen_classes = set()
|
||||
if unseen_classes:
|
||||
for class_ in unseen_classes:
|
||||
|
@ -234,6 +248,8 @@ class ParserModel(Model):
|
|||
return step_model, finish_parser_update
|
||||
|
||||
def resize_output(self, new_output):
|
||||
if len(self._layers) == 2:
|
||||
return
|
||||
if new_output == self.upper.nO:
|
||||
return
|
||||
smaller = self.upper
|
||||
|
@ -275,11 +291,23 @@ class ParserModel(Model):
|
|||
class ParserStepModel(Model):
|
||||
def __init__(self, docs, layers, unseen_classes=None, drop=0.):
|
||||
self.tokvecs, self.bp_tokvecs = layers[0].begin_update(docs, drop=drop)
|
||||
if layers[1].nP >= 2:
|
||||
activation = "maxout"
|
||||
elif len(layers) == 2:
|
||||
activation = None
|
||||
else:
|
||||
activation = "relu"
|
||||
self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
|
||||
drop=drop)
|
||||
activation=activation, drop=drop)
|
||||
if len(layers) == 3:
|
||||
self.vec2scores = layers[-1]
|
||||
self.cuda_stream = util.get_cuda_stream()
|
||||
else:
|
||||
self.vec2scores = None
|
||||
self.cuda_stream = util.get_cuda_stream(non_blocking=True)
|
||||
self.backprops = []
|
||||
if self.vec2scores is None:
|
||||
self._class_mask = numpy.zeros((self.state2vec.nO,), dtype='f')
|
||||
else:
|
||||
self._class_mask = numpy.zeros((self.vec2scores.nO,), dtype='f')
|
||||
self._class_mask.fill(1)
|
||||
if unseen_classes is not None:
|
||||
|
@ -302,10 +330,15 @@ class ParserStepModel(Model):
|
|||
def begin_update(self, states, drop=0.):
|
||||
token_ids = self.get_token_ids(states)
|
||||
vector, get_d_tokvecs = self.state2vec.begin_update(token_ids, drop=0.0)
|
||||
if self.vec2scores is not None:
|
||||
mask = self.vec2scores.ops.get_dropout_mask(vector.shape, drop)
|
||||
if mask is not None:
|
||||
vector *= mask
|
||||
scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop)
|
||||
else:
|
||||
scores = NumpyOps().asarray(vector)
|
||||
get_d_vector = lambda d_scores, sgd=None: d_scores
|
||||
mask = None
|
||||
# If the class is unseen, make sure its score is minimum
|
||||
scores[:, self._class_mask == 0] = numpy.nanmin(scores)
|
||||
|
||||
|
@ -342,12 +375,12 @@ class ParserStepModel(Model):
|
|||
return ids
|
||||
|
||||
def make_updates(self, sgd):
|
||||
# Tells CUDA to block, so our async copies complete.
|
||||
if self.cuda_stream is not None:
|
||||
self.cuda_stream.synchronize()
|
||||
# Add a padding vector to the d_tokvecs gradient, so that missing
|
||||
# values don't affect the real gradient.
|
||||
d_tokvecs = self.ops.allocate((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
|
||||
# Tells CUDA to block, so our async copies complete.
|
||||
if self.cuda_stream is not None:
|
||||
self.cuda_stream.synchronize()
|
||||
for ids, d_vector, bp_vector in self.backprops:
|
||||
d_state_features = bp_vector((d_vector, ids), sgd=sgd)
|
||||
ids = ids.flatten()
|
||||
|
@ -385,9 +418,10 @@ cdef class precompute_hiddens:
|
|||
cdef np.ndarray bias
|
||||
cdef object _cuda_stream
|
||||
cdef object _bp_hiddens
|
||||
cdef object activation
|
||||
|
||||
def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
|
||||
drop=0.):
|
||||
activation="maxout", drop=0.):
|
||||
gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop)
|
||||
cdef np.ndarray cached
|
||||
if not isinstance(gpu_cached, numpy.ndarray):
|
||||
|
@ -405,6 +439,8 @@ cdef class precompute_hiddens:
|
|||
self.nP = getattr(lower_model, 'nP', 1)
|
||||
self.nO = cached.shape[2]
|
||||
self.ops = lower_model.ops
|
||||
assert activation in (None, "relu", "maxout")
|
||||
self.activation = activation
|
||||
self._is_synchronized = False
|
||||
self._cuda_stream = cuda_stream
|
||||
self._cached = cached
|
||||
|
@ -417,7 +453,7 @@ cdef class precompute_hiddens:
|
|||
return <float*>self._cached.data
|
||||
|
||||
def __call__(self, X):
|
||||
return self.begin_update(X)[0]
|
||||
return self.begin_update(X, drop=None)[0]
|
||||
|
||||
def begin_update(self, token_ids, drop=0.):
|
||||
cdef np.ndarray state_vector = numpy.zeros(
|
||||
|
@ -450,28 +486,35 @@ cdef class precompute_hiddens:
|
|||
else:
|
||||
ops = CupyOps()
|
||||
|
||||
if self.nP == 1:
|
||||
if self.activation == "maxout":
|
||||
state_vector, mask = ops.maxout(state_vector)
|
||||
else:
|
||||
state_vector = state_vector.reshape(state_vector.shape[:-1])
|
||||
if self.activation == "relu":
|
||||
mask = state_vector >= 0.
|
||||
state_vector *= mask
|
||||
else:
|
||||
state_vector, mask = ops.maxout(state_vector)
|
||||
mask = None
|
||||
|
||||
def backprop_nonlinearity(d_best, sgd=None):
|
||||
if isinstance(d_best, numpy.ndarray):
|
||||
ops = NumpyOps()
|
||||
else:
|
||||
ops = CupyOps()
|
||||
if mask is not None:
|
||||
mask_ = ops.asarray(mask)
|
||||
|
||||
# This will usually be on GPU
|
||||
d_best = ops.asarray(d_best)
|
||||
# Fix nans (which can occur from unseen classes.)
|
||||
d_best[ops.xp.isnan(d_best)] = 0.
|
||||
if self.nP == 1:
|
||||
if self.activation == "maxout":
|
||||
mask_ = ops.asarray(mask)
|
||||
return ops.backprop_maxout(d_best, mask_, self.nP)
|
||||
elif self.activation == "relu":
|
||||
mask_ = ops.asarray(mask)
|
||||
d_best *= mask_
|
||||
d_best = d_best.reshape((d_best.shape + (1,)))
|
||||
return d_best
|
||||
else:
|
||||
return ops.backprop_maxout(d_best, mask_, self.nP)
|
||||
return d_best.reshape((d_best.shape + (1,)))
|
||||
return state_vector, backprop_nonlinearity
|
||||
|
|
|
@ -100,10 +100,30 @@ cdef cppclass StateC:
|
|||
free(this.shifted - PADDING)
|
||||
|
||||
void set_context_tokens(int* ids, int n) nogil:
|
||||
if n == 2:
|
||||
if n == 1:
|
||||
if this.B(0) >= 0:
|
||||
ids[0] = this.B(0)
|
||||
else:
|
||||
ids[0] = -1
|
||||
elif n == 2:
|
||||
ids[0] = this.B(0)
|
||||
ids[1] = this.S(0)
|
||||
if n == 8:
|
||||
elif n == 3:
|
||||
if this.B(0) >= 0:
|
||||
ids[0] = this.B(0)
|
||||
else:
|
||||
ids[0] = -1
|
||||
# First word of entity, if any
|
||||
if this.entity_is_open():
|
||||
ids[1] = this.E(0)
|
||||
else:
|
||||
ids[1] = -1
|
||||
# Last word of entity, if within entity
|
||||
if ids[0] == -1 or ids[1] == -1:
|
||||
ids[2] = -1
|
||||
else:
|
||||
ids[2] = ids[0] - 1
|
||||
elif n == 8:
|
||||
ids[0] = this.B(0)
|
||||
ids[1] = this.B(1)
|
||||
ids[2] = this.S(0)
|
||||
|
|
|
@ -324,10 +324,16 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
|
|||
return <void*>st
|
||||
|
||||
|
||||
cdef int _del_state(Pool mem, void* state, void* x) except -1:
|
||||
cdef StateC* st = <StateC*>state
|
||||
del st
|
||||
|
||||
|
||||
cdef class ArcEager(TransitionSystem):
|
||||
def __init__(self, *args, **kwargs):
|
||||
TransitionSystem.__init__(self, *args, **kwargs)
|
||||
self.init_beam_state = _init_state
|
||||
self.del_beam_state = _del_state
|
||||
|
||||
@classmethod
|
||||
def get_actions(cls, **kwargs):
|
||||
|
|
|
@ -22,7 +22,7 @@ from thinc.extra.search cimport Beam
|
|||
from thinc.api import chain, clone
|
||||
from thinc.v2v import Model, Maxout, Affine
|
||||
from thinc.misc import LayerNorm
|
||||
from thinc.neural.ops import CupyOps
|
||||
from thinc.neural.ops import NumpyOps, CupyOps
|
||||
from thinc.neural.util import get_array_module
|
||||
from thinc.linalg cimport Vec, VecVec
|
||||
import srsly
|
||||
|
@ -62,13 +62,17 @@ cdef class Parser:
|
|||
t2v_pieces = util.env_opt('cnn_maxout_pieces', cfg.get('cnn_maxout_pieces', 3))
|
||||
bilstm_depth = util.env_opt('bilstm_depth', cfg.get('bilstm_depth', 0))
|
||||
self_attn_depth = util.env_opt('self_attn_depth', cfg.get('self_attn_depth', 0))
|
||||
if depth != 1:
|
||||
nr_feature_tokens = cfg.get("nr_feature_tokens", cls.nr_feature)
|
||||
if depth not in (0, 1):
|
||||
raise ValueError(TempErrors.T004.format(value=depth))
|
||||
parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
|
||||
cfg.get('maxout_pieces', 2))
|
||||
token_vector_width = util.env_opt('token_vector_width',
|
||||
cfg.get('token_vector_width', 96))
|
||||
hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 64))
|
||||
if depth == 0:
|
||||
hidden_width = nr_class
|
||||
parser_maxout_pieces = 1
|
||||
embed_size = util.env_opt('embed_size', cfg.get('embed_size', 2000))
|
||||
pretrained_vectors = cfg.get('pretrained_vectors', None)
|
||||
tok2vec = Tok2Vec(token_vector_width, embed_size,
|
||||
|
@ -81,16 +85,19 @@ cdef class Parser:
|
|||
tok2vec = chain(tok2vec, flatten)
|
||||
tok2vec.nO = token_vector_width
|
||||
lower = PrecomputableAffine(hidden_width,
|
||||
nF=cls.nr_feature, nI=token_vector_width,
|
||||
nF=nr_feature_tokens, nI=token_vector_width,
|
||||
nP=parser_maxout_pieces)
|
||||
lower.nP = parser_maxout_pieces
|
||||
|
||||
if depth == 1:
|
||||
with Model.use_device('cpu'):
|
||||
upper = Affine(nr_class, hidden_width, drop_factor=0.0)
|
||||
upper.W *= 0
|
||||
else:
|
||||
upper = None
|
||||
|
||||
cfg = {
|
||||
'nr_class': nr_class,
|
||||
'nr_feature_tokens': nr_feature_tokens,
|
||||
'hidden_depth': depth,
|
||||
'token_vector_width': token_vector_width,
|
||||
'hidden_width': hidden_width,
|
||||
|
@ -134,6 +141,7 @@ cdef class Parser:
|
|||
if 'beam_update_prob' not in cfg:
|
||||
cfg['beam_update_prob'] = util.env_opt('beam_update_prob', 1.0)
|
||||
cfg.setdefault('cnn_maxout_pieces', 3)
|
||||
cfg.setdefault("nr_feature_tokens", self.nr_feature)
|
||||
self.cfg = cfg
|
||||
self.model = model
|
||||
self._multitasks = []
|
||||
|
@ -308,7 +316,7 @@ cdef class Parser:
|
|||
token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature),
|
||||
dtype='i', order='C')
|
||||
cdef int* c_ids
|
||||
cdef int nr_feature = self.nr_feature
|
||||
cdef int nr_feature = self.cfg["nr_feature_tokens"]
|
||||
cdef int n_states
|
||||
model = self.model(docs)
|
||||
todo = [beam for beam in beams if not beam.is_done]
|
||||
|
@ -512,7 +520,7 @@ cdef class Parser:
|
|||
new_golds.append(gold)
|
||||
model, finish_update = self.model.begin_update(docs, drop=drop)
|
||||
states_d_scores, backprops, beams = _beam_utils.update_beam(
|
||||
self.moves, self.nr_feature, 10000, states, new_golds, model.state2vec,
|
||||
self.moves, self.cfg["nr_feature_tokens"], 10000, states, golds, model.state2vec,
|
||||
model.vec2scores, width, drop=drop, losses=losses,
|
||||
beam_density=beam_density)
|
||||
for i, d_scores in enumerate(states_d_scores):
|
||||
|
|
|
@ -33,6 +33,8 @@ ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil
|
|||
|
||||
ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL
|
||||
|
||||
ctypedef int (*del_state_t)(Pool mem, void* state, void* extra_args) except -1
|
||||
|
||||
cdef class TransitionSystem:
|
||||
cdef Pool mem
|
||||
cdef StringStore strings
|
||||
|
@ -42,6 +44,7 @@ cdef class TransitionSystem:
|
|||
cdef public attr_t root_label
|
||||
cdef public freqs
|
||||
cdef init_state_t init_beam_state
|
||||
cdef del_state_t del_beam_state
|
||||
cdef public object labels
|
||||
|
||||
cdef int initialize_state(self, StateC* state) nogil
|
||||
|
|
|
@ -30,6 +30,11 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
|
|||
return <void*>st
|
||||
|
||||
|
||||
cdef int _del_state(Pool mem, void* state, void* x) except -1:
|
||||
cdef StateC* st = <StateC*>state
|
||||
del st
|
||||
|
||||
|
||||
cdef class TransitionSystem:
|
||||
def __init__(self, StringStore string_table, labels_by_action=None, min_freq=None):
|
||||
self.mem = Pool()
|
||||
|
@ -44,6 +49,7 @@ cdef class TransitionSystem:
|
|||
self.initialize_actions(labels_by_action, min_freq=min_freq)
|
||||
self.root_label = self.strings.add('ROOT')
|
||||
self.init_beam_state = _init_state
|
||||
self.del_beam_state = _del_state
|
||||
|
||||
def __reduce__(self):
|
||||
return (self.__class__, (self.strings, self.labels), None, None)
|
||||
|
@ -72,7 +78,8 @@ cdef class TransitionSystem:
|
|||
|
||||
for doc in docs:
|
||||
beam = Beam(self.n_moves, beam_width, min_density=beam_density)
|
||||
beam.initialize(self.init_beam_state, doc.length, doc.c)
|
||||
beam.initialize(self.init_beam_state, self.del_beam_state,
|
||||
doc.length, doc.c)
|
||||
for i in range(beam.width):
|
||||
state = <StateC*>beam.at(i)
|
||||
state.offset = offset
|
||||
|
|
|
@ -125,7 +125,7 @@ def it_tokenizer():
|
|||
|
||||
@pytest.fixture(scope="session")
|
||||
def ja_tokenizer():
|
||||
pytest.importorskip("MeCab")
|
||||
pytest.importorskip("fugashi")
|
||||
return get_lang_class("ja").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
|
@ -218,3 +218,15 @@ def uk_tokenizer():
|
|||
@pytest.fixture(scope="session")
|
||||
def ur_tokenizer():
|
||||
return get_lang_class("ur").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def yo_tokenizer():
|
||||
return get_lang_class("yo").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def zh_tokenizer():
|
||||
pytest.importorskip("jieba")
|
||||
return get_lang_class("zh").Defaults.create_tokenizer()
|
||||
|
||||
|
|
|
@ -183,3 +183,18 @@ def test_doc_retokenizer_split_lex_attrs(en_vocab):
|
|||
retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
|
||||
assert doc[0].is_stop
|
||||
assert not doc[1].is_stop
|
||||
|
||||
|
||||
def test_doc_retokenizer_realloc(en_vocab):
|
||||
"""#4604: realloc correctly when new tokens outnumber original tokens"""
|
||||
text = "Hyperglycemic adverse events following antipsychotic drug administration in the"
|
||||
doc = Doc(en_vocab, words=text.split()[:-1])
|
||||
with doc.retokenize() as retokenizer:
|
||||
token = doc[0]
|
||||
heads = [(token, 0)] * len(token)
|
||||
retokenizer.split(doc[token.i], list(token.text), heads=heads)
|
||||
doc = Doc(en_vocab, words=text.split())
|
||||
with doc.retokenize() as retokenizer:
|
||||
token = doc[0]
|
||||
heads = [(token, 0)] * len(token)
|
||||
retokenizer.split(doc[token.i], list(token.text), heads=heads)
|
||||
|
|
|
@ -32,6 +32,24 @@ def doc_not_parsed(en_tokenizer):
|
|||
return doc
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"i_sent,i,j,text",
|
||||
[
|
||||
(0, 0, len("This is a"), "This is a"),
|
||||
(1, 0, len("This is another"), "This is another"),
|
||||
(2, len("And "), len("And ") + len("a third"), "a third"),
|
||||
(0, 1, 2, None),
|
||||
],
|
||||
)
|
||||
def test_char_span(doc, i_sent, i, j, text):
|
||||
sents = list(doc.sents)
|
||||
span = sents[i_sent].char_span(i, j)
|
||||
if not text:
|
||||
assert not span
|
||||
else:
|
||||
assert span.text == text
|
||||
|
||||
|
||||
def test_spans_sent_spans(doc):
|
||||
sents = list(doc.sents)
|
||||
assert sents[0].start == 0
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
import re
|
||||
from spacy.lang.en import English
|
||||
from spacy.tokenizer import Tokenizer
|
||||
from spacy.util import compile_prefix_regex, compile_suffix_regex
|
||||
|
@ -19,13 +20,14 @@ def custom_en_tokenizer(en_vocab):
|
|||
r"[\[\]!&:,()\*—–\/-]",
|
||||
]
|
||||
infix_re = compile_infix_regex(custom_infixes)
|
||||
token_match_re = re.compile("a-b")
|
||||
return Tokenizer(
|
||||
en_vocab,
|
||||
English.Defaults.tokenizer_exceptions,
|
||||
prefix_re.search,
|
||||
suffix_re.search,
|
||||
infix_re.finditer,
|
||||
token_match=None,
|
||||
token_match=token_match_re.match,
|
||||
)
|
||||
|
||||
|
||||
|
@ -74,3 +76,81 @@ def test_en_customized_tokenizer_handles_infixes(custom_en_tokenizer):
|
|||
"Megaregion",
|
||||
".",
|
||||
]
|
||||
|
||||
|
||||
def test_en_customized_tokenizer_handles_token_match(custom_en_tokenizer):
|
||||
sentence = "The 8 and 10-county definitions a-b not used for the greater Southern California Megaregion."
|
||||
context = [word.text for word in custom_en_tokenizer(sentence)]
|
||||
assert context == [
|
||||
"The",
|
||||
"8",
|
||||
"and",
|
||||
"10",
|
||||
"-",
|
||||
"county",
|
||||
"definitions",
|
||||
"a-b",
|
||||
"not",
|
||||
"used",
|
||||
"for",
|
||||
"the",
|
||||
"greater",
|
||||
"Southern",
|
||||
"California",
|
||||
"Megaregion",
|
||||
".",
|
||||
]
|
||||
|
||||
|
||||
def test_en_customized_tokenizer_handles_rules(custom_en_tokenizer):
|
||||
sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)"
|
||||
context = [word.text for word in custom_en_tokenizer(sentence)]
|
||||
assert context == [
|
||||
"The",
|
||||
"8",
|
||||
"and",
|
||||
"10",
|
||||
"-",
|
||||
"county",
|
||||
"definitions",
|
||||
"are",
|
||||
"not",
|
||||
"used",
|
||||
"for",
|
||||
"the",
|
||||
"greater",
|
||||
"Southern",
|
||||
"California",
|
||||
"Megaregion",
|
||||
".",
|
||||
":)",
|
||||
]
|
||||
|
||||
|
||||
def test_en_customized_tokenizer_handles_rules_property(custom_en_tokenizer):
|
||||
sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)"
|
||||
rules = custom_en_tokenizer.rules
|
||||
del rules[":)"]
|
||||
custom_en_tokenizer.rules = rules
|
||||
context = [word.text for word in custom_en_tokenizer(sentence)]
|
||||
assert context == [
|
||||
"The",
|
||||
"8",
|
||||
"and",
|
||||
"10",
|
||||
"-",
|
||||
"county",
|
||||
"definitions",
|
||||
"are",
|
||||
"not",
|
||||
"used",
|
||||
"for",
|
||||
"the",
|
||||
"greater",
|
||||
"Southern",
|
||||
"California",
|
||||
"Megaregion",
|
||||
".",
|
||||
":",
|
||||
")",
|
||||
]
|
||||
|
|
27
spacy/tests/lang/fi/test_text.py
Normal file
27
spacy/tests/lang/fi/test_text.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,match",
|
||||
[
|
||||
("10", True),
|
||||
("1", True),
|
||||
("10000", True),
|
||||
("10,00", True),
|
||||
("-999,0", True),
|
||||
("yksi", True),
|
||||
("kolmetoista", True),
|
||||
("viisikymmentä", True),
|
||||
("tuhat", True),
|
||||
("1/2", True),
|
||||
("hevonen", False),
|
||||
(",", False),
|
||||
],
|
||||
)
|
||||
def test_fi_lex_attrs_like_number(fi_tokenizer, text, match):
|
||||
tokens = fi_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].like_num == match
|
|
@ -12,9 +12,23 @@ ABBREVIATION_TESTS = [
|
|||
("Paino on n. 2.2 kg", ["Paino", "on", "n.", "2.2", "kg"]),
|
||||
]
|
||||
|
||||
HYPHENATED_TESTS = [
|
||||
(
|
||||
"1700-luvulle sijoittuva taide-elokuva",
|
||||
["1700-luvulle", "sijoittuva", "taide-elokuva"]
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS)
|
||||
def test_fi_tokenizer_handles_testcases(fi_tokenizer, text, expected_tokens):
|
||||
def test_fi_tokenizer_abbreviations(fi_tokenizer, text, expected_tokens):
|
||||
tokens = fi_tokenizer(text)
|
||||
token_list = [token.text for token in tokens if not token.is_space]
|
||||
assert expected_tokens == token_list
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tokens", HYPHENATED_TESTS)
|
||||
def test_fi_tokenizer_hyphenated_words(fi_tokenizer, text, expected_tokens):
|
||||
tokens = fi_tokenizer(text)
|
||||
token_list = [token.text for token in tokens if not token.is_space]
|
||||
assert expected_tokens == token_list
|
||||
|
|
|
@ -3,8 +3,24 @@ from __future__ import unicode_literals
|
|||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["z.B.", "Jan."])
|
||||
def test_lb_tokenizer_handles_abbr(lb_tokenizer, text):
|
||||
tokens = lb_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
@pytest.mark.parametrize("text", ["d'Saach", "d'Kanner", "d’Welt", "d’Suen"])
|
||||
def test_lb_tokenizer_splits_contractions(lb_tokenizer, text):
|
||||
tokens = lb_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
||||
def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer):
|
||||
text = "Mee 't ass net evident, d'Liewen."
|
||||
tokens = lb_tokenizer(text)
|
||||
assert len(tokens) == 9
|
||||
assert tokens[1].text == "'t"
|
||||
assert tokens[1].lemma_ == "et"
|
||||
|
||||
@pytest.mark.parametrize("text,norm", [("dass", "datt"), ("viläicht", "vläicht")])
|
||||
def test_lb_norm_exceptions(lb_tokenizer, text, norm):
|
||||
tokens = lb_tokenizer(text)
|
||||
assert tokens[0].norm_ == norm
|
||||
|
|
|
@ -5,18 +5,10 @@ import pytest
|
|||
|
||||
|
||||
def test_lb_tokenizer_handles_long_text(lb_tokenizer):
|
||||
text = """Den Nordwand an d'Sonn
|
||||
|
||||
An der Zäit hunn sech den Nordwand an d’Sonn gestridden, wie vun hinnen zwee wuel méi staark wier, wéi e Wanderer, deen an ee waarme Mantel agepak war, iwwert de Wee koum. Si goufen sech eens, dass deejéinege fir de Stäerkste gëlle sollt, deen de Wanderer forcéiere géif, säi Mantel auszedoen.",
|
||||
|
||||
Den Nordwand huet mat aller Force geblosen, awer wat e méi geblosen huet, wat de Wanderer sech méi a säi Mantel agewéckelt huet. Um Enn huet den Nordwand säi Kampf opginn.
|
||||
|
||||
Dunn huet d’Sonn d’Loft mat hire frëndleche Strale gewiermt, a schonn no kuerzer Zäit huet de Wanderer säi Mantel ausgedoen.
|
||||
|
||||
Do huet den Nordwand missen zouginn, dass d’Sonn vun hinnen zwee de Stäerkste wier."""
|
||||
text = """Den Nordwand an d'Sonn An der Zäit hunn sech den Nordwand an d'Sonn gestridden, wie vun hinnen zwee wuel méi staark wier, wéi e Wanderer, deen an ee waarme Mantel agepak war, iwwert de Wee koum. Si goufen sech eens, dass deejéinege fir de Stäerkste gëlle sollt, deen de Wanderer forcéiere géif, säi Mantel auszedoen. Den Nordwand huet mat aller Force geblosen, awer wat e méi geblosen huet, wat de Wanderer sech méi a säi Mantel agewéckelt huet. Um Enn huet den Nordwand säi Kampf opginn. Dunn huet d'Sonn d'Loft mat hire frëndleche Strale gewiermt, a schonn no kuerzer Zäit huet de Wanderer säi Mantel ausgedoen. Do huet den Nordwand missen zouginn, dass d'Sonn vun hinnen zwee de Stäerkste wier."""
|
||||
|
||||
tokens = lb_tokenizer(text)
|
||||
assert len(tokens) == 143
|
||||
assert len(tokens) == 142
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
@ -24,6 +16,7 @@ Do huet den Nordwand missen zouginn, dass d’Sonn vun hinnen zwee de Stäerkste
|
|||
[
|
||||
("»Wat ass mat mir geschitt?«, huet hie geduecht.", 13),
|
||||
("“Dëst fréi Opstoen”, denkt hien, “mécht ee ganz duercherneen. ", 15),
|
||||
("Am Grand-Duché ass d'Liewen schéin, mee 't gëtt ze vill Autoen.", 14)
|
||||
],
|
||||
)
|
||||
def test_lb_tokenizer_handles_examples(lb_tokenizer, text, length):
|
||||
|
|
|
@ -11,7 +11,7 @@ from spacy.util import get_lang_class
|
|||
LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
|
||||
"et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is",
|
||||
"it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk",
|
||||
"sl", "sq", "sr", "sv", "ta", "te", "tl", "tr", "tt", "ur"]
|
||||
"sl", "sq", "sr", "sv", "ta", "te", "tl", "tr", "tt", "ur", 'yo']
|
||||
# fmt: on
|
||||
|
||||
|
||||
|
|
0
spacy/tests/lang/yo/__init__.py
Normal file
0
spacy/tests/lang/yo/__init__.py
Normal file
32
spacy/tests/lang/yo/test_text.py
Normal file
32
spacy/tests/lang/yo/test_text.py
Normal file
|
@ -0,0 +1,32 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.lang.yo.lex_attrs import like_num
|
||||
|
||||
|
||||
def test_yo_tokenizer_handles_long_text(yo_tokenizer):
|
||||
text = """Àwọn ọmọ ìlú tí wọ́n ń ṣàmúlò ayélujára ti bẹ̀rẹ̀ ìkọkúkọ sórí àwòrán ààrẹ Nkurunziza nínú ìfẹ̀hónúhàn pẹ̀lú àmì ìdámọ̀: Nkurunziza àti Burundi:
|
||||
Ọmọ ilé ẹ̀kọ́ gíga ní ẹ̀wọ̀n fún kíkọ ìkọkúkọ sí orí àwòrán Ààrẹ .
|
||||
Bí mo bá ṣe èyí ní Burundi , ó ṣe é ṣe kí a fi mí sí àtìmọ́lé
|
||||
Ìjọba Burundi fi akẹ́kọ̀ọ́bìnrin sí àtìmọ́lé látàrí ẹ̀sùn ìkọkúkọ sí orí àwòrán ààrẹ. A túwíìtì àwòrán ìkọkúkọ wa ní ìbánikẹ́dùn ìṣẹ̀lẹ̀ náà.
|
||||
Wọ́n ní kí a dán an wò, kí a kọ nǹkan sí orí àwòrán ààrẹ mo sì ṣe bẹ́ẹ̀. Mo ní ìgbóyà wípé ẹnikẹ́ni kò ní mú mi níbí.
|
||||
Ìfòfinlíle mú àtakò"""
|
||||
tokens = yo_tokenizer(text)
|
||||
assert len(tokens) == 121
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,match",
|
||||
[("ení", True), ("ogun", True), ("mewadinlogun", True), ("ten", False)],
|
||||
)
|
||||
def test_lex_attrs_like_number(yo_tokenizer, text, match):
|
||||
tokens = yo_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].like_num == match
|
||||
|
||||
|
||||
@pytest.mark.parametrize("word", ["eji", "ejila", "ogun", "aárùn"])
|
||||
def test_yo_lex_attrs_capitals(word):
|
||||
assert like_num(word)
|
||||
assert like_num(word.upper())
|
0
spacy/tests/lang/zh/__init__.py
Normal file
0
spacy/tests/lang/zh/__init__.py
Normal file
25
spacy/tests/lang/zh/test_text.py
Normal file
25
spacy/tests/lang/zh/test_text.py
Normal file
|
@ -0,0 +1,25 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,match",
|
||||
[
|
||||
("10", True),
|
||||
("1", True),
|
||||
("999.0", True),
|
||||
("一", True),
|
||||
("二", True),
|
||||
("〇", True),
|
||||
("十一", True),
|
||||
("狗", False),
|
||||
(",", False),
|
||||
],
|
||||
)
|
||||
def test_lex_attrs_like_number(zh_tokenizer, text, match):
|
||||
tokens = zh_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].like_num == match
|
31
spacy/tests/lang/zh/test_tokenizer.py
Normal file
31
spacy/tests/lang/zh/test_tokenizer.py
Normal file
|
@ -0,0 +1,31 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
# fmt: off
|
||||
TOKENIZER_TESTS = [
|
||||
("作为语言而言,为世界使用人数最多的语言,目前世界有五分之一人口做为母语。",
|
||||
['作为', '语言', '而言', ',', '为', '世界', '使用', '人', '数最多',
|
||||
'的', '语言', ',', '目前', '世界', '有', '五分之一', '人口', '做',
|
||||
'为', '母语', '。']),
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
|
||||
def test_zh_tokenizer(zh_tokenizer, text, expected_tokens):
|
||||
zh_tokenizer.use_jieba = False
|
||||
tokens = [token.text for token in zh_tokenizer(text)]
|
||||
assert tokens == list(text)
|
||||
|
||||
zh_tokenizer.use_jieba = True
|
||||
tokens = [token.text for token in zh_tokenizer(text)]
|
||||
assert tokens == expected_tokens
|
||||
|
||||
|
||||
def test_extra_spaces(zh_tokenizer):
|
||||
# note: three spaces after "I"
|
||||
tokens = zh_tokenizer("I like cheese.")
|
||||
assert tokens[1].orth_ == " "
|
|
@ -259,6 +259,27 @@ def test_block_ner():
|
|||
assert [token.ent_type_ for token in doc] == expected_types
|
||||
|
||||
|
||||
def test_change_number_features():
|
||||
# Test the default number features
|
||||
nlp = English()
|
||||
ner = nlp.create_pipe("ner")
|
||||
nlp.add_pipe(ner)
|
||||
ner.add_label("PERSON")
|
||||
nlp.begin_training()
|
||||
assert ner.model.lower.nF == ner.nr_feature
|
||||
# Test we can change it
|
||||
nlp = English()
|
||||
ner = nlp.create_pipe("ner")
|
||||
nlp.add_pipe(ner)
|
||||
ner.add_label("PERSON")
|
||||
nlp.begin_training(
|
||||
component_cfg={"ner": {"nr_feature_tokens": 3, "token_vector_width": 128}}
|
||||
)
|
||||
assert ner.model.lower.nF == 3
|
||||
# Test the model runs
|
||||
nlp("hello world")
|
||||
|
||||
|
||||
class BlockerComponent1(object):
|
||||
name = "my_blocker"
|
||||
|
||||
|
|
|
@ -148,3 +148,20 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
|
|||
assert tokens[4].left_edge.i == 0
|
||||
assert tokens[4].right_edge.i == 4
|
||||
assert tokens[4].head.i == 4
|
||||
|
||||
|
||||
def test_parser_set_sent_starts(en_vocab):
|
||||
words = ['Ein', 'Satz', '.', 'Außerdem', 'ist', 'Zimmer', 'davon', 'überzeugt', ',', 'dass', 'auch', 'epige-', '\n', 'netische', 'Mechanismen', 'eine', 'Rolle', 'spielen', ',', 'also', 'Vorgänge', ',', 'die', '\n', 'sich', 'darauf', 'auswirken', ',', 'welche', 'Gene', 'abgelesen', 'werden', 'und', '\n', 'welche', 'nicht', '.', '\n']
|
||||
heads = [1, 0, -1, 27, 0, -1, 1, -3, -1, 8, 4, 3, -1, 1, 3, 1, 1, -11, -1, 1, -9, -1, 4, -1, 2, 1, -6, -1, 1, 2, 1, -6, -1, -1, -17, -31, -32, -1]
|
||||
deps = ['nk', 'ROOT', 'punct', 'mo', 'ROOT', 'sb', 'op', 'pd', 'punct', 'cp', 'mo', 'nk', '', 'nk', 'sb', 'nk', 'oa', 're', 'punct', 'mo', 'app', 'punct', 'sb', '', 'oa', 'op', 'rc', 'punct', 'nk', 'sb', 'oc', 're', 'cd', '', 'oa', 'ng', 'punct', '']
|
||||
doc = get_doc(
|
||||
en_vocab, words=words, deps=deps, heads=heads
|
||||
)
|
||||
for i in range(len(words)):
|
||||
if i == 0 or i == 3:
|
||||
assert doc[i].is_sent_start == True
|
||||
else:
|
||||
assert doc[i].is_sent_start == None
|
||||
for sent in doc.sents:
|
||||
for token in sent:
|
||||
assert token.head in sent
|
||||
|
|
|
@ -5,6 +5,7 @@ import pytest
|
|||
import spacy
|
||||
from spacy.pipeline import Sentencizer
|
||||
from spacy.tokens import Doc
|
||||
from spacy.lang.en import English
|
||||
|
||||
|
||||
def test_sentencizer(en_vocab):
|
||||
|
@ -17,6 +18,17 @@ def test_sentencizer(en_vocab):
|
|||
assert len(list(doc.sents)) == 2
|
||||
|
||||
|
||||
def test_sentencizer_pipe():
|
||||
texts = ["Hello! This is a test.", "Hi! This is a test."]
|
||||
nlp = English()
|
||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||
for doc in nlp.pipe(texts):
|
||||
assert doc.is_sentenced
|
||||
sent_starts = [t.is_sent_start for t in doc]
|
||||
assert sent_starts == [True, False, True, False, False, False, False]
|
||||
assert len(list(doc.sents)) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"words,sent_starts,n_sents",
|
||||
[
|
||||
|
|
14
spacy/tests/pipeline/test_tagger.py
Normal file
14
spacy/tests/pipeline/test_tagger.py
Normal file
|
@ -0,0 +1,14 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.language import Language
|
||||
from spacy.pipeline import Tagger
|
||||
|
||||
|
||||
def test_label_types():
|
||||
nlp = Language()
|
||||
nlp.add_pipe(nlp.create_pipe("tagger"))
|
||||
nlp.get_pipe("tagger").add_label("A")
|
||||
with pytest.raises(ValueError):
|
||||
nlp.get_pipe("tagger").add_label(9)
|
|
@ -62,3 +62,11 @@ def test_textcat_learns_multilabel():
|
|||
assert score < 0.5
|
||||
else:
|
||||
assert score > 0.5
|
||||
|
||||
|
||||
def test_label_types():
|
||||
nlp = Language()
|
||||
nlp.add_pipe(nlp.create_pipe("textcat"))
|
||||
nlp.get_pipe("textcat").add_label("answer")
|
||||
with pytest.raises(ValueError):
|
||||
nlp.get_pipe("textcat").add_label(9)
|
||||
|
|
|
@ -177,7 +177,6 @@ def test_issue3328(en_vocab):
|
|||
assert matched_texts == ["Hello", "how", "you", "doing"]
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_issue3331(en_vocab):
|
||||
"""Test that duplicate patterns for different rules result in multiple
|
||||
matches, one per rule.
|
||||
|
@ -328,6 +327,7 @@ def test_issue3449():
|
|||
assert t3[5].text == "I"
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue3456():
|
||||
# this crashed because of a padding error in layer.ops.unflatten in thinc
|
||||
nlp = English()
|
||||
|
|
|
@ -2,8 +2,10 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.lang.en import English
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue3880():
|
||||
"""Test that `nlp.pipe()` works when an empty string ends the batch.
|
||||
|
||||
|
|
|
@ -3,8 +3,10 @@ from __future__ import unicode_literals
|
|||
|
||||
from spacy.lang.en import English
|
||||
from spacy.util import minibatch, compounding
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue4348():
|
||||
"""Test that training the tagger with empty data, doesn't throw errors"""
|
||||
|
||||
|
|
|
@ -3,9 +3,9 @@ from __future__ import unicode_literals
|
|||
|
||||
import srsly
|
||||
from spacy.gold import GoldCorpus
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.tests.util import make_tempdir
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4402():
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from mock import Mock
|
||||
from spacy.matcher import DependencyMatcher
|
||||
from ..util import get_doc
|
||||
|
@ -11,8 +10,14 @@ def test_issue4590(en_vocab):
|
|||
"""Test that matches param in on_match method are the same as matches run with no on_match method"""
|
||||
pattern = [
|
||||
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
|
||||
{"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}},
|
||||
{"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
|
||||
"PATTERN": {"ORTH": "fox"},
|
||||
},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
|
||||
"PATTERN": {"ORTH": "fox"},
|
||||
},
|
||||
]
|
||||
|
||||
on_match = Mock()
|
||||
|
@ -31,4 +36,3 @@ def test_issue4590(en_vocab):
|
|||
on_match_args = on_match.call_args
|
||||
|
||||
assert on_match_args[0][3] == matches
|
||||
|
||||
|
|
65
spacy/tests/regression/test_issue4651.py
Normal file
65
spacy/tests/regression/test_issue4651.py
Normal file
|
@ -0,0 +1,65 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.pipeline import EntityRuler
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4651_with_phrase_matcher_attr():
|
||||
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
|
||||
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
||||
specified.
|
||||
"""
|
||||
text = "Spacy is a python library for nlp"
|
||||
|
||||
nlp = English()
|
||||
ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER")
|
||||
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
|
||||
doc = nlp(text)
|
||||
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
||||
|
||||
nlp_reloaded = English()
|
||||
with make_tempdir() as d:
|
||||
file_path = d / "entityruler"
|
||||
ruler.to_disk(file_path)
|
||||
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
|
||||
|
||||
nlp_reloaded.add_pipe(ruler_reloaded)
|
||||
doc_reloaded = nlp_reloaded(text)
|
||||
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
||||
|
||||
assert res == res_reloaded
|
||||
|
||||
|
||||
def test_issue4651_without_phrase_matcher_attr():
|
||||
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
|
||||
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
||||
not specified.
|
||||
"""
|
||||
text = "Spacy is a python library for nlp"
|
||||
|
||||
nlp = English()
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
|
||||
doc = nlp(text)
|
||||
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
||||
|
||||
nlp_reloaded = English()
|
||||
with make_tempdir() as d:
|
||||
file_path = d / "entityruler"
|
||||
ruler.to_disk(file_path)
|
||||
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
|
||||
|
||||
nlp_reloaded.add_pipe(ruler_reloaded)
|
||||
doc_reloaded = nlp_reloaded(text)
|
||||
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
||||
|
||||
assert res == res_reloaded
|
34
spacy/tests/regression/test_issue4674.py
Normal file
34
spacy/tests/regression/test_issue4674.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.kb import KnowledgeBase
|
||||
from spacy.util import ensure_path
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.tests.util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4674():
|
||||
"""Test that setting entities with overlapping identifiers does not mess up IO"""
|
||||
nlp = English()
|
||||
kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||
|
||||
vector1 = [0.9, 1.1, 1.01]
|
||||
vector2 = [1.8, 2.25, 2.01]
|
||||
kb.set_entities(entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2])
|
||||
|
||||
assert kb.get_size_entities() == 1
|
||||
|
||||
# dumping to file & loading back in
|
||||
with make_tempdir() as d:
|
||||
dir_path = ensure_path(d)
|
||||
if not dir_path.exists():
|
||||
dir_path.mkdir()
|
||||
file_path = dir_path / "kb"
|
||||
kb.dump(str(file_path))
|
||||
|
||||
kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
|
||||
kb2.load_bulk(str(file_path))
|
||||
|
||||
assert kb2.get_size_entities() == 1
|
||||
|
23
spacy/tests/regression/test_issue4707.py
Normal file
23
spacy/tests/regression/test_issue4707.py
Normal file
|
@ -0,0 +1,23 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.util import load_model_from_path
|
||||
from spacy.lang.en import English
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4707():
|
||||
"""Tests that disabled component names are also excluded from nlp.from_disk
|
||||
by default when loading a model.
|
||||
"""
|
||||
nlp = English()
|
||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||
nlp.add_pipe(nlp.create_pipe("entity_ruler"))
|
||||
assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
|
||||
exclude = ["tokenizer", "sentencizer"]
|
||||
with make_tempdir() as tmpdir:
|
||||
nlp.to_disk(tmpdir, exclude=exclude)
|
||||
new_nlp = load_model_from_path(tmpdir, disable=exclude)
|
||||
assert "sentencizer" not in new_nlp.pipe_names
|
||||
assert "entity_ruler" in new_nlp.pipe_names
|
|
@ -24,6 +24,7 @@ def test_serialize_empty_doc(en_vocab):
|
|||
|
||||
def test_serialize_doc_roundtrip_bytes(en_vocab):
|
||||
doc = Doc(en_vocab, words=["hello", "world"])
|
||||
doc.cats = {"A": 0.5}
|
||||
doc_b = doc.to_bytes()
|
||||
new_doc = Doc(en_vocab).from_bytes(doc_b)
|
||||
assert new_doc.to_bytes() == doc_b
|
||||
|
@ -66,12 +67,17 @@ def test_serialize_doc_exclude(en_vocab):
|
|||
def test_serialize_doc_bin():
|
||||
doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)
|
||||
texts = ["Some text", "Lots of texts...", "..."]
|
||||
cats = {"A": 0.5}
|
||||
nlp = English()
|
||||
for doc in nlp.pipe(texts):
|
||||
doc.cats = cats
|
||||
doc_bin.add(doc)
|
||||
bytes_data = doc_bin.to_bytes()
|
||||
|
||||
# Deserialize later, e.g. in a new process
|
||||
nlp = spacy.blank("en")
|
||||
doc_bin = DocBin().from_bytes(bytes_data)
|
||||
list(doc_bin.get_docs(nlp.vocab))
|
||||
reloaded_docs = list(doc_bin.get_docs(nlp.vocab))
|
||||
for i, doc in enumerate(reloaded_docs):
|
||||
assert doc.text == texts[i]
|
||||
assert doc.cats == cats
|
||||
|
|
|
@ -65,6 +65,20 @@ def test_language_evaluate(nlp):
|
|||
nlp.evaluate([text, gold])
|
||||
|
||||
|
||||
def test_evaluate_no_pipe(nlp):
|
||||
"""Test that docs are processed correctly within Language.pipe if the
|
||||
component doesn't expose a .pipe method."""
|
||||
|
||||
def pipe(doc):
|
||||
return doc
|
||||
|
||||
text = "hello world"
|
||||
annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
|
||||
nlp = Language(Vocab())
|
||||
nlp.add_pipe(pipe)
|
||||
nlp.evaluate([(text, annots)])
|
||||
|
||||
|
||||
def vector_modification_pipe(doc):
|
||||
doc.vector += 1
|
||||
return doc
|
||||
|
|
|
@ -12,8 +12,22 @@ from .util import get_doc
|
|||
test_las_apple = [
|
||||
[
|
||||
"Apple is looking at buying U.K. startup for $ 1 billion",
|
||||
{"heads": [2, 2, 2, 2, 3, 6, 4, 4, 10, 10, 7],
|
||||
"deps": ['nsubj', 'aux', 'ROOT', 'prep', 'pcomp', 'compound', 'dobj', 'prep', 'quantmod', 'compound', 'pobj']},
|
||||
{
|
||||
"heads": [2, 2, 2, 2, 3, 6, 4, 4, 10, 10, 7],
|
||||
"deps": [
|
||||
"nsubj",
|
||||
"aux",
|
||||
"ROOT",
|
||||
"prep",
|
||||
"pcomp",
|
||||
"compound",
|
||||
"dobj",
|
||||
"prep",
|
||||
"quantmod",
|
||||
"compound",
|
||||
"pobj",
|
||||
],
|
||||
},
|
||||
]
|
||||
]
|
||||
|
||||
|
@ -59,7 +73,7 @@ def test_las_per_type(en_vocab):
|
|||
en_vocab,
|
||||
words=input_.split(" "),
|
||||
heads=([h - i for i, h in enumerate(annot["heads"])]),
|
||||
deps=annot["deps"]
|
||||
deps=annot["deps"],
|
||||
)
|
||||
gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"])
|
||||
doc[0].dep_ = "compound"
|
||||
|
|
65
spacy/tests/tokenizer/test_explain.py
Normal file
65
spacy/tests/tokenizer/test_explain.py
Normal file
|
@ -0,0 +1,65 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.util import get_lang_class
|
||||
|
||||
# Only include languages with no external dependencies
|
||||
# "is" seems to confuse importlib, so we're also excluding it for now
|
||||
# excluded: ja, ru, th, uk, vi, zh, is
|
||||
LANGUAGES = [
|
||||
pytest.param("fr", marks=pytest.mark.slow()),
|
||||
pytest.param("af", marks=pytest.mark.slow()),
|
||||
pytest.param("ar", marks=pytest.mark.slow()),
|
||||
pytest.param("bg", marks=pytest.mark.slow()),
|
||||
"bn",
|
||||
pytest.param("ca", marks=pytest.mark.slow()),
|
||||
pytest.param("cs", marks=pytest.mark.slow()),
|
||||
pytest.param("da", marks=pytest.mark.slow()),
|
||||
pytest.param("de", marks=pytest.mark.slow()),
|
||||
"el",
|
||||
"en",
|
||||
pytest.param("es", marks=pytest.mark.slow()),
|
||||
pytest.param("et", marks=pytest.mark.slow()),
|
||||
pytest.param("fa", marks=pytest.mark.slow()),
|
||||
pytest.param("fi", marks=pytest.mark.slow()),
|
||||
"fr",
|
||||
pytest.param("ga", marks=pytest.mark.slow()),
|
||||
pytest.param("he", marks=pytest.mark.slow()),
|
||||
pytest.param("hi", marks=pytest.mark.slow()),
|
||||
pytest.param("hr", marks=pytest.mark.slow()),
|
||||
"hu",
|
||||
pytest.param("id", marks=pytest.mark.slow()),
|
||||
pytest.param("it", marks=pytest.mark.slow()),
|
||||
pytest.param("kn", marks=pytest.mark.slow()),
|
||||
pytest.param("lb", marks=pytest.mark.slow()),
|
||||
pytest.param("lt", marks=pytest.mark.slow()),
|
||||
pytest.param("lv", marks=pytest.mark.slow()),
|
||||
pytest.param("nb", marks=pytest.mark.slow()),
|
||||
pytest.param("nl", marks=pytest.mark.slow()),
|
||||
"pl",
|
||||
pytest.param("pt", marks=pytest.mark.slow()),
|
||||
pytest.param("ro", marks=pytest.mark.slow()),
|
||||
pytest.param("si", marks=pytest.mark.slow()),
|
||||
pytest.param("sk", marks=pytest.mark.slow()),
|
||||
pytest.param("sl", marks=pytest.mark.slow()),
|
||||
pytest.param("sq", marks=pytest.mark.slow()),
|
||||
pytest.param("sr", marks=pytest.mark.slow()),
|
||||
pytest.param("sv", marks=pytest.mark.slow()),
|
||||
pytest.param("ta", marks=pytest.mark.slow()),
|
||||
pytest.param("te", marks=pytest.mark.slow()),
|
||||
pytest.param("tl", marks=pytest.mark.slow()),
|
||||
pytest.param("tr", marks=pytest.mark.slow()),
|
||||
pytest.param("tt", marks=pytest.mark.slow()),
|
||||
pytest.param("ur", marks=pytest.mark.slow()),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("lang", LANGUAGES)
|
||||
def test_tokenizer_explain(lang):
|
||||
tokenizer = get_lang_class(lang).Defaults.create_tokenizer()
|
||||
examples = pytest.importorskip("spacy.lang.{}.examples".format(lang))
|
||||
for sentence in examples.sentences:
|
||||
tokens = [t.text for t in tokenizer(sentence) if not t.is_space]
|
||||
debug_tokens = [t[1] for t in tokenizer.explain(sentence)]
|
||||
assert tokens == debug_tokens
|
|
@ -57,10 +57,8 @@ URLS_SHOULD_MATCH = [
|
|||
pytest.param(
|
||||
"chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()
|
||||
),
|
||||
pytest.param("http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail()),
|
||||
pytest.param(
|
||||
"http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail()
|
||||
),
|
||||
"http://foo.com/blah_blah_(wikipedia)",
|
||||
"http://foo.com/blah_blah_(wikipedia)_(again)",
|
||||
pytest.param("http://⌘.ws", marks=pytest.mark.xfail()),
|
||||
pytest.param("http://⌘.ws/", marks=pytest.mark.xfail()),
|
||||
pytest.param("http://☺.damowmow.com/", marks=pytest.mark.xfail()),
|
||||
|
@ -107,8 +105,8 @@ URLS_SHOULD_NOT_MATCH = [
|
|||
"NASDAQ:GOOG",
|
||||
"http://-a.b.co",
|
||||
pytest.param("foo.com", marks=pytest.mark.xfail()),
|
||||
pytest.param("http://1.1.1.1.1", marks=pytest.mark.xfail()),
|
||||
pytest.param("http://www.foo.bar./", marks=pytest.mark.xfail()),
|
||||
"http://1.1.1.1.1",
|
||||
"http://www.foo.bar./",
|
||||
]
|
||||
|
||||
|
||||
|
|
|
@ -17,6 +17,8 @@ import re
|
|||
from .tokens.doc cimport Doc
|
||||
from .strings cimport hash_string
|
||||
from .compat import unescape_unicode
|
||||
from .attrs import intify_attrs
|
||||
from .symbols import ORTH
|
||||
|
||||
from .errors import Errors, Warnings, deprecation_warning
|
||||
from . import util
|
||||
|
@ -107,6 +109,18 @@ cdef class Tokenizer:
|
|||
if self._property_init_count <= self._property_init_max:
|
||||
self._property_init_count += 1
|
||||
|
||||
property rules:
|
||||
def __get__(self):
|
||||
return self._rules
|
||||
|
||||
def __set__(self, rules):
|
||||
self._rules = {}
|
||||
self._reset_cache([key for key in self._cache])
|
||||
self._reset_specials()
|
||||
self._cache = PreshMap()
|
||||
self._specials = PreshMap()
|
||||
self._load_special_tokenization(rules)
|
||||
|
||||
def __reduce__(self):
|
||||
args = (self.vocab,
|
||||
self._rules,
|
||||
|
@ -572,7 +586,7 @@ cdef class Tokenizer:
|
|||
attrs = [intify_attrs(spec, _do_deprecated=True) for spec in substrings]
|
||||
orth = "".join([spec[ORTH] for spec in attrs])
|
||||
if chunk != orth:
|
||||
raise ValueError(Errors.E187.format(chunk=chunk, orth=orth, token_attrs=substrings))
|
||||
raise ValueError(Errors.E997.format(chunk=chunk, orth=orth, token_attrs=substrings))
|
||||
|
||||
def add_special_case(self, unicode string, substrings):
|
||||
"""Add a special-case tokenization rule.
|
||||
|
@ -612,6 +626,73 @@ cdef class Tokenizer:
|
|||
self._flush_specials()
|
||||
self._load_special_cases(self._rules)
|
||||
|
||||
def explain(self, text):
|
||||
"""A debugging tokenizer that provides information about which
|
||||
tokenizer rule or pattern was matched for each token. The tokens
|
||||
produced are identical to `nlp.tokenizer()` except for whitespace
|
||||
tokens.
|
||||
|
||||
string (unicode): The string to tokenize.
|
||||
RETURNS (list): A list of (pattern_string, token_string) tuples
|
||||
|
||||
DOCS: https://spacy.io/api/tokenizer#explain
|
||||
"""
|
||||
prefix_search = self.prefix_search
|
||||
suffix_search = self.suffix_search
|
||||
infix_finditer = self.infix_finditer
|
||||
token_match = self.token_match
|
||||
special_cases = {}
|
||||
for orth, special_tokens in self.rules.items():
|
||||
special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens]
|
||||
tokens = []
|
||||
for substring in text.split():
|
||||
suffixes = []
|
||||
while substring:
|
||||
while prefix_search(substring) or suffix_search(substring):
|
||||
if substring in special_cases:
|
||||
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
|
||||
substring = ''
|
||||
break
|
||||
if prefix_search(substring):
|
||||
split = prefix_search(substring).end()
|
||||
# break if pattern matches the empty string
|
||||
if split == 0:
|
||||
break
|
||||
tokens.append(("PREFIX", substring[:split]))
|
||||
substring = substring[split:]
|
||||
if substring in special_cases:
|
||||
continue
|
||||
if suffix_search(substring):
|
||||
split = suffix_search(substring).start()
|
||||
# break if pattern matches the empty string
|
||||
if split == len(substring):
|
||||
break
|
||||
suffixes.append(("SUFFIX", substring[split:]))
|
||||
substring = substring[:split]
|
||||
if substring in special_cases:
|
||||
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
|
||||
substring = ''
|
||||
elif token_match(substring):
|
||||
tokens.append(("TOKEN_MATCH", substring))
|
||||
substring = ''
|
||||
elif list(infix_finditer(substring)):
|
||||
infixes = infix_finditer(substring)
|
||||
offset = 0
|
||||
for match in infixes:
|
||||
if substring[offset : match.start()]:
|
||||
tokens.append(("TOKEN", substring[offset : match.start()]))
|
||||
if substring[match.start() : match.end()]:
|
||||
tokens.append(("INFIX", substring[match.start() : match.end()]))
|
||||
offset = match.end()
|
||||
if substring[offset:]:
|
||||
tokens.append(("TOKEN", substring[offset:]))
|
||||
substring = ''
|
||||
elif substring:
|
||||
tokens.append(("TOKEN", substring))
|
||||
substring = ''
|
||||
tokens.extend(reversed(suffixes))
|
||||
return tokens
|
||||
|
||||
def to_disk(self, path, **kwargs):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
|
|
|
@ -329,7 +329,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
|
|||
doc.c[i].head += offset
|
||||
# Double doc.c max_length if necessary (until big enough for all new tokens)
|
||||
while doc.length + nb_subtokens - 1 >= doc.max_length:
|
||||
doc._realloc(doc.length * 2)
|
||||
doc._realloc(doc.max_length * 2)
|
||||
# Move tokens after the split to create space for the new tokens
|
||||
doc.length = len(doc) + nb_subtokens -1
|
||||
to_process_tensor = (doc.tensor is not None and doc.tensor.size != 0)
|
||||
|
|
|
@ -58,6 +58,7 @@ class DocBin(object):
|
|||
self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0]
|
||||
self.tokens = []
|
||||
self.spaces = []
|
||||
self.cats = []
|
||||
self.user_data = []
|
||||
self.strings = set()
|
||||
self.store_user_data = store_user_data
|
||||
|
@ -82,6 +83,7 @@ class DocBin(object):
|
|||
spaces = spaces.reshape((spaces.shape[0], 1))
|
||||
self.spaces.append(numpy.asarray(spaces, dtype=bool))
|
||||
self.strings.update(w.text for w in doc)
|
||||
self.cats.append(doc.cats)
|
||||
if self.store_user_data:
|
||||
self.user_data.append(srsly.msgpack_dumps(doc.user_data))
|
||||
|
||||
|
@ -102,6 +104,7 @@ class DocBin(object):
|
|||
words = [vocab.strings[orth] for orth in tokens[:, orth_col]]
|
||||
doc = Doc(vocab, words=words, spaces=spaces)
|
||||
doc = doc.from_array(self.attrs, tokens)
|
||||
doc.cats = self.cats[i]
|
||||
if self.store_user_data:
|
||||
user_data = srsly.msgpack_loads(self.user_data[i], use_list=False)
|
||||
doc.user_data.update(user_data)
|
||||
|
@ -121,6 +124,7 @@ class DocBin(object):
|
|||
self.tokens.extend(other.tokens)
|
||||
self.spaces.extend(other.spaces)
|
||||
self.strings.update(other.strings)
|
||||
self.cats.extend(other.cats)
|
||||
if self.store_user_data:
|
||||
self.user_data.extend(other.user_data)
|
||||
|
||||
|
@ -140,6 +144,7 @@ class DocBin(object):
|
|||
"spaces": numpy.vstack(self.spaces).tobytes("C"),
|
||||
"lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
|
||||
"strings": list(self.strings),
|
||||
"cats": self.cats,
|
||||
}
|
||||
if self.store_user_data:
|
||||
msg["user_data"] = self.user_data
|
||||
|
@ -164,6 +169,7 @@ class DocBin(object):
|
|||
flat_spaces = flat_spaces.reshape((flat_spaces.size, 1))
|
||||
self.tokens = NumpyOps().unflatten(flat_tokens, lengths)
|
||||
self.spaces = NumpyOps().unflatten(flat_spaces, lengths)
|
||||
self.cats = msg["cats"]
|
||||
if self.store_user_data and "user_data" in msg:
|
||||
self.user_data = list(msg["user_data"])
|
||||
for tokens in self.tokens:
|
||||
|
|
|
@ -21,6 +21,9 @@ ctypedef fused LexemeOrToken:
|
|||
cdef int set_children_from_heads(TokenC* tokens, int length) except -1
|
||||
|
||||
|
||||
cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1
|
||||
|
||||
|
||||
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2
|
||||
|
||||
|
||||
|
|
|
@ -887,6 +887,7 @@ cdef class Doc:
|
|||
"array_body": lambda: self.to_array(array_head),
|
||||
"sentiment": lambda: self.sentiment,
|
||||
"tensor": lambda: self.tensor,
|
||||
"cats": lambda: self.cats,
|
||||
}
|
||||
for key in kwargs:
|
||||
if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"):
|
||||
|
@ -916,6 +917,7 @@ cdef class Doc:
|
|||
"array_body": lambda b: None,
|
||||
"sentiment": lambda b: None,
|
||||
"tensor": lambda b: None,
|
||||
"cats": lambda b: None,
|
||||
"user_data_keys": lambda b: None,
|
||||
"user_data_values": lambda b: None,
|
||||
}
|
||||
|
@ -937,6 +939,8 @@ cdef class Doc:
|
|||
self.sentiment = msg["sentiment"]
|
||||
if "tensor" not in exclude and "tensor" in msg:
|
||||
self.tensor = msg["tensor"]
|
||||
if "cats" not in exclude and "cats" in msg:
|
||||
self.cats = msg["cats"]
|
||||
start = 0
|
||||
cdef const LexemeC* lex
|
||||
cdef unicode orth_
|
||||
|
@ -1153,10 +1157,32 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
|
|||
tokens[i].r_kids = 0
|
||||
tokens[i].l_edge = i
|
||||
tokens[i].r_edge = i
|
||||
# Three times, for non-projectivity. See issue #3170. This isn't a very
|
||||
# satisfying fix, but I think it's sufficient.
|
||||
for loop_count in range(3):
|
||||
cdef int loop_count = 0
|
||||
cdef bint heads_within_sents = False
|
||||
# Try up to 10 iterations of adjusting lr_kids and lr_edges in order to
|
||||
# handle non-projective dependency parses, stopping when all heads are
|
||||
# within their respective sentence boundaries. We have documented cases
|
||||
# that need at least 4 iterations, so this is to be on the safe side
|
||||
# without risking getting stuck in an infinite loop if something is
|
||||
# terribly malformed.
|
||||
while not heads_within_sents:
|
||||
heads_within_sents = _set_lr_kids_and_edges(tokens, length, loop_count)
|
||||
if loop_count > 10:
|
||||
user_warning(Warnings.W026)
|
||||
loop_count += 1
|
||||
# Set sentence starts
|
||||
for i in range(length):
|
||||
if tokens[i].head == 0 and tokens[i].dep != 0:
|
||||
tokens[tokens[i].l_edge].sent_start = True
|
||||
|
||||
|
||||
cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1:
|
||||
# May be called multiple times due to non-projectivity. See issues #3170
|
||||
# and #4688.
|
||||
# Set left edges
|
||||
cdef TokenC* head
|
||||
cdef TokenC* child
|
||||
cdef int i, j
|
||||
for i in range(length):
|
||||
child = &tokens[i]
|
||||
head = &tokens[i + child.head]
|
||||
|
@ -1176,10 +1202,22 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
|
|||
head.r_edge = child.r_edge
|
||||
if child.l_edge < head.l_edge:
|
||||
head.l_edge = child.l_edge
|
||||
# Set sentence starts
|
||||
# Get sentence start positions according to current state
|
||||
sent_starts = set()
|
||||
for i in range(length):
|
||||
if tokens[i].head == 0 and tokens[i].dep != 0:
|
||||
tokens[tokens[i].l_edge].sent_start = True
|
||||
sent_starts.add(tokens[i].l_edge)
|
||||
cdef int curr_sent_start = 0
|
||||
cdef int curr_sent_end = 0
|
||||
# Check whether any heads are not within the current sentence
|
||||
for i in range(length):
|
||||
if (i > 0 and i in sent_starts) or i == length - 1:
|
||||
curr_sent_end = i
|
||||
for j in range(curr_sent_start, curr_sent_end):
|
||||
if tokens[j].head + j < curr_sent_start or tokens[j].head + j >= curr_sent_end + 1:
|
||||
return False
|
||||
curr_sent_start = i
|
||||
return True
|
||||
|
||||
|
||||
cdef int _get_tokens_lca(Token token_j, Token token_k):
|
||||
|
|
|
@ -584,6 +584,22 @@ cdef class Span:
|
|||
else:
|
||||
return self.doc[root]
|
||||
|
||||
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None):
|
||||
"""Create a `Span` object from the slice `span.text[start : end]`.
|
||||
|
||||
start (int): The index of the first character of the span.
|
||||
end (int): The index of the first character after the span.
|
||||
label (uint64 or string): A label to attach to the Span, e.g. for
|
||||
named entities.
|
||||
kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity.
|
||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
|
||||
the span.
|
||||
RETURNS (Span): The newly constructed object.
|
||||
"""
|
||||
start_idx += self.start_char
|
||||
end_idx += self.start_char
|
||||
return self.doc.char_span(start_idx, end_idx)
|
||||
|
||||
@property
|
||||
def conjuncts(self):
|
||||
"""Tokens that are conjoined to the span's root.
|
||||
|
|
|
@ -208,7 +208,7 @@ def load_model_from_path(model_path, meta=False, **overrides):
|
|||
factory = factories.get(name, name)
|
||||
component = nlp.create_pipe(factory, config=config)
|
||||
nlp.add_pipe(component, name=name)
|
||||
return nlp.from_disk(model_path)
|
||||
return nlp.from_disk(model_path, exclude=disable)
|
||||
|
||||
|
||||
def load_model_from_init_py(init_file, **overrides):
|
||||
|
@ -301,13 +301,13 @@ def get_component_name(component):
|
|||
return repr(component)
|
||||
|
||||
|
||||
def get_cuda_stream(require=False):
|
||||
def get_cuda_stream(require=False, non_blocking=True):
|
||||
if CudaStream is None:
|
||||
return None
|
||||
elif isinstance(Model.ops, NumpyOps):
|
||||
return None
|
||||
else:
|
||||
return CudaStream()
|
||||
return CudaStream(non_blocking=non_blocking)
|
||||
|
||||
|
||||
def get_async(stream, numpy_array):
|
||||
|
|
|
@ -265,16 +265,11 @@ cdef class Vectors:
|
|||
rows = [self.key2row.get(key, -1.) for key in keys]
|
||||
return xp.asarray(rows, dtype="i")
|
||||
else:
|
||||
targets = set()
|
||||
row2key = {row: key for key, row in self.key2row.items()}
|
||||
if row is not None:
|
||||
targets.add(row)
|
||||
return row2key[row]
|
||||
else:
|
||||
targets.update(rows)
|
||||
results = []
|
||||
for key, row in self.key2row.items():
|
||||
if row in targets:
|
||||
results.append(key)
|
||||
targets.remove(row)
|
||||
results = [row2key[row] for row in rows]
|
||||
return xp.asarray(results, dtype="uint64")
|
||||
|
||||
def add(self, key, *, vector=None, row=None):
|
||||
|
|
|
@ -3,7 +3,6 @@
|
|||
from __future__ import unicode_literals
|
||||
from libc.string cimport memcpy
|
||||
|
||||
import numpy
|
||||
import srsly
|
||||
from collections import OrderedDict
|
||||
from thinc.neural.util import get_array_module
|
||||
|
@ -361,7 +360,8 @@ cdef class Vocab:
|
|||
minn = len(word)
|
||||
if maxn is None:
|
||||
maxn = len(word)
|
||||
vectors = numpy.zeros((self.vectors_length,), dtype="f")
|
||||
xp = get_array_module(self.vectors.data)
|
||||
vectors = xp.zeros((self.vectors_length,), dtype="f")
|
||||
# Fasttext's ngram computation taken from
|
||||
# https://github.com/facebookresearch/fastText
|
||||
ngrams_size = 0;
|
||||
|
@ -381,7 +381,7 @@ cdef class Vocab:
|
|||
j = j + 1
|
||||
if (n >= minn and not (n == 1 and (i == 0 or j == len(word)))):
|
||||
if self.strings[ngram] in self.vectors.key2row:
|
||||
vectors = numpy.add(self.vectors[self.strings[ngram]],vectors)
|
||||
vectors = xp.add(self.vectors[self.strings[ngram]], vectors)
|
||||
ngrams_size += 1
|
||||
n = n + 1
|
||||
if ngrams_size > 0:
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user