mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-25 00:34:20 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
4aa1002546
12
.flake8
12
.flake8
|
@ -1,4 +1,14 @@
|
|||
[flake8]
|
||||
ignore = E203, E266, E501, W503
|
||||
ignore = E203, E266, E501, E731, W503
|
||||
max-line-length = 80
|
||||
select = B,C,E,F,W,T4,B9
|
||||
exclude =
|
||||
.env,
|
||||
.git,
|
||||
__pycache__,
|
||||
lemmatizer.py,
|
||||
lookup.py,
|
||||
_tokenizer_exceptions_list.py,
|
||||
spacy/lang/fr/lemmatizer,
|
||||
spacy/lang/nb/lemmatizer
|
||||
spacy/__init__.py
|
||||
|
|
2
.github/ISSUE_TEMPLATE.md
vendored
2
.github/ISSUE_TEMPLATE.md
vendored
|
@ -1,7 +1,7 @@
|
|||
<!--- Please provide a summary in the title and describe your issue here.
|
||||
Is this a bug or feature request? If a bug, include all the steps that led to the issue.
|
||||
|
||||
If you're looking for help with your code, consider posting a question on StackOverflow instead:
|
||||
If you're looking for help with your code, consider posting a question on Stack Overflow instead:
|
||||
http://stackoverflow.com/questions/tagged/spacy -->
|
||||
|
||||
|
||||
|
|
4
.github/ISSUE_TEMPLATE/05_other.md
vendored
4
.github/ISSUE_TEMPLATE/05_other.md
vendored
|
@ -1,11 +1,11 @@
|
|||
---
|
||||
name: "\U0001F4AC Anything else?"
|
||||
about: For general usage questions or help with your code, please consider
|
||||
posting on StackOverflow instead.
|
||||
posting on Stack Overflow instead.
|
||||
|
||||
---
|
||||
|
||||
<!-- Describe your issue here. Please keep in mind that the GitHub issue tracker is mostly intended for reports related to the spaCy code base and source, and for bugs and feature requests. If you're looking for help with your code, consider posting a question on StackOverflow instead: http://stackoverflow.com/questions/tagged/spacy -->
|
||||
<!-- Describe your issue here. Please keep in mind that the GitHub issue tracker is mostly intended for reports related to the spaCy code base and source, and for bugs and feature requests. If you're looking for help with your code, consider posting a question on Stack Overflow instead: http://stackoverflow.com/questions/tagged/spacy -->
|
||||
|
||||
## Your Environment
|
||||
<!-- Include details of your environment. If you're using spaCy 1.7+, you can also type `python -m spacy info --markdown` and copy-paste the result here.-->
|
||||
|
|
106
.github/contributors/ALSchwalm.md
vendored
Normal file
106
.github/contributors/ALSchwalm.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | ------------------------ |
|
||||
| Name | Adam Schwalm |
|
||||
| Company name (if applicable) | Star Lab |
|
||||
| Title or role (if applicable) | Software Engineer |
|
||||
| Date | 2018-11-28 |
|
||||
| GitHub username | ALSchwalm |
|
||||
| Website (optional) | https://alschwalm.com |
|
106
.github/contributors/BramVanroy.md
vendored
Normal file
106
.github/contributors/BramVanroy.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [x] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | ----------------------|
|
||||
| Name | Bram Vanroy |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | October 19, 2018 |
|
||||
| GitHub username | BramVanroy |
|
||||
| Website (optional) | https://bramvanroy.be |
|
106
.github/contributors/Cinnamy.md
vendored
Normal file
106
.github/contributors/Cinnamy.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Marina Lysyuk |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 13.10.2018 |
|
||||
| GitHub username | Cinnamy |
|
||||
| Website (optional) | |
|
106
.github/contributors/JKhakpour.md
vendored
Normal file
106
.github/contributors/JKhakpour.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [ ] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Ja'far Khakpour |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 2018-09-24 |
|
||||
| GitHub username | JKhakpour |
|
||||
| Website (optional) | |
|
106
.github/contributors/aniruddha-adhikary.md
vendored
Normal file
106
.github/contributors/aniruddha-adhikary.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Aniruddha Adhikary |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 2018-09-05 |
|
||||
| GitHub username | aniruddha-adhikary |
|
||||
| Website (optional) | https://adhikary.net |
|
106
.github/contributors/aongko.md
vendored
Normal file
106
.github/contributors/aongko.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [ ] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [x] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Andrew Ongko |
|
||||
| Company name (if applicable) | Kurio |
|
||||
| Title or role (if applicable) | Senior Data Science |
|
||||
| Date | Sep 10, 2018 |
|
||||
| GitHub username | aongko |
|
||||
| Website (optional) | |
|
54
.github/contributors/aryaprabhudesai.md
vendored
Normal file
54
.github/contributors/aryaprabhudesai.md
vendored
Normal file
|
@ -0,0 +1,54 @@
|
|||
spaCy contributor agreement
|
||||
This spaCy Contributor Agreement ("SCA") is based on the Oracle Contributor Agreement. The SCA applies to any contribution that you make to any product or project managed by us (the "project"), and sets out the intellectual property rights you grant to us in the contributed materials. The term "us" shall mean ExplosionAI UG (haftungsbeschränkt). The term "you" shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested below and include the filled-in version with your first pull request, under the folder .github/contributors/. The name of the file should be your GitHub username, with the extension .md. For example, the user example_user would create the file .github/contributors/example_user.md.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions constitute a binding legal agreement.
|
||||
|
||||
Contributor Agreement
|
||||
The term "contribution" or "contributed materials" means any source code, object code, patch, tool, sample, graphic, specification, manual, documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
With respect to any worldwide copyrights, or copyright applications and registrations, in your contribution:
|
||||
|
||||
you hereby assign to us joint ownership, and to the extent that such assignment is or becomes invalid, ineffective or unenforceable, you hereby grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, royalty-free, unrestricted license to exercise all rights under those copyrights. This includes, at our option, the right to sublicense these same rights to third parties through multiple levels of sublicensees or other licensing arrangements;
|
||||
|
||||
you agree that each of us can do all things in relation to your contribution as if each of us were the sole owners, and if one of us makes a derivative work of your contribution, the one who makes the derivative work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
you agree that you will not assert any moral rights in your contribution against us, our licensees or transferees;
|
||||
|
||||
you agree that we may register a copyright in your contribution and exercise all ownership rights associated with it; and
|
||||
|
||||
you agree that neither of us has any duty to consult with, obtain the consent of, pay or render an accounting to the other for any use or distribution of your contribution.
|
||||
|
||||
With respect to any patents you own, or that you can license without payment to any third party, you hereby grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
make, have made, use, sell, offer to sell, import, and otherwise transfer your contribution in whole or in part, alone or in combination with or included in any product, work or materials arising out of the project to which your contribution was submitted, and
|
||||
|
||||
at our option, to sublicense these same rights to third parties through multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
Except as set out above, you keep all right, title, and interest in your contribution. The rights that you grant to us under these terms are effective on the date you first submitted a contribution to us, even if your submission took place before the date you sign these terms.
|
||||
|
||||
You covenant, represent, warrant and agree that:
|
||||
|
||||
Each contribution that you submit is and shall be an original work of authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
to the best of your knowledge, each contribution will not violate any third party's copyrights, trademarks, patents, or other intellectual property rights; and
|
||||
|
||||
each contribution shall be in compliance with U.S. export control laws and other applicable export and import laws. You agree to notify us if you become aware of any circumstance which would make any of the foregoing representations inaccurate in any respect. We may publicly disclose your participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
This SCA is governed by the laws of the State of California and applicable U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
Please place an “x” on one of the applicable statement below. Please do NOT mark both statements:
|
||||
|
||||
[X] I am signing on behalf of myself as an individual and no other person or entity, including my employer, has or will have rights with respect to my contributions.
|
||||
|
||||
I am signing on behalf of my employer or a legal entity and I have the actual authority to contractually bind that entity.
|
||||
|
||||
Contributor Details
|
||||
Field Entry
|
||||
Name Arya Prabhudesai
|
||||
Company name (if applicable) -
|
||||
Title or role (if applicable) -
|
||||
Date 2018-08-17
|
||||
GitHub username aryaprabhudesai
|
||||
Website (optional) -
|
106
.github/contributors/charlax.md
vendored
Normal file
106
.github/contributors/charlax.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [ ] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [x] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Charles-Axel Dein |
|
||||
| Company name (if applicable) | Skrib |
|
||||
| Title or role (if applicable) | CEO |
|
||||
| Date | 27/09/2018 |
|
||||
| GitHub username | charlax |
|
||||
| Website (optional) | www.dein.fr |
|
106
.github/contributors/cicorias.md
vendored
Normal file
106
.github/contributors/cicorias.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [X] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Shawn Cicoria |
|
||||
| Company name (if applicable) | Microsoft |
|
||||
| Title or role (if applicable) | Principal Software Engineer |
|
||||
| Date | November 20, 2018 |
|
||||
| GitHub username | cicorias |
|
||||
| Website (optional) | www.cicoria.com |
|
106
.github/contributors/darindf.md
vendored
Normal file
106
.github/contributors/darindf.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Darin DeForest |
|
||||
| Company name (if applicable) | Ipro Tech |
|
||||
| Title or role (if applicable) | Senior Software Engineer |
|
||||
| Date | 2018-09-26 |
|
||||
| GitHub username | darindf |
|
||||
| Website (optional) | |
|
106
.github/contributors/filipecaixeta.md
vendored
Normal file
106
.github/contributors/filipecaixeta.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Filipe Caixeta |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 09.12.2018 |
|
||||
| GitHub username | filipecaixeta |
|
||||
| Website (optional) | filipecaixeta.com.br |
|
106
.github/contributors/frascuchon.md
vendored
Normal file
106
.github/contributors/frascuchon.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Francisco Aranda |
|
||||
| Company name (if applicable) | recognai |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | |
|
||||
| GitHub username | frascuchon |
|
||||
| Website (optional) | https://recogn.ai |
|
106
.github/contributors/free-variation.md
vendored
Normal file
106
.github/contributors/free-variation.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [ ] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | John Stewart |
|
||||
| Company name (if applicable) | Amplify |
|
||||
| Title or role (if applicable) | SVP Research |
|
||||
| Date | 14/09/2018 |
|
||||
| GitHub username | free-variation |
|
||||
| Website (optional) | |
|
106
.github/contributors/gavrieltal.md
vendored
Normal file
106
.github/contributors/gavrieltal.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Gavriel Loria |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | Nov 29, 2018 |
|
||||
| GitHub username | gavrieltal |
|
||||
| Website (optional) | |
|
106
.github/contributors/grivaz.md
vendored
Normal file
106
.github/contributors/grivaz.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name |C. Grivaz |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date |08.22.2018 |
|
||||
| GitHub username |grivaz |
|
||||
| Website (optional) | |
|
106
.github/contributors/jacopofar.md
vendored
Normal file
106
.github/contributors/jacopofar.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [X] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Jacopo Farina |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 2018-10-12 |
|
||||
| GitHub username | jacopofar |
|
||||
| Website (optional) | jacopofarina.eu |
|
106
.github/contributors/keshan.md
vendored
Normal file
106
.github/contributors/keshan.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Keshan Sodimana |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | Sep 21, 2018 |
|
||||
| GitHub username | keshan |
|
||||
| Website (optional) | |
|
106
.github/contributors/mbkupfer.md
vendored
Normal file
106
.github/contributors/mbkupfer.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Maxim Kupfer |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | Sep 6, 2018 |
|
||||
| GitHub username | mbkupfer |
|
||||
| Website (optional) | |
|
106
.github/contributors/mikelibg.md
vendored
Normal file
106
.github/contributors/mikelibg.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | ------------------------ |
|
||||
| Name | Michael Liberman |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 2018-11-08 |
|
||||
| GitHub username | mikelibg |
|
||||
| Website (optional) | |
|
106
.github/contributors/mpuig.md
vendored
Normal file
106
.github/contributors/mpuig.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Marc Puig |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 2018-11-17 |
|
||||
| GitHub username | mpuig |
|
||||
| Website (optional) | |
|
106
.github/contributors/phojnacki.md
vendored
Normal file
106
.github/contributors/phojnacki.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [ X ] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | ------------------------------------- |
|
||||
| Name | Przemysław Hojnacki |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 12/09/2018 |
|
||||
| GitHub username | phojnacki |
|
||||
| Website (optional) | https://about.me/przemyslaw.hojnacki |
|
106
.github/contributors/pzelasko.md
vendored
Normal file
106
.github/contributors/pzelasko.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Piotr Żelasko |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 04-09-2018 |
|
||||
| GitHub username | pzelasko |
|
||||
| Website (optional) | |
|
106
.github/contributors/sainathadapa.md
vendored
Normal file
106
.github/contributors/sainathadapa.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Sainath Adapa |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 2018-09-06 |
|
||||
| GitHub username | sainathadapa |
|
||||
| Website (optional) | |
|
106
.github/contributors/tyburam.md
vendored
Normal file
106
.github/contributors/tyburam.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [ ] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Mateusz Tybura |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 08.09.2018 |
|
||||
| GitHub username | tyburam |
|
||||
| Website (optional) | |
|
102
CONTRIBUTING.md
102
CONTRIBUTING.md
|
@ -26,7 +26,7 @@ also check the [troubleshooting guide](https://spacy.io/usage/#troubleshooting)
|
|||
to see if your problem is already listed there.
|
||||
|
||||
If you're looking for help with your code, consider posting a question on
|
||||
[StackOverflow](http://stackoverflow.com/questions/tagged/spacy) instead. If you
|
||||
[Stack Overflow](http://stackoverflow.com/questions/tagged/spacy) instead. If you
|
||||
tag it `spacy` and `python`, more people will see it and hopefully be able to
|
||||
help. Please understand that we won't be able to provide individual support via
|
||||
email. We also believe that help is much more valuable if it's **shared publicly**,
|
||||
|
@ -186,13 +186,99 @@ sure your test passes and reference the issue in your commit message.
|
|||
## Code conventions
|
||||
|
||||
Code should loosely follow [pep8](https://www.python.org/dev/peps/pep-0008/).
|
||||
Regular line length is **80 characters**, with some tolerance for lines up to
|
||||
90 characters if the alternative would be worse — for instance, if your list
|
||||
comprehension comes to 82 characters, it's better not to split it over two lines.
|
||||
You can also use a linter like [`flake8`](https://pypi.python.org/pypi/flake8)
|
||||
or [`frosted`](https://pypi.python.org/pypi/frosted) – just keep in mind that
|
||||
it won't work very well for `.pyx` files and will complain about Cython syntax
|
||||
like `<int*>` or `cimport`.
|
||||
As of `v2.1.0`, spaCy uses [`black`](https://github.com/ambv/black) for code
|
||||
formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its
|
||||
Python modules. If you've built spaCy from source, you'll already have both
|
||||
tools installed.
|
||||
|
||||
**⚠️ Note that formatting and linting is currently only possible for Python
|
||||
modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.**
|
||||
|
||||
### Code formatting
|
||||
|
||||
[`black`](https://github.com/ambv/black) is an opinionated Python code
|
||||
formatter, optimised to produce readable code and small diffs. You can run
|
||||
`black` from the command-line, or via your code editor. For example, if you're
|
||||
using [Visual Studio Code](https://code.visualstudio.com/), you can add the
|
||||
following to your `settings.json` to use `black` for formatting and auto-format
|
||||
your files on save:
|
||||
|
||||
```json
|
||||
{
|
||||
"python.formatting.provider": "black",
|
||||
"[python]": {
|
||||
"editor.formatOnSave": true
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
[See here](https://github.com/ambv/black#editor-integration) for the full
|
||||
list of available editor integrations.
|
||||
|
||||
#### Disabling formatting
|
||||
|
||||
There are a few cases where auto-formatting doesn't improve readability – for
|
||||
example, in some of the the language data files like the `tag_map.py`, or in
|
||||
the tests that construct `Doc` objects from lists of words and other labels.
|
||||
Wrapping a block in `# fmt: off` and `# fmt: on` lets you disable formatting
|
||||
for that particular code. Here's an example:
|
||||
|
||||
```python
|
||||
# fmt: off
|
||||
text = "I look forward to using Thingamajig. I've been told it will make my life easier..."
|
||||
heads = [1, 0, -1, -2, -1, -1, -5, -1, 3, 2, 1, 0, 2, 1, -3, 1, 1, -3, -7]
|
||||
deps = ["nsubj", "ROOT", "advmod", "prep", "pcomp", "dobj", "punct", "",
|
||||
"nsubjpass", "aux", "auxpass", "ROOT", "nsubj", "aux", "ccomp",
|
||||
"poss", "nsubj", "ccomp", "punct"]
|
||||
# fmt: on
|
||||
```
|
||||
|
||||
### Code linting
|
||||
|
||||
[`flake8`](http://flake8.pycqa.org/en/latest/) is a tool for enforcing code
|
||||
style. It scans one or more files and outputs errors and warnings. This feedback
|
||||
can help you stick to general standards and conventions, and can be very useful
|
||||
for spotting potential mistakes and inconsistencies in your code. The most
|
||||
important things to watch out for are syntax errors and undefined names, but you
|
||||
also want to keep an eye on unused declared variables or repeated
|
||||
(i.e. overwritten) dictionary keys. If your code was formatted with `black`
|
||||
(see above), you shouldn't see any formatting-related warnings.
|
||||
|
||||
The [`.flake8`](.flake8) config defines the configuration we use for this
|
||||
codebase. For example, we're not super strict about the line length, and we're
|
||||
excluding very large files like lemmatization and tokenizer exception tables.
|
||||
|
||||
Ideally, running the following command from within the repo directory should
|
||||
not return any errors or warnings:
|
||||
|
||||
```bash
|
||||
flake8 spacy
|
||||
```
|
||||
|
||||
#### Disabling linting
|
||||
|
||||
Sometimes, you explicitly want to write code that's not compatible with our
|
||||
rules. For example, a module's `__init__.py` might import a function so other
|
||||
modules can import it from there, but `flake8` will complain about an unused
|
||||
import. And although it's generally discouraged, there might be cases where it
|
||||
makes sense to use a bare `except`.
|
||||
|
||||
To ignore a given line, you can add a comment like `# noqa: F401`, specifying
|
||||
the code of the error or warning we want to ignore. It's also possible to
|
||||
ignore several comma-separated codes at once, e.g. `# noqa: E731,E123`. Here
|
||||
are some examples:
|
||||
|
||||
```python
|
||||
# The imported class isn't used in this file, but imported here, so it can be
|
||||
# imported *from* here by another module.
|
||||
from .submodule import SomeClass # noqa: F401
|
||||
|
||||
try:
|
||||
do_something()
|
||||
except: # noqa: E722
|
||||
# This bare except is justified, for some specific reason
|
||||
do_something_else()
|
||||
```
|
||||
|
||||
### Python conventions
|
||||
|
||||
|
|
|
@ -1,83 +0,0 @@
|
|||
# 👥 Contributors
|
||||
|
||||
This is a list of everyone who has made significant contributions to spaCy, in alphabetical order. Thanks a lot for the great work!
|
||||
|
||||
* Adam Bittlingmayer, [@bittlingmayer](https://github.com/bittlingmayer)
|
||||
* Alexey Kim, [@yuukos](https://github.com/yuukos)
|
||||
* Alexis Eidelman, [@AlexisEidelman](https://github.com/AlexisEidelman)
|
||||
* Ali Zarezade, [@azarezade](https://github.com/azarezade)
|
||||
* Andreas Grivas, [@andreasgrv](https://github.com/andreasgrv)
|
||||
* Andrew Poliakov, [@pavlin99th](https://github.com/pavlin99th)
|
||||
* Aniruddha Adhikary, [@aniruddha-adhikary](https://github.com/aniruddha-adhikary)
|
||||
* Anto Binish Kaspar, [@binishkaspar](https://github.com/binishkaspar)
|
||||
* Avadh Patel, [@avadhpatel](https://github.com/avadhpatel)
|
||||
* Ben Eyal, [@beneyal](https://github.com/beneyal)
|
||||
* Bhargav Srinivasa, [@bhargavvader](https://github.com/bhargavvader)
|
||||
* Bruno P. Kinoshita, [@kinow](https://github.com/kinow)
|
||||
* Canbey Bilgili, [@cbilgili](https://github.com/cbilgili)
|
||||
* Chris DuBois, [@chrisdubois](https://github.com/chrisdubois)
|
||||
* Christoph Schwienheer, [@chssch](https://github.com/chssch)
|
||||
* Dafne van Kuppevelt, [@dafnevk](https://github.com/dafnevk)
|
||||
* Daniel Rapp, [@rappdw](https://github.com/rappdw)
|
||||
* Daniel Vila Suero, [@dvsrepo](https://github.com/dvsrepo)
|
||||
* Dmytro Sadovnychyi, [@sadovnychyi](https://github.com/sadovnychyi)
|
||||
* Eric Zhao, [@ericzhao28](https://github.com/ericzhao28)
|
||||
* Francisco Aranda, [@frascuchon](https://github.com/frascuchon)
|
||||
* Greg Baker, [@solresol](https://github.com/solresol)
|
||||
* Greg Dubbin, [@GregDubbin](https://github.com/GregDubbin)
|
||||
* Grégory Howard, [@Gregory-Howard](https://github.com/Gregory-Howard)
|
||||
* György Orosz, [@oroszgy](https://github.com/oroszgy)
|
||||
* Henning Peters, [@henningpeters](https://github.com/henningpeters)
|
||||
* Iddo Berger, [@iddoberger](https://github.com/iddoberger)
|
||||
* Ines Montani, [@ines](https://github.com/ines)
|
||||
* J Nicolas Schrading, [@NSchrading](https://github.com/NSchrading)
|
||||
* Janneke van der Zwaan, [@jvdzwaan](https://github.com/jvdzwaan)
|
||||
* Jim Geovedi, [@geovedi](https://github.com/geovedi)
|
||||
* Jim Regan, [@jimregan](https://github.com/jimregan)
|
||||
* Jeffrey Gerard, [@IamJeffG](https://github.com/IamJeffG)
|
||||
* Jordan Suchow, [@suchow](https://github.com/suchow)
|
||||
* Josh Reeter, [@jreeter](https://github.com/jreeter)
|
||||
* Juan Miguel Cejuela, [@juanmirocks](https://github.com/juanmirocks)
|
||||
* Kendrick Tan, [@kendricktan](https://github.com/kendricktan)
|
||||
* Kyle P. Johnson, [@kylepjohnson](https://github.com/kylepjohnson)
|
||||
* Leif Uwe Vogelsang, [@luvogels](https://github.com/luvogels)
|
||||
* Liling Tan, [@alvations](https://github.com/alvations)
|
||||
* Magnus Burton, [@magnusburton](https://github.com/magnusburton)
|
||||
* Mark Amery, [@ExplodingCabbage](https://github.com/ExplodingCabbage)
|
||||
* Matthew Honnibal, [@honnibal](https://github.com/honnibal)
|
||||
* Maxim Samsonov, [@maxirmx](https://github.com/maxirmx)
|
||||
* Michael Wallin, [@wallinm1](https://github.com/wallinm1)
|
||||
* Miguel Almeida, [@mamoit](https://github.com/mamoit)
|
||||
* Motoki Wu, [@tokestermw](https://github.com/tokestermw)
|
||||
* Ole Henrik Skogstrøm, [@ohenrik](https://github.com/ohenrik)
|
||||
* Oleg Zd, [@olegzd](https://github.com/olegzd)
|
||||
* Orhan Bilgin, [@melanuria](https://github.com/melanuria)
|
||||
* Orion Montoya, [@mdcclv](https://github.com/mdcclv)
|
||||
* Paul O'Leary McCann, [@polm](https://github.com/polm)
|
||||
* Pokey Rule, [@pokey](https://github.com/pokey)
|
||||
* Ramanan Balakrishnan, [@ramananbalakrishnan](https://github.com/ramananbalakrishnan)
|
||||
* Raphaël Bournhonesque, [@raphael0202](https://github.com/raphael0202)
|
||||
* Rob van Nieuwpoort, [@RvanNieuwpoort](https://github.com/RvanNieuwpoort)
|
||||
* Roman Domrachev, [@ligser](https://github.com/ligser)
|
||||
* Roman Inflianskas, [@rominf](https://github.com/rominf)
|
||||
* Sam Bozek, [@sambozek](https://github.com/sambozek)
|
||||
* Sasho Savkov, [@savkov](https://github.com/savkov)
|
||||
* Shuvanon Razik, [@shuvanon](https://github.com/shuvanon)
|
||||
* Søren Lind Kristiansen, [@sorenlind](https://github.com/sorenlind)
|
||||
* Swier, [@swierh](https://github.com/swierh)
|
||||
* Thomas Tanon, [@Tpt](https://github.com/Tpt)
|
||||
* Thomas Opsomer, [@thomasopsomer](https://github.com/thomasopsomer)
|
||||
* Tiago Rodrigues, [@TiagoMRodrigues](https://github.com/TiagoMRodrigues)
|
||||
* Vadim Mazaev, [@GreenRiverRUS](https://github.com/GreenRiverRUS)
|
||||
* Vimos Tan, [@Vimos](https://github.com/Vimos)
|
||||
* Vsevolod Solovyov, [@vsolovyov](https://github.com/vsolovyov)
|
||||
* Wah Loon Keng, [@kengz](https://github.com/kengz)
|
||||
* Wannaphong Phatthiyaphaibun, [@wannaphongcom](https://github.com/wannaphongcom)
|
||||
* Willem van Hage, [@wrvhage](https://github.com/wrvhage)
|
||||
* Wolfgang Seeker, [@wbwseeker](https://github.com/wbwseeker)
|
||||
* Yam, [@hscspring](https://github.com/hscspring)
|
||||
* Yanhao Yang, [@YanhaoYang](https://github.com/YanhaoYang)
|
||||
* Yasuaki Uechi, [@uetchy](https://github.com/uetchy)
|
||||
* Yu-chun Huang, [@galaxyh](https://github.com/galaxyh)
|
||||
* Yubing Dong, [@tomtung](https://github.com/tomtung)
|
||||
* Yuval Pinter, [@yuvalpinter](https://github.com/yuvalpinter)
|
|
@ -35,41 +35,49 @@ import subprocess
|
|||
import argparse
|
||||
|
||||
|
||||
HASH_FILE = 'cythonize.json'
|
||||
HASH_FILE = "cythonize.json"
|
||||
|
||||
|
||||
def process_pyx(fromfile, tofile, language_level='-2'):
|
||||
print('Processing %s' % fromfile)
|
||||
def process_pyx(fromfile, tofile, language_level="-2"):
|
||||
print("Processing %s" % fromfile)
|
||||
try:
|
||||
from Cython.Compiler.Version import version as cython_version
|
||||
from distutils.version import LooseVersion
|
||||
if LooseVersion(cython_version) < LooseVersion('0.19'):
|
||||
raise Exception('Require Cython >= 0.19')
|
||||
|
||||
if LooseVersion(cython_version) < LooseVersion("0.19"):
|
||||
raise Exception("Require Cython >= 0.19")
|
||||
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
flags = ['--fast-fail', language_level]
|
||||
if tofile.endswith('.cpp'):
|
||||
flags += ['--cplus']
|
||||
flags = ["--fast-fail", language_level]
|
||||
if tofile.endswith(".cpp"):
|
||||
flags += ["--cplus"]
|
||||
|
||||
try:
|
||||
try:
|
||||
r = subprocess.call(['cython'] + flags + ['-o', tofile, fromfile],
|
||||
env=os.environ) # See Issue #791
|
||||
r = subprocess.call(
|
||||
["cython"] + flags + ["-o", tofile, fromfile], env=os.environ
|
||||
) # See Issue #791
|
||||
if r != 0:
|
||||
raise Exception('Cython failed')
|
||||
raise Exception("Cython failed")
|
||||
except OSError:
|
||||
# There are ways of installing Cython that don't result in a cython
|
||||
# executable on the path, see gh-2397.
|
||||
r = subprocess.call([sys.executable, '-c',
|
||||
'import sys; from Cython.Compiler.Main import '
|
||||
'setuptools_main as main; sys.exit(main())'] + flags +
|
||||
['-o', tofile, fromfile])
|
||||
r = subprocess.call(
|
||||
[
|
||||
sys.executable,
|
||||
"-c",
|
||||
"import sys; from Cython.Compiler.Main import "
|
||||
"setuptools_main as main; sys.exit(main())",
|
||||
]
|
||||
+ flags
|
||||
+ ["-o", tofile, fromfile]
|
||||
)
|
||||
if r != 0:
|
||||
raise Exception('Cython failed')
|
||||
raise Exception("Cython failed")
|
||||
except OSError:
|
||||
raise OSError('Cython needs to be installed')
|
||||
raise OSError("Cython needs to be installed")
|
||||
|
||||
|
||||
def preserve_cwd(path, func, *args):
|
||||
|
@ -89,12 +97,12 @@ def load_hashes(filename):
|
|||
|
||||
|
||||
def save_hashes(hash_db, filename):
|
||||
with open(filename, 'w') as f:
|
||||
with open(filename, "w") as f:
|
||||
f.write(json.dumps(hash_db))
|
||||
|
||||
|
||||
def get_hash(path):
|
||||
return hashlib.md5(open(path, 'rb').read()).hexdigest()
|
||||
return hashlib.md5(open(path, "rb").read()).hexdigest()
|
||||
|
||||
|
||||
def hash_changed(base, path, db):
|
||||
|
@ -109,25 +117,27 @@ def hash_add(base, path, db):
|
|||
|
||||
def process(base, filename, db):
|
||||
root, ext = os.path.splitext(filename)
|
||||
if ext in ['.pyx', '.cpp']:
|
||||
if hash_changed(base, filename, db) or not os.path.isfile(os.path.join(base, root + '.cpp')):
|
||||
preserve_cwd(base, process_pyx, root + '.pyx', root + '.cpp')
|
||||
hash_add(base, root + '.cpp', db)
|
||||
hash_add(base, root + '.pyx', db)
|
||||
if ext in [".pyx", ".cpp"]:
|
||||
if hash_changed(base, filename, db) or not os.path.isfile(
|
||||
os.path.join(base, root + ".cpp")
|
||||
):
|
||||
preserve_cwd(base, process_pyx, root + ".pyx", root + ".cpp")
|
||||
hash_add(base, root + ".cpp", db)
|
||||
hash_add(base, root + ".pyx", db)
|
||||
|
||||
|
||||
def check_changes(root, db):
|
||||
res = False
|
||||
new_db = {}
|
||||
|
||||
setup_filename = 'setup.py'
|
||||
hash_add('.', setup_filename, new_db)
|
||||
if hash_changed('.', setup_filename, db):
|
||||
setup_filename = "setup.py"
|
||||
hash_add(".", setup_filename, new_db)
|
||||
if hash_changed(".", setup_filename, db):
|
||||
res = True
|
||||
|
||||
for base, _, files in os.walk(root):
|
||||
for filename in files:
|
||||
if filename.endswith('.pxd'):
|
||||
if filename.endswith(".pxd"):
|
||||
hash_add(base, filename, new_db)
|
||||
if hash_changed(base, filename, db):
|
||||
res = True
|
||||
|
@ -150,8 +160,10 @@ def run(root):
|
|||
save_hashes(db, HASH_FILE)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Cythonize pyx files into C++ files as needed')
|
||||
parser.add_argument('root', help='root directory')
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Cythonize pyx files into C++ files as needed"
|
||||
)
|
||||
parser.add_argument("root", help="root directory")
|
||||
args = parser.parse_args()
|
||||
run(args.root)
|
||||
|
|
|
@ -15,12 +15,13 @@ _unset = object()
|
|||
|
||||
class Reddit(object):
|
||||
"""Stream cleaned comments from Reddit."""
|
||||
pre_format_re = re.compile(r'^[\`\*\~]')
|
||||
post_format_re = re.compile(r'[\`\*\~]$')
|
||||
url_re = re.compile(r'\[([^]]+)\]\(%%URL\)')
|
||||
link_re = re.compile(r'\[([^]]+)\]\(https?://[^\)]+\)')
|
||||
|
||||
def __init__(self, file_path, meta_keys={'subreddit': 'section'}):
|
||||
pre_format_re = re.compile(r"^[\`\*\~]")
|
||||
post_format_re = re.compile(r"[\`\*\~]$")
|
||||
url_re = re.compile(r"\[([^]]+)\]\(%%URL\)")
|
||||
link_re = re.compile(r"\[([^]]+)\]\(https?://[^\)]+\)")
|
||||
|
||||
def __init__(self, file_path, meta_keys={"subreddit": "section"}):
|
||||
"""
|
||||
file_path (unicode / Path): Path to archive or directory of archives.
|
||||
meta_keys (dict): Meta data key included in the Reddit corpus, mapped
|
||||
|
@ -45,28 +46,30 @@ class Reddit(object):
|
|||
continue
|
||||
comment = ujson.loads(line)
|
||||
if self.is_valid(comment):
|
||||
text = self.strip_tags(comment['body'])
|
||||
yield {'text': text}
|
||||
text = self.strip_tags(comment["body"])
|
||||
yield {"text": text}
|
||||
|
||||
def get_meta(self, item):
|
||||
return {name: item.get(key, 'n/a') for key, name in self.meta.items()}
|
||||
return {name: item.get(key, "n/a") for key, name in self.meta.items()}
|
||||
|
||||
def iter_files(self):
|
||||
for file_path in self.files:
|
||||
yield file_path
|
||||
|
||||
def strip_tags(self, text):
|
||||
text = self.link_re.sub(r'\1', text)
|
||||
text = text.replace('>', '>').replace('<', '<')
|
||||
text = self.pre_format_re.sub('', text)
|
||||
text = self.post_format_re.sub('', text)
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
text = self.link_re.sub(r"\1", text)
|
||||
text = text.replace(">", ">").replace("<", "<")
|
||||
text = self.pre_format_re.sub("", text)
|
||||
text = self.post_format_re.sub("", text)
|
||||
text = re.sub(r"\s+", " ", text)
|
||||
return text.strip()
|
||||
|
||||
def is_valid(self, comment):
|
||||
return comment['body'] is not None \
|
||||
and comment['body'] != '[deleted]' \
|
||||
and comment['body'] != '[removed]'
|
||||
return (
|
||||
comment["body"] is not None
|
||||
and comment["body"] != "[deleted]"
|
||||
and comment["body"] != "[removed]"
|
||||
)
|
||||
|
||||
|
||||
def main(path):
|
||||
|
@ -75,16 +78,18 @@ def main(path):
|
|||
print(ujson.dumps(comment))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
import socket
|
||||
|
||||
try:
|
||||
BrokenPipeError
|
||||
except NameError:
|
||||
BrokenPipeError = socket.error
|
||||
try:
|
||||
plac.call(main)
|
||||
except BrokenPipeError:
|
||||
except BrokenPipeError:
|
||||
import os, sys
|
||||
|
||||
# Python flushes standard streams on exit; redirect remaining output
|
||||
# to devnull to avoid another BrokenPipeError at shutdown
|
||||
devnull = os.open(os.devnull, os.O_WRONLY)
|
||||
|
|
|
@ -7,6 +7,7 @@ git diff-index --quiet HEAD
|
|||
|
||||
git checkout $1
|
||||
git pull origin $1
|
||||
|
||||
version=$(grep "__version__ = " spacy/about.py)
|
||||
version=${version/__version__ = }
|
||||
version=${version/\'/}
|
||||
|
|
|
@ -92,11 +92,13 @@ def get_features(docs, max_length):
|
|||
def train(train_texts, train_labels, dev_texts, dev_labels,
|
||||
lstm_shape, lstm_settings, lstm_optimizer, batch_size=100,
|
||||
nb_epoch=5, by_sentence=True):
|
||||
|
||||
print("Loading spaCy")
|
||||
nlp = spacy.load('en_vectors_web_lg')
|
||||
nlp.add_pipe(nlp.create_pipe('sentencizer'))
|
||||
embeddings = get_embeddings(nlp.vocab)
|
||||
model = compile_lstm(embeddings, lstm_shape, lstm_settings)
|
||||
|
||||
print("Parsing texts...")
|
||||
train_docs = list(nlp.pipe(train_texts))
|
||||
dev_docs = list(nlp.pipe(dev_texts))
|
||||
|
@ -107,7 +109,7 @@ def train(train_texts, train_labels, dev_texts, dev_labels,
|
|||
train_X = get_features(train_docs, lstm_shape['max_length'])
|
||||
dev_X = get_features(dev_docs, lstm_shape['max_length'])
|
||||
model.fit(train_X, train_labels, validation_data=(dev_X, dev_labels),
|
||||
nb_epoch=nb_epoch, batch_size=batch_size)
|
||||
epochs=nb_epoch, batch_size=batch_size)
|
||||
return model
|
||||
|
||||
|
||||
|
@ -138,15 +140,9 @@ def get_embeddings(vocab):
|
|||
|
||||
|
||||
def evaluate(model_dir, texts, labels, max_length=100):
|
||||
def create_pipeline(nlp):
|
||||
'''
|
||||
This could be a lambda, but named functions are easier to read in Python.
|
||||
'''
|
||||
return [nlp.tagger, nlp.parser, SentimentAnalyser.load(model_dir, nlp,
|
||||
max_length=max_length)]
|
||||
|
||||
nlp = spacy.load('en')
|
||||
nlp.pipeline = create_pipeline(nlp)
|
||||
nlp = spacy.load('en_vectors_web_lg')
|
||||
nlp.add_pipe(nlp.create_pipe('sentencizer'))
|
||||
nlp.add_pipe(SentimentAnalyser.load(model_dir, nlp, max_length=max_length))
|
||||
|
||||
correct = 0
|
||||
i = 0
|
||||
|
@ -186,7 +182,7 @@ def main(model_dir=None, train_dir=None, dev_dir=None,
|
|||
is_runtime=False,
|
||||
nr_hidden=64, max_length=100, # Shape
|
||||
dropout=0.5, learn_rate=0.001, # General NN config
|
||||
nb_epoch=5, batch_size=100, nr_examples=-1): # Training params
|
||||
nb_epoch=5, batch_size=256, nr_examples=-1): # Training params
|
||||
if model_dir is not None:
|
||||
model_dir = pathlib.Path(model_dir)
|
||||
if train_dir is None or dev_dir is None:
|
||||
|
@ -219,7 +215,7 @@ def main(model_dir=None, train_dir=None, dev_dir=None,
|
|||
if model_dir is not None:
|
||||
with (model_dir / 'model').open('wb') as file_:
|
||||
pickle.dump(weights[1:], file_)
|
||||
with (model_dir / 'config.json').open('wb') as file_:
|
||||
with (model_dir / 'config.json').open('w') as file_:
|
||||
file_.write(lstm.to_json())
|
||||
|
||||
|
||||
|
|
|
@ -2,11 +2,7 @@
|
|||
|
||||
# A decomposable attention model for Natural Language Inference
|
||||
**by Matthew Honnibal, [@honnibal](https://github.com/honnibal)**
|
||||
|
||||
> ⚠️ **IMPORTANT NOTE:** This example is currently only compatible with spaCy
|
||||
> v1.x. We're working on porting the example over to Keras v2.x and spaCy v2.x.
|
||||
> See [#1445](https://github.com/explosion/spaCy/issues/1445) for details –
|
||||
> contributions welcome!
|
||||
**Updated for spaCy 2.0+ and Keras 2.2.2+ by John Stewart, [@free-variation](https://github.com/free-variation)**
|
||||
|
||||
This directory contains an implementation of the entailment prediction model described
|
||||
by [Parikh et al. (2016)](https://arxiv.org/pdf/1606.01933.pdf). The model is notable
|
||||
|
@ -21,19 +17,25 @@ hook is installed to customise the `.similarity()` method of spaCy's `Doc`
|
|||
and `Span` objects:
|
||||
|
||||
```python
|
||||
def demo(model_dir):
|
||||
nlp = spacy.load('en', path=model_dir,
|
||||
create_pipeline=create_similarity_pipeline)
|
||||
doc1 = nlp(u'Worst fries ever! Greasy and horrible...')
|
||||
doc2 = nlp(u'The milkshakes are good. The fries are bad.')
|
||||
print(doc1.similarity(doc2))
|
||||
sent1a, sent1b = doc1.sents
|
||||
print(sent1a.similarity(sent1b))
|
||||
print(sent1a.similarity(doc2))
|
||||
print(sent1b.similarity(doc2))
|
||||
def demo(shape):
|
||||
nlp = spacy.load('en_vectors_web_lg')
|
||||
nlp.add_pipe(KerasSimilarityShim.load(nlp.path / 'similarity', nlp, shape[0]))
|
||||
|
||||
doc1 = nlp(u'The king of France is bald.')
|
||||
doc2 = nlp(u'France has no king.')
|
||||
|
||||
print("Sentence 1:", doc1)
|
||||
print("Sentence 2:", doc2)
|
||||
|
||||
entailment_type, confidence = doc1.similarity(doc2)
|
||||
print("Entailment type:", entailment_type, "(Confidence:", confidence, ")")
|
||||
```
|
||||
|
||||
Which gives the output `Entailment type: contradiction (Confidence: 0.60604566)`, showing that
|
||||
the system has definite opinions about Betrand Russell's [famous conundrum](https://users.drew.edu/jlenz/br-on-denoting.html)!
|
||||
|
||||
I'm working on a blog post to explain Parikh et al.'s model in more detail.
|
||||
A [notebook](https://github.com/free-variation/spaCy/blob/master/examples/notebooks/Decompositional%20Attention.ipynb) is available that briefly explains this implementation.
|
||||
I think it is a very interesting example of the attention mechanism, which
|
||||
I didn't understand very well before working through this paper. There are
|
||||
lots of ways to extend the model.
|
||||
|
@ -43,7 +45,7 @@ lots of ways to extend the model.
|
|||
| File | Description |
|
||||
| --- | --- |
|
||||
| `__main__.py` | The script that will be executed. Defines the CLI, the data reading, etc — all the boring stuff. |
|
||||
| `spacy_hook.py` | Provides a class `SimilarityShim` that lets you use an arbitrary function to customize spaCy's `doc.similarity()` method. Instead of the default average-of-vectors algorithm, when you call `doc1.similarity(doc2)`, you'll get the result of `your_model(doc1, doc2)`. |
|
||||
| `spacy_hook.py` | Provides a class `KerasSimilarityShim` that lets you use an arbitrary function to customize spaCy's `doc.similarity()` method. Instead of the default average-of-vectors algorithm, when you call `doc1.similarity(doc2)`, you'll get the result of `your_model(doc1, doc2)`. |
|
||||
| `keras_decomposable_attention.py` | Defines the neural network model. |
|
||||
|
||||
## Setting up
|
||||
|
@ -52,17 +54,13 @@ First, install [Keras](https://keras.io/), [spaCy](https://spacy.io) and the spa
|
|||
English models (about 1GB of data):
|
||||
|
||||
```bash
|
||||
pip install https://github.com/fchollet/keras/archive/1.2.2.zip
|
||||
pip install keras
|
||||
pip install spacy
|
||||
python -m spacy.en.download
|
||||
python -m spacy download en_vectors_web_lg
|
||||
```
|
||||
|
||||
⚠️ **Important:** In order for the example to run, you'll need to install Keras from
|
||||
the 1.2.2 release (and not via `pip install keras`). For more info on this, see
|
||||
[#727](https://github.com/explosion/spaCy/issues/727).
|
||||
|
||||
You'll also want to get Keras working on your GPU. This will depend on your
|
||||
set up, so you're mostly on your own for this step. If you're using AWS, try the
|
||||
You'll also want to get Keras working on your GPU, and you will need a backend, such as TensorFlow or Theano.
|
||||
This will depend on your set up, so you're mostly on your own for this step. If you're using AWS, try the
|
||||
[NVidia AMI](https://aws.amazon.com/marketplace/pp/B00FYCDDTE). It made things pretty easy.
|
||||
|
||||
Once you've installed the dependencies, you can run a small preliminary test of
|
||||
|
@ -80,22 +78,35 @@ Finally, download the [Stanford Natural Language Inference corpus](http://nlp.st
|
|||
## Running the example
|
||||
|
||||
You can run the `keras_parikh_entailment/` directory as a script, which executes the file
|
||||
[`keras_parikh_entailment/__main__.py`](__main__.py). The first thing you'll want to do is train the model:
|
||||
[`keras_parikh_entailment/__main__.py`](__main__.py). If you run the script without arguments
|
||||
the usage is shown. Running it with `-h` explains the command line arguments.
|
||||
|
||||
The first thing you'll want to do is train the model:
|
||||
|
||||
```bash
|
||||
python keras_parikh_entailment/ train <train_directory> <dev_directory>
|
||||
python keras_parikh_entailment/ train -t <path to SNLI train JSON> -s <path to SNLI dev JSON>
|
||||
```
|
||||
|
||||
Training takes about 300 epochs for full accuracy, and I haven't rerun the full
|
||||
experiment since refactoring things to publish this example — please let me
|
||||
know if I've broken something. You should get to at least 85% on the development data.
|
||||
know if I've broken something. You should get to at least 85% on the development data even after 10-15 epochs.
|
||||
|
||||
The other two modes demonstrate run-time usage. I never like relying on the accuracy printed
|
||||
by `.fit()` methods. I never really feel confident until I've run a new process that loads
|
||||
the model and starts making predictions, without access to the gold labels. I've therefore
|
||||
included an `evaluate` mode. Finally, there's also a little demo, which mostly exists to show
|
||||
included an `evaluate` mode.
|
||||
|
||||
```bash
|
||||
python keras_parikh_entailment/ evaluate -s <path to SNLI train JSON>
|
||||
```
|
||||
|
||||
Finally, there's also a little demo, which mostly exists to show
|
||||
you how run-time usage will eventually look.
|
||||
|
||||
```bash
|
||||
python keras_parikh_entailment/ demo
|
||||
```
|
||||
|
||||
## Getting updates
|
||||
|
||||
We should have the blog post explaining the model ready before the end of the week. To get
|
||||
|
|
|
@ -1,82 +1,104 @@
|
|||
from __future__ import division, unicode_literals, print_function
|
||||
import spacy
|
||||
|
||||
import plac
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import ujson as json
|
||||
import numpy
|
||||
from keras.utils.np_utils import to_categorical
|
||||
|
||||
from spacy_hook import get_embeddings, get_word_ids
|
||||
from spacy_hook import create_similarity_pipeline
|
||||
from keras.utils import to_categorical
|
||||
import plac
|
||||
import sys
|
||||
|
||||
from keras_decomposable_attention import build_model
|
||||
from spacy_hook import get_embeddings, KerasSimilarityShim
|
||||
|
||||
try:
|
||||
import cPickle as pickle
|
||||
except ImportError:
|
||||
import pickle
|
||||
|
||||
import spacy
|
||||
|
||||
# workaround for keras/tensorflow bug
|
||||
# see https://github.com/tensorflow/tensorflow/issues/3388
|
||||
import os
|
||||
import importlib
|
||||
from keras import backend as K
|
||||
|
||||
def set_keras_backend(backend):
|
||||
if K.backend() != backend:
|
||||
os.environ['KERAS_BACKEND'] = backend
|
||||
importlib.reload(K)
|
||||
assert K.backend() == backend
|
||||
if backend == "tensorflow":
|
||||
K.get_session().close()
|
||||
cfg = K.tf.ConfigProto()
|
||||
cfg.gpu_options.allow_growth = True
|
||||
K.set_session(K.tf.Session(config=cfg))
|
||||
K.clear_session()
|
||||
|
||||
set_keras_backend("tensorflow")
|
||||
|
||||
|
||||
def train(train_loc, dev_loc, shape, settings):
|
||||
train_texts1, train_texts2, train_labels = read_snli(train_loc)
|
||||
dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc)
|
||||
|
||||
print("Loading spaCy")
|
||||
nlp = spacy.load('en')
|
||||
nlp = spacy.load('en_vectors_web_lg')
|
||||
assert nlp.path is not None
|
||||
|
||||
print("Processing texts...")
|
||||
train_X = create_dataset(nlp, train_texts1, train_texts2, 100, shape[0])
|
||||
dev_X = create_dataset(nlp, dev_texts1, dev_texts2, 100, shape[0])
|
||||
|
||||
print("Compiling network")
|
||||
model = build_model(get_embeddings(nlp.vocab), shape, settings)
|
||||
print("Processing texts...")
|
||||
Xs = []
|
||||
for texts in (train_texts1, train_texts2, dev_texts1, dev_texts2):
|
||||
Xs.append(get_word_ids(list(nlp.pipe(texts, n_threads=20, batch_size=20000)),
|
||||
max_length=shape[0],
|
||||
rnn_encode=settings['gru_encode'],
|
||||
tree_truncate=settings['tree_truncate']))
|
||||
train_X1, train_X2, dev_X1, dev_X2 = Xs
|
||||
|
||||
print(settings)
|
||||
model.fit(
|
||||
[train_X1, train_X2],
|
||||
train_X,
|
||||
train_labels,
|
||||
validation_data=([dev_X1, dev_X2], dev_labels),
|
||||
nb_epoch=settings['nr_epoch'],
|
||||
batch_size=settings['batch_size'])
|
||||
validation_data = (dev_X, dev_labels),
|
||||
epochs = settings['nr_epoch'],
|
||||
batch_size = settings['batch_size'])
|
||||
|
||||
if not (nlp.path / 'similarity').exists():
|
||||
(nlp.path / 'similarity').mkdir()
|
||||
print("Saving to", nlp.path / 'similarity')
|
||||
weights = model.get_weights()
|
||||
# remove the embedding matrix. We can reconstruct it.
|
||||
del weights[1]
|
||||
with (nlp.path / 'similarity' / 'model').open('wb') as file_:
|
||||
pickle.dump(weights[1:], file_)
|
||||
with (nlp.path / 'similarity' / 'config.json').open('wb') as file_:
|
||||
pickle.dump(weights, file_)
|
||||
with (nlp.path / 'similarity' / 'config.json').open('w') as file_:
|
||||
file_.write(model.to_json())
|
||||
|
||||
|
||||
def evaluate(dev_loc):
|
||||
def evaluate(dev_loc, shape):
|
||||
dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc)
|
||||
nlp = spacy.load('en',
|
||||
create_pipeline=create_similarity_pipeline)
|
||||
nlp = spacy.load('en_vectors_web_lg')
|
||||
nlp.add_pipe(KerasSimilarityShim.load(nlp.path / 'similarity', nlp, shape[0]))
|
||||
|
||||
total = 0.
|
||||
correct = 0.
|
||||
for text1, text2, label in zip(dev_texts1, dev_texts2, dev_labels):
|
||||
doc1 = nlp(text1)
|
||||
doc2 = nlp(text2)
|
||||
sim = doc1.similarity(doc2)
|
||||
if sim.argmax() == label.argmax():
|
||||
sim, _ = doc1.similarity(doc2)
|
||||
if sim == KerasSimilarityShim.entailment_types[label.argmax()]:
|
||||
correct += 1
|
||||
total += 1
|
||||
return correct, total
|
||||
|
||||
|
||||
def demo():
|
||||
nlp = spacy.load('en',
|
||||
create_pipeline=create_similarity_pipeline)
|
||||
doc1 = nlp(u'What were the best crime fiction books in 2016?')
|
||||
doc2 = nlp(
|
||||
u'What should I read that was published last year? I like crime stories.')
|
||||
print(doc1)
|
||||
print(doc2)
|
||||
print("Similarity", doc1.similarity(doc2))
|
||||
def demo(shape):
|
||||
nlp = spacy.load('en_vectors_web_lg')
|
||||
nlp.add_pipe(KerasSimilarityShim.load(nlp.path / 'similarity', nlp, shape[0]))
|
||||
|
||||
doc1 = nlp(u'The king of France is bald.')
|
||||
doc2 = nlp(u'France has no king.')
|
||||
|
||||
print("Sentence 1:", doc1)
|
||||
print("Sentence 2:", doc2)
|
||||
|
||||
entailment_type, confidence = doc1.similarity(doc2)
|
||||
print("Entailment type:", entailment_type, "(Confidence:", confidence, ")")
|
||||
|
||||
|
||||
LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}
|
||||
|
@ -84,56 +106,92 @@ def read_snli(path):
|
|||
texts1 = []
|
||||
texts2 = []
|
||||
labels = []
|
||||
with path.open() as file_:
|
||||
with open(path, 'r') as file_:
|
||||
for line in file_:
|
||||
eg = json.loads(line)
|
||||
label = eg['gold_label']
|
||||
if label == '-':
|
||||
if label == '-': # per Parikh, ignore - SNLI entries
|
||||
continue
|
||||
texts1.append(eg['sentence1'])
|
||||
texts2.append(eg['sentence2'])
|
||||
labels.append(LABELS[label])
|
||||
return texts1, texts2, to_categorical(numpy.asarray(labels, dtype='int32'))
|
||||
return texts1, texts2, to_categorical(np.asarray(labels, dtype='int32'))
|
||||
|
||||
def create_dataset(nlp, texts, hypotheses, num_unk, max_length):
|
||||
sents = texts + hypotheses
|
||||
|
||||
sents_as_ids = []
|
||||
for sent in sents:
|
||||
doc = nlp(sent)
|
||||
word_ids = []
|
||||
|
||||
for i, token in enumerate(doc):
|
||||
# skip odd spaces from tokenizer
|
||||
if token.has_vector and token.vector_norm == 0:
|
||||
continue
|
||||
|
||||
if i > max_length:
|
||||
break
|
||||
|
||||
if token.has_vector:
|
||||
word_ids.append(token.rank + num_unk + 1)
|
||||
else:
|
||||
# if we don't have a vector, pick an OOV entry
|
||||
word_ids.append(token.rank % num_unk + 1)
|
||||
|
||||
# there must be a simpler way of generating padded arrays from lists...
|
||||
word_id_vec = np.zeros((max_length), dtype='int')
|
||||
clipped_len = min(max_length, len(word_ids))
|
||||
word_id_vec[:clipped_len] = word_ids[:clipped_len]
|
||||
sents_as_ids.append(word_id_vec)
|
||||
|
||||
|
||||
return [np.array(sents_as_ids[:len(texts)]), np.array(sents_as_ids[len(texts):])]
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
mode=("Mode to execute", "positional", None, str, ["train", "evaluate", "demo"]),
|
||||
train_loc=("Path to training data", "positional", None, Path),
|
||||
dev_loc=("Path to development data", "positional", None, Path),
|
||||
train_loc=("Path to training data", "option", "t", str),
|
||||
dev_loc=("Path to development or test data", "option", "s", str),
|
||||
max_length=("Length to truncate sentences", "option", "L", int),
|
||||
nr_hidden=("Number of hidden units", "option", "H", int),
|
||||
dropout=("Dropout level", "option", "d", float),
|
||||
learn_rate=("Learning rate", "option", "e", float),
|
||||
learn_rate=("Learning rate", "option", "r", float),
|
||||
batch_size=("Batch size for neural network training", "option", "b", int),
|
||||
nr_epoch=("Number of training epochs", "option", "i", int),
|
||||
tree_truncate=("Truncate sentences by tree distance", "flag", "T", bool),
|
||||
gru_encode=("Encode sentences with bidirectional GRU", "flag", "E", bool),
|
||||
nr_epoch=("Number of training epochs", "option", "e", int),
|
||||
entail_dir=("Direction of entailment", "option", "D", str, ["both", "left", "right"])
|
||||
)
|
||||
def main(mode, train_loc, dev_loc,
|
||||
tree_truncate=False,
|
||||
gru_encode=False,
|
||||
max_length=100,
|
||||
nr_hidden=100,
|
||||
dropout=0.2,
|
||||
learn_rate=0.001,
|
||||
batch_size=100,
|
||||
nr_epoch=5):
|
||||
max_length = 50,
|
||||
nr_hidden = 200,
|
||||
dropout = 0.2,
|
||||
learn_rate = 0.001,
|
||||
batch_size = 1024,
|
||||
nr_epoch = 10,
|
||||
entail_dir="both"):
|
||||
|
||||
shape = (max_length, nr_hidden, 3)
|
||||
settings = {
|
||||
'lr': learn_rate,
|
||||
'dropout': dropout,
|
||||
'batch_size': batch_size,
|
||||
'nr_epoch': nr_epoch,
|
||||
'tree_truncate': tree_truncate,
|
||||
'gru_encode': gru_encode
|
||||
'entail_dir': entail_dir
|
||||
}
|
||||
|
||||
if mode == 'train':
|
||||
if train_loc == None or dev_loc == None:
|
||||
print("Train mode requires paths to training and development data sets.")
|
||||
sys.exit(1)
|
||||
train(train_loc, dev_loc, shape, settings)
|
||||
elif mode == 'evaluate':
|
||||
correct, total = evaluate(dev_loc)
|
||||
if dev_loc == None:
|
||||
print("Evaluate mode requires paths to test data set.")
|
||||
sys.exit(1)
|
||||
correct, total = evaluate(dev_loc, shape)
|
||||
print(correct, '/', total, correct / total)
|
||||
else:
|
||||
demo()
|
||||
demo(shape)
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
||||
|
|
|
@ -1,259 +1,137 @@
|
|||
# Semantic similarity with decomposable attention (using spaCy and Keras)
|
||||
# Practical state-of-the-art text similarity with spaCy and Keras
|
||||
import numpy
|
||||
|
||||
from keras.layers import InputSpec, Layer, Input, Dense, merge
|
||||
from keras.layers import Lambda, Activation, Dropout, Embedding, TimeDistributed
|
||||
from keras.layers import Bidirectional, GRU, LSTM
|
||||
from keras.layers.noise import GaussianNoise
|
||||
from keras.layers.advanced_activations import ELU
|
||||
import keras.backend as K
|
||||
from keras.models import Sequential, Model, model_from_json
|
||||
from keras.regularizers import l2
|
||||
from keras.optimizers import Adam
|
||||
from keras.layers.normalization import BatchNormalization
|
||||
from keras.layers.pooling import GlobalAveragePooling1D, GlobalMaxPooling1D
|
||||
from keras.layers import Merge
|
||||
# Semantic entailment/similarity with decomposable attention (using spaCy and Keras)
|
||||
# Practical state-of-the-art textual entailment with spaCy and Keras
|
||||
|
||||
import numpy as np
|
||||
from keras import layers, Model, models, optimizers
|
||||
from keras import backend as K
|
||||
|
||||
def build_model(vectors, shape, settings):
|
||||
'''Compile the model.'''
|
||||
max_length, nr_hidden, nr_class = shape
|
||||
# Declare inputs.
|
||||
ids1 = Input(shape=(max_length,), dtype='int32', name='words1')
|
||||
ids2 = Input(shape=(max_length,), dtype='int32', name='words2')
|
||||
|
||||
# Construct operations, which we'll chain together.
|
||||
embed = _StaticEmbedding(vectors, max_length, nr_hidden, dropout=0.2, nr_tune=5000)
|
||||
if settings['gru_encode']:
|
||||
encode = _BiRNNEncoding(max_length, nr_hidden, dropout=settings['dropout'])
|
||||
attend = _Attention(max_length, nr_hidden, dropout=settings['dropout'])
|
||||
align = _SoftAlignment(max_length, nr_hidden)
|
||||
compare = _Comparison(max_length, nr_hidden, dropout=settings['dropout'])
|
||||
entail = _Entailment(nr_hidden, nr_class, dropout=settings['dropout'])
|
||||
input1 = layers.Input(shape=(max_length,), dtype='int32', name='words1')
|
||||
input2 = layers.Input(shape=(max_length,), dtype='int32', name='words2')
|
||||
|
||||
# embeddings (projected)
|
||||
embed = create_embedding(vectors, max_length, nr_hidden)
|
||||
|
||||
a = embed(input1)
|
||||
b = embed(input2)
|
||||
|
||||
# step 1: attend
|
||||
F = create_feedforward(nr_hidden)
|
||||
att_weights = layers.dot([F(a), F(b)], axes=-1)
|
||||
|
||||
G = create_feedforward(nr_hidden)
|
||||
|
||||
if settings['entail_dir'] == 'both':
|
||||
norm_weights_a = layers.Lambda(normalizer(1))(att_weights)
|
||||
norm_weights_b = layers.Lambda(normalizer(2))(att_weights)
|
||||
alpha = layers.dot([norm_weights_a, a], axes=1)
|
||||
beta = layers.dot([norm_weights_b, b], axes=1)
|
||||
|
||||
# Declare the model as a computational graph.
|
||||
sent1 = embed(ids1) # Shape: (i, n)
|
||||
sent2 = embed(ids2) # Shape: (j, n)
|
||||
# step 2: compare
|
||||
comp1 = layers.concatenate([a, beta])
|
||||
comp2 = layers.concatenate([b, alpha])
|
||||
v1 = layers.TimeDistributed(G)(comp1)
|
||||
v2 = layers.TimeDistributed(G)(comp2)
|
||||
|
||||
if settings['gru_encode']:
|
||||
sent1 = encode(sent1)
|
||||
sent2 = encode(sent2)
|
||||
# step 3: aggregate
|
||||
v1_sum = layers.Lambda(sum_word)(v1)
|
||||
v2_sum = layers.Lambda(sum_word)(v2)
|
||||
concat = layers.concatenate([v1_sum, v2_sum])
|
||||
|
||||
attention = attend(sent1, sent2) # Shape: (i, j)
|
||||
elif settings['entail_dir'] == 'left':
|
||||
norm_weights_a = layers.Lambda(normalizer(1))(att_weights)
|
||||
alpha = layers.dot([norm_weights_a, a], axes=1)
|
||||
comp2 = layers.concatenate([b, alpha])
|
||||
v2 = layers.TimeDistributed(G)(comp2)
|
||||
v2_sum = layers.Lambda(sum_word)(v2)
|
||||
concat = v2_sum
|
||||
|
||||
align1 = align(sent2, attention)
|
||||
align2 = align(sent1, attention, transpose=True)
|
||||
|
||||
feats1 = compare(sent1, align1)
|
||||
feats2 = compare(sent2, align2)
|
||||
|
||||
scores = entail(feats1, feats2)
|
||||
|
||||
# Now that we have the input/output, we can construct the Model object...
|
||||
model = Model(input=[ids1, ids2], output=[scores])
|
||||
|
||||
# ...Compile it...
|
||||
else:
|
||||
norm_weights_b = layers.Lambda(normalizer(2))(att_weights)
|
||||
beta = layers.dot([norm_weights_b, b], axes=1)
|
||||
comp1 = layers.concatenate([a, beta])
|
||||
v1 = layers.TimeDistributed(G)(comp1)
|
||||
v1_sum = layers.Lambda(sum_word)(v1)
|
||||
concat = v1_sum
|
||||
|
||||
H = create_feedforward(nr_hidden)
|
||||
out = H(concat)
|
||||
out = layers.Dense(nr_class, activation='softmax')(out)
|
||||
|
||||
model = Model([input1, input2], out)
|
||||
|
||||
model.compile(
|
||||
optimizer=Adam(lr=settings['lr']),
|
||||
optimizer=optimizers.Adam(lr=settings['lr']),
|
||||
loss='categorical_crossentropy',
|
||||
metrics=['accuracy'])
|
||||
# ...And return it for training.
|
||||
|
||||
return model
|
||||
|
||||
|
||||
class _StaticEmbedding(object):
|
||||
def __init__(self, vectors, max_length, nr_out, nr_tune=1000, dropout=0.0):
|
||||
self.nr_out = nr_out
|
||||
self.max_length = max_length
|
||||
self.embed = Embedding(
|
||||
vectors.shape[0],
|
||||
vectors.shape[1],
|
||||
input_length=max_length,
|
||||
weights=[vectors],
|
||||
name='embed',
|
||||
trainable=False)
|
||||
self.tune = Embedding(
|
||||
nr_tune,
|
||||
nr_out,
|
||||
input_length=max_length,
|
||||
weights=None,
|
||||
name='tune',
|
||||
trainable=True,
|
||||
dropout=dropout)
|
||||
self.mod_ids = Lambda(lambda sent: sent % (nr_tune-1)+1,
|
||||
output_shape=(self.max_length,))
|
||||
def create_embedding(vectors, max_length, projected_dim):
|
||||
return models.Sequential([
|
||||
layers.Embedding(
|
||||
vectors.shape[0],
|
||||
vectors.shape[1],
|
||||
input_length=max_length,
|
||||
weights=[vectors],
|
||||
trainable=False),
|
||||
|
||||
layers.TimeDistributed(
|
||||
layers.Dense(projected_dim,
|
||||
activation=None,
|
||||
use_bias=False))
|
||||
])
|
||||
|
||||
self.project = TimeDistributed(
|
||||
Dense(
|
||||
nr_out,
|
||||
activation=None,
|
||||
bias=False,
|
||||
name='project'))
|
||||
|
||||
def __call__(self, sentence):
|
||||
def get_output_shape(shapes):
|
||||
print(shapes)
|
||||
return shapes[0]
|
||||
mod_sent = self.mod_ids(sentence)
|
||||
tuning = self.tune(mod_sent)
|
||||
#tuning = merge([tuning, mod_sent],
|
||||
# mode=lambda AB: AB[0] * (K.clip(K.cast(AB[1], 'float32'), 0, 1)),
|
||||
# output_shape=(self.max_length, self.nr_out))
|
||||
pretrained = self.project(self.embed(sentence))
|
||||
vectors = merge([pretrained, tuning], mode='sum')
|
||||
return vectors
|
||||
def create_feedforward(num_units=200, activation='relu', dropout_rate=0.2):
|
||||
return models.Sequential([
|
||||
layers.Dense(num_units, activation=activation),
|
||||
layers.Dropout(dropout_rate),
|
||||
layers.Dense(num_units, activation=activation),
|
||||
layers.Dropout(dropout_rate)
|
||||
])
|
||||
|
||||
|
||||
class _BiRNNEncoding(object):
|
||||
def __init__(self, max_length, nr_out, dropout=0.0):
|
||||
self.model = Sequential()
|
||||
self.model.add(Bidirectional(LSTM(nr_out, return_sequences=True,
|
||||
dropout_W=dropout, dropout_U=dropout),
|
||||
input_shape=(max_length, nr_out)))
|
||||
self.model.add(TimeDistributed(Dense(nr_out, activation='relu', init='he_normal')))
|
||||
self.model.add(TimeDistributed(Dropout(0.2)))
|
||||
def normalizer(axis):
|
||||
def _normalize(att_weights):
|
||||
exp_weights = K.exp(att_weights)
|
||||
sum_weights = K.sum(exp_weights, axis=axis, keepdims=True)
|
||||
return exp_weights/sum_weights
|
||||
return _normalize
|
||||
|
||||
def __call__(self, sentence):
|
||||
return self.model(sentence)
|
||||
|
||||
|
||||
class _Attention(object):
|
||||
def __init__(self, max_length, nr_hidden, dropout=0.0, L2=0.0, activation='relu'):
|
||||
self.max_length = max_length
|
||||
self.model = Sequential()
|
||||
self.model.add(Dropout(dropout, input_shape=(nr_hidden,)))
|
||||
self.model.add(
|
||||
Dense(nr_hidden, name='attend1',
|
||||
init='he_normal', W_regularizer=l2(L2),
|
||||
input_shape=(nr_hidden,), activation='relu'))
|
||||
self.model.add(Dropout(dropout))
|
||||
self.model.add(Dense(nr_hidden, name='attend2',
|
||||
init='he_normal', W_regularizer=l2(L2), activation='relu'))
|
||||
self.model = TimeDistributed(self.model)
|
||||
|
||||
def __call__(self, sent1, sent2):
|
||||
def _outer(AB):
|
||||
att_ji = K.batch_dot(AB[1], K.permute_dimensions(AB[0], (0, 2, 1)))
|
||||
return K.permute_dimensions(att_ji,(0, 2, 1))
|
||||
return merge(
|
||||
[self.model(sent1), self.model(sent2)],
|
||||
mode=_outer,
|
||||
output_shape=(self.max_length, self.max_length))
|
||||
|
||||
|
||||
class _SoftAlignment(object):
|
||||
def __init__(self, max_length, nr_hidden):
|
||||
self.max_length = max_length
|
||||
self.nr_hidden = nr_hidden
|
||||
|
||||
def __call__(self, sentence, attention, transpose=False):
|
||||
def _normalize_attention(attmat):
|
||||
att = attmat[0]
|
||||
mat = attmat[1]
|
||||
if transpose:
|
||||
att = K.permute_dimensions(att,(0, 2, 1))
|
||||
# 3d softmax
|
||||
e = K.exp(att - K.max(att, axis=-1, keepdims=True))
|
||||
s = K.sum(e, axis=-1, keepdims=True)
|
||||
sm_att = e / s
|
||||
return K.batch_dot(sm_att, mat)
|
||||
return merge([attention, sentence], mode=_normalize_attention,
|
||||
output_shape=(self.max_length, self.nr_hidden)) # Shape: (i, n)
|
||||
|
||||
|
||||
class _Comparison(object):
|
||||
def __init__(self, words, nr_hidden, L2=0.0, dropout=0.0):
|
||||
self.words = words
|
||||
self.model = Sequential()
|
||||
self.model.add(Dropout(dropout, input_shape=(nr_hidden*2,)))
|
||||
self.model.add(Dense(nr_hidden, name='compare1',
|
||||
init='he_normal', W_regularizer=l2(L2)))
|
||||
self.model.add(Activation('relu'))
|
||||
self.model.add(Dropout(dropout))
|
||||
self.model.add(Dense(nr_hidden, name='compare2',
|
||||
W_regularizer=l2(L2), init='he_normal'))
|
||||
self.model.add(Activation('relu'))
|
||||
self.model = TimeDistributed(self.model)
|
||||
|
||||
def __call__(self, sent, align, **kwargs):
|
||||
result = self.model(merge([sent, align], mode='concat')) # Shape: (i, n)
|
||||
avged = GlobalAveragePooling1D()(result, mask=self.words)
|
||||
maxed = GlobalMaxPooling1D()(result, mask=self.words)
|
||||
merged = merge([avged, maxed])
|
||||
result = BatchNormalization()(merged)
|
||||
return result
|
||||
|
||||
|
||||
class _Entailment(object):
|
||||
def __init__(self, nr_hidden, nr_out, dropout=0.0, L2=0.0):
|
||||
self.model = Sequential()
|
||||
self.model.add(Dropout(dropout, input_shape=(nr_hidden*2,)))
|
||||
self.model.add(Dense(nr_hidden, name='entail1',
|
||||
init='he_normal', W_regularizer=l2(L2)))
|
||||
self.model.add(Activation('relu'))
|
||||
self.model.add(Dropout(dropout))
|
||||
self.model.add(Dense(nr_hidden, name='entail2',
|
||||
init='he_normal', W_regularizer=l2(L2)))
|
||||
self.model.add(Activation('relu'))
|
||||
self.model.add(Dense(nr_out, name='entail_out', activation='softmax',
|
||||
W_regularizer=l2(L2), init='zero'))
|
||||
|
||||
def __call__(self, feats1, feats2):
|
||||
features = merge([feats1, feats2], mode='concat')
|
||||
return self.model(features)
|
||||
|
||||
|
||||
class _GlobalSumPooling1D(Layer):
|
||||
'''Global sum pooling operation for temporal data.
|
||||
|
||||
# Input shape
|
||||
3D tensor with shape: `(samples, steps, features)`.
|
||||
|
||||
# Output shape
|
||||
2D tensor with shape: `(samples, features)`.
|
||||
'''
|
||||
def __init__(self, **kwargs):
|
||||
super(_GlobalSumPooling1D, self).__init__(**kwargs)
|
||||
self.input_spec = [InputSpec(ndim=3)]
|
||||
|
||||
def get_output_shape_for(self, input_shape):
|
||||
return (input_shape[0], input_shape[2])
|
||||
|
||||
def call(self, x, mask=None):
|
||||
if mask is not None:
|
||||
return K.sum(x * K.clip(mask, 0, 1), axis=1)
|
||||
else:
|
||||
return K.sum(x, axis=1)
|
||||
def sum_word(x):
|
||||
return K.sum(x, axis=1)
|
||||
|
||||
|
||||
def test_build_model():
|
||||
vectors = numpy.ndarray((100, 8), dtype='float32')
|
||||
vectors = np.ndarray((100, 8), dtype='float32')
|
||||
shape = (10, 16, 3)
|
||||
settings = {'lr': 0.001, 'dropout': 0.2, 'gru_encode':True}
|
||||
settings = {'lr': 0.001, 'dropout': 0.2, 'gru_encode':True, 'entail_dir':'both'}
|
||||
model = build_model(vectors, shape, settings)
|
||||
|
||||
|
||||
def test_fit_model():
|
||||
|
||||
def _generate_X(nr_example, length, nr_vector):
|
||||
X1 = numpy.ndarray((nr_example, length), dtype='int32')
|
||||
X1 = np.ndarray((nr_example, length), dtype='int32')
|
||||
X1 *= X1 < nr_vector
|
||||
X1 *= 0 <= X1
|
||||
X2 = numpy.ndarray((nr_example, length), dtype='int32')
|
||||
X2 = np.ndarray((nr_example, length), dtype='int32')
|
||||
X2 *= X2 < nr_vector
|
||||
X2 *= 0 <= X2
|
||||
return [X1, X2]
|
||||
|
||||
def _generate_Y(nr_example, nr_class):
|
||||
ys = numpy.zeros((nr_example, nr_class), dtype='int32')
|
||||
ys = np.zeros((nr_example, nr_class), dtype='int32')
|
||||
for i in range(nr_example):
|
||||
ys[i, i % nr_class] = 1
|
||||
return ys
|
||||
|
||||
vectors = numpy.ndarray((100, 8), dtype='float32')
|
||||
vectors = np.ndarray((100, 8), dtype='float32')
|
||||
shape = (10, 16, 3)
|
||||
settings = {'lr': 0.001, 'dropout': 0.2, 'gru_encode':True}
|
||||
settings = {'lr': 0.001, 'dropout': 0.2, 'gru_encode':True, 'entail_dir':'both'}
|
||||
model = build_model(vectors, shape, settings)
|
||||
|
||||
train_X = _generate_X(20, shape[0], vectors.shape[0])
|
||||
|
@ -261,8 +139,7 @@ def test_fit_model():
|
|||
dev_X = _generate_X(15, shape[0], vectors.shape[0])
|
||||
dev_Y = _generate_Y(15, shape[2])
|
||||
|
||||
model.fit(train_X, train_Y, validation_data=(dev_X, dev_Y), nb_epoch=5,
|
||||
batch_size=4)
|
||||
model.fit(train_X, train_Y, validation_data=(dev_X, dev_Y), epochs=5, batch_size=4)
|
||||
|
||||
|
||||
__all__ = [build_model]
|
||||
|
|
|
@ -1,8 +1,5 @@
|
|||
import numpy as np
|
||||
from keras.models import model_from_json
|
||||
import numpy
|
||||
import numpy.random
|
||||
import json
|
||||
from spacy.tokens.span import Span
|
||||
|
||||
try:
|
||||
import cPickle as pickle
|
||||
|
@ -11,16 +8,23 @@ except ImportError:
|
|||
|
||||
|
||||
class KerasSimilarityShim(object):
|
||||
entailment_types = ["entailment", "contradiction", "neutral"]
|
||||
|
||||
@classmethod
|
||||
def load(cls, path, nlp, get_features=None, max_length=100):
|
||||
def load(cls, path, nlp, max_length=100, get_features=None):
|
||||
|
||||
if get_features is None:
|
||||
get_features = get_word_ids
|
||||
|
||||
with (path / 'config.json').open() as file_:
|
||||
model = model_from_json(file_.read())
|
||||
with (path / 'model').open('rb') as file_:
|
||||
weights = pickle.load(file_)
|
||||
|
||||
embeddings = get_embeddings(nlp.vocab)
|
||||
model.set_weights([embeddings] + weights)
|
||||
weights.insert(1, embeddings)
|
||||
model.set_weights(weights)
|
||||
|
||||
return cls(model, get_features=get_features, max_length=max_length)
|
||||
|
||||
def __init__(self, model, get_features=None, max_length=100):
|
||||
|
@ -32,58 +36,42 @@ class KerasSimilarityShim(object):
|
|||
doc.user_hooks['similarity'] = self.predict
|
||||
doc.user_span_hooks['similarity'] = self.predict
|
||||
|
||||
return doc
|
||||
|
||||
def predict(self, doc1, doc2):
|
||||
x1 = self.get_features([doc1], max_length=self.max_length, tree_truncate=True)
|
||||
x2 = self.get_features([doc2], max_length=self.max_length, tree_truncate=True)
|
||||
x1 = self.get_features([doc1], max_length=self.max_length)
|
||||
x2 = self.get_features([doc2], max_length=self.max_length)
|
||||
scores = self.model.predict([x1, x2])
|
||||
return scores[0]
|
||||
|
||||
return self.entailment_types[scores.argmax()], scores.max()
|
||||
|
||||
|
||||
def get_embeddings(vocab, nr_unk=100):
|
||||
nr_vector = max(lex.rank for lex in vocab) + 1
|
||||
vectors = numpy.zeros((nr_vector+nr_unk+2, vocab.vectors_length), dtype='float32')
|
||||
# the extra +1 is for a zero vector representing sentence-final padding
|
||||
num_vectors = max(lex.rank for lex in vocab) + 2
|
||||
|
||||
# create random vectors for OOV tokens
|
||||
oov = np.random.normal(size=(nr_unk, vocab.vectors_length))
|
||||
oov = oov / oov.sum(axis=1, keepdims=True)
|
||||
|
||||
vectors = np.zeros((num_vectors + nr_unk, vocab.vectors_length), dtype='float32')
|
||||
vectors[1:(nr_unk + 1), ] = oov
|
||||
for lex in vocab:
|
||||
if lex.has_vector:
|
||||
vectors[lex.rank+1] = lex.vector / lex.vector_norm
|
||||
if lex.has_vector and lex.vector_norm > 0:
|
||||
vectors[nr_unk + lex.rank + 1] = lex.vector / lex.vector_norm
|
||||
|
||||
return vectors
|
||||
|
||||
|
||||
def get_word_ids(docs, rnn_encode=False, tree_truncate=False, max_length=100, nr_unk=100):
|
||||
Xs = numpy.zeros((len(docs), max_length), dtype='int32')
|
||||
def get_word_ids(docs, max_length=100, nr_unk=100):
|
||||
Xs = np.zeros((len(docs), max_length), dtype='int32')
|
||||
|
||||
for i, doc in enumerate(docs):
|
||||
if tree_truncate:
|
||||
if isinstance(doc, Span):
|
||||
queue = [doc.root]
|
||||
else:
|
||||
queue = [sent.root for sent in doc.sents]
|
||||
else:
|
||||
queue = list(doc)
|
||||
words = []
|
||||
while len(words) <= max_length and queue:
|
||||
word = queue.pop(0)
|
||||
if rnn_encode or (not word.is_punct and not word.is_space):
|
||||
words.append(word)
|
||||
if tree_truncate:
|
||||
queue.extend(list(word.lefts))
|
||||
queue.extend(list(word.rights))
|
||||
words.sort()
|
||||
for j, token in enumerate(words):
|
||||
if token.has_vector:
|
||||
Xs[i, j] = token.rank+1
|
||||
else:
|
||||
Xs[i, j] = (token.shape % (nr_unk-1))+2
|
||||
j += 1
|
||||
if j >= max_length:
|
||||
for j, token in enumerate(doc):
|
||||
if j == max_length:
|
||||
break
|
||||
else:
|
||||
Xs[i, len(words)] = 1
|
||||
if token.has_vector:
|
||||
Xs[i, j] = token.rank + nr_unk + 1
|
||||
else:
|
||||
Xs[i, j] = token.rank % nr_unk + 1
|
||||
return Xs
|
||||
|
||||
|
||||
def create_similarity_pipeline(nlp, max_length=100):
|
||||
return [
|
||||
nlp.tagger,
|
||||
nlp.entity,
|
||||
nlp.parser,
|
||||
KerasSimilarityShim.load(nlp.path / 'similarity', nlp, max_length)
|
||||
]
|
||||
|
|
955
examples/notebooks/Decompositional Attention.ipynb
Normal file
955
examples/notebooks/Decompositional Attention.ipynb
Normal file
|
@ -0,0 +1,955 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Natural language inference using spaCy and Keras"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Introduction"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This notebook details an implementation of the natural language inference model presented in [(Parikh et al, 2016)](https://arxiv.org/abs/1606.01933). The model is notable for the small number of paramaters *and hyperparameters* it specifices, while still yielding good performance."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Constructing the dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import spacy\n",
|
||||
"import numpy as np"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We only need the GloVe vectors from spaCy, not a full NLP pipeline."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"nlp = spacy.load('en_vectors_web_lg')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Function to load the SNLI dataset. The categories are converted to one-shot representation. The function comes from an example in spaCy."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/jds/tensorflow-gpu/lib/python3.5/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
|
||||
" from ._conv import register_converters as _register_converters\n",
|
||||
"Using TensorFlow backend.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import ujson as json\n",
|
||||
"from keras.utils import to_categorical\n",
|
||||
"\n",
|
||||
"LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}\n",
|
||||
"def read_snli(path):\n",
|
||||
" texts1 = []\n",
|
||||
" texts2 = []\n",
|
||||
" labels = []\n",
|
||||
" with open(path, 'r') as file_:\n",
|
||||
" for line in file_:\n",
|
||||
" eg = json.loads(line)\n",
|
||||
" label = eg['gold_label']\n",
|
||||
" if label == '-': # per Parikh, ignore - SNLI entries\n",
|
||||
" continue\n",
|
||||
" texts1.append(eg['sentence1'])\n",
|
||||
" texts2.append(eg['sentence2'])\n",
|
||||
" labels.append(LABELS[label])\n",
|
||||
" return texts1, texts2, to_categorical(np.asarray(labels, dtype='int32'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Because Keras can do the train/test split for us, we'll load *all* SNLI triples from one file."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"texts,hypotheses,labels = read_snli('snli/snli_1.0_train.jsonl')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def create_dataset(nlp, texts, hypotheses, num_oov, max_length, norm_vectors = True):\n",
|
||||
" sents = texts + hypotheses\n",
|
||||
" \n",
|
||||
" # the extra +1 is for a zero vector represting NULL for padding\n",
|
||||
" num_vectors = max(lex.rank for lex in nlp.vocab) + 2 \n",
|
||||
" \n",
|
||||
" # create random vectors for OOV tokens\n",
|
||||
" oov = np.random.normal(size=(num_oov, nlp.vocab.vectors_length))\n",
|
||||
" oov = oov / oov.sum(axis=1, keepdims=True)\n",
|
||||
" \n",
|
||||
" vectors = np.zeros((num_vectors + num_oov, nlp.vocab.vectors_length), dtype='float32')\n",
|
||||
" vectors[num_vectors:, ] = oov\n",
|
||||
" for lex in nlp.vocab:\n",
|
||||
" if lex.has_vector and lex.vector_norm > 0:\n",
|
||||
" vectors[lex.rank + 1] = lex.vector / lex.vector_norm if norm_vectors == True else lex.vector\n",
|
||||
" \n",
|
||||
" sents_as_ids = []\n",
|
||||
" for sent in sents:\n",
|
||||
" doc = nlp(sent)\n",
|
||||
" word_ids = []\n",
|
||||
" \n",
|
||||
" for i, token in enumerate(doc):\n",
|
||||
" # skip odd spaces from tokenizer\n",
|
||||
" if token.has_vector and token.vector_norm == 0:\n",
|
||||
" continue\n",
|
||||
" \n",
|
||||
" if i > max_length:\n",
|
||||
" break\n",
|
||||
" \n",
|
||||
" if token.has_vector:\n",
|
||||
" word_ids.append(token.rank + 1)\n",
|
||||
" else:\n",
|
||||
" # if we don't have a vector, pick an OOV entry\n",
|
||||
" word_ids.append(token.rank % num_oov + num_vectors) \n",
|
||||
" \n",
|
||||
" # there must be a simpler way of generating padded arrays from lists...\n",
|
||||
" word_id_vec = np.zeros((max_length), dtype='int')\n",
|
||||
" clipped_len = min(max_length, len(word_ids))\n",
|
||||
" word_id_vec[:clipped_len] = word_ids[:clipped_len]\n",
|
||||
" sents_as_ids.append(word_id_vec)\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" return vectors, np.array(sents_as_ids[:len(texts)]), np.array(sents_as_ids[len(texts):])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sem_vectors, text_vectors, hypothesis_vectors = create_dataset(nlp, texts, hypotheses, 100, 50, True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"texts_test,hypotheses_test,labels_test = read_snli('snli/snli_1.0_test.jsonl')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"_, text_vectors_test, hypothesis_vectors_test = create_dataset(nlp, texts_test, hypotheses_test, 100, 50, True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We use spaCy to tokenize the sentences and return, when available, a semantic vector for each token. \n",
|
||||
"\n",
|
||||
"OOV terms (tokens for which no semantic vector is available) are assigned to one of a set of randomly-generated OOV vectors, per (Parikh et al, 2016).\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Note that we will clip sentences to 50 words maximum."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from keras import layers, Model, models\n",
|
||||
"from keras import backend as K"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Building the model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The embedding layer copies the 300-dimensional GloVe vectors into GPU memory. Per (Parikh et al, 2016), the vectors, which are not adapted during training, are projected down to lower-dimensional vectors using a trained projection matrix."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def create_embedding(vectors, max_length, projected_dim):\n",
|
||||
" return models.Sequential([\n",
|
||||
" layers.Embedding(\n",
|
||||
" vectors.shape[0],\n",
|
||||
" vectors.shape[1],\n",
|
||||
" input_length=max_length,\n",
|
||||
" weights=[vectors],\n",
|
||||
" trainable=False),\n",
|
||||
" \n",
|
||||
" layers.TimeDistributed(\n",
|
||||
" layers.Dense(projected_dim,\n",
|
||||
" activation=None,\n",
|
||||
" use_bias=False))\n",
|
||||
" ])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The Parikh model makes use of three feedforward blocks that construct nonlinear combinations of their input. Each block contains two ReLU layers and two dropout layers."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def create_feedforward(num_units=200, activation='relu', dropout_rate=0.2):\n",
|
||||
" return models.Sequential([\n",
|
||||
" layers.Dense(num_units, activation=activation),\n",
|
||||
" layers.Dropout(dropout_rate),\n",
|
||||
" layers.Dense(num_units, activation=activation),\n",
|
||||
" layers.Dropout(dropout_rate)\n",
|
||||
" ])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The basic idea of the (Parikh et al, 2016) model is to:\n",
|
||||
"\n",
|
||||
"1. *Align*: Construct an alignment of subphrases in the text and hypothesis using an attention-like mechanism, called \"decompositional\" because the layer is applied to each of the two sentences individually rather than to their product. The dot product of the nonlinear transformations of the inputs is then normalized vertically and horizontally to yield a pair of \"soft\" alignment structures, from text->hypothesis and hypothesis->text. Concretely, for each word in one sentence, a multinomial distribution is computed over the words of the other sentence, by learning a multinomial logistic with softmax target.\n",
|
||||
"2. *Compare*: Each word is now compared to its aligned phrase using a function modeled as a two-layer feedforward ReLU network. The output is a high-dimensional representation of the strength of association between word and aligned phrase.\n",
|
||||
"3. *Aggregate*: The comparison vectors are summed, separately, for the text and the hypothesis. The result is two vectors: one that describes the degree of association of the text to the hypothesis, and the second, of the hypothesis to the text.\n",
|
||||
"4. Finally, these two vectors are processed by a dense layer followed by a softmax classifier, as usual.\n",
|
||||
"\n",
|
||||
"Note that because in entailment the truth conditions of the consequent must be a subset of those of the antecedent, it is not obvious that we need both vectors in step (3). Entailment is not symmetric. It may be enough to just use the hypothesis->text vector. We will explore this possibility later."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We need a couple of little functions for Lambda layers to normalize and aggregate weights:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def normalizer(axis):\n",
|
||||
" def _normalize(att_weights):\n",
|
||||
" exp_weights = K.exp(att_weights)\n",
|
||||
" sum_weights = K.sum(exp_weights, axis=axis, keepdims=True)\n",
|
||||
" return exp_weights/sum_weights\n",
|
||||
" return _normalize\n",
|
||||
"\n",
|
||||
"def sum_word(x):\n",
|
||||
" return K.sum(x, axis=1)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def build_model(vectors, max_length, num_hidden, num_classes, projected_dim, entail_dir='both'):\n",
|
||||
" input1 = layers.Input(shape=(max_length,), dtype='int32', name='words1')\n",
|
||||
" input2 = layers.Input(shape=(max_length,), dtype='int32', name='words2')\n",
|
||||
" \n",
|
||||
" # embeddings (projected)\n",
|
||||
" embed = create_embedding(vectors, max_length, projected_dim)\n",
|
||||
" \n",
|
||||
" a = embed(input1)\n",
|
||||
" b = embed(input2)\n",
|
||||
" \n",
|
||||
" # step 1: attend\n",
|
||||
" F = create_feedforward(num_hidden)\n",
|
||||
" att_weights = layers.dot([F(a), F(b)], axes=-1)\n",
|
||||
" \n",
|
||||
" G = create_feedforward(num_hidden)\n",
|
||||
" \n",
|
||||
" if entail_dir == 'both':\n",
|
||||
" norm_weights_a = layers.Lambda(normalizer(1))(att_weights)\n",
|
||||
" norm_weights_b = layers.Lambda(normalizer(2))(att_weights)\n",
|
||||
" alpha = layers.dot([norm_weights_a, a], axes=1)\n",
|
||||
" beta = layers.dot([norm_weights_b, b], axes=1)\n",
|
||||
"\n",
|
||||
" # step 2: compare\n",
|
||||
" comp1 = layers.concatenate([a, beta])\n",
|
||||
" comp2 = layers.concatenate([b, alpha])\n",
|
||||
" v1 = layers.TimeDistributed(G)(comp1)\n",
|
||||
" v2 = layers.TimeDistributed(G)(comp2)\n",
|
||||
"\n",
|
||||
" # step 3: aggregate\n",
|
||||
" v1_sum = layers.Lambda(sum_word)(v1)\n",
|
||||
" v2_sum = layers.Lambda(sum_word)(v2)\n",
|
||||
" concat = layers.concatenate([v1_sum, v2_sum])\n",
|
||||
" elif entail_dir == 'left':\n",
|
||||
" norm_weights_a = layers.Lambda(normalizer(1))(att_weights)\n",
|
||||
" alpha = layers.dot([norm_weights_a, a], axes=1)\n",
|
||||
" comp2 = layers.concatenate([b, alpha])\n",
|
||||
" v2 = layers.TimeDistributed(G)(comp2)\n",
|
||||
" v2_sum = layers.Lambda(sum_word)(v2)\n",
|
||||
" concat = v2_sum\n",
|
||||
" else:\n",
|
||||
" norm_weights_b = layers.Lambda(normalizer(2))(att_weights)\n",
|
||||
" beta = layers.dot([norm_weights_b, b], axes=1)\n",
|
||||
" comp1 = layers.concatenate([a, beta])\n",
|
||||
" v1 = layers.TimeDistributed(G)(comp1)\n",
|
||||
" v1_sum = layers.Lambda(sum_word)(v1)\n",
|
||||
" concat = v1_sum\n",
|
||||
" \n",
|
||||
" H = create_feedforward(num_hidden)\n",
|
||||
" out = H(concat)\n",
|
||||
" out = layers.Dense(num_classes, activation='softmax')(out)\n",
|
||||
" \n",
|
||||
" model = Model([input1, input2], out)\n",
|
||||
" \n",
|
||||
" model.compile(optimizer='adam',\n",
|
||||
" loss='categorical_crossentropy',\n",
|
||||
" metrics=['accuracy'])\n",
|
||||
" return model\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"Layer (type) Output Shape Param # Connected to \n",
|
||||
"==================================================================================================\n",
|
||||
"words1 (InputLayer) (None, 50) 0 \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"words2 (InputLayer) (None, 50) 0 \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"sequential_1 (Sequential) (None, 50, 200) 321381600 words1[0][0] \n",
|
||||
" words2[0][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"sequential_2 (Sequential) (None, 50, 200) 80400 sequential_1[1][0] \n",
|
||||
" sequential_1[2][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"dot_1 (Dot) (None, 50, 50) 0 sequential_2[1][0] \n",
|
||||
" sequential_2[2][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"lambda_2 (Lambda) (None, 50, 50) 0 dot_1[0][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"lambda_1 (Lambda) (None, 50, 50) 0 dot_1[0][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"dot_3 (Dot) (None, 50, 200) 0 lambda_2[0][0] \n",
|
||||
" sequential_1[2][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"dot_2 (Dot) (None, 50, 200) 0 lambda_1[0][0] \n",
|
||||
" sequential_1[1][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"concatenate_1 (Concatenate) (None, 50, 400) 0 sequential_1[1][0] \n",
|
||||
" dot_3[0][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"concatenate_2 (Concatenate) (None, 50, 400) 0 sequential_1[2][0] \n",
|
||||
" dot_2[0][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"time_distributed_2 (TimeDistrib (None, 50, 200) 120400 concatenate_1[0][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"time_distributed_3 (TimeDistrib (None, 50, 200) 120400 concatenate_2[0][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"lambda_3 (Lambda) (None, 200) 0 time_distributed_2[0][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"lambda_4 (Lambda) (None, 200) 0 time_distributed_3[0][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"concatenate_3 (Concatenate) (None, 400) 0 lambda_3[0][0] \n",
|
||||
" lambda_4[0][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"sequential_4 (Sequential) (None, 200) 120400 concatenate_3[0][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"dense_8 (Dense) (None, 3) 603 sequential_4[1][0] \n",
|
||||
"==================================================================================================\n",
|
||||
"Total params: 321,703,403\n",
|
||||
"Trainable params: 381,803\n",
|
||||
"Non-trainable params: 321,321,600\n",
|
||||
"__________________________________________________________________________________________________\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"K.clear_session()\n",
|
||||
"m = build_model(sem_vectors, 50, 200, 3, 200)\n",
|
||||
"m.summary()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The number of trainable parameters, ~381k, is the number given by Parikh et al, so we're on the right track."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Training the model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Parikh et al use tiny batches of 4, training for 50MM batches, which amounts to around 500 epochs. Here we'll use large batches to better use the GPU, and train for fewer epochs -- for purposes of this experiment."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Train on 549367 samples, validate on 9824 samples\n",
|
||||
"Epoch 1/50\n",
|
||||
"549367/549367 [==============================] - 34s 62us/step - loss: 0.7599 - acc: 0.6617 - val_loss: 0.5396 - val_acc: 0.7861\n",
|
||||
"Epoch 2/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.5611 - acc: 0.7763 - val_loss: 0.4892 - val_acc: 0.8085\n",
|
||||
"Epoch 3/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.5212 - acc: 0.7948 - val_loss: 0.4574 - val_acc: 0.8261\n",
|
||||
"Epoch 4/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4986 - acc: 0.8045 - val_loss: 0.4410 - val_acc: 0.8274\n",
|
||||
"Epoch 5/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4819 - acc: 0.8114 - val_loss: 0.4224 - val_acc: 0.8383\n",
|
||||
"Epoch 6/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4714 - acc: 0.8166 - val_loss: 0.4200 - val_acc: 0.8379\n",
|
||||
"Epoch 7/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4633 - acc: 0.8203 - val_loss: 0.4098 - val_acc: 0.8457\n",
|
||||
"Epoch 8/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4558 - acc: 0.8232 - val_loss: 0.4114 - val_acc: 0.8415\n",
|
||||
"Epoch 9/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4508 - acc: 0.8250 - val_loss: 0.4062 - val_acc: 0.8477\n",
|
||||
"Epoch 10/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4433 - acc: 0.8286 - val_loss: 0.3982 - val_acc: 0.8486\n",
|
||||
"Epoch 11/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4388 - acc: 0.8307 - val_loss: 0.3953 - val_acc: 0.8497\n",
|
||||
"Epoch 12/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4351 - acc: 0.8321 - val_loss: 0.3973 - val_acc: 0.8522\n",
|
||||
"Epoch 13/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4309 - acc: 0.8342 - val_loss: 0.3939 - val_acc: 0.8539\n",
|
||||
"Epoch 14/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4269 - acc: 0.8355 - val_loss: 0.3932 - val_acc: 0.8517\n",
|
||||
"Epoch 15/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4247 - acc: 0.8369 - val_loss: 0.3938 - val_acc: 0.8515\n",
|
||||
"Epoch 16/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4208 - acc: 0.8379 - val_loss: 0.3936 - val_acc: 0.8504\n",
|
||||
"Epoch 17/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4194 - acc: 0.8390 - val_loss: 0.3885 - val_acc: 0.8560\n",
|
||||
"Epoch 18/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4162 - acc: 0.8402 - val_loss: 0.3874 - val_acc: 0.8561\n",
|
||||
"Epoch 19/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4140 - acc: 0.8409 - val_loss: 0.3889 - val_acc: 0.8545\n",
|
||||
"Epoch 20/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4114 - acc: 0.8426 - val_loss: 0.3864 - val_acc: 0.8583\n",
|
||||
"Epoch 21/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4092 - acc: 0.8430 - val_loss: 0.3870 - val_acc: 0.8561\n",
|
||||
"Epoch 22/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4062 - acc: 0.8442 - val_loss: 0.3852 - val_acc: 0.8577\n",
|
||||
"Epoch 23/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4050 - acc: 0.8450 - val_loss: 0.3850 - val_acc: 0.8578\n",
|
||||
"Epoch 24/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4035 - acc: 0.8455 - val_loss: 0.3825 - val_acc: 0.8555\n",
|
||||
"Epoch 25/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4018 - acc: 0.8460 - val_loss: 0.3837 - val_acc: 0.8573\n",
|
||||
"Epoch 26/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3989 - acc: 0.8476 - val_loss: 0.3843 - val_acc: 0.8599\n",
|
||||
"Epoch 27/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3979 - acc: 0.8481 - val_loss: 0.3841 - val_acc: 0.8589\n",
|
||||
"Epoch 28/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3967 - acc: 0.8484 - val_loss: 0.3811 - val_acc: 0.8575\n",
|
||||
"Epoch 29/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3956 - acc: 0.8492 - val_loss: 0.3829 - val_acc: 0.8589\n",
|
||||
"Epoch 30/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3938 - acc: 0.8499 - val_loss: 0.3859 - val_acc: 0.8562\n",
|
||||
"Epoch 31/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3925 - acc: 0.8500 - val_loss: 0.3798 - val_acc: 0.8587\n",
|
||||
"Epoch 32/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3906 - acc: 0.8509 - val_loss: 0.3834 - val_acc: 0.8569\n",
|
||||
"Epoch 33/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3893 - acc: 0.8511 - val_loss: 0.3806 - val_acc: 0.8588\n",
|
||||
"Epoch 34/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3885 - acc: 0.8515 - val_loss: 0.3828 - val_acc: 0.8603\n",
|
||||
"Epoch 35/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3879 - acc: 0.8520 - val_loss: 0.3800 - val_acc: 0.8594\n",
|
||||
"Epoch 36/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3860 - acc: 0.8530 - val_loss: 0.3796 - val_acc: 0.8577\n",
|
||||
"Epoch 37/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3856 - acc: 0.8532 - val_loss: 0.3857 - val_acc: 0.8591\n",
|
||||
"Epoch 38/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3838 - acc: 0.8535 - val_loss: 0.3835 - val_acc: 0.8603\n",
|
||||
"Epoch 39/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3830 - acc: 0.8543 - val_loss: 0.3830 - val_acc: 0.8599\n",
|
||||
"Epoch 40/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3818 - acc: 0.8548 - val_loss: 0.3832 - val_acc: 0.8559\n",
|
||||
"Epoch 41/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3806 - acc: 0.8551 - val_loss: 0.3845 - val_acc: 0.8553\n",
|
||||
"Epoch 42/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3803 - acc: 0.8550 - val_loss: 0.3789 - val_acc: 0.8617\n",
|
||||
"Epoch 43/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3791 - acc: 0.8556 - val_loss: 0.3835 - val_acc: 0.8580\n",
|
||||
"Epoch 44/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3778 - acc: 0.8565 - val_loss: 0.3799 - val_acc: 0.8580\n",
|
||||
"Epoch 45/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3766 - acc: 0.8571 - val_loss: 0.3790 - val_acc: 0.8625\n",
|
||||
"Epoch 46/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3770 - acc: 0.8569 - val_loss: 0.3820 - val_acc: 0.8590\n",
|
||||
"Epoch 47/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3761 - acc: 0.8573 - val_loss: 0.3831 - val_acc: 0.8581\n",
|
||||
"Epoch 48/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3739 - acc: 0.8579 - val_loss: 0.3828 - val_acc: 0.8599\n",
|
||||
"Epoch 49/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3738 - acc: 0.8577 - val_loss: 0.3785 - val_acc: 0.8590\n",
|
||||
"Epoch 50/50\n",
|
||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3726 - acc: 0.8580 - val_loss: 0.3820 - val_acc: 0.8585\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<keras.callbacks.History at 0x7f5c9f49c438>"
|
||||
]
|
||||
},
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"m.fit([text_vectors, hypothesis_vectors], labels, batch_size=1024, epochs=50,validation_data=([text_vectors_test, hypothesis_vectors_test], labels_test))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The result is broadly in the region reported by Parikh et al: ~86 vs 86.3%. The small difference might be accounted by differences in `max_length` (here set at 50), in the training regime, and that here we use Keras' built-in validation splitting rather than the SNLI test set."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Experiment: the asymmetric model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"It was suggested earlier that, based on the semantics of entailment, the vector representing the strength of association between the hypothesis to the text is all that is needed for classifying the entailment.\n",
|
||||
"\n",
|
||||
"The following model removes consideration of the complementary vector (text to hypothesis) from the computation. This will decrease the paramater count slightly, because the final dense layers will be smaller, and speed up the forward pass when predicting, because fewer calculations will be needed."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"Layer (type) Output Shape Param # Connected to \n",
|
||||
"==================================================================================================\n",
|
||||
"words2 (InputLayer) (None, 50) 0 \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"words1 (InputLayer) (None, 50) 0 \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"sequential_5 (Sequential) (None, 50, 200) 321381600 words1[0][0] \n",
|
||||
" words2[0][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"sequential_6 (Sequential) (None, 50, 200) 80400 sequential_5[1][0] \n",
|
||||
" sequential_5[2][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"dot_4 (Dot) (None, 50, 50) 0 sequential_6[1][0] \n",
|
||||
" sequential_6[2][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"lambda_5 (Lambda) (None, 50, 50) 0 dot_4[0][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"dot_5 (Dot) (None, 50, 200) 0 lambda_5[0][0] \n",
|
||||
" sequential_5[1][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"concatenate_4 (Concatenate) (None, 50, 400) 0 sequential_5[2][0] \n",
|
||||
" dot_5[0][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"time_distributed_5 (TimeDistrib (None, 50, 200) 120400 concatenate_4[0][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"lambda_6 (Lambda) (None, 200) 0 time_distributed_5[0][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"sequential_8 (Sequential) (None, 200) 80400 lambda_6[0][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"dense_16 (Dense) (None, 3) 603 sequential_8[1][0] \n",
|
||||
"==================================================================================================\n",
|
||||
"Total params: 321,663,403\n",
|
||||
"Trainable params: 341,803\n",
|
||||
"Non-trainable params: 321,321,600\n",
|
||||
"__________________________________________________________________________________________________\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"m1 = build_model(sem_vectors, 50, 200, 3, 200, 'left')\n",
|
||||
"m1.summary()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The parameter count has indeed decreased by 40,000, corresponding to the 200x200 smaller H function."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Train on 549367 samples, validate on 9824 samples\n",
|
||||
"Epoch 1/50\n",
|
||||
"549367/549367 [==============================] - 25s 46us/step - loss: 0.7331 - acc: 0.6770 - val_loss: 0.5257 - val_acc: 0.7936\n",
|
||||
"Epoch 2/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.5518 - acc: 0.7799 - val_loss: 0.4717 - val_acc: 0.8159\n",
|
||||
"Epoch 3/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.5147 - acc: 0.7967 - val_loss: 0.4449 - val_acc: 0.8278\n",
|
||||
"Epoch 4/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4948 - acc: 0.8060 - val_loss: 0.4326 - val_acc: 0.8344\n",
|
||||
"Epoch 5/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4814 - acc: 0.8122 - val_loss: 0.4247 - val_acc: 0.8359\n",
|
||||
"Epoch 6/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4712 - acc: 0.8162 - val_loss: 0.4143 - val_acc: 0.8430\n",
|
||||
"Epoch 7/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4635 - acc: 0.8205 - val_loss: 0.4172 - val_acc: 0.8401\n",
|
||||
"Epoch 8/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4570 - acc: 0.8223 - val_loss: 0.4106 - val_acc: 0.8422\n",
|
||||
"Epoch 9/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4505 - acc: 0.8259 - val_loss: 0.4043 - val_acc: 0.8451\n",
|
||||
"Epoch 10/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4459 - acc: 0.8280 - val_loss: 0.4050 - val_acc: 0.8467\n",
|
||||
"Epoch 11/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4405 - acc: 0.8300 - val_loss: 0.3975 - val_acc: 0.8481\n",
|
||||
"Epoch 12/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4360 - acc: 0.8324 - val_loss: 0.4026 - val_acc: 0.8496\n",
|
||||
"Epoch 13/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4327 - acc: 0.8334 - val_loss: 0.4024 - val_acc: 0.8471\n",
|
||||
"Epoch 14/50\n",
|
||||
"549367/549367 [==============================] - 24s 45us/step - loss: 0.4293 - acc: 0.8350 - val_loss: 0.3955 - val_acc: 0.8496\n",
|
||||
"Epoch 15/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4263 - acc: 0.8369 - val_loss: 0.3980 - val_acc: 0.8490\n",
|
||||
"Epoch 16/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4236 - acc: 0.8377 - val_loss: 0.3958 - val_acc: 0.8496\n",
|
||||
"Epoch 17/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4213 - acc: 0.8384 - val_loss: 0.3954 - val_acc: 0.8496\n",
|
||||
"Epoch 18/50\n",
|
||||
"549367/549367 [==============================] - 24s 45us/step - loss: 0.4187 - acc: 0.8394 - val_loss: 0.3929 - val_acc: 0.8514\n",
|
||||
"Epoch 19/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4157 - acc: 0.8409 - val_loss: 0.3939 - val_acc: 0.8507\n",
|
||||
"Epoch 20/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4135 - acc: 0.8417 - val_loss: 0.3953 - val_acc: 0.8522\n",
|
||||
"Epoch 21/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4122 - acc: 0.8424 - val_loss: 0.3974 - val_acc: 0.8506\n",
|
||||
"Epoch 22/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4099 - acc: 0.8435 - val_loss: 0.3918 - val_acc: 0.8522\n",
|
||||
"Epoch 23/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4075 - acc: 0.8443 - val_loss: 0.3901 - val_acc: 0.8513\n",
|
||||
"Epoch 24/50\n",
|
||||
"549367/549367 [==============================] - 24s 44us/step - loss: 0.4067 - acc: 0.8447 - val_loss: 0.3885 - val_acc: 0.8543\n",
|
||||
"Epoch 25/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4047 - acc: 0.8454 - val_loss: 0.3846 - val_acc: 0.8531\n",
|
||||
"Epoch 26/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4031 - acc: 0.8461 - val_loss: 0.3864 - val_acc: 0.8562\n",
|
||||
"Epoch 27/50\n",
|
||||
"549367/549367 [==============================] - 24s 45us/step - loss: 0.4020 - acc: 0.8467 - val_loss: 0.3874 - val_acc: 0.8546\n",
|
||||
"Epoch 28/50\n",
|
||||
"549367/549367 [==============================] - 24s 45us/step - loss: 0.4001 - acc: 0.8473 - val_loss: 0.3848 - val_acc: 0.8534\n",
|
||||
"Epoch 29/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3991 - acc: 0.8479 - val_loss: 0.3865 - val_acc: 0.8562\n",
|
||||
"Epoch 30/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3976 - acc: 0.8484 - val_loss: 0.3833 - val_acc: 0.8574\n",
|
||||
"Epoch 31/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3961 - acc: 0.8487 - val_loss: 0.3846 - val_acc: 0.8585\n",
|
||||
"Epoch 32/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3942 - acc: 0.8498 - val_loss: 0.3805 - val_acc: 0.8573\n",
|
||||
"Epoch 33/50\n",
|
||||
"549367/549367 [==============================] - 24s 44us/step - loss: 0.3935 - acc: 0.8503 - val_loss: 0.3856 - val_acc: 0.8579\n",
|
||||
"Epoch 34/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3923 - acc: 0.8507 - val_loss: 0.3829 - val_acc: 0.8560\n",
|
||||
"Epoch 35/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3920 - acc: 0.8508 - val_loss: 0.3864 - val_acc: 0.8575\n",
|
||||
"Epoch 36/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3907 - acc: 0.8516 - val_loss: 0.3873 - val_acc: 0.8563\n",
|
||||
"Epoch 37/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3891 - acc: 0.8519 - val_loss: 0.3850 - val_acc: 0.8570\n",
|
||||
"Epoch 38/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3872 - acc: 0.8522 - val_loss: 0.3815 - val_acc: 0.8591\n",
|
||||
"Epoch 39/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3887 - acc: 0.8520 - val_loss: 0.3829 - val_acc: 0.8590\n",
|
||||
"Epoch 40/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3868 - acc: 0.8531 - val_loss: 0.3807 - val_acc: 0.8600\n",
|
||||
"Epoch 41/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3859 - acc: 0.8537 - val_loss: 0.3832 - val_acc: 0.8574\n",
|
||||
"Epoch 42/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3849 - acc: 0.8537 - val_loss: 0.3850 - val_acc: 0.8576\n",
|
||||
"Epoch 43/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3834 - acc: 0.8541 - val_loss: 0.3825 - val_acc: 0.8563\n",
|
||||
"Epoch 44/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3829 - acc: 0.8548 - val_loss: 0.3844 - val_acc: 0.8540\n",
|
||||
"Epoch 45/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3816 - acc: 0.8552 - val_loss: 0.3841 - val_acc: 0.8559\n",
|
||||
"Epoch 46/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3816 - acc: 0.8549 - val_loss: 0.3880 - val_acc: 0.8567\n",
|
||||
"Epoch 47/50\n",
|
||||
"549367/549367 [==============================] - 24s 45us/step - loss: 0.3799 - acc: 0.8559 - val_loss: 0.3767 - val_acc: 0.8635\n",
|
||||
"Epoch 48/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3800 - acc: 0.8560 - val_loss: 0.3786 - val_acc: 0.8563\n",
|
||||
"Epoch 49/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3781 - acc: 0.8563 - val_loss: 0.3812 - val_acc: 0.8596\n",
|
||||
"Epoch 50/50\n",
|
||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3788 - acc: 0.8560 - val_loss: 0.3782 - val_acc: 0.8601\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<keras.callbacks.History at 0x7f5ca1bf3e48>"
|
||||
]
|
||||
},
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"m1.fit([text_vectors, hypothesis_vectors], labels, batch_size=1024, epochs=50,validation_data=([text_vectors_test, hypothesis_vectors_test], labels_test))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This model performs the same as the slightly more complex model that evaluates alignments in both directions. Note also that processing time is improved, from 64 down to 48 microseconds per step. \n",
|
||||
"\n",
|
||||
"Let's now look at an asymmetric model that evaluates text to hypothesis comparisons. The prediction is that such a model will correctly classify a decent proportion of the exemplars, but not as accurately as the previous two.\n",
|
||||
"\n",
|
||||
"We'll just use 10 epochs for expediency."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 96,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"Layer (type) Output Shape Param # Connected to \n",
|
||||
"==================================================================================================\n",
|
||||
"words1 (InputLayer) (None, 50) 0 \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"words2 (InputLayer) (None, 50) 0 \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"sequential_13 (Sequential) (None, 50, 200) 321381600 words1[0][0] \n",
|
||||
" words2[0][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"sequential_14 (Sequential) (None, 50, 200) 80400 sequential_13[1][0] \n",
|
||||
" sequential_13[2][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"dot_8 (Dot) (None, 50, 50) 0 sequential_14[1][0] \n",
|
||||
" sequential_14[2][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"lambda_9 (Lambda) (None, 50, 50) 0 dot_8[0][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"dot_9 (Dot) (None, 50, 200) 0 lambda_9[0][0] \n",
|
||||
" sequential_13[2][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"concatenate_6 (Concatenate) (None, 50, 400) 0 sequential_13[1][0] \n",
|
||||
" dot_9[0][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"time_distributed_9 (TimeDistrib (None, 50, 200) 120400 concatenate_6[0][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"lambda_10 (Lambda) (None, 200) 0 time_distributed_9[0][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"sequential_16 (Sequential) (None, 200) 80400 lambda_10[0][0] \n",
|
||||
"__________________________________________________________________________________________________\n",
|
||||
"dense_32 (Dense) (None, 3) 603 sequential_16[1][0] \n",
|
||||
"==================================================================================================\n",
|
||||
"Total params: 321,663,403\n",
|
||||
"Trainable params: 341,803\n",
|
||||
"Non-trainable params: 321,321,600\n",
|
||||
"__________________________________________________________________________________________________\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"m2 = build_model(sem_vectors, 50, 200, 3, 200, 'right')\n",
|
||||
"m2.summary()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 97,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Train on 455226 samples, validate on 113807 samples\n",
|
||||
"Epoch 1/10\n",
|
||||
"455226/455226 [==============================] - 22s 49us/step - loss: 0.8920 - acc: 0.5771 - val_loss: 0.8001 - val_acc: 0.6435\n",
|
||||
"Epoch 2/10\n",
|
||||
"455226/455226 [==============================] - 22s 47us/step - loss: 0.7808 - acc: 0.6553 - val_loss: 0.7267 - val_acc: 0.6855\n",
|
||||
"Epoch 3/10\n",
|
||||
"455226/455226 [==============================] - 22s 47us/step - loss: 0.7329 - acc: 0.6825 - val_loss: 0.6966 - val_acc: 0.7006\n",
|
||||
"Epoch 4/10\n",
|
||||
"455226/455226 [==============================] - 22s 47us/step - loss: 0.7055 - acc: 0.6978 - val_loss: 0.6713 - val_acc: 0.7150\n",
|
||||
"Epoch 5/10\n",
|
||||
"455226/455226 [==============================] - 22s 47us/step - loss: 0.6862 - acc: 0.7081 - val_loss: 0.6533 - val_acc: 0.7253\n",
|
||||
"Epoch 6/10\n",
|
||||
"455226/455226 [==============================] - 21s 47us/step - loss: 0.6694 - acc: 0.7179 - val_loss: 0.6472 - val_acc: 0.7277\n",
|
||||
"Epoch 7/10\n",
|
||||
"455226/455226 [==============================] - 22s 47us/step - loss: 0.6555 - acc: 0.7252 - val_loss: 0.6338 - val_acc: 0.7347\n",
|
||||
"Epoch 8/10\n",
|
||||
"455226/455226 [==============================] - 22s 48us/step - loss: 0.6434 - acc: 0.7310 - val_loss: 0.6246 - val_acc: 0.7385\n",
|
||||
"Epoch 9/10\n",
|
||||
"455226/455226 [==============================] - 22s 47us/step - loss: 0.6325 - acc: 0.7367 - val_loss: 0.6164 - val_acc: 0.7424\n",
|
||||
"Epoch 10/10\n",
|
||||
"455226/455226 [==============================] - 22s 47us/step - loss: 0.6216 - acc: 0.7426 - val_loss: 0.6082 - val_acc: 0.7478\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<keras.callbacks.History at 0x7fa6850cf080>"
|
||||
]
|
||||
},
|
||||
"execution_count": 97,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"m2.fit([text_vectors, hypothesis_vectors], labels, batch_size=1024, epochs=10,validation_split=.2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Comparing this fit to the validation accuracy of the previous two models after 10 epochs, we observe that its accuracy is roughly 10% lower.\n",
|
||||
"\n",
|
||||
"It is reassuring that the neural modeling here reproduces what we know from the semantics of natural language!"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.5.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
27
examples/pipeline/fix_space_entities.py
Normal file
27
examples/pipeline/fix_space_entities.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
'''Demonstrate adding a rule-based component that forces some tokens to not
|
||||
be entities, before the NER tagger is applied. This is used to hotfix the issue
|
||||
in https://github.com/explosion/spaCy/issues/2870 , present as of spaCy v2.0.16.
|
||||
'''
|
||||
import spacy
|
||||
from spacy.attrs import ENT_IOB
|
||||
|
||||
def fix_space_tags(doc):
|
||||
ent_iobs = doc.to_array([ENT_IOB])
|
||||
for i, token in enumerate(doc):
|
||||
if token.is_space:
|
||||
# Sets 'O' tag (0 is None, so I is 1, O is 2)
|
||||
ent_iobs[i] = 2
|
||||
doc.from_array([ENT_IOB], ent_iobs.reshape((len(doc), 1)))
|
||||
return doc
|
||||
|
||||
def main():
|
||||
nlp = spacy.load('en_core_web_sm')
|
||||
text = u'''This is some crazy test where I dont need an Apple Watch to make things bug'''
|
||||
doc = nlp(text)
|
||||
print('Before', doc.ents)
|
||||
nlp.add_pipe(fix_space_tags, name='fix-ner', before='ner')
|
||||
doc = nlp(text)
|
||||
print('After', doc.ents)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -21,8 +21,9 @@ from __future__ import unicode_literals, print_function
|
|||
|
||||
import plac
|
||||
import random
|
||||
import spacy
|
||||
from pathlib import Path
|
||||
import spacy
|
||||
from spacy.util import minibatch, compounding
|
||||
|
||||
|
||||
# training data: texts, heads and dependency labels
|
||||
|
@ -63,7 +64,7 @@ TRAIN_DATA = [
|
|||
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
|
||||
output_dir=("Optional output directory", "option", "o", Path),
|
||||
n_iter=("Number of training iterations", "option", "n", int))
|
||||
def main(model=None, output_dir=None, n_iter=5):
|
||||
def main(model=None, output_dir=None, n_iter=15):
|
||||
"""Load the model, set up the pipeline and train the parser."""
|
||||
if model is not None:
|
||||
nlp = spacy.load(model) # load existing spaCy model
|
||||
|
@ -89,9 +90,12 @@ def main(model=None, output_dir=None, n_iter=5):
|
|||
for itn in range(n_iter):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
losses = {}
|
||||
for text, annotations in TRAIN_DATA:
|
||||
nlp.update([text], [annotations], sgd=optimizer, losses=losses)
|
||||
print(losses)
|
||||
# batch up the examples using spaCy's minibatch
|
||||
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
|
||||
for batch in batches:
|
||||
texts, annotations = zip(*batch)
|
||||
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
|
||||
print('Losses', losses)
|
||||
|
||||
# test the trained model
|
||||
test_model(nlp)
|
||||
|
@ -135,7 +139,8 @@ if __name__ == '__main__':
|
|||
# [
|
||||
# ('find', 'ROOT', 'find'),
|
||||
# ('cheapest', 'QUALITY', 'gym'),
|
||||
# ('gym', 'PLACE', 'find')
|
||||
# ('gym', 'PLACE', 'find'),
|
||||
# ('near', 'ATTRIBUTE', 'gym'),
|
||||
# ('work', 'LOCATION', 'near')
|
||||
# ]
|
||||
# show me the best hotel in berlin
|
||||
|
|
|
@ -15,6 +15,7 @@ import plac
|
|||
import random
|
||||
from pathlib import Path
|
||||
import spacy
|
||||
from spacy.util import minibatch, compounding
|
||||
|
||||
|
||||
# training data
|
||||
|
@ -62,14 +63,17 @@ def main(model=None, output_dir=None, n_iter=100):
|
|||
for itn in range(n_iter):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
losses = {}
|
||||
for text, annotations in TRAIN_DATA:
|
||||
# batch up the examples using spaCy's minibatch
|
||||
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
|
||||
for batch in batches:
|
||||
texts, annotations = zip(*batch)
|
||||
nlp.update(
|
||||
[text], # batch of texts
|
||||
[annotations], # batch of annotations
|
||||
texts, # batch of texts
|
||||
annotations, # batch of annotations
|
||||
drop=0.5, # dropout - make it harder to memorise data
|
||||
sgd=optimizer, # callable to update weights
|
||||
losses=losses)
|
||||
print(losses)
|
||||
print('Losses', losses)
|
||||
|
||||
# test the trained model
|
||||
for text, _ in TRAIN_DATA:
|
||||
|
|
|
@ -31,6 +31,7 @@ import plac
|
|||
import random
|
||||
from pathlib import Path
|
||||
import spacy
|
||||
from spacy.util import minibatch, compounding
|
||||
|
||||
|
||||
# new entity label
|
||||
|
@ -73,7 +74,7 @@ TRAIN_DATA = [
|
|||
new_model_name=("New model name for model meta.", "option", "nm", str),
|
||||
output_dir=("Optional output directory", "option", "o", Path),
|
||||
n_iter=("Number of training iterations", "option", "n", int))
|
||||
def main(model=None, new_model_name='animal', output_dir=None, n_iter=20):
|
||||
def main(model=None, new_model_name='animal', output_dir=None, n_iter=10):
|
||||
"""Set up the pipeline and entity recognizer, and train the new entity."""
|
||||
if model is not None:
|
||||
nlp = spacy.load(model) # load existing spaCy model
|
||||
|
@ -104,10 +105,13 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=20):
|
|||
for itn in range(n_iter):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
losses = {}
|
||||
for text, annotations in TRAIN_DATA:
|
||||
nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
|
||||
# batch up the examples using spaCy's minibatch
|
||||
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
|
||||
for batch in batches:
|
||||
texts, annotations = zip(*batch)
|
||||
nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
|
||||
losses=losses)
|
||||
print(losses)
|
||||
print('Losses', losses)
|
||||
|
||||
# test the trained model
|
||||
test_text = 'Do you like horses?'
|
||||
|
|
|
@ -13,6 +13,7 @@ import plac
|
|||
import random
|
||||
from pathlib import Path
|
||||
import spacy
|
||||
from spacy.util import minibatch, compounding
|
||||
|
||||
|
||||
# training data
|
||||
|
@ -62,9 +63,12 @@ def main(model=None, output_dir=None, n_iter=10):
|
|||
for itn in range(n_iter):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
losses = {}
|
||||
for text, annotations in TRAIN_DATA:
|
||||
nlp.update([text], [annotations], sgd=optimizer, losses=losses)
|
||||
print(losses)
|
||||
# batch up the examples using spaCy's minibatch
|
||||
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
|
||||
for batch in batches:
|
||||
texts, annotations = zip(*batch)
|
||||
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
|
||||
print('Losses', losses)
|
||||
|
||||
# test the trained model
|
||||
test_text = "I like securities."
|
||||
|
|
|
@ -16,6 +16,7 @@ import plac
|
|||
import random
|
||||
from pathlib import Path
|
||||
import spacy
|
||||
from spacy.util import minibatch, compounding
|
||||
|
||||
|
||||
# You need to define a mapping from your data's part-of-speech tag names to the
|
||||
|
@ -63,9 +64,12 @@ def main(lang='en', output_dir=None, n_iter=25):
|
|||
for i in range(n_iter):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
losses = {}
|
||||
for text, annotations in TRAIN_DATA:
|
||||
nlp.update([text], [annotations], sgd=optimizer, losses=losses)
|
||||
print(losses)
|
||||
# batch up the examples using spaCy's minibatch
|
||||
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
|
||||
for batch in batches:
|
||||
texts, annotations = zip(*batch)
|
||||
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
|
||||
print('Losses', losses)
|
||||
|
||||
# test the trained model
|
||||
test_text = "I like blue eggs"
|
||||
|
|
|
@ -2,7 +2,7 @@ cython>=0.25
|
|||
numpy>=1.15.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=2.0.1,<2.1.0
|
||||
thinc==7.0.0.dev1
|
||||
thinc==7.0.0.dev2
|
||||
blis>=0.2.2,<0.3.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cytoolz>=0.9.0,<0.10.0
|
||||
|
@ -11,7 +11,11 @@ ujson>=1.35
|
|||
dill>=0.2,<0.3
|
||||
regex==2018.01.10
|
||||
requests>=2.13.0,<3.0.0
|
||||
jsonschema>=2.6.0,<3.0.0
|
||||
wasabi>=0.0.8,<1.1.0
|
||||
pathlib==1.0.1; python_version < "3.4"
|
||||
# Development dependencies
|
||||
pytest>=4.0.0,<5.0.0
|
||||
pytest-timeout>=1.3.0,<2.0.0
|
||||
mock>=2.0.0,<3.0.0
|
||||
pathlib==1.0.1; python_version < "3.4"
|
||||
flake8>=3.5.0,<3.6.0
|
||||
|
|
4
setup.py
4
setup.py
|
@ -200,13 +200,15 @@ def setup_package():
|
|||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=2.0.1,<2.1.0",
|
||||
"thinc==7.0.0.dev1",
|
||||
"thinc==7.0.0.dev2",
|
||||
"blis>=0.2.2,<0.3.0",
|
||||
"plac<1.0.0,>=0.9.6",
|
||||
"ujson>=1.35",
|
||||
"regex==2018.01.10",
|
||||
"dill>=0.2,<0.3",
|
||||
"requests>=2.13.0,<3.0.0",
|
||||
"jsonschema>=2.6.0,<3.0.0",
|
||||
"wasabi>=0.0.8,<1.1.0",
|
||||
'pathlib==1.0.1; python_version < "3.4"',
|
||||
],
|
||||
setup_requires=["wheel"],
|
||||
|
|
|
@ -1,9 +1,13 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
import warnings
|
||||
|
||||
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
|
||||
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
|
||||
|
||||
# These are imported as part of the API
|
||||
from thinc.neural.util import prefer_gpu, require_gpu
|
||||
|
||||
from .cli.info import info as cli_info
|
||||
from .glossary import explain
|
||||
from .about import __version__
|
||||
|
@ -12,7 +16,7 @@ from . import util
|
|||
|
||||
|
||||
def load(name, **overrides):
|
||||
depr_path = overrides.get('path')
|
||||
depr_path = overrides.get("path")
|
||||
if depr_path not in (True, False, None):
|
||||
deprecation_warning(Warnings.W001.format(path=depr_path))
|
||||
return util.load_model(name, **overrides)
|
||||
|
|
|
@ -1,40 +1,41 @@
|
|||
# coding: utf8
|
||||
from __future__ import print_function
|
||||
|
||||
# NB! This breaks in plac on Python 2!!
|
||||
# from __future__ import unicode_literals
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
import plac
|
||||
import sys
|
||||
from wasabi import Printer
|
||||
from spacy.cli import download, link, info, package, train, pretrain, convert
|
||||
from spacy.cli import vocab, init_model, profile, evaluate, validate
|
||||
from spacy.cli import ud_train, ud_evaluate
|
||||
from spacy.util import prints
|
||||
from spacy.cli import init_model, profile, evaluate, validate
|
||||
from spacy.cli import ud_train, ud_evaluate, debug_data
|
||||
|
||||
msg = Printer()
|
||||
|
||||
commands = {
|
||||
'download': download,
|
||||
'link': link,
|
||||
'info': info,
|
||||
'train': train,
|
||||
'pretrain': pretrain,
|
||||
'ud-train': ud_train,
|
||||
'evaluate': evaluate,
|
||||
'ud-evaluate': ud_evaluate,
|
||||
'convert': convert,
|
||||
'package': package,
|
||||
'vocab': vocab,
|
||||
'init-model': init_model,
|
||||
'profile': profile,
|
||||
'validate': validate
|
||||
"download": download,
|
||||
"link": link,
|
||||
"info": info,
|
||||
"train": train,
|
||||
"pretrain": pretrain,
|
||||
"debug-data": debug_data,
|
||||
"ud-train": ud_train,
|
||||
"evaluate": evaluate,
|
||||
"ud-evaluate": ud_evaluate,
|
||||
"convert": convert,
|
||||
"package": package,
|
||||
"init-model": init_model,
|
||||
"profile": profile,
|
||||
"validate": validate,
|
||||
}
|
||||
if len(sys.argv) == 1:
|
||||
prints(', '.join(commands), title="Available commands", exits=1)
|
||||
msg.info("Available commands", ", ".join(commands), exits=1)
|
||||
command = sys.argv.pop(1)
|
||||
sys.argv[0] = 'spacy %s' % command
|
||||
sys.argv[0] = "spacy %s" % command
|
||||
if command in commands:
|
||||
plac.call(commands[command], sys.argv[1:])
|
||||
else:
|
||||
prints(
|
||||
"Available: %s" % ', '.join(commands),
|
||||
title="Unknown command: %s" % command,
|
||||
exits=1)
|
||||
available = "Available: {}".format(", ".join(commands))
|
||||
msg.fail("Unknown command: {}".format(command), available, exits=1)
|
||||
|
|
303
spacy/_ml.py
303
spacy/_ml.py
|
@ -14,8 +14,7 @@ from thinc.api import uniqued, wrap, noop
|
|||
from thinc.api import with_square_sequences
|
||||
from thinc.linear.linear import LinearModel
|
||||
from thinc.neural.ops import NumpyOps, CupyOps
|
||||
from thinc.neural.util import get_array_module, copy_array
|
||||
from thinc.neural._lsuv import svd_orthonormal
|
||||
from thinc.neural.util import get_array_module
|
||||
from thinc.neural.optimizers import Adam
|
||||
|
||||
from thinc import describe
|
||||
|
@ -30,39 +29,39 @@ from . import util
|
|||
try:
|
||||
import torch.nn
|
||||
from thinc.extra.wrappers import PyTorchWrapperRNN
|
||||
except:
|
||||
except ImportError:
|
||||
torch = None
|
||||
|
||||
VECTORS_KEY = 'spacy_pretrained_vectors'
|
||||
VECTORS_KEY = "spacy_pretrained_vectors"
|
||||
|
||||
|
||||
def cosine(vec1, vec2):
|
||||
xp = get_array_module(vec1)
|
||||
norm1 = xp.linalg.norm(vec1)
|
||||
norm2 = xp.linalg.norm(vec2)
|
||||
if norm1 == 0. or norm2 == 0.:
|
||||
if norm1 == 0.0 or norm2 == 0.0:
|
||||
return 0
|
||||
else:
|
||||
return vec1.dot(vec2) / (norm1 * norm2)
|
||||
|
||||
|
||||
def create_default_optimizer(ops, **cfg):
|
||||
learn_rate = util.env_opt('learn_rate', 0.001)
|
||||
beta1 = util.env_opt('optimizer_B1', 0.8)
|
||||
beta2 = util.env_opt('optimizer_B2', 0.8)
|
||||
eps = util.env_opt('optimizer_eps', 0.00001)
|
||||
L2 = util.env_opt('L2_penalty', 1e-6)
|
||||
max_grad_norm = util.env_opt('grad_norm_clip', 5.)
|
||||
optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1,
|
||||
beta2=beta2, eps=eps)
|
||||
learn_rate = util.env_opt("learn_rate", 0.001)
|
||||
beta1 = util.env_opt("optimizer_B1", 0.8)
|
||||
beta2 = util.env_opt("optimizer_B2", 0.8)
|
||||
eps = util.env_opt("optimizer_eps", 0.00001)
|
||||
L2 = util.env_opt("L2_penalty", 1e-6)
|
||||
max_grad_norm = util.env_opt("grad_norm_clip", 5.0)
|
||||
optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps)
|
||||
optimizer.max_grad_norm = max_grad_norm
|
||||
optimizer.device = ops.device
|
||||
return optimizer
|
||||
|
||||
|
||||
@layerize
|
||||
def _flatten_add_lengths(seqs, pad=0, drop=0.):
|
||||
def _flatten_add_lengths(seqs, pad=0, drop=0.0):
|
||||
ops = Model.ops
|
||||
lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
|
||||
lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
|
||||
|
||||
def finish_update(d_X, sgd=None):
|
||||
return ops.unflatten(d_X, lengths, pad=pad)
|
||||
|
@ -74,14 +73,15 @@ def _flatten_add_lengths(seqs, pad=0, drop=0.):
|
|||
def _zero_init(model):
|
||||
def _zero_init_impl(self, X, y):
|
||||
self.W.fill(0)
|
||||
|
||||
model.on_data_hooks.append(_zero_init_impl)
|
||||
if model.W is not None:
|
||||
model.W.fill(0.)
|
||||
model.W.fill(0.0)
|
||||
return model
|
||||
|
||||
|
||||
@layerize
|
||||
def _preprocess_doc(docs, drop=0.):
|
||||
def _preprocess_doc(docs, drop=0.0):
|
||||
keys = [doc.to_array(LOWER) for doc in docs]
|
||||
ops = Model.ops
|
||||
# The dtype here matches what thinc is expecting -- which differs per
|
||||
|
@ -89,11 +89,12 @@ def _preprocess_doc(docs, drop=0.):
|
|||
# is fixed on Thinc's side.
|
||||
lengths = ops.asarray([arr.shape[0] for arr in keys], dtype=numpy.int_)
|
||||
keys = ops.xp.concatenate(keys)
|
||||
vals = ops.allocate(keys.shape) + 1.
|
||||
vals = ops.allocate(keys.shape) + 1.0
|
||||
return (keys, vals, lengths), None
|
||||
|
||||
|
||||
@layerize
|
||||
def _preprocess_doc_bigrams(docs, drop=0.):
|
||||
def _preprocess_doc_bigrams(docs, drop=0.0):
|
||||
unigrams = [doc.to_array(LOWER) for doc in docs]
|
||||
ops = Model.ops
|
||||
bigrams = [ops.ngrams(2, doc_unis) for doc_unis in unigrams]
|
||||
|
@ -104,27 +105,29 @@ def _preprocess_doc_bigrams(docs, drop=0.):
|
|||
# is fixed on Thinc's side.
|
||||
lengths = ops.asarray([arr.shape[0] for arr in keys], dtype=numpy.int_)
|
||||
keys = ops.xp.concatenate(keys)
|
||||
vals = ops.asarray(ops.xp.concatenate(vals), dtype='f')
|
||||
vals = ops.asarray(ops.xp.concatenate(vals), dtype="f")
|
||||
return (keys, vals, lengths), None
|
||||
|
||||
|
||||
@describe.on_data(_set_dimensions_if_needed,
|
||||
lambda model, X, y: model.init_weights(model))
|
||||
@describe.on_data(
|
||||
_set_dimensions_if_needed, lambda model, X, y: model.init_weights(model)
|
||||
)
|
||||
@describe.attributes(
|
||||
nI=Dimension("Input size"),
|
||||
nF=Dimension("Number of features"),
|
||||
nO=Dimension("Output size"),
|
||||
nP=Dimension("Maxout pieces"),
|
||||
W=Synapses("Weights matrix",
|
||||
lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI)),
|
||||
b=Biases("Bias vector",
|
||||
lambda obj: (obj.nO, obj.nP)),
|
||||
pad=Synapses("Pad",
|
||||
W=Synapses("Weights matrix", lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI)),
|
||||
b=Biases("Bias vector", lambda obj: (obj.nO, obj.nP)),
|
||||
pad=Synapses(
|
||||
"Pad",
|
||||
lambda obj: (1, obj.nF, obj.nO, obj.nP),
|
||||
lambda M, ops: ops.normal_init(M, 1.)),
|
||||
lambda M, ops: ops.normal_init(M, 1.0),
|
||||
),
|
||||
d_W=Gradient("W"),
|
||||
d_pad=Gradient("pad"),
|
||||
d_b=Gradient("b"))
|
||||
d_b=Gradient("b"),
|
||||
)
|
||||
class PrecomputableAffine(Model):
|
||||
def __init__(self, nO=None, nI=None, nF=None, nP=None, **kwargs):
|
||||
Model.__init__(self, **kwargs)
|
||||
|
@ -133,9 +136,10 @@ class PrecomputableAffine(Model):
|
|||
self.nI = nI
|
||||
self.nF = nF
|
||||
|
||||
def begin_update(self, X, drop=0.):
|
||||
Yf = self.ops.gemm(X,
|
||||
self.W.reshape((self.nF*self.nO*self.nP, self.nI)), trans2=True)
|
||||
def begin_update(self, X, drop=0.0):
|
||||
Yf = self.ops.gemm(
|
||||
X, self.W.reshape((self.nF * self.nO * self.nP, self.nI)), trans2=True
|
||||
)
|
||||
Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO, self.nP))
|
||||
Yf = self._add_padding(Yf)
|
||||
|
||||
|
@ -146,15 +150,16 @@ class PrecomputableAffine(Model):
|
|||
Xf = Xf.reshape((Xf.shape[0], self.nF * self.nI))
|
||||
|
||||
self.d_b += dY.sum(axis=0)
|
||||
dY = dY.reshape((dY.shape[0], self.nO*self.nP))
|
||||
dY = dY.reshape((dY.shape[0], self.nO * self.nP))
|
||||
|
||||
Wopfi = self.W.transpose((1, 2, 0, 3))
|
||||
Wopfi = self.ops.xp.ascontiguousarray(Wopfi)
|
||||
Wopfi = Wopfi.reshape((self.nO*self.nP, self.nF * self.nI))
|
||||
dXf = self.ops.gemm(dY.reshape((dY.shape[0], self.nO*self.nP)), Wopfi)
|
||||
Wopfi = Wopfi.reshape((self.nO * self.nP, self.nF * self.nI))
|
||||
dXf = self.ops.gemm(dY.reshape((dY.shape[0], self.nO * self.nP)), Wopfi)
|
||||
|
||||
# Reuse the buffer
|
||||
dWopfi = Wopfi; dWopfi.fill(0.)
|
||||
dWopfi = Wopfi
|
||||
dWopfi.fill(0.0)
|
||||
self.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
|
||||
dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI))
|
||||
# (o, p, f, i) --> (f, o, p, i)
|
||||
|
@ -163,6 +168,7 @@ class PrecomputableAffine(Model):
|
|||
if sgd is not None:
|
||||
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
||||
return dXf.reshape((dXf.shape[0], self.nF, self.nI))
|
||||
|
||||
return Yf, backward
|
||||
|
||||
def _add_padding(self, Yf):
|
||||
|
@ -171,7 +177,7 @@ class PrecomputableAffine(Model):
|
|||
|
||||
def _backprop_padding(self, dY, ids):
|
||||
# (1, nF, nO, nP) += (nN, nF, nO, nP) where IDs (nN, nF) < 0
|
||||
mask = ids < 0.
|
||||
mask = ids < 0.0
|
||||
mask = mask.sum(axis=1)
|
||||
d_pad = dY * mask.reshape((ids.shape[0], 1, 1))
|
||||
self.d_pad += d_pad.sum(axis=0)
|
||||
|
@ -179,33 +185,36 @@ class PrecomputableAffine(Model):
|
|||
|
||||
@staticmethod
|
||||
def init_weights(model):
|
||||
'''This is like the 'layer sequential unit variance', but instead
|
||||
"""This is like the 'layer sequential unit variance', but instead
|
||||
of taking the actual inputs, we randomly generate whitened data.
|
||||
|
||||
Why's this all so complicated? We have a huge number of inputs,
|
||||
and the maxout unit makes guessing the dynamics tricky. Instead
|
||||
we set the maxout weights to values that empirically result in
|
||||
whitened outputs given whitened inputs.
|
||||
'''
|
||||
if (model.W**2).sum() != 0.:
|
||||
"""
|
||||
if (model.W ** 2).sum() != 0.0:
|
||||
return
|
||||
ops = model.ops
|
||||
xp = ops.xp
|
||||
ops.normal_init(model.W, model.nF * model.nI, inplace=True)
|
||||
|
||||
ids = ops.allocate((5000, model.nF), dtype='f')
|
||||
ids = ops.allocate((5000, model.nF), dtype="f")
|
||||
ids += xp.random.uniform(0, 1000, ids.shape)
|
||||
ids = ops.asarray(ids, dtype='i')
|
||||
tokvecs = ops.allocate((5000, model.nI), dtype='f')
|
||||
tokvecs += xp.random.normal(loc=0., scale=1.,
|
||||
size=tokvecs.size).reshape(tokvecs.shape)
|
||||
ids = ops.asarray(ids, dtype="i")
|
||||
tokvecs = ops.allocate((5000, model.nI), dtype="f")
|
||||
tokvecs += xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
|
||||
tokvecs.shape
|
||||
)
|
||||
|
||||
def predict(ids, tokvecs):
|
||||
# nS ids. nW tokvecs. Exclude the padding array.
|
||||
hiddens = model(tokvecs[:-1]) # (nW, f, o, p)
|
||||
vectors = model.ops.allocate((ids.shape[0], model.nO * model.nP), dtype='f')
|
||||
hiddens = model(tokvecs[:-1]) # (nW, f, o, p)
|
||||
vectors = model.ops.allocate((ids.shape[0], model.nO * model.nP), dtype="f")
|
||||
# need nS vectors
|
||||
hiddens = hiddens.reshape((hiddens.shape[0] * model.nF, model.nO * model.nP))
|
||||
hiddens = hiddens.reshape(
|
||||
(hiddens.shape[0] * model.nF, model.nO * model.nP)
|
||||
)
|
||||
model.ops.scatter_add(vectors, ids.flatten(), hiddens)
|
||||
vectors = vectors.reshape((vectors.shape[0], model.nO, model.nP))
|
||||
vectors += model.b
|
||||
|
@ -238,7 +247,8 @@ def link_vectors_to_models(vocab):
|
|||
if vectors.data.size != 0:
|
||||
print(
|
||||
"Warning: Unnamed vectors -- this won't allow multiple vectors "
|
||||
"models to be loaded. (Shape: (%d, %d))" % vectors.data.shape)
|
||||
"models to be loaded. (Shape: (%d, %d))" % vectors.data.shape
|
||||
)
|
||||
ops = Model.ops
|
||||
for word in vocab:
|
||||
if word.orth in vectors.key2row:
|
||||
|
@ -254,28 +264,31 @@ def link_vectors_to_models(vocab):
|
|||
def PyTorchBiLSTM(nO, nI, depth, dropout=0.2):
|
||||
if depth == 0:
|
||||
return layerize(noop())
|
||||
model = torch.nn.LSTM(nI, nO//2, depth, bidirectional=True, dropout=dropout)
|
||||
model = torch.nn.LSTM(nI, nO // 2, depth, bidirectional=True, dropout=dropout)
|
||||
return with_square_sequences(PyTorchWrapperRNN(model))
|
||||
|
||||
|
||||
def Tok2Vec(width, embed_size, **kwargs):
|
||||
pretrained_vectors = kwargs.get('pretrained_vectors', None)
|
||||
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2)
|
||||
subword_features = kwargs.get('subword_features', True)
|
||||
conv_depth = kwargs.get('conv_depth', 4)
|
||||
bilstm_depth = kwargs.get('bilstm_depth', 0)
|
||||
pretrained_vectors = kwargs.get("pretrained_vectors", None)
|
||||
cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 2)
|
||||
subword_features = kwargs.get("subword_features", True)
|
||||
conv_depth = kwargs.get("conv_depth", 4)
|
||||
bilstm_depth = kwargs.get("bilstm_depth", 0)
|
||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone,
|
||||
'+': add, '*': reapply}):
|
||||
norm = HashEmbed(width, embed_size, column=cols.index(NORM),
|
||||
name='embed_norm')
|
||||
with Model.define_operators(
|
||||
{">>": chain, "|": concatenate, "**": clone, "+": add, "*": reapply}
|
||||
):
|
||||
norm = HashEmbed(width, embed_size, column=cols.index(NORM), name="embed_norm")
|
||||
if subword_features:
|
||||
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX),
|
||||
name='embed_prefix')
|
||||
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX),
|
||||
name='embed_suffix')
|
||||
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE),
|
||||
name='embed_shape')
|
||||
prefix = HashEmbed(
|
||||
width, embed_size // 2, column=cols.index(PREFIX), name="embed_prefix"
|
||||
)
|
||||
suffix = HashEmbed(
|
||||
width, embed_size // 2, column=cols.index(SUFFIX), name="embed_suffix"
|
||||
)
|
||||
shape = HashEmbed(
|
||||
width, embed_size // 2, column=cols.index(SHAPE), name="embed_shape"
|
||||
)
|
||||
else:
|
||||
prefix, suffix, shape = (None, None, None)
|
||||
if pretrained_vectors is not None:
|
||||
|
@ -284,28 +297,29 @@ def Tok2Vec(width, embed_size, **kwargs):
|
|||
if subword_features:
|
||||
embed = uniqued(
|
||||
(glove | norm | prefix | suffix | shape)
|
||||
>> LN(Maxout(width, width*5, pieces=3)), column=cols.index(ORTH))
|
||||
>> LN(Maxout(width, width * 5, pieces=3)),
|
||||
column=cols.index(ORTH),
|
||||
)
|
||||
else:
|
||||
embed = uniqued(
|
||||
(glove | norm)
|
||||
>> LN(Maxout(width, width*2, pieces=3)), column=cols.index(ORTH))
|
||||
(glove | norm) >> LN(Maxout(width, width * 2, pieces=3)),
|
||||
column=cols.index(ORTH),
|
||||
)
|
||||
elif subword_features:
|
||||
embed = uniqued(
|
||||
(norm | prefix | suffix | shape)
|
||||
>> LN(Maxout(width, width*4, pieces=3)), column=cols.index(ORTH))
|
||||
>> LN(Maxout(width, width * 4, pieces=3)),
|
||||
column=cols.index(ORTH),
|
||||
)
|
||||
else:
|
||||
embed = norm
|
||||
|
||||
convolution = Residual(
|
||||
ExtractWindow(nW=1)
|
||||
>> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
|
||||
>> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
|
||||
)
|
||||
tok2vec = (
|
||||
FeatureExtracter(cols)
|
||||
>> with_flatten(
|
||||
embed
|
||||
>> convolution ** conv_depth, pad=conv_depth
|
||||
)
|
||||
tok2vec = FeatureExtracter(cols) >> with_flatten(
|
||||
embed >> convolution ** conv_depth, pad=conv_depth
|
||||
)
|
||||
if bilstm_depth >= 1:
|
||||
tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth)
|
||||
|
@ -316,7 +330,7 @@ def Tok2Vec(width, embed_size, **kwargs):
|
|||
|
||||
|
||||
def reapply(layer, n_times):
|
||||
def reapply_fwd(X, drop=0.):
|
||||
def reapply_fwd(X, drop=0.0):
|
||||
backprops = []
|
||||
for i in range(n_times):
|
||||
Y, backprop = layer.begin_update(X, drop=drop)
|
||||
|
@ -334,12 +348,14 @@ def reapply(layer, n_times):
|
|||
return dX
|
||||
|
||||
return Y, reapply_bwd
|
||||
|
||||
return wrap(reapply_fwd, layer)
|
||||
|
||||
|
||||
def asarray(ops, dtype):
|
||||
def forward(X, drop=0.):
|
||||
def forward(X, drop=0.0):
|
||||
return ops.asarray(X, dtype=dtype), None
|
||||
|
||||
return layerize(forward)
|
||||
|
||||
|
||||
|
@ -347,7 +363,7 @@ def _divide_array(X, size):
|
|||
parts = []
|
||||
index = 0
|
||||
while index < len(X):
|
||||
parts.append(X[index:index + size])
|
||||
parts.append(X[index : index + size])
|
||||
index += size
|
||||
return parts
|
||||
|
||||
|
@ -356,7 +372,7 @@ def get_col(idx):
|
|||
if idx < 0:
|
||||
raise IndexError(Errors.E066.format(value=idx))
|
||||
|
||||
def forward(X, drop=0.):
|
||||
def forward(X, drop=0.0):
|
||||
if isinstance(X, numpy.ndarray):
|
||||
ops = NumpyOps()
|
||||
else:
|
||||
|
@ -377,7 +393,7 @@ def doc2feats(cols=None):
|
|||
if cols is None:
|
||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||
|
||||
def forward(docs, drop=0.):
|
||||
def forward(docs, drop=0.0):
|
||||
feats = []
|
||||
for doc in docs:
|
||||
feats.append(doc.to_array(cols))
|
||||
|
@ -389,13 +405,14 @@ def doc2feats(cols=None):
|
|||
|
||||
|
||||
def print_shape(prefix):
|
||||
def forward(X, drop=0.):
|
||||
def forward(X, drop=0.0):
|
||||
return X, lambda dX, **kwargs: dX
|
||||
|
||||
return layerize(forward)
|
||||
|
||||
|
||||
@layerize
|
||||
def get_token_vectors(tokens_attrs_vectors, drop=0.):
|
||||
def get_token_vectors(tokens_attrs_vectors, drop=0.0):
|
||||
tokens, attrs, vectors = tokens_attrs_vectors
|
||||
|
||||
def backward(d_output, sgd=None):
|
||||
|
@ -405,17 +422,17 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):
|
|||
|
||||
|
||||
@layerize
|
||||
def logistic(X, drop=0.):
|
||||
def logistic(X, drop=0.0):
|
||||
xp = get_array_module(X)
|
||||
if not isinstance(X, xp.ndarray):
|
||||
X = xp.asarray(X)
|
||||
# Clip to range (-10, 10)
|
||||
X = xp.minimum(X, 10., X)
|
||||
X = xp.maximum(X, -10., X)
|
||||
Y = 1. / (1. + xp.exp(-X))
|
||||
X = xp.minimum(X, 10.0, X)
|
||||
X = xp.maximum(X, -10.0, X)
|
||||
Y = 1.0 / (1.0 + xp.exp(-X))
|
||||
|
||||
def logistic_bwd(dY, sgd=None):
|
||||
dX = dY * (Y * (1-Y))
|
||||
dX = dY * (Y * (1 - Y))
|
||||
return dX
|
||||
|
||||
return Y, logistic_bwd
|
||||
|
@ -424,12 +441,13 @@ def logistic(X, drop=0.):
|
|||
def zero_init(model):
|
||||
def _zero_init_impl(self, X, y):
|
||||
self.W.fill(0)
|
||||
|
||||
model.on_data_hooks.append(_zero_init_impl)
|
||||
return model
|
||||
|
||||
|
||||
@layerize
|
||||
def preprocess_doc(docs, drop=0.):
|
||||
def preprocess_doc(docs, drop=0.0):
|
||||
keys = [doc.to_array([LOWER]) for doc in docs]
|
||||
ops = Model.ops
|
||||
lengths = ops.asarray([arr.shape[0] for arr in keys])
|
||||
|
@ -439,31 +457,32 @@ def preprocess_doc(docs, drop=0.):
|
|||
|
||||
|
||||
def getitem(i):
|
||||
def getitem_fwd(X, drop=0.):
|
||||
def getitem_fwd(X, drop=0.0):
|
||||
return X[i], None
|
||||
|
||||
return layerize(getitem_fwd)
|
||||
|
||||
|
||||
def build_tagger_model(nr_class, **cfg):
|
||||
embed_size = util.env_opt('embed_size', 2000)
|
||||
if 'token_vector_width' in cfg:
|
||||
token_vector_width = cfg['token_vector_width']
|
||||
embed_size = util.env_opt("embed_size", 2000)
|
||||
if "token_vector_width" in cfg:
|
||||
token_vector_width = cfg["token_vector_width"]
|
||||
else:
|
||||
token_vector_width = util.env_opt('token_vector_width', 96)
|
||||
pretrained_vectors = cfg.get('pretrained_vectors')
|
||||
subword_features = cfg.get('subword_features', True)
|
||||
with Model.define_operators({'>>': chain, '+': add}):
|
||||
if 'tok2vec' in cfg:
|
||||
tok2vec = cfg['tok2vec']
|
||||
token_vector_width = util.env_opt("token_vector_width", 96)
|
||||
pretrained_vectors = cfg.get("pretrained_vectors")
|
||||
subword_features = cfg.get("subword_features", True)
|
||||
with Model.define_operators({">>": chain, "+": add}):
|
||||
if "tok2vec" in cfg:
|
||||
tok2vec = cfg["tok2vec"]
|
||||
else:
|
||||
tok2vec = Tok2Vec(token_vector_width, embed_size,
|
||||
subword_features=subword_features,
|
||||
pretrained_vectors=pretrained_vectors)
|
||||
tok2vec = Tok2Vec(
|
||||
token_vector_width,
|
||||
embed_size,
|
||||
subword_features=subword_features,
|
||||
pretrained_vectors=pretrained_vectors,
|
||||
)
|
||||
softmax = with_flatten(Softmax(nr_class, token_vector_width))
|
||||
model = (
|
||||
tok2vec
|
||||
>> softmax
|
||||
)
|
||||
model = tok2vec >> softmax
|
||||
model.nI = None
|
||||
model.tok2vec = tok2vec
|
||||
model.softmax = softmax
|
||||
|
@ -471,10 +490,10 @@ def build_tagger_model(nr_class, **cfg):
|
|||
|
||||
|
||||
@layerize
|
||||
def SpacyVectors(docs, drop=0.):
|
||||
def SpacyVectors(docs, drop=0.0):
|
||||
batch = []
|
||||
for doc in docs:
|
||||
indices = numpy.zeros((len(doc),), dtype='i')
|
||||
indices = numpy.zeros((len(doc),), dtype="i")
|
||||
for i, word in enumerate(doc):
|
||||
if word.orth in doc.vocab.vectors.key2row:
|
||||
indices[i] = doc.vocab.vectors.key2row[word.orth]
|
||||
|
@ -486,12 +505,11 @@ def SpacyVectors(docs, drop=0.):
|
|||
|
||||
|
||||
def build_text_classifier(nr_class, width=64, **cfg):
|
||||
depth = cfg.get('depth', 2)
|
||||
nr_vector = cfg.get('nr_vector', 5000)
|
||||
pretrained_dims = cfg.get('pretrained_dims', 0)
|
||||
with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
|
||||
'**': clone}):
|
||||
if cfg.get('low_data') and pretrained_dims:
|
||||
depth = cfg.get("depth", 2)
|
||||
nr_vector = cfg.get("nr_vector", 5000)
|
||||
pretrained_dims = cfg.get("pretrained_dims", 0)
|
||||
with Model.define_operators({">>": chain, "+": add, "|": concatenate, "**": clone}):
|
||||
if cfg.get("low_data") and pretrained_dims:
|
||||
model = (
|
||||
SpacyVectors
|
||||
>> flatten_add_lengths
|
||||
|
@ -505,41 +523,35 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
|||
return model
|
||||
|
||||
lower = HashEmbed(width, nr_vector, column=1)
|
||||
prefix = HashEmbed(width//2, nr_vector, column=2)
|
||||
suffix = HashEmbed(width//2, nr_vector, column=3)
|
||||
shape = HashEmbed(width//2, nr_vector, column=4)
|
||||
prefix = HashEmbed(width // 2, nr_vector, column=2)
|
||||
suffix = HashEmbed(width // 2, nr_vector, column=3)
|
||||
shape = HashEmbed(width // 2, nr_vector, column=4)
|
||||
|
||||
trained_vectors = (
|
||||
FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID])
|
||||
>> with_flatten(
|
||||
uniqued(
|
||||
(lower | prefix | suffix | shape)
|
||||
>> LN(Maxout(width, width+(width//2)*3)),
|
||||
column=0
|
||||
)
|
||||
trained_vectors = FeatureExtracter(
|
||||
[ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
|
||||
) >> with_flatten(
|
||||
uniqued(
|
||||
(lower | prefix | suffix | shape)
|
||||
>> LN(Maxout(width, width + (width // 2) * 3)),
|
||||
column=0,
|
||||
)
|
||||
)
|
||||
|
||||
if pretrained_dims:
|
||||
static_vectors = (
|
||||
SpacyVectors
|
||||
>> with_flatten(Affine(width, pretrained_dims))
|
||||
static_vectors = SpacyVectors >> with_flatten(
|
||||
Affine(width, pretrained_dims)
|
||||
)
|
||||
# TODO Make concatenate support lists
|
||||
vectors = concatenate_lists(trained_vectors, static_vectors)
|
||||
vectors_width = width*2
|
||||
vectors_width = width * 2
|
||||
else:
|
||||
vectors = trained_vectors
|
||||
vectors_width = width
|
||||
static_vectors = None
|
||||
tok2vec = (
|
||||
vectors
|
||||
>> with_flatten(
|
||||
LN(Maxout(width, vectors_width))
|
||||
>> Residual(
|
||||
(ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
|
||||
) ** depth, pad=depth
|
||||
)
|
||||
tok2vec = vectors >> with_flatten(
|
||||
LN(Maxout(width, vectors_width))
|
||||
>> Residual((ExtractWindow(nW=1) >> LN(Maxout(width, width * 3)))) ** depth,
|
||||
pad=depth,
|
||||
)
|
||||
cnn_model = (
|
||||
tok2vec
|
||||
|
@ -550,13 +562,10 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
|||
>> zero_init(Affine(nr_class, width, drop_factor=0.0))
|
||||
)
|
||||
|
||||
linear_model = (
|
||||
_preprocess_doc
|
||||
>> LinearModel(nr_class)
|
||||
)
|
||||
linear_model = _preprocess_doc >> LinearModel(nr_class)
|
||||
model = (
|
||||
(linear_model | cnn_model)
|
||||
>> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
|
||||
>> zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0))
|
||||
>> logistic
|
||||
)
|
||||
model.tok2vec = tok2vec
|
||||
|
@ -566,9 +575,9 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
|||
|
||||
|
||||
@layerize
|
||||
def flatten(seqs, drop=0.):
|
||||
def flatten(seqs, drop=0.0):
|
||||
ops = Model.ops
|
||||
lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
|
||||
lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
|
||||
|
||||
def finish_update(d_X, sgd=None):
|
||||
return ops.unflatten(d_X, lengths, pad=0)
|
||||
|
@ -583,14 +592,14 @@ def concatenate_lists(*layers, **kwargs): # pragma: no cover
|
|||
"""
|
||||
if not layers:
|
||||
return noop()
|
||||
drop_factor = kwargs.get('drop_factor', 1.0)
|
||||
drop_factor = kwargs.get("drop_factor", 1.0)
|
||||
ops = layers[0].ops
|
||||
layers = [chain(layer, flatten) for layer in layers]
|
||||
concat = concatenate(*layers)
|
||||
|
||||
def concatenate_lists_fwd(Xs, drop=0.):
|
||||
def concatenate_lists_fwd(Xs, drop=0.0):
|
||||
drop *= drop_factor
|
||||
lengths = ops.asarray([len(X) for X in Xs], dtype='i')
|
||||
lengths = ops.asarray([len(X) for X in Xs], dtype="i")
|
||||
flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
|
||||
ys = ops.unflatten(flat_y, lengths)
|
||||
|
||||
|
|
|
@ -1,16 +1,17 @@
|
|||
# inspired from:
|
||||
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
|
||||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||
# fmt: off
|
||||
|
||||
__title__ = 'spacy-nightly'
|
||||
__version__ = '2.1.0a3'
|
||||
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
||||
__uri__ = 'https://spacy.io'
|
||||
__author__ = 'Explosion AI'
|
||||
__email__ = 'contact@explosion.ai'
|
||||
__license__ = 'MIT'
|
||||
__title__ = "spacy-nightly"
|
||||
__version__ = "2.1.0a3"
|
||||
__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
|
||||
__uri__ = "https://spacy.io"
|
||||
__author__ = "Explosion AI"
|
||||
__email__ = "contact@explosion.ai"
|
||||
__license__ = "MIT"
|
||||
__release__ = False
|
||||
|
||||
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
|
||||
__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json'
|
||||
__shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json'
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
__shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json"
|
||||
|
|
|
@ -1,14 +1,13 @@
|
|||
from .download import download
|
||||
from .info import info
|
||||
from .link import link
|
||||
from .package import package
|
||||
from .profile import profile
|
||||
from .train import train
|
||||
from .pretrain import pretrain
|
||||
from .evaluate import evaluate
|
||||
from .convert import convert
|
||||
from .vocab import make_vocab as vocab
|
||||
from .init_model import init_model
|
||||
from .validate import validate
|
||||
from .ud_train import main as ud_train
|
||||
from .conll17_ud_eval import main as ud_evaluate
|
||||
from .download import download # noqa: F401
|
||||
from .info import info # noqa: F401
|
||||
from .link import link # noqa: F401
|
||||
from .package import package # noqa: F401
|
||||
from .profile import profile # noqa: F401
|
||||
from .train import train # noqa: F401
|
||||
from .pretrain import pretrain # noqa: F401
|
||||
from .debug_data import debug_data # noqa: F401
|
||||
from .evaluate import evaluate # noqa: F401
|
||||
from .convert import convert # noqa: F401
|
||||
from .init_model import init_model # noqa: F401
|
||||
from .validate import validate # noqa: F401
|
||||
from .ud import ud_train, ud_evaluate # noqa: F401
|
||||
|
|
|
@ -2,6 +2,8 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# fmt: off
|
||||
|
||||
class Messages(object):
|
||||
M001 = ("Download successful but linking failed")
|
||||
M002 = ("Creating a shortcut link for 'en' didn't work (maybe you "
|
||||
|
@ -73,3 +75,31 @@ class Messages(object):
|
|||
M052 = ("Not a valid meta.json format")
|
||||
M053 = ("Expected dict but got: {meta_type}")
|
||||
M054 = ("No --lang specified, but tokenization required.")
|
||||
M055 = ("Training pipeline: {pipeline}")
|
||||
M056 = ("Starting with base model '{model}'")
|
||||
M057 = ("Starting with blank model '{model}'")
|
||||
M058 = ("Loading vector from model '{model}'")
|
||||
M059 = ("Can't use multitask objective without '{pipe}' in the pipeline")
|
||||
M060 = ("Counting training words (limit={limit})")
|
||||
M061 = ("\nSaving model...")
|
||||
M062 = ("Output directory is not empty.")
|
||||
M063 = ("Incompatible arguments")
|
||||
M064 = ("The -f and -c arguments are deprecated, and not compatible with "
|
||||
"the -j argument, which should specify the same information. "
|
||||
"Either merge the frequencies and clusters data into the "
|
||||
"JSONL-formatted file (recommended), or use only the -f and -c "
|
||||
"files, without the other lexical attributes.")
|
||||
M065 = ("This can lead to unintended side effects when saving the model. "
|
||||
"Please use an empty directory or a different path instead. If "
|
||||
"the specified output path doesn't exist, the directory will be "
|
||||
"created for you.")
|
||||
M066 = ("Saved model to output directory")
|
||||
M067 = ("Can't find lexical data")
|
||||
M068 = ("Sucessfully compiled vocab and vectors, and saved model")
|
||||
M069 = ("Unknown file type: '{name}'")
|
||||
M070 = ("Supported file types: '{options}'")
|
||||
M071 = ("Loaded pretrained tok2vec for: {components}")
|
||||
M072 = ("Model language ('{model_lang}') doesn't match language specified "
|
||||
"as `lang` argument ('{lang}') ")
|
||||
|
||||
# fmt: on
|
||||
|
|
|
@ -3,49 +3,91 @@ from __future__ import unicode_literals
|
|||
|
||||
import plac
|
||||
from pathlib import Path
|
||||
from wasabi import Printer
|
||||
|
||||
from ..util import write_jsonl, write_json
|
||||
from ..compat import json_dumps, path2str
|
||||
from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json
|
||||
from .converters import ner_jsonl2json
|
||||
from ._messages import Messages
|
||||
from ..util import prints
|
||||
|
||||
|
||||
# Converters are matched by file extension. To add a converter, add a new
|
||||
# entry to this dict with the file extension mapped to the converter function
|
||||
# imported from /converters.
|
||||
CONVERTERS = {
|
||||
'conllubio': conllubio2json,
|
||||
'conllu': conllu2json,
|
||||
'conll': conllu2json,
|
||||
'ner': conll_ner2json,
|
||||
'iob': iob2json,
|
||||
'jsonl': ner_jsonl2json
|
||||
"conllubio": conllubio2json,
|
||||
"conllu": conllu2json,
|
||||
"conll": conllu2json,
|
||||
"ner": conll_ner2json,
|
||||
"iob": iob2json,
|
||||
"jsonl": ner_jsonl2json,
|
||||
}
|
||||
|
||||
# File types
|
||||
FILE_TYPES = ("json", "jsonl")
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
input_file=("input file", "positional", None, str),
|
||||
output_dir=("output directory for converted file", "positional", None, str),
|
||||
input_file=("Input file", "positional", None, str),
|
||||
output_dir=("Output directory for converted file", "positional", None, str),
|
||||
file_type=("Type of data to produce: 'jsonl' or 'json'", "option", "t", str),
|
||||
n_sents=("Number of sentences per doc", "option", "n", int),
|
||||
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
|
||||
lang=("Language (if tokenizer required)", "option", "l", str),
|
||||
morphology=("Enable appending morphology to tags", "flag", "m", bool))
|
||||
def convert(input_file, output_dir, n_sents=1, morphology=False, converter='auto',
|
||||
lang=None):
|
||||
morphology=("Enable appending morphology to tags", "flag", "m", bool),
|
||||
)
|
||||
def convert(
|
||||
input_file,
|
||||
output_dir="-",
|
||||
file_type="jsonl",
|
||||
n_sents=1,
|
||||
morphology=False,
|
||||
converter="auto",
|
||||
lang=None,
|
||||
):
|
||||
"""
|
||||
Convert files into JSON format for use with train command and other
|
||||
experiment management functions.
|
||||
experiment management functions. If no output_dir is specified, the data
|
||||
is written to stdout, so you can pipe them forward to a JSONL file:
|
||||
$ spacy convert some_file.conllu > some_file.jsonl
|
||||
"""
|
||||
msg = Printer()
|
||||
input_path = Path(input_file)
|
||||
output_path = Path(output_dir)
|
||||
if file_type not in FILE_TYPES:
|
||||
msg.fail(
|
||||
Messages.M069.format(name=file_type),
|
||||
Messages.M070.format(options=", ".join(FILE_TYPES)),
|
||||
exits=1,
|
||||
)
|
||||
if not input_path.exists():
|
||||
prints(input_path, title=Messages.M028, exits=1)
|
||||
if not output_path.exists():
|
||||
prints(output_path, title=Messages.M029, exits=1)
|
||||
if converter == 'auto':
|
||||
msg.fail(Messages.M028, input_path, exits=1)
|
||||
if output_dir != "-" and not Path(output_dir).exists():
|
||||
msg.fail(Messages.M029, output_dir, exits=1)
|
||||
if converter == "auto":
|
||||
converter = input_path.suffix[1:]
|
||||
if converter not in CONVERTERS:
|
||||
prints(Messages.M031.format(converter=converter),
|
||||
title=Messages.M030, exits=1)
|
||||
msg.fail(Messages.M030, Messages.M031.format(converter=converter), exits=1)
|
||||
# Use converter function to convert data
|
||||
func = CONVERTERS[converter]
|
||||
func(input_path, output_path,
|
||||
n_sents=n_sents, use_morphology=morphology, lang=lang)
|
||||
input_data = input_path.open("r", encoding="utf-8").read()
|
||||
data = func(input_data, nsents=n_sents, use_morphology=morphology, lang=lang)
|
||||
if output_dir != "-":
|
||||
# Export data to a file
|
||||
suffix = ".{}".format(file_type)
|
||||
output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
|
||||
if file_type == "json":
|
||||
write_json(output_file, data)
|
||||
elif file_type == "jsonl":
|
||||
write_jsonl(output_file, data)
|
||||
msg.good(
|
||||
Messages.M032.format(name=path2str(output_file)),
|
||||
Messages.M033.format(n_docs=len(data)),
|
||||
)
|
||||
else:
|
||||
# Print to stdout
|
||||
if file_type == "json":
|
||||
print(json_dumps(data))
|
||||
elif file_type == "jsonl":
|
||||
for line in data:
|
||||
print(json_dumps(line))
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from .conllu2json import conllu2json
|
||||
from .conllubio2json import conllubio2json
|
||||
from .iob2json import iob2json
|
||||
from .conll_ner2json import conll_ner2json
|
||||
from .jsonl2json import ner_jsonl2json
|
||||
from .conllu2json import conllu2json # noqa: F401
|
||||
from .conllubio2json import conllubio2json # noqa: F401
|
||||
from .iob2json import iob2json # noqa: F401
|
||||
from .conll_ner2json import conll_ner2json # noqa: F401
|
||||
from .jsonl2json import ner_jsonl2json # noqa: F401
|
||||
|
|
|
@ -1,52 +1,38 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .._messages import Messages
|
||||
from ...compat import json_dumps, path2str
|
||||
from ...util import prints
|
||||
from ...gold import iob_to_biluo
|
||||
|
||||
|
||||
def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):
|
||||
def conll_ner2json(input_data, **kwargs):
|
||||
"""
|
||||
Convert files in the CoNLL-2003 NER format into JSON format for use with
|
||||
train cli.
|
||||
"""
|
||||
docs = read_conll_ner(input_path)
|
||||
|
||||
output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
|
||||
output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
|
||||
output_file = output_path / output_filename
|
||||
with output_file.open('w', encoding='utf-8') as f:
|
||||
f.write(json_dumps(docs))
|
||||
prints(Messages.M033.format(n_docs=len(docs)),
|
||||
title=Messages.M032.format(name=path2str(output_file)))
|
||||
|
||||
|
||||
def read_conll_ner(input_path):
|
||||
text = input_path.open('r', encoding='utf-8').read()
|
||||
i = 0
|
||||
delimit_docs = '-DOCSTART- -X- O O'
|
||||
delimit_docs = "-DOCSTART- -X- O O"
|
||||
output_docs = []
|
||||
for doc in text.strip().split(delimit_docs):
|
||||
for doc in input_data.strip().split(delimit_docs):
|
||||
doc = doc.strip()
|
||||
if not doc:
|
||||
continue
|
||||
output_doc = []
|
||||
for sent in doc.split('\n\n'):
|
||||
for sent in doc.split("\n\n"):
|
||||
sent = sent.strip()
|
||||
if not sent:
|
||||
continue
|
||||
lines = [line.strip() for line in sent.split('\n') if line.strip()]
|
||||
lines = [line.strip() for line in sent.split("\n") if line.strip()]
|
||||
words, tags, chunks, iob_ents = zip(*[line.split() for line in lines])
|
||||
biluo_ents = iob_to_biluo(iob_ents)
|
||||
output_doc.append({'tokens': [
|
||||
{'orth': w, 'tag': tag, 'ner': ent} for (w, tag, ent) in
|
||||
zip(words, tags, biluo_ents)
|
||||
]})
|
||||
output_docs.append({
|
||||
'id': len(output_docs),
|
||||
'paragraphs': [{'sentences': output_doc}]
|
||||
})
|
||||
output_doc.append(
|
||||
{
|
||||
"tokens": [
|
||||
{"orth": w, "tag": tag, "ner": ent}
|
||||
for (w, tag, ent) in zip(words, tags, biluo_ents)
|
||||
]
|
||||
}
|
||||
)
|
||||
output_docs.append(
|
||||
{"id": len(output_docs), "paragraphs": [{"sentences": output_doc}]}
|
||||
)
|
||||
output_doc = []
|
||||
return output_docs
|
||||
|
|
|
@ -1,34 +1,27 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .._messages import Messages
|
||||
from ...compat import json_dumps, path2str
|
||||
from ...util import prints
|
||||
from ...gold import iob_to_biluo
|
||||
import re
|
||||
|
||||
from ...gold import iob_to_biluo
|
||||
|
||||
def conllu2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):
|
||||
|
||||
def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None):
|
||||
"""
|
||||
Convert conllu files into JSON format for use with train cli.
|
||||
use_morphology parameter enables appending morphology to tags, which is
|
||||
useful for languages such as Spanish, where UD tags are not so rich.
|
||||
"""
|
||||
# by @dvsrepo, via #11 explosion/spacy-dev-resources
|
||||
|
||||
"""
|
||||
Extract NER tags if available and convert them so that they follow
|
||||
BILUO and the Wikipedia scheme
|
||||
"""
|
||||
# by @dvsrepo, via #11 explosion/spacy-dev-resources
|
||||
# by @katarkor
|
||||
|
||||
docs = []
|
||||
sentences = []
|
||||
conll_tuples = read_conllx(input_path, use_morphology=use_morphology)
|
||||
conll_tuples = read_conllx(input_data, use_morphology=use_morphology)
|
||||
checked_for_ner = False
|
||||
has_ner_tags = False
|
||||
|
||||
for i, (raw_text, tokens) in enumerate(conll_tuples):
|
||||
sentence, brackets = tokens[0]
|
||||
if not checked_for_ner:
|
||||
|
@ -37,29 +30,19 @@ def conllu2json(input_path, output_path, n_sents=10, use_morphology=False, lang=
|
|||
sentences.append(generate_sentence(sentence, has_ner_tags))
|
||||
# Real-sized documents could be extracted using the comments on the
|
||||
# conluu document
|
||||
|
||||
if(len(sentences) % n_sents == 0):
|
||||
if len(sentences) % n_sents == 0:
|
||||
doc = create_doc(sentences, i)
|
||||
docs.append(doc)
|
||||
sentences = []
|
||||
|
||||
output_filename = input_path.parts[-1].replace(".conll", ".json")
|
||||
output_filename = input_path.parts[-1].replace(".conllu", ".json")
|
||||
output_file = output_path / output_filename
|
||||
with output_file.open('w', encoding='utf-8') as f:
|
||||
f.write(json_dumps(docs))
|
||||
prints(Messages.M033.format(n_docs=len(docs)),
|
||||
title=Messages.M032.format(name=path2str(output_file)))
|
||||
return docs
|
||||
|
||||
|
||||
def is_ner(tag):
|
||||
|
||||
"""
|
||||
Check the 10th column of the first token to determine if the file contains
|
||||
NER tags
|
||||
"""
|
||||
|
||||
tag_match = re.match('([A-Z_]+)-([A-Z_]+)', tag)
|
||||
Check the 10th column of the first token to determine if the file contains
|
||||
NER tags
|
||||
"""
|
||||
tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
|
||||
if tag_match:
|
||||
return True
|
||||
elif tag == "O":
|
||||
|
@ -67,29 +50,29 @@ def is_ner(tag):
|
|||
else:
|
||||
return False
|
||||
|
||||
def read_conllx(input_path, use_morphology=False, n=0):
|
||||
text = input_path.open('r', encoding='utf-8').read()
|
||||
|
||||
def read_conllx(input_data, use_morphology=False, n=0):
|
||||
i = 0
|
||||
for sent in text.strip().split('\n\n'):
|
||||
lines = sent.strip().split('\n')
|
||||
for sent in input_data.strip().split("\n\n"):
|
||||
lines = sent.strip().split("\n")
|
||||
if lines:
|
||||
while lines[0].startswith('#'):
|
||||
while lines[0].startswith("#"):
|
||||
lines.pop(0)
|
||||
tokens = []
|
||||
for line in lines:
|
||||
|
||||
parts = line.split('\t')
|
||||
parts = line.split("\t")
|
||||
id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts
|
||||
if '-' in id_ or '.' in id_:
|
||||
if "-" in id_ or "." in id_:
|
||||
continue
|
||||
try:
|
||||
id_ = int(id_) - 1
|
||||
head = (int(head) - 1) if head != '0' else id_
|
||||
dep = 'ROOT' if dep == 'root' else dep
|
||||
tag = pos if tag == '_' else tag
|
||||
tag = tag+'__'+morph if use_morphology else tag
|
||||
head = (int(head) - 1) if head != "0" else id_
|
||||
dep = "ROOT" if dep == "root" else dep
|
||||
tag = pos if tag == "_" else tag
|
||||
tag = tag + "__" + morph if use_morphology else tag
|
||||
tokens.append((id_, word, tag, head, dep, iob))
|
||||
except:
|
||||
except: # noqa: E722
|
||||
print(line)
|
||||
raise
|
||||
tuples = [list(t) for t in zip(*tokens)]
|
||||
|
@ -98,31 +81,31 @@ def read_conllx(input_path, use_morphology=False, n=0):
|
|||
if n >= 1 and i >= n:
|
||||
break
|
||||
|
||||
|
||||
def simplify_tags(iob):
|
||||
|
||||
"""
|
||||
Simplify tags obtained from the dataset in order to follow Wikipedia
|
||||
scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
|
||||
'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
|
||||
'MISC'.
|
||||
'MISC'.
|
||||
"""
|
||||
|
||||
new_iob = []
|
||||
for tag in iob:
|
||||
tag_match = re.match('([A-Z_]+)-([A-Z_]+)', tag)
|
||||
tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
|
||||
if tag_match:
|
||||
prefix = tag_match.group(1)
|
||||
suffix = tag_match.group(2)
|
||||
if suffix == 'GPE_LOC':
|
||||
suffix = 'LOC'
|
||||
elif suffix == 'GPE_ORG':
|
||||
suffix = 'ORG'
|
||||
elif suffix != 'PER' and suffix != 'LOC' and suffix != 'ORG':
|
||||
suffix = 'MISC'
|
||||
tag = prefix + '-' + suffix
|
||||
if suffix == "GPE_LOC":
|
||||
suffix = "LOC"
|
||||
elif suffix == "GPE_ORG":
|
||||
suffix = "ORG"
|
||||
elif suffix != "PER" and suffix != "LOC" and suffix != "ORG":
|
||||
suffix = "MISC"
|
||||
tag = prefix + "-" + suffix
|
||||
new_iob.append(tag)
|
||||
return new_iob
|
||||
|
||||
|
||||
def generate_sentence(sent, has_ner_tags):
|
||||
(id_, word, tag, head, dep, iob) = sent
|
||||
sentence = {}
|
||||
|
@ -144,7 +127,7 @@ def generate_sentence(sent, has_ner_tags):
|
|||
return sentence
|
||||
|
||||
|
||||
def create_doc(sentences,id):
|
||||
def create_doc(sentences, id):
|
||||
doc = {}
|
||||
paragraph = {}
|
||||
doc["id"] = id
|
||||
|
|
|
@ -1,65 +1,54 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...compat import json_dumps, path2str
|
||||
from ...util import prints
|
||||
from ...gold import iob_to_biluo
|
||||
|
||||
def conllubio2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):
|
||||
|
||||
def conllubio2json(input_data, n_sents=10, use_morphology=False, lang=None):
|
||||
"""
|
||||
Convert conllu files into JSON format for use with train cli.
|
||||
use_morphology parameter enables appending morphology to tags, which is
|
||||
useful for languages such as Spanish, where UD tags are not so rich.
|
||||
"""
|
||||
# by @dvsrepo, via #11 explosion/spacy-dev-resources
|
||||
|
||||
docs = []
|
||||
sentences = []
|
||||
conll_tuples = read_conllx(input_path, use_morphology=use_morphology)
|
||||
|
||||
conll_tuples = read_conllx(input_data, use_morphology=use_morphology)
|
||||
for i, (raw_text, tokens) in enumerate(conll_tuples):
|
||||
sentence, brackets = tokens[0]
|
||||
sentences.append(generate_sentence(sentence))
|
||||
# Real-sized documents could be extracted using the comments on the
|
||||
# conluu document
|
||||
if(len(sentences) % n_sents == 0):
|
||||
if len(sentences) % n_sents == 0:
|
||||
doc = create_doc(sentences, i)
|
||||
docs.append(doc)
|
||||
sentences = []
|
||||
|
||||
output_filename = input_path.parts[-1].replace(".conll", ".json")
|
||||
output_filename = input_path.parts[-1].replace(".conllu", ".json")
|
||||
output_file = output_path / output_filename
|
||||
with output_file.open('w', encoding='utf-8') as f:
|
||||
f.write(json_dumps(docs))
|
||||
prints("Created %d documents" % len(docs),
|
||||
title="Generated output file %s" % path2str(output_file))
|
||||
return docs
|
||||
|
||||
|
||||
def read_conllx(input_path, use_morphology=False, n=0):
|
||||
text = input_path.open('r', encoding='utf-8').read()
|
||||
def read_conllx(input_data, use_morphology=False, n=0):
|
||||
i = 0
|
||||
for sent in text.strip().split('\n\n'):
|
||||
lines = sent.strip().split('\n')
|
||||
for sent in input_data.strip().split("\n\n"):
|
||||
lines = sent.strip().split("\n")
|
||||
if lines:
|
||||
while lines[0].startswith('#'):
|
||||
while lines[0].startswith("#"):
|
||||
lines.pop(0)
|
||||
tokens = []
|
||||
for line in lines:
|
||||
|
||||
parts = line.split('\t')
|
||||
parts = line.split("\t")
|
||||
id_, word, lemma, pos, tag, morph, head, dep, _1, ner = parts
|
||||
if '-' in id_ or '.' in id_:
|
||||
if "-" in id_ or "." in id_:
|
||||
continue
|
||||
try:
|
||||
id_ = int(id_) - 1
|
||||
head = (int(head) - 1) if head != '0' else id_
|
||||
dep = 'ROOT' if dep == 'root' else dep
|
||||
tag = pos if tag == '_' else tag
|
||||
tag = tag+'__'+morph if use_morphology else tag
|
||||
ner = ner if ner else 'O'
|
||||
head = (int(head) - 1) if head != "0" else id_
|
||||
dep = "ROOT" if dep == "root" else dep
|
||||
tag = pos if tag == "_" else tag
|
||||
tag = tag + "__" + morph if use_morphology else tag
|
||||
ner = ner if ner else "O"
|
||||
tokens.append((id_, word, tag, head, dep, ner))
|
||||
except:
|
||||
except: # noqa: E722
|
||||
print(line)
|
||||
raise
|
||||
tuples = [list(t) for t in zip(*tokens)]
|
||||
|
@ -68,6 +57,7 @@ def read_conllx(input_path, use_morphology=False, n=0):
|
|||
if n >= 1 and i >= n:
|
||||
break
|
||||
|
||||
|
||||
def generate_sentence(sent):
|
||||
(id_, word, tag, head, dep, ner) = sent
|
||||
sentence = {}
|
||||
|
@ -85,7 +75,7 @@ def generate_sentence(sent):
|
|||
return sentence
|
||||
|
||||
|
||||
def create_doc(sentences,id):
|
||||
def create_doc(sentences, id):
|
||||
doc = {}
|
||||
paragraph = {}
|
||||
doc["id"] = id
|
||||
|
|
|
@ -1,26 +1,24 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
from cytoolz import partition_all, concat
|
||||
|
||||
from .._messages import Messages
|
||||
from ...compat import json_dumps, path2str
|
||||
from ...util import prints
|
||||
from cytoolz import partition_all
|
||||
|
||||
from ...gold import iob_to_biluo
|
||||
|
||||
|
||||
def iob2json(input_path, output_path, n_sents=10, *a, **k):
|
||||
def iob2json(input_data, n_sents=10, *args, **kwargs):
|
||||
"""
|
||||
Convert IOB files into JSON format for use with train cli.
|
||||
"""
|
||||
with input_path.open('r', encoding='utf8') as file_:
|
||||
sentences = read_iob(file_)
|
||||
docs = merge_sentences(sentences, n_sents)
|
||||
output_filename = input_path.parts[-1].replace(".iob", ".json")
|
||||
output_file = output_path / output_filename
|
||||
with output_file.open('w', encoding='utf-8') as f:
|
||||
f.write(json_dumps(docs))
|
||||
prints(Messages.M033.format(n_docs=len(docs)),
|
||||
title=Messages.M032.format(name=path2str(output_file)))
|
||||
docs = []
|
||||
for group in partition_all(n_sents, docs):
|
||||
group = list(group)
|
||||
first = group.pop(0)
|
||||
to_extend = first["paragraphs"][0]["sentences"]
|
||||
for sent in group[1:]:
|
||||
to_extend.extend(sent["paragraphs"][0]["sentences"])
|
||||
docs.append(first)
|
||||
return docs
|
||||
|
||||
|
||||
def read_iob(raw_sents):
|
||||
|
@ -28,30 +26,20 @@ def read_iob(raw_sents):
|
|||
for line in raw_sents:
|
||||
if not line.strip():
|
||||
continue
|
||||
tokens = [t.split('|') for t in line.split()]
|
||||
tokens = [t.split("|") for t in line.split()]
|
||||
if len(tokens[0]) == 3:
|
||||
words, pos, iob = zip(*tokens)
|
||||
else:
|
||||
words, iob = zip(*tokens)
|
||||
pos = ['-'] * len(words)
|
||||
pos = ["-"] * len(words)
|
||||
biluo = iob_to_biluo(iob)
|
||||
sentences.append([
|
||||
{'orth': w, 'tag': p, 'ner': ent}
|
||||
for (w, p, ent) in zip(words, pos, biluo)
|
||||
])
|
||||
sentences = [{'tokens': sent} for sent in sentences]
|
||||
paragraphs = [{'sentences': [sent]} for sent in sentences]
|
||||
docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs]
|
||||
sentences.append(
|
||||
[
|
||||
{"orth": w, "tag": p, "ner": ent}
|
||||
for (w, p, ent) in zip(words, pos, biluo)
|
||||
]
|
||||
)
|
||||
sentences = [{"tokens": sent} for sent in sentences]
|
||||
paragraphs = [{"sentences": [sent]} for sent in sentences]
|
||||
docs = [{"id": 0, "paragraphs": [para]} for para in paragraphs]
|
||||
return docs
|
||||
|
||||
def merge_sentences(docs, n_sents):
|
||||
counter = 0
|
||||
merged = []
|
||||
for group in partition_all(n_sents, docs):
|
||||
group = list(group)
|
||||
first = group.pop(0)
|
||||
to_extend = first['paragraphs'][0]['sentences']
|
||||
for sent in group[1:]:
|
||||
to_extend.extend(sent['paragraphs'][0]['sentences'])
|
||||
merged.append(first)
|
||||
return merged
|
||||
|
|
|
@ -1,33 +1,21 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
import ujson as json
|
||||
|
||||
import ujson
|
||||
|
||||
from ...util import get_lang_class
|
||||
from .._messages import Messages
|
||||
from ...compat import json_dumps, path2str
|
||||
from ...util import prints, get_lang_class
|
||||
from ...gold import docs_to_json
|
||||
|
||||
|
||||
def ner_jsonl2json(input_path, output_path, lang=None, n_sents=10, use_morphology=False):
|
||||
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False):
|
||||
if lang is None:
|
||||
prints(Messages.M054, exits=True)
|
||||
raise ValueError(Messages.M054)
|
||||
json_docs = []
|
||||
input_tuples = list(read_jsonl(input_path))
|
||||
input_tuples = [ujson.loads(line) for line in input_data]
|
||||
nlp = get_lang_class(lang)()
|
||||
for i, (raw_text, ents) in enumerate(input_tuples):
|
||||
doc = nlp.make_doc(raw_text)
|
||||
doc[0].is_sent_start = True
|
||||
doc.ents = [doc.char_span(s, e, label=L) for s, e, L in ents['entities']]
|
||||
json_docs.append(docs_to_json(i, [doc]))
|
||||
|
||||
output_filename = input_path.parts[-1].replace(".jsonl", ".json")
|
||||
output_loc = output_path / output_filename
|
||||
with (output_loc).open('w', encoding='utf8') as file_:
|
||||
file_.write(json_dumps(json_docs))
|
||||
prints(Messages.M033.format(n_docs=len(json_docs)),
|
||||
title=Messages.M032.format(name=path2str(output_loc)))
|
||||
|
||||
def read_jsonl(input_path):
|
||||
with input_path.open('r', encoding='utf8') as file_:
|
||||
for line in file_:
|
||||
yield json.loads(line)
|
||||
doc.ents = [doc.char_span(s, e, label=L) for s, e, L in ents["entities"]]
|
||||
json_docs.append(doc.to_json())
|
||||
return json_docs
|
||||
|
|
398
spacy/cli/debug_data.py
Normal file
398
spacy/cli/debug_data.py
Normal file
|
@ -0,0 +1,398 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from pathlib import Path
|
||||
from collections import Counter
|
||||
import plac
|
||||
import sys
|
||||
from wasabi import Printer, MESSAGES
|
||||
|
||||
from ..gold import GoldCorpus, read_json_object
|
||||
from ..util import load_model, get_lang_class, read_json, read_jsonl
|
||||
|
||||
# from .schemas import get_schema, validate_json
|
||||
from ._messages import Messages
|
||||
|
||||
|
||||
# Minimum number of expected occurences of label in data to train new label
|
||||
NEW_LABEL_THRESHOLD = 50
|
||||
# Minimum number of expected examples to train a blank model
|
||||
BLANK_MODEL_MIN_THRESHOLD = 100
|
||||
BLANK_MODEL_THRESHOLD = 2000
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
lang=("model language", "positional", None, str),
|
||||
train_path=("location of JSON-formatted training data", "positional", None, Path),
|
||||
dev_path=("location of JSON-formatted development data", "positional", None, Path),
|
||||
base_model=("name of model to update (optional)", "option", "b", str),
|
||||
pipeline=(
|
||||
"Comma-separated names of pipeline components to train",
|
||||
"option",
|
||||
"p",
|
||||
str,
|
||||
),
|
||||
ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
|
||||
ignore_validation=(
|
||||
"Don't exit if JSON format validation fails",
|
||||
"flag",
|
||||
"IV",
|
||||
bool,
|
||||
),
|
||||
verbose=("Print additional information and explanations", "flag", "V", bool),
|
||||
no_format=("Don't pretty-print the results", "flag", "NF", bool),
|
||||
)
|
||||
def debug_data(
|
||||
lang,
|
||||
train_path,
|
||||
dev_path,
|
||||
base_model=None,
|
||||
pipeline="tagger,parser,ner",
|
||||
ignore_warnings=False,
|
||||
ignore_validation=False,
|
||||
verbose=False,
|
||||
no_format=False,
|
||||
):
|
||||
msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings)
|
||||
|
||||
# Make sure all files and paths exists if they are needed
|
||||
if not train_path.exists():
|
||||
msg.fail(Messages.M050, train_path, exits=1)
|
||||
if not dev_path.exists():
|
||||
msg.fail(Messages.M051, dev_path, exits=1)
|
||||
|
||||
# Initialize the model and pipeline
|
||||
pipeline = [p.strip() for p in pipeline.split(",")]
|
||||
if base_model:
|
||||
nlp = load_model(base_model)
|
||||
else:
|
||||
lang_cls = get_lang_class(lang)
|
||||
nlp = lang_cls()
|
||||
|
||||
msg.divider("Data format validation")
|
||||
# Load the data in one – might take a while but okay in this case
|
||||
with msg.loading("Loading {}...".format(train_path.parts[-1])):
|
||||
train_data = _load_file(train_path, msg)
|
||||
with msg.loading("Loading {}...".format(dev_path.parts[-1])):
|
||||
dev_data = _load_file(dev_path, msg)
|
||||
|
||||
# Validate data format using the JSON schema
|
||||
# TODO: update once the new format is ready
|
||||
# schema = get_schema("training")
|
||||
train_data_errors = [] # TODO: validate_json(train_data, schema)
|
||||
dev_data_errors = [] # TODO: validate_json(dev_data, schema)
|
||||
if not train_data_errors:
|
||||
msg.good("Training data JSON format is valid")
|
||||
if not dev_data_errors:
|
||||
msg.good("Development data JSON format is valid")
|
||||
for error in train_data_errors:
|
||||
msg.fail("Training data: {}".format(error))
|
||||
for error in dev_data_errors:
|
||||
msg.fail("Develoment data: {}".format(error))
|
||||
if (train_data_errors or dev_data_errors) and not ignore_validation:
|
||||
sys.exit(1)
|
||||
|
||||
# Create the gold corpus to be able to better analyze data
|
||||
with msg.loading("Analyzing corpus..."):
|
||||
train_data = read_json_object(train_data)
|
||||
dev_data = read_json_object(dev_data)
|
||||
corpus = GoldCorpus(train_data, dev_data)
|
||||
train_docs = list(corpus.train_docs(nlp))
|
||||
dev_docs = list(corpus.dev_docs(nlp))
|
||||
msg.good("Corpus is loadable")
|
||||
|
||||
# Create all gold data here to avoid iterating over the train_docs constantly
|
||||
gold_data = _compile_gold(train_docs, pipeline)
|
||||
train_texts = gold_data["texts"]
|
||||
dev_texts = set([doc.text for doc, gold in dev_docs])
|
||||
|
||||
msg.divider("Training stats")
|
||||
msg.text("Training pipeline: {}".format(", ".join(pipeline)))
|
||||
for pipe in [p for p in pipeline if p not in nlp.factories]:
|
||||
msg.fail("Pipeline component '{}' not available in factories".format(pipe))
|
||||
if base_model:
|
||||
msg.text("Starting with base model '{}'".format(base_model))
|
||||
else:
|
||||
msg.text("Starting with blank model '{}'".format(lang))
|
||||
msg.text("{} training docs".format(len(train_docs)))
|
||||
msg.text("{} evaluation docs".format(len(dev_docs)))
|
||||
|
||||
overlap = len(train_texts.intersection(dev_texts))
|
||||
if overlap:
|
||||
msg.warn("{} training examples also in evaluation data".format(overlap))
|
||||
else:
|
||||
msg.good("No overlap between training and evaluation data")
|
||||
if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD:
|
||||
text = "Low number of examples to train from a blank model ({})".format(
|
||||
len(train_docs)
|
||||
)
|
||||
if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD:
|
||||
msg.fail(text)
|
||||
else:
|
||||
msg.warn(text)
|
||||
msg.text(
|
||||
"It's recommended to use at least {} examples (minimum {})".format(
|
||||
BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD
|
||||
),
|
||||
show=verbose,
|
||||
)
|
||||
|
||||
msg.divider("Vocab & Vectors")
|
||||
n_words = gold_data["n_words"]
|
||||
msg.info(
|
||||
"{} total {} in the data ({} unique)".format(
|
||||
n_words, "word" if n_words == 1 else "words", len(gold_data["words"])
|
||||
)
|
||||
)
|
||||
most_common_words = gold_data["words"].most_common(10)
|
||||
msg.text(
|
||||
"10 most common words: {}".format(
|
||||
_format_labels(most_common_words, counts=True)
|
||||
),
|
||||
show=verbose,
|
||||
)
|
||||
if len(nlp.vocab.vectors):
|
||||
msg.info(
|
||||
"{} vectors ({} unique keys, {} dimensions)".format(
|
||||
len(nlp.vocab.vectors),
|
||||
nlp.vocab.vectors.n_keys,
|
||||
nlp.vocab.vectors_length,
|
||||
)
|
||||
)
|
||||
else:
|
||||
msg.info("No word vectors present in the model")
|
||||
|
||||
if "ner" in pipeline:
|
||||
# Get all unique NER labels present in the data
|
||||
labels = set(label for label in gold_data["ner"] if label not in ("O", "-"))
|
||||
label_counts = gold_data["ner"]
|
||||
model_labels = _get_labels_from_model(nlp, "ner")
|
||||
new_labels = [l for l in labels if l not in model_labels]
|
||||
existing_labels = [l for l in labels if l in model_labels]
|
||||
has_low_data_warning = False
|
||||
has_no_neg_warning = False
|
||||
|
||||
msg.divider("Named Entity Recognition")
|
||||
msg.info(
|
||||
"{} new {}, {} existing {}".format(
|
||||
len(new_labels),
|
||||
"label" if len(new_labels) == 1 else "labels",
|
||||
len(existing_labels),
|
||||
"label" if len(existing_labels) == 1 else "labels",
|
||||
)
|
||||
)
|
||||
missing_values = label_counts["-"]
|
||||
msg.text(
|
||||
"{} missing {} (tokens with '-' label)".format(
|
||||
missing_values, "value" if missing_values == 1 else "values"
|
||||
)
|
||||
)
|
||||
if new_labels:
|
||||
labels_with_counts = [
|
||||
(label, count)
|
||||
for label, count in label_counts.most_common()
|
||||
if label != "-"
|
||||
]
|
||||
labels_with_counts = _format_labels(labels_with_counts, counts=True)
|
||||
msg.text("New: {}".format(labels_with_counts), show=verbose)
|
||||
if existing_labels:
|
||||
msg.text(
|
||||
"Existing: {}".format(_format_labels(existing_labels)), show=verbose
|
||||
)
|
||||
|
||||
for label in new_labels:
|
||||
if label_counts[label] <= NEW_LABEL_THRESHOLD:
|
||||
msg.warn(
|
||||
"Low number of examples for new label '{}' ({})".format(
|
||||
label, label_counts[label]
|
||||
)
|
||||
)
|
||||
has_low_data_warning = True
|
||||
|
||||
with msg.loading("Analyzing label distribution..."):
|
||||
neg_docs = _get_examples_without_label(train_docs, label)
|
||||
if neg_docs == 0:
|
||||
msg.warn(
|
||||
"No examples for texts WITHOUT new label '{}'".format(label)
|
||||
)
|
||||
has_no_neg_warning = True
|
||||
|
||||
if not has_low_data_warning:
|
||||
msg.good("Good amount of examples for all labels")
|
||||
if not has_no_neg_warning:
|
||||
msg.good("Examples without occurences available for all labels")
|
||||
|
||||
if has_low_data_warning:
|
||||
msg.text(
|
||||
"To train a new entity type, your data should include at "
|
||||
"least {} insteances of the new label".format(NEW_LABEL_THRESHOLD),
|
||||
show=verbose,
|
||||
)
|
||||
if has_no_neg_warning:
|
||||
msg.text(
|
||||
"Training data should always include examples of entities "
|
||||
"in context, as well as examples without a given entity "
|
||||
"type.",
|
||||
show=verbose,
|
||||
)
|
||||
|
||||
if "textcat" in pipeline:
|
||||
msg.divider("Text Classification")
|
||||
labels = [label for label in gold_data["textcat"]]
|
||||
model_labels = _get_labels_from_model(nlp, "textcat")
|
||||
new_labels = [l for l in labels if l not in model_labels]
|
||||
existing_labels = [l for l in labels if l in model_labels]
|
||||
msg.info(
|
||||
"Text Classification: {} new label(s), {} existing label(s)".format(
|
||||
len(new_labels), len(existing_labels)
|
||||
)
|
||||
)
|
||||
if new_labels:
|
||||
labels_with_counts = _format_labels(
|
||||
gold_data["textcat"].most_common(), counts=True
|
||||
)
|
||||
msg.text("New: {}".format(labels_with_counts), show=verbose)
|
||||
if existing_labels:
|
||||
msg.text(
|
||||
"Existing: {}".format(_format_labels(existing_labels)), show=verbose
|
||||
)
|
||||
|
||||
if "tagger" in pipeline:
|
||||
msg.divider("Part-of-speech Tagging")
|
||||
labels = [label for label in gold_data["tags"]]
|
||||
tag_map = nlp.Defaults.tag_map
|
||||
msg.info(
|
||||
"{} {} in data ({} {} in tag map)".format(
|
||||
len(labels),
|
||||
"label" if len(labels) == 1 else "labels",
|
||||
len(tag_map),
|
||||
"label" if len(tag_map) == 1 else "labels",
|
||||
)
|
||||
)
|
||||
labels_with_counts = _format_labels(
|
||||
gold_data["tags"].most_common(), counts=True
|
||||
)
|
||||
msg.text(labels_with_counts, show=verbose)
|
||||
non_tagmap = [l for l in labels if l not in tag_map]
|
||||
if not non_tagmap:
|
||||
msg.good("All labels present in tag map for language '{}'".format(nlp.lang))
|
||||
for label in non_tagmap:
|
||||
msg.fail(
|
||||
"Label '{}' not found in tag map for language '{}'".format(
|
||||
label, nlp.lang
|
||||
)
|
||||
)
|
||||
|
||||
if "parser" in pipeline:
|
||||
msg.divider("Dependency Parsing")
|
||||
labels = [label for label in gold_data["deps"]]
|
||||
msg.info(
|
||||
"{} {} in data".format(
|
||||
len(labels), "label" if len(labels) == 1 else "labels"
|
||||
)
|
||||
)
|
||||
labels_with_counts = _format_labels(
|
||||
gold_data["deps"].most_common(), counts=True
|
||||
)
|
||||
msg.text(labels_with_counts, show=verbose)
|
||||
|
||||
msg.divider("Summary")
|
||||
good_counts = msg.counts[MESSAGES.GOOD]
|
||||
warn_counts = msg.counts[MESSAGES.WARN]
|
||||
fail_counts = msg.counts[MESSAGES.FAIL]
|
||||
if good_counts:
|
||||
msg.good(
|
||||
"{} {} passed".format(
|
||||
good_counts, "check" if good_counts == 1 else "checks"
|
||||
)
|
||||
)
|
||||
if warn_counts:
|
||||
msg.warn(
|
||||
"{} {}".format(warn_counts, "warning" if warn_counts == 1 else "warnings")
|
||||
)
|
||||
if fail_counts:
|
||||
msg.fail("{} {}".format(fail_counts, "error" if fail_counts == 1 else "errors"))
|
||||
|
||||
if fail_counts:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def _load_file(file_path, msg):
|
||||
file_name = file_path.parts[-1]
|
||||
if file_path.suffix == ".json":
|
||||
data = read_json(file_path)
|
||||
msg.good("Loaded {}".format(file_name))
|
||||
return data
|
||||
elif file_path.suffix == ".jsonl":
|
||||
data = read_jsonl(file_path)
|
||||
msg.good("Loaded {}".format(file_name))
|
||||
return data
|
||||
msg.fail(
|
||||
"Can't load file extension {}".format(file_path.suffix),
|
||||
"Expected .json or .jsonl",
|
||||
exits=1,
|
||||
)
|
||||
|
||||
|
||||
def _compile_gold(train_docs, pipeline):
|
||||
data = {
|
||||
"ner": Counter(),
|
||||
"cats": Counter(),
|
||||
"tags": Counter(),
|
||||
"deps": Counter(),
|
||||
"words": Counter(),
|
||||
"n_words": 0,
|
||||
"texts": set(),
|
||||
}
|
||||
for doc, gold in train_docs:
|
||||
data["words"].update(gold.words)
|
||||
data["n_words"] += len(gold.words)
|
||||
data["texts"].add(doc.text)
|
||||
if "ner" in pipeline:
|
||||
for label in gold.ner:
|
||||
if label.startswith(("B-", "U-")):
|
||||
combined_label = label.split("-")[1]
|
||||
data["ner"][combined_label] += 1
|
||||
elif label == "-":
|
||||
data["ner"]["-"] += 1
|
||||
if "textcat" in pipeline:
|
||||
data["cats"].update(gold.cats)
|
||||
if "tagger" in pipeline:
|
||||
data["tags"].update(gold.tags)
|
||||
if "parser" in pipeline:
|
||||
data["deps"].update(gold.labels)
|
||||
return data
|
||||
|
||||
|
||||
def _format_labels(labels, counts=False):
|
||||
if counts:
|
||||
return ", ".join(["'{}' ({})".format(l, c) for l, c in labels])
|
||||
return ", ".join(["'{}'".format(l) for l in labels])
|
||||
|
||||
|
||||
def _get_ner_counts(data):
|
||||
counter = Counter()
|
||||
for doc, gold in data:
|
||||
for label in gold.ner:
|
||||
if label.startswith(("B-", "U-")):
|
||||
combined_label = label.split("-")[1]
|
||||
counter[combined_label] += 1
|
||||
elif label == "-":
|
||||
counter["-"] += 1
|
||||
return counter
|
||||
|
||||
|
||||
def _get_examples_without_label(data, label):
|
||||
count = 0
|
||||
for doc, gold in data:
|
||||
labels = [label.split("-")[1] for label in gold.ner if label not in ("O", "-")]
|
||||
if label not in labels:
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
def _get_labels_from_model(nlp, pipe_name):
|
||||
if pipe_name not in nlp.pipe_names:
|
||||
return set()
|
||||
pipe = nlp.get_pipe(pipe_name)
|
||||
return pipe.labels
|
|
@ -6,34 +6,37 @@ import requests
|
|||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from wasabi import Printer
|
||||
|
||||
from ._messages import Messages
|
||||
from .link import link
|
||||
from ..util import prints, get_package_path
|
||||
from ..util import get_package_path
|
||||
from .. import about
|
||||
|
||||
|
||||
msg = Printer()
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
model=("model to download, shortcut or name)", "positional", None, str),
|
||||
direct=("force direct download. Needs model name with version and won't "
|
||||
"perform compatibility check", "flag", "d", bool),
|
||||
pip_args=("additional arguments to be passed to `pip install` when "
|
||||
"installing the model"))
|
||||
model=("Model to download (shortcut or name)", "positional", None, str),
|
||||
direct=("Force direct download of name + version", "flag", "d", bool),
|
||||
pip_args=("additional arguments to be passed to `pip install` on model install"),
|
||||
)
|
||||
def download(model, direct=False, *pip_args):
|
||||
"""
|
||||
Download compatible model from default download path using pip. Model
|
||||
can be shortcut, model name or, if --direct flag is set, full model name
|
||||
with version.
|
||||
with version. For direct downloads, the compatibility check will be skipped.
|
||||
"""
|
||||
if direct:
|
||||
dl = download_model('{m}/{m}.tar.gz#egg={m}'.format(m=model), pip_args)
|
||||
dl = download_model("{m}/{m}.tar.gz#egg={m}".format(m=model), pip_args)
|
||||
else:
|
||||
shortcuts = get_json(about.__shortcuts__, "available shortcuts")
|
||||
model_name = shortcuts.get(model, model)
|
||||
compatibility = get_compatibility()
|
||||
version = get_version(model_name, compatibility)
|
||||
dl = download_model('{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}'
|
||||
.format(m=model_name, v=version), pip_args)
|
||||
dl_tpl = "{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}"
|
||||
dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
||||
if dl != 0: # if download subprocess doesn't return 0, exit
|
||||
sys.exit(dl)
|
||||
try:
|
||||
|
@ -43,44 +46,49 @@ def download(model, direct=False, *pip_args):
|
|||
# subprocess
|
||||
package_path = get_package_path(model_name)
|
||||
link(model_name, model, force=True, model_path=package_path)
|
||||
except:
|
||||
except: # noqa: E722
|
||||
# Dirty, but since spacy.download and the auto-linking is
|
||||
# mostly a convenience wrapper, it's best to show a success
|
||||
# message and loading instructions, even if linking fails.
|
||||
prints(Messages.M001, title=Messages.M002.format(name=model_name))
|
||||
msg.warn(Messages.M002.format(name=model_name), Messages.M001)
|
||||
|
||||
|
||||
def get_json(url, desc):
|
||||
r = requests.get(url)
|
||||
if r.status_code != 200:
|
||||
prints(Messages.M004.format(desc=desc, version=about.__version__),
|
||||
title=Messages.M003.format(code=r.status_code), exits=1)
|
||||
msg.fail(
|
||||
Messages.M003.format(code=r.status_code),
|
||||
Messages.M004.format(desc=desc, version=about.__version__),
|
||||
exits=1,
|
||||
)
|
||||
return r.json()
|
||||
|
||||
|
||||
def get_compatibility():
|
||||
version = about.__version__
|
||||
version = version.rsplit('.dev', 1)[0]
|
||||
version = version.rsplit(".dev", 1)[0]
|
||||
comp_table = get_json(about.__compatibility__, "compatibility table")
|
||||
comp = comp_table['spacy']
|
||||
comp = comp_table["spacy"]
|
||||
if version not in comp:
|
||||
prints(Messages.M006.format(version=version), title=Messages.M005,
|
||||
exits=1)
|
||||
msg.fail(Messages.M005, Messages.M006.format(version=version), exits=1)
|
||||
return comp[version]
|
||||
|
||||
|
||||
def get_version(model, comp):
|
||||
model = model.rsplit('.dev', 1)[0]
|
||||
model = model.rsplit(".dev", 1)[0]
|
||||
if model not in comp:
|
||||
prints(Messages.M007.format(name=model, version=about.__version__),
|
||||
title=Messages.M005, exits=1)
|
||||
msg.fail(
|
||||
Messages.M005,
|
||||
Messages.M007.format(name=model, version=about.__version__),
|
||||
exits=1,
|
||||
)
|
||||
return comp[model][0]
|
||||
|
||||
|
||||
def download_model(filename, user_pip_args=None):
|
||||
download_url = about.__download_url__ + '/' + filename
|
||||
pip_args = ['--no-cache-dir', '--no-deps']
|
||||
download_url = about.__download_url__ + "/" + filename
|
||||
pip_args = ["--no-cache-dir", "--no-deps"]
|
||||
if user_pip_args:
|
||||
pip_args.extend(user_pip_args)
|
||||
cmd = [sys.executable, '-m', 'pip', 'install'] + pip_args + [download_url]
|
||||
cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
|
||||
return subprocess.call(cmd, env=os.environ.copy())
|
||||
|
|
|
@ -3,30 +3,35 @@ from __future__ import unicode_literals, division, print_function
|
|||
|
||||
import plac
|
||||
from timeit import default_timer as timer
|
||||
from wasabi import Printer
|
||||
|
||||
from ._messages import Messages
|
||||
from ..gold import GoldCorpus
|
||||
from ..util import prints
|
||||
from .. import util
|
||||
from .. import displacy
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
model=("model name or path", "positional", None, str),
|
||||
data_path=("location of JSON-formatted evaluation data", "positional",
|
||||
None, str),
|
||||
gold_preproc=("use gold preprocessing", "flag", "G", bool),
|
||||
gpu_id=("use GPU", "option", "g", int),
|
||||
displacy_path=("directory to output rendered parses as HTML", "option",
|
||||
"dp", str),
|
||||
displacy_limit=("limit of parses to render as HTML", "option", "dl", int))
|
||||
def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None,
|
||||
displacy_limit=25):
|
||||
model=("Model name or path", "positional", None, str),
|
||||
data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
|
||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||
gpu_id=("Use GPU", "option", "g", int),
|
||||
displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
|
||||
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int),
|
||||
)
|
||||
def evaluate(
|
||||
model,
|
||||
data_path,
|
||||
gpu_id=-1,
|
||||
gold_preproc=False,
|
||||
displacy_path=None,
|
||||
displacy_limit=25,
|
||||
):
|
||||
"""
|
||||
Evaluate a model. To render a sample of parses in a HTML file, set an
|
||||
output directory as the displacy_path argument.
|
||||
"""
|
||||
|
||||
msg = Printer()
|
||||
util.fix_random_seed()
|
||||
if gpu_id >= 0:
|
||||
util.use_gpu(gpu_id)
|
||||
|
@ -34,9 +39,9 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None
|
|||
data_path = util.ensure_path(data_path)
|
||||
displacy_path = util.ensure_path(displacy_path)
|
||||
if not data_path.exists():
|
||||
prints(data_path, title=Messages.M034, exits=1)
|
||||
msg.fail(Messages.M034, data_path, exits=1)
|
||||
if displacy_path and not displacy_path.exists():
|
||||
prints(displacy_path, title=Messages.M035, exits=1)
|
||||
msg.fail(Messages.M035, displacy_path, exits=1)
|
||||
corpus = GoldCorpus(data_path, data_path)
|
||||
nlp = util.load_model(model)
|
||||
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
|
||||
|
@ -44,65 +49,80 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None
|
|||
scorer = nlp.evaluate(dev_docs, verbose=False)
|
||||
end = timer()
|
||||
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
|
||||
print_results(scorer, time=end - begin, words=nwords,
|
||||
wps=nwords / (end - begin))
|
||||
results = {
|
||||
"Time": "%.2f s" % end - begin,
|
||||
"Words": nwords,
|
||||
"Words/s": "%.0f" % nwords / (end - begin),
|
||||
"TOK": "%.2f" % scorer.token_acc,
|
||||
"POS": "%.2f" % scorer.tags_acc,
|
||||
"UAS": "%.2f" % scorer.uas,
|
||||
"LAS": "%.2f" % scorer.las,
|
||||
"NER P": "%.2f" % scorer.ents_p,
|
||||
"NER R": "%.2f" % scorer.ents_r,
|
||||
"NER F": "%.2f" % scorer.ents_f,
|
||||
}
|
||||
msg.table(results, title="Results")
|
||||
|
||||
if displacy_path:
|
||||
docs, golds = zip(*dev_docs)
|
||||
render_deps = 'parser' in nlp.meta.get('pipeline', [])
|
||||
render_ents = 'ner' in nlp.meta.get('pipeline', [])
|
||||
render_parses(docs, displacy_path, model_name=model,
|
||||
limit=displacy_limit, deps=render_deps, ents=render_ents)
|
||||
prints(displacy_path, title=Messages.M036.format(n=displacy_limit))
|
||||
render_deps = "parser" in nlp.meta.get("pipeline", [])
|
||||
render_ents = "ner" in nlp.meta.get("pipeline", [])
|
||||
render_parses(
|
||||
docs,
|
||||
displacy_path,
|
||||
model_name=model,
|
||||
limit=displacy_limit,
|
||||
deps=render_deps,
|
||||
ents=render_ents,
|
||||
)
|
||||
msg.good(Messages.M036.format(n=displacy_limit), displacy_path)
|
||||
|
||||
|
||||
def render_parses(docs, output_path, model_name='', limit=250, deps=True,
|
||||
ents=True):
|
||||
docs[0].user_data['title'] = model_name
|
||||
def render_parses(docs, output_path, model_name="", limit=250, deps=True, ents=True):
|
||||
docs[0].user_data["title"] = model_name
|
||||
if ents:
|
||||
with (output_path / 'entities.html').open('w') as file_:
|
||||
html = displacy.render(docs[:limit], style='ent', page=True)
|
||||
with (output_path / "entities.html").open("w") as file_:
|
||||
html = displacy.render(docs[:limit], style="ent", page=True)
|
||||
file_.write(html)
|
||||
if deps:
|
||||
with (output_path / 'parses.html').open('w') as file_:
|
||||
html = displacy.render(docs[:limit], style='dep', page=True,
|
||||
options={'compact': True})
|
||||
with (output_path / "parses.html").open("w") as file_:
|
||||
html = displacy.render(
|
||||
docs[:limit], style="dep", page=True, options={"compact": True}
|
||||
)
|
||||
file_.write(html)
|
||||
|
||||
|
||||
def print_progress(itn, losses, dev_scores, wps=0.0):
|
||||
scores = {}
|
||||
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
|
||||
'ents_p', 'ents_r', 'ents_f', 'wps']:
|
||||
for col in [
|
||||
"dep_loss",
|
||||
"tag_loss",
|
||||
"uas",
|
||||
"tags_acc",
|
||||
"token_acc",
|
||||
"ents_p",
|
||||
"ents_r",
|
||||
"ents_f",
|
||||
"wps",
|
||||
]:
|
||||
scores[col] = 0.0
|
||||
scores['dep_loss'] = losses.get('parser', 0.0)
|
||||
scores['ner_loss'] = losses.get('ner', 0.0)
|
||||
scores['tag_loss'] = losses.get('tagger', 0.0)
|
||||
scores["dep_loss"] = losses.get("parser", 0.0)
|
||||
scores["ner_loss"] = losses.get("ner", 0.0)
|
||||
scores["tag_loss"] = losses.get("tagger", 0.0)
|
||||
scores.update(dev_scores)
|
||||
scores['wps'] = wps
|
||||
tpl = '\t'.join((
|
||||
'{:d}',
|
||||
'{dep_loss:.3f}',
|
||||
'{ner_loss:.3f}',
|
||||
'{uas:.3f}',
|
||||
'{ents_p:.3f}',
|
||||
'{ents_r:.3f}',
|
||||
'{ents_f:.3f}',
|
||||
'{tags_acc:.3f}',
|
||||
'{token_acc:.3f}',
|
||||
'{wps:.1f}'))
|
||||
scores["wps"] = wps
|
||||
tpl = "\t".join(
|
||||
(
|
||||
"{:d}",
|
||||
"{dep_loss:.3f}",
|
||||
"{ner_loss:.3f}",
|
||||
"{uas:.3f}",
|
||||
"{ents_p:.3f}",
|
||||
"{ents_r:.3f}",
|
||||
"{ents_f:.3f}",
|
||||
"{tags_acc:.3f}",
|
||||
"{token_acc:.3f}",
|
||||
"{wps:.1f}",
|
||||
)
|
||||
)
|
||||
print(tpl.format(itn, **scores))
|
||||
|
||||
|
||||
def print_results(scorer, time, words, wps):
|
||||
results = {
|
||||
'Time': '%.2f s' % time,
|
||||
'Words': words,
|
||||
'Words/s': '%.0f' % wps,
|
||||
'TOK': '%.2f' % scorer.token_acc,
|
||||
'POS': '%.2f' % scorer.tags_acc,
|
||||
'UAS': '%.2f' % scorer.uas,
|
||||
'LAS': '%.2f' % scorer.las,
|
||||
'NER P': '%.2f' % scorer.ents_p,
|
||||
'NER R': '%.2f' % scorer.ents_r,
|
||||
'NER F': '%.2f' % scorer.ents_f}
|
||||
util.print_table(results, title="Results")
|
||||
|
|
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals
|
|||
import plac
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from wasabi import Printer
|
||||
|
||||
from ._messages import Messages
|
||||
from ..compat import path2str
|
||||
|
@ -12,56 +13,65 @@ from .. import about
|
|||
|
||||
|
||||
@plac.annotations(
|
||||
model=("optional: shortcut link of model", "positional", None, str),
|
||||
markdown=("generate Markdown for GitHub issues", "flag", "md", str),
|
||||
silent=("don't print anything (just return)", "flag", "s"))
|
||||
model=("Optional shortcut link of model", "positional", None, str),
|
||||
markdown=("Generate Markdown for GitHub issues", "flag", "md", str),
|
||||
silent=("Don't print anything (just return)", "flag", "s"),
|
||||
)
|
||||
def info(model=None, markdown=False, silent=False):
|
||||
"""Print info about spaCy installation. If a model shortcut link is
|
||||
"""
|
||||
Print info about spaCy installation. If a model shortcut link is
|
||||
speficied as an argument, print model information. Flag --markdown
|
||||
prints details in Markdown for easy copy-pasting to GitHub issues.
|
||||
"""
|
||||
msg = Printer()
|
||||
if model:
|
||||
if util.is_package(model):
|
||||
model_path = util.get_package_path(model)
|
||||
else:
|
||||
model_path = util.get_data_path() / model
|
||||
meta_path = model_path / 'meta.json'
|
||||
meta_path = model_path / "meta.json"
|
||||
if not meta_path.is_file():
|
||||
util.prints(meta_path, title=Messages.M020, exits=1)
|
||||
msg.fail(Messages.M020, meta_path, exits=1)
|
||||
meta = util.read_json(meta_path)
|
||||
if model_path.resolve() != model_path:
|
||||
meta['link'] = path2str(model_path)
|
||||
meta['source'] = path2str(model_path.resolve())
|
||||
meta["link"] = path2str(model_path)
|
||||
meta["source"] = path2str(model_path.resolve())
|
||||
else:
|
||||
meta['source'] = path2str(model_path)
|
||||
meta["source"] = path2str(model_path)
|
||||
if not silent:
|
||||
print_info(meta, 'model %s' % model, markdown)
|
||||
title = "Info about model '{}'".format(model)
|
||||
model_meta = {
|
||||
k: v for k, v in meta.items() if k not in ("accuracy", "speed")
|
||||
}
|
||||
if markdown:
|
||||
util.print_markdown(model_meta, title=title)
|
||||
else:
|
||||
msg.table(model_meta, title=title)
|
||||
return meta
|
||||
data = {'spaCy version': about.__version__,
|
||||
'Location': path2str(Path(__file__).parent.parent),
|
||||
'Platform': platform.platform(),
|
||||
'Python version': platform.python_version(),
|
||||
'Models': list_models()}
|
||||
data = {
|
||||
"spaCy version": about.__version__,
|
||||
"Location": path2str(Path(__file__).parent.parent),
|
||||
"Platform": platform.platform(),
|
||||
"Python version": platform.python_version(),
|
||||
"Models": list_models(),
|
||||
}
|
||||
if not silent:
|
||||
print_info(data, 'spaCy', markdown)
|
||||
title = "Info about spaCy"
|
||||
if markdown:
|
||||
util.print_markdown(data, title=title)
|
||||
else:
|
||||
msg.table(data, title=title)
|
||||
return data
|
||||
|
||||
|
||||
def print_info(data, title, markdown):
|
||||
title = 'Info about %s' % title
|
||||
if markdown:
|
||||
util.print_markdown(data, title=title)
|
||||
else:
|
||||
util.print_table(data, title=title)
|
||||
|
||||
|
||||
def list_models():
|
||||
def exclude_dir(dir_name):
|
||||
# exclude common cache directories and hidden directories
|
||||
exclude = ['cache', 'pycache', '__pycache__']
|
||||
return dir_name in exclude or dir_name.startswith('.')
|
||||
exclude = ("cache", "pycache", "__pycache__")
|
||||
return dir_name in exclude or dir_name.startswith(".")
|
||||
|
||||
data_path = util.get_data_path()
|
||||
if data_path:
|
||||
models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()]
|
||||
return ', '.join([m for m in models if not exclude_dir(m)])
|
||||
return '-'
|
||||
return ", ".join([m for m in models if not exclude_dir(m)])
|
||||
return "-"
|
||||
|
|
|
@ -11,13 +11,12 @@ from preshed.counter import PreshCounter
|
|||
import tarfile
|
||||
import gzip
|
||||
import zipfile
|
||||
import ujson as json
|
||||
from spacy.lexeme import intify_attrs
|
||||
from wasabi import Printer
|
||||
|
||||
from ._messages import Messages
|
||||
from ..vectors import Vectors
|
||||
from ..errors import Errors, Warnings, user_warning
|
||||
from ..util import prints, ensure_path, get_lang_class
|
||||
from ..util import ensure_path, get_lang_class, read_jsonl
|
||||
|
||||
try:
|
||||
import ftfy
|
||||
|
@ -25,121 +24,133 @@ except ImportError:
|
|||
ftfy = None
|
||||
|
||||
|
||||
msg = Printer()
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
lang=("model language", "positional", None, str),
|
||||
output_dir=("model output directory", "positional", None, Path),
|
||||
freqs_loc=("location of words frequencies file", "option", "f", Path),
|
||||
jsonl_loc=("location of JSONL-formatted attributes file", "option", "j", Path),
|
||||
clusters_loc=("optional: location of brown clusters data",
|
||||
"option", "c", str),
|
||||
vectors_loc=("optional: location of vectors file in Word2Vec format "
|
||||
"(either as .txt or zipped as .zip or .tar.gz)", "option",
|
||||
"v", str),
|
||||
prune_vectors=("optional: number of vectors to prune to",
|
||||
"option", "V", int)
|
||||
lang=("Model language", "positional", None, str),
|
||||
output_dir=("Model output directory", "positional", None, Path),
|
||||
freqs_loc=("Location of words frequencies file", "option", "f", Path),
|
||||
jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
|
||||
clusters_loc=("Optional location of brown clusters data", "option", "c", str),
|
||||
vectors_loc=("Optional vectors file in Word2Vec format" "option", "v", str),
|
||||
prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
|
||||
)
|
||||
def init_model(lang, output_dir, freqs_loc=None, clusters_loc=None, jsonl_loc=None,
|
||||
vectors_loc=None, prune_vectors=-1):
|
||||
def init_model(
|
||||
lang,
|
||||
output_dir,
|
||||
freqs_loc=None,
|
||||
clusters_loc=None,
|
||||
jsonl_loc=None,
|
||||
vectors_loc=None,
|
||||
prune_vectors=-1,
|
||||
):
|
||||
"""
|
||||
Create a new model from raw data, like word frequencies, Brown clusters
|
||||
and word vectors.
|
||||
and word vectors. If vectors are provided in Word2Vec format, they can
|
||||
be either a .txt or zipped as a .zip or .tar.gz.
|
||||
"""
|
||||
if jsonl_loc is not None:
|
||||
if freqs_loc is not None or clusters_loc is not None:
|
||||
settings = ['-j']
|
||||
settings = ["-j"]
|
||||
if freqs_loc:
|
||||
settings.append('-f')
|
||||
settings.append("-f")
|
||||
if clusters_loc:
|
||||
settings.append('-c')
|
||||
prints(' '.join(settings),
|
||||
title=(
|
||||
"The -f and -c arguments are deprecated, and not compatible "
|
||||
"with the -j argument, which should specify the same information. "
|
||||
"Either merge the frequencies and clusters data into the "
|
||||
"jsonl-formatted file (recommended), or use only the -f and "
|
||||
"-c files, without the other lexical attributes."))
|
||||
settings.append("-c")
|
||||
msg.warn(Messages.M063, Messages.M064)
|
||||
jsonl_loc = ensure_path(jsonl_loc)
|
||||
lex_attrs = (json.loads(line) for line in jsonl_loc.open())
|
||||
lex_attrs = read_jsonl(jsonl_loc)
|
||||
else:
|
||||
clusters_loc = ensure_path(clusters_loc)
|
||||
freqs_loc = ensure_path(freqs_loc)
|
||||
if freqs_loc is not None and not freqs_loc.exists():
|
||||
prints(freqs_loc, title=Messages.M037, exits=1)
|
||||
msg.fail(Messages.M037, freqs_loc, exits=1)
|
||||
lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)
|
||||
|
||||
nlp = create_model(lang, lex_attrs)
|
||||
with msg.loading("Creating model..."):
|
||||
nlp = create_model(lang, lex_attrs)
|
||||
msg.good("Successfully created model")
|
||||
if vectors_loc is not None:
|
||||
add_vectors(nlp, vectors_loc, prune_vectors)
|
||||
vec_added = len(nlp.vocab.vectors)
|
||||
lex_added = len(nlp.vocab)
|
||||
prints(Messages.M039.format(entries=lex_added, vectors=vec_added),
|
||||
title=Messages.M038)
|
||||
msg.good(Messages.M038, Messages.M039.format(entries=lex_added, vectors=vec_added))
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
nlp.to_disk(output_dir)
|
||||
return nlp
|
||||
|
||||
|
||||
def open_file(loc):
|
||||
'''Handle .gz, .tar.gz or unzipped files'''
|
||||
"""Handle .gz, .tar.gz or unzipped files"""
|
||||
loc = ensure_path(loc)
|
||||
print("Open loc")
|
||||
if tarfile.is_tarfile(str(loc)):
|
||||
return tarfile.open(str(loc), 'r:gz')
|
||||
elif loc.parts[-1].endswith('gz'):
|
||||
return (line.decode('utf8') for line in gzip.open(str(loc), 'r'))
|
||||
elif loc.parts[-1].endswith('zip'):
|
||||
return tarfile.open(str(loc), "r:gz")
|
||||
elif loc.parts[-1].endswith("gz"):
|
||||
return (line.decode("utf8") for line in gzip.open(str(loc), "r"))
|
||||
elif loc.parts[-1].endswith("zip"):
|
||||
zip_file = zipfile.ZipFile(str(loc))
|
||||
names = zip_file.namelist()
|
||||
file_ = zip_file.open(names[0])
|
||||
return (line.decode('utf8') for line in file_)
|
||||
return (line.decode("utf8") for line in file_)
|
||||
else:
|
||||
return loc.open('r', encoding='utf8')
|
||||
return loc.open("r", encoding="utf8")
|
||||
|
||||
|
||||
def read_attrs_from_deprecated(freqs_loc, clusters_loc):
|
||||
probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20)
|
||||
clusters = read_clusters(clusters_loc) if clusters_loc else {}
|
||||
with msg.loading("Counting frequencies..."):
|
||||
probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20)
|
||||
msg.good("Counted frequencies")
|
||||
with msg.loading("Reading clusters..."):
|
||||
clusters = read_clusters(clusters_loc) if clusters_loc else {}
|
||||
msg.good("Read clusters")
|
||||
lex_attrs = []
|
||||
sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
|
||||
for i, (word, prob) in tqdm(enumerate(sorted_probs)):
|
||||
attrs = {'orth': word, 'id': i, 'prob': prob}
|
||||
attrs = {"orth": word, "id": i, "prob": prob}
|
||||
# Decode as a little-endian string, so that we can do & 15 to get
|
||||
# the first 4 bits. See _parse_features.pyx
|
||||
if word in clusters:
|
||||
attrs['cluster'] = int(clusters[word][::-1], 2)
|
||||
attrs["cluster"] = int(clusters[word][::-1], 2)
|
||||
else:
|
||||
attrs['cluster'] = 0
|
||||
attrs["cluster"] = 0
|
||||
lex_attrs.append(attrs)
|
||||
return lex_attrs
|
||||
|
||||
|
||||
def create_model(lang, lex_attrs):
|
||||
print("Creating model...")
|
||||
lang_class = get_lang_class(lang)
|
||||
nlp = lang_class()
|
||||
for lexeme in nlp.vocab:
|
||||
lexeme.rank = 0
|
||||
lex_added = 0
|
||||
for attrs in lex_attrs:
|
||||
if 'settings' in attrs:
|
||||
if "settings" in attrs:
|
||||
continue
|
||||
lexeme = nlp.vocab[attrs['orth']]
|
||||
lexeme = nlp.vocab[attrs["orth"]]
|
||||
lexeme.set_attrs(**attrs)
|
||||
lexeme.is_oov = False
|
||||
lex_added += 1
|
||||
lex_added += 1
|
||||
oov_prob = min(lex.prob for lex in nlp.vocab)
|
||||
nlp.vocab.cfg.update({'oov_prob': oov_prob-1})
|
||||
nlp.vocab.cfg.update({"oov_prob": oov_prob - 1})
|
||||
return nlp
|
||||
|
||||
|
||||
def add_vectors(nlp, vectors_loc, prune_vectors):
|
||||
vectors_loc = ensure_path(vectors_loc)
|
||||
if vectors_loc and vectors_loc.parts[-1].endswith('.npz'):
|
||||
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open('rb')))
|
||||
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
||||
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
|
||||
for lex in nlp.vocab:
|
||||
if lex.rank:
|
||||
nlp.vocab.vectors.add(lex.orth, row=lex.rank)
|
||||
else:
|
||||
vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None)
|
||||
if vectors_loc:
|
||||
with msg.loading("Reading vectors from {}".format(vectors_loc)):
|
||||
vectors_data, vector_keys = read_vectors(vectors_loc)
|
||||
msg.good("Loaded vectors from {}".format(vectors_loc))
|
||||
else:
|
||||
vectors_data, vector_keys = (None, None)
|
||||
if vector_keys is not None:
|
||||
for word in vector_keys:
|
||||
if word not in nlp.vocab:
|
||||
|
@ -147,35 +158,34 @@ def add_vectors(nlp, vectors_loc, prune_vectors):
|
|||
lexeme.is_oov = False
|
||||
if vectors_data is not None:
|
||||
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
||||
nlp.vocab.vectors.name = '%s_model.vectors' % nlp.meta['lang']
|
||||
nlp.meta['vectors']['name'] = nlp.vocab.vectors.name
|
||||
nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"]
|
||||
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
||||
if prune_vectors >= 1:
|
||||
nlp.vocab.prune_vectors(prune_vectors)
|
||||
|
||||
|
||||
def read_vectors(vectors_loc):
|
||||
print("Reading vectors from %s" % vectors_loc)
|
||||
f = open_file(vectors_loc)
|
||||
shape = tuple(int(size) for size in next(f).split())
|
||||
vectors_data = numpy.zeros(shape=shape, dtype='f')
|
||||
vectors_data = numpy.zeros(shape=shape, dtype="f")
|
||||
vectors_keys = []
|
||||
for i, line in enumerate(tqdm(f)):
|
||||
line = line.rstrip()
|
||||
pieces = line.rsplit(' ', vectors_data.shape[1]+1)
|
||||
pieces = line.rsplit(" ", vectors_data.shape[1] + 1)
|
||||
word = pieces.pop(0)
|
||||
if len(pieces) != vectors_data.shape[1]:
|
||||
raise ValueError(Errors.E094.format(line_num=i, loc=vectors_loc))
|
||||
vectors_data[i] = numpy.asarray(pieces, dtype='f')
|
||||
msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
|
||||
vectors_data[i] = numpy.asarray(pieces, dtype="f")
|
||||
vectors_keys.append(word)
|
||||
return vectors_data, vectors_keys
|
||||
|
||||
|
||||
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
||||
print("Counting frequencies...")
|
||||
counts = PreshCounter()
|
||||
total = 0
|
||||
with freqs_loc.open() as f:
|
||||
for i, line in enumerate(f):
|
||||
freq, doc_freq, key = line.rstrip().split('\t', 2)
|
||||
freq, doc_freq, key = line.rstrip().split("\t", 2)
|
||||
freq = int(freq)
|
||||
counts.inc(i + 1, freq)
|
||||
total += freq
|
||||
|
@ -184,7 +194,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
|||
probs = {}
|
||||
with freqs_loc.open() as f:
|
||||
for line in tqdm(f):
|
||||
freq, doc_freq, key = line.rstrip().split('\t', 2)
|
||||
freq, doc_freq, key = line.rstrip().split("\t", 2)
|
||||
doc_freq = int(doc_freq)
|
||||
freq = int(freq)
|
||||
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
|
||||
|
@ -196,7 +206,6 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
|||
|
||||
|
||||
def read_clusters(clusters_loc):
|
||||
print("Reading clusters...")
|
||||
clusters = {}
|
||||
if ftfy is None:
|
||||
user_warning(Warnings.W004)
|
||||
|
@ -213,7 +222,7 @@ def read_clusters(clusters_loc):
|
|||
if int(freq) >= 3:
|
||||
clusters[word] = cluster
|
||||
else:
|
||||
clusters[word] = '0'
|
||||
clusters[word] = "0"
|
||||
# Expand clusters with re-casing
|
||||
for word, cluster in list(clusters.items()):
|
||||
if word.lower() not in clusters:
|
||||
|
|
|
@ -3,51 +3,54 @@ from __future__ import unicode_literals
|
|||
|
||||
import plac
|
||||
from pathlib import Path
|
||||
from wasabi import Printer
|
||||
|
||||
from ._messages import Messages
|
||||
from ..compat import symlink_to, path2str
|
||||
from ..util import prints
|
||||
from .. import util
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
origin=("package name or local path to model", "positional", None, str),
|
||||
link_name=("name of shortuct link to create", "positional", None, str),
|
||||
force=("force overwriting of existing link", "flag", "f", bool))
|
||||
force=("force overwriting of existing link", "flag", "f", bool),
|
||||
)
|
||||
def link(origin, link_name, force=False, model_path=None):
|
||||
"""
|
||||
Create a symlink for models within the spacy/data directory. Accepts
|
||||
either the name of a pip package, or the local path to the model data
|
||||
directory. Linking models allows loading them via spacy.load(link_name).
|
||||
"""
|
||||
msg = Printer()
|
||||
if util.is_package(origin):
|
||||
model_path = util.get_package_path(origin)
|
||||
else:
|
||||
model_path = Path(origin) if model_path is None else Path(model_path)
|
||||
if not model_path.exists():
|
||||
prints(Messages.M009.format(path=path2str(model_path)),
|
||||
title=Messages.M008, exits=1)
|
||||
msg.fail(
|
||||
Messages.M008, Messages.M009.format(path=path2str(model_path)), exits=1
|
||||
)
|
||||
data_path = util.get_data_path()
|
||||
if not data_path or not data_path.exists():
|
||||
spacy_loc = Path(__file__).parent.parent
|
||||
prints(Messages.M011, spacy_loc, title=Messages.M010, exits=1)
|
||||
msg.fail(Messages.M010, Messages.M011.format(path=spacy_loc), exits=1)
|
||||
link_path = util.get_data_path() / link_name
|
||||
if link_path.is_symlink() and not force:
|
||||
prints(Messages.M013, title=Messages.M012.format(name=link_name),
|
||||
exits=1)
|
||||
msg.fail(Messages.M012.format(name=link_name), Messages.M013, exits=1)
|
||||
elif link_path.is_symlink(): # does a symlink exist?
|
||||
# NB: It's important to check for is_symlink here and not for exists,
|
||||
# because invalid/outdated symlinks would return False otherwise.
|
||||
link_path.unlink()
|
||||
elif link_path.exists(): # does it exist otherwise?
|
||||
elif link_path.exists(): # does it exist otherwise?
|
||||
# NB: Check this last because valid symlinks also "exist".
|
||||
prints(Messages.M015, link_path,
|
||||
title=Messages.M014.format(name=link_name), exits=1)
|
||||
msg = "%s --> %s" % (path2str(model_path), path2str(link_path))
|
||||
msg.fail(Messages.M014.format(name=link_name), Messages.M015, exits=1)
|
||||
details = "%s --> %s" % (path2str(model_path), path2str(link_path))
|
||||
try:
|
||||
symlink_to(link_path, model_path)
|
||||
except:
|
||||
except: # noqa: E722
|
||||
# This is quite dirty, but just making sure other errors are caught.
|
||||
prints(Messages.M017, msg, title=Messages.M016.format(name=link_name))
|
||||
msg.fail(Messages.M016.format(name=link_name), Messages.M017)
|
||||
msg.text(details)
|
||||
raise
|
||||
prints(msg, Messages.M019.format(name=link_name), title=Messages.M018)
|
||||
msg.good(Messages.M018, details)
|
||||
msg.text(Messages.M019.format(name=link_name))
|
||||
|
|
|
@ -4,109 +4,106 @@ from __future__ import unicode_literals
|
|||
import plac
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from wasabi import Printer, get_raw_input
|
||||
|
||||
from ._messages import Messages
|
||||
from ..compat import path2str, json_dumps
|
||||
from ..util import prints
|
||||
from .. import util
|
||||
from .. import about
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
input_dir=("directory with model data", "positional", None, str),
|
||||
output_dir=("output parent directory", "positional", None, str),
|
||||
meta_path=("path to meta.json", "option", "m", str),
|
||||
create_meta=("create meta.json, even if one exists in directory – if "
|
||||
"existing meta is found, entries are shown as defaults in "
|
||||
"the command line prompt", "flag", "c", bool),
|
||||
force=("force overwriting of existing model directory in output directory",
|
||||
"flag", "f", bool))
|
||||
def package(input_dir, output_dir, meta_path=None, create_meta=False,
|
||||
force=False):
|
||||
input_dir=("Directory with model data", "positional", None, str),
|
||||
output_dir=("Output parent directory", "positional", None, str),
|
||||
meta_path=("Path to meta.json", "option", "m", str),
|
||||
create_meta=("Create meta.json, even if one exists", "flag", "c", bool),
|
||||
force=("Force overwriting existing model in output directory", "flag", "f", bool),
|
||||
)
|
||||
def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False):
|
||||
"""
|
||||
Generate Python package for model data, including meta and required
|
||||
installation files. A new directory will be created in the specified
|
||||
output directory, and model data will be copied over.
|
||||
output directory, and model data will be copied over. If --create-meta is
|
||||
set and a meta.json already exists in the output directory, the existing
|
||||
values will be used as the defaults in the command-line prompt.
|
||||
"""
|
||||
msg = Printer()
|
||||
input_path = util.ensure_path(input_dir)
|
||||
output_path = util.ensure_path(output_dir)
|
||||
meta_path = util.ensure_path(meta_path)
|
||||
if not input_path or not input_path.exists():
|
||||
prints(input_path, title=Messages.M008, exits=1)
|
||||
msg.fail(Messages.M008, input_path, exits=1)
|
||||
if not output_path or not output_path.exists():
|
||||
prints(output_path, title=Messages.M040, exits=1)
|
||||
msg.fail(Messages.M040, output_path, exits=1)
|
||||
if meta_path and not meta_path.exists():
|
||||
prints(meta_path, title=Messages.M020, exits=1)
|
||||
msg.fail(Messages.M020, meta_path, exits=1)
|
||||
|
||||
meta_path = meta_path or input_path / 'meta.json'
|
||||
meta_path = meta_path or input_path / "meta.json"
|
||||
if meta_path.is_file():
|
||||
meta = util.read_json(meta_path)
|
||||
if not create_meta: # only print this if user doesn't want to overwrite
|
||||
prints(meta_path, title=Messages.M041)
|
||||
if not create_meta: # only print if user doesn't want to overwrite
|
||||
msg.good(Messages.M041, meta_path)
|
||||
else:
|
||||
meta = generate_meta(input_dir, meta)
|
||||
meta = validate_meta(meta, ['lang', 'name', 'version'])
|
||||
model_name = meta['lang'] + '_' + meta['name']
|
||||
model_name_v = model_name + '-' + meta['version']
|
||||
meta = generate_meta(input_dir, meta, msg)
|
||||
for key in ("lang", "name", "version"):
|
||||
if key not in meta or meta[key] == "":
|
||||
msg.fail(Messages.M048.format(key=key), Messages.M049, exits=1)
|
||||
model_name = meta["lang"] + "_" + meta["name"]
|
||||
model_name_v = model_name + "-" + meta["version"]
|
||||
main_path = output_path / model_name_v
|
||||
package_path = main_path / model_name
|
||||
|
||||
create_dirs(package_path, force)
|
||||
shutil.copytree(path2str(input_path),
|
||||
path2str(package_path / model_name_v))
|
||||
create_file(main_path / 'meta.json', json_dumps(meta))
|
||||
create_file(main_path / 'setup.py', TEMPLATE_SETUP)
|
||||
create_file(main_path / 'MANIFEST.in', TEMPLATE_MANIFEST)
|
||||
create_file(package_path / '__init__.py', TEMPLATE_INIT)
|
||||
prints(main_path, Messages.M043,
|
||||
title=Messages.M042.format(name=model_name_v))
|
||||
|
||||
|
||||
def create_dirs(package_path, force):
|
||||
if package_path.exists():
|
||||
if force:
|
||||
shutil.rmtree(path2str(package_path))
|
||||
else:
|
||||
prints(package_path, Messages.M045, title=Messages.M044, exits=1)
|
||||
msg.fail(
|
||||
Messages.M044,
|
||||
Messages.M045.format(path=path2str(package_path)),
|
||||
exits=1,
|
||||
)
|
||||
Path.mkdir(package_path, parents=True)
|
||||
shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
|
||||
create_file(main_path / "meta.json", json_dumps(meta))
|
||||
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
||||
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
||||
create_file(package_path / "__init__.py", TEMPLATE_INIT)
|
||||
msg.good(Messages.M042.format(name=model_name_v), main_path)
|
||||
msg.text(Messages.M043)
|
||||
|
||||
|
||||
def create_file(file_path, contents):
|
||||
file_path.touch()
|
||||
file_path.open('w', encoding='utf-8').write(contents)
|
||||
file_path.open("w", encoding="utf-8").write(contents)
|
||||
|
||||
|
||||
def generate_meta(model_path, existing_meta):
|
||||
def generate_meta(model_path, existing_meta, msg):
|
||||
meta = existing_meta or {}
|
||||
settings = [('lang', 'Model language', meta.get('lang', 'en')),
|
||||
('name', 'Model name', meta.get('name', 'model')),
|
||||
('version', 'Model version', meta.get('version', '0.0.0')),
|
||||
('spacy_version', 'Required spaCy version',
|
||||
'>=%s,<3.0.0' % about.__version__),
|
||||
('description', 'Model description',
|
||||
meta.get('description', False)),
|
||||
('author', 'Author', meta.get('author', False)),
|
||||
('email', 'Author email', meta.get('email', False)),
|
||||
('url', 'Author website', meta.get('url', False)),
|
||||
('license', 'License', meta.get('license', 'CC BY-SA 3.0'))]
|
||||
settings = [
|
||||
("lang", "Model language", meta.get("lang", "en")),
|
||||
("name", "Model name", meta.get("name", "model")),
|
||||
("version", "Model version", meta.get("version", "0.0.0")),
|
||||
("spacy_version", "Required spaCy version", ">=%s,<3.0.0" % about.__version__),
|
||||
("description", "Model description", meta.get("description", False)),
|
||||
("author", "Author", meta.get("author", False)),
|
||||
("email", "Author email", meta.get("email", False)),
|
||||
("url", "Author website", meta.get("url", False)),
|
||||
("license", "License", meta.get("license", "CC BY-SA 3.0")),
|
||||
]
|
||||
nlp = util.load_model_from_path(Path(model_path))
|
||||
meta['pipeline'] = nlp.pipe_names
|
||||
meta['vectors'] = {'width': nlp.vocab.vectors_length,
|
||||
'vectors': len(nlp.vocab.vectors),
|
||||
'keys': nlp.vocab.vectors.n_keys}
|
||||
prints(Messages.M047, title=Messages.M046)
|
||||
meta["pipeline"] = nlp.pipe_names
|
||||
meta["vectors"] = {
|
||||
"width": nlp.vocab.vectors_length,
|
||||
"vectors": len(nlp.vocab.vectors),
|
||||
"keys": nlp.vocab.vectors.n_keys,
|
||||
}
|
||||
msg.divider(Messages.M046)
|
||||
msg.text(Messages.M047)
|
||||
for setting, desc, default in settings:
|
||||
response = util.get_raw_input(desc, default)
|
||||
meta[setting] = default if response == '' and default else response
|
||||
if about.__title__ != 'spacy':
|
||||
meta['parent_package'] = about.__title__
|
||||
return meta
|
||||
|
||||
|
||||
def validate_meta(meta, keys):
|
||||
for key in keys:
|
||||
if key not in meta or meta[key] == '':
|
||||
prints(Messages.M049, title=Messages.M048.format(key=key), exits=1)
|
||||
response = get_raw_input(desc, default)
|
||||
meta[setting] = default if response == "" and default else response
|
||||
if about.__title__ != "spacy":
|
||||
meta["parent_package"] = about.__title__
|
||||
return meta
|
||||
|
||||
|
||||
|
|
|
@ -1,66 +1,148 @@
|
|||
'''This script is experimental.
|
||||
|
||||
Try pre-training the CNN component of the text categorizer using a cheap
|
||||
language modelling-like objective. Specifically, we load pre-trained vectors
|
||||
(from something like word2vec, GloVe, FastText etc), and use the CNN to
|
||||
predict the tokens' pre-trained vectors. This isn't as easy as it sounds:
|
||||
we're not merely doing compression here, because heavy dropout is applied,
|
||||
including over the input words. This means the model must often (50% of the time)
|
||||
use the context in order to predict the word.
|
||||
|
||||
To evaluate the technique, we're pre-training with the 50k texts from the IMDB
|
||||
corpus, and then training with only 100 labels. Note that it's a bit dirty to
|
||||
pre-train with the development data, but also not *so* terrible: we're not using
|
||||
the development labels, after all --- only the unlabelled text.
|
||||
'''
|
||||
# coding: utf8
|
||||
from __future__ import print_function, unicode_literals
|
||||
|
||||
import plac
|
||||
import random
|
||||
import numpy
|
||||
import time
|
||||
import ujson as json
|
||||
from pathlib import Path
|
||||
import ujson
|
||||
import sys
|
||||
from collections import Counter
|
||||
|
||||
import spacy
|
||||
from spacy.tokens import Doc
|
||||
from spacy.attrs import ID, HEAD
|
||||
from spacy.util import minibatch, minibatch_by_words, use_gpu, compounding, ensure_path
|
||||
from spacy._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
|
||||
from pathlib import Path
|
||||
from thinc.v2v import Affine, Maxout
|
||||
from thinc.api import wrap
|
||||
from thinc.misc import LayerNorm as LN
|
||||
from thinc.neural.util import prefer_gpu
|
||||
from wasabi import Printer
|
||||
|
||||
from ..tokens import Doc
|
||||
from ..attrs import ID, HEAD
|
||||
from ..compat import json_dumps
|
||||
from .._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
|
||||
from .. import util
|
||||
|
||||
|
||||
def prefer_gpu():
|
||||
used = spacy.util.use_gpu(0)
|
||||
if used is None:
|
||||
return False
|
||||
else:
|
||||
import cupy.random
|
||||
cupy.random.seed(0)
|
||||
return True
|
||||
@plac.annotations(
|
||||
texts_loc=("Path to jsonl file with texts to learn from", "positional", None, str),
|
||||
vectors_model=("Name or path to vectors model to learn from"),
|
||||
output_dir=("Directory to write models each epoch", "positional", None, str),
|
||||
width=("Width of CNN layers", "option", "cw", int),
|
||||
depth=("Depth of CNN layers", "option", "cd", int),
|
||||
embed_rows=("Embedding rows", "option", "er", int),
|
||||
use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
|
||||
dropout=("Dropout", "option", "d", float),
|
||||
seed=("Seed for random number generators", "option", "s", float),
|
||||
nr_iter=("Number of iterations to pretrain", "option", "i", int),
|
||||
)
|
||||
def pretrain(
|
||||
texts_loc,
|
||||
vectors_model,
|
||||
output_dir,
|
||||
width=96,
|
||||
depth=4,
|
||||
embed_rows=2000,
|
||||
use_vectors=False,
|
||||
dropout=0.2,
|
||||
nr_iter=1000,
|
||||
seed=0,
|
||||
):
|
||||
"""
|
||||
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
||||
using an approximate language-modelling objective. Specifically, we load
|
||||
pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
|
||||
vectors which match the pre-trained ones. The weights are saved to a directory
|
||||
after each epoch. You can then pass a path to one of these pre-trained weights
|
||||
files to the 'spacy train' command.
|
||||
|
||||
This technique may be especially helpful if you have little labelled data.
|
||||
However, it's still quite experimental, so your mileage may vary.
|
||||
|
||||
def load_texts(path):
|
||||
'''Load inputs from a jsonl file.
|
||||
|
||||
Each line should be a dict like {"text": "..."}
|
||||
'''
|
||||
path = ensure_path(path)
|
||||
with path.open('r', encoding='utf8') as file_:
|
||||
texts = [json.loads(line) for line in file_]
|
||||
random.shuffle(texts)
|
||||
return texts
|
||||
To load the weights back in during 'spacy train', you need to ensure
|
||||
all settings are the same between pretraining and training. The API and
|
||||
errors around this need some improvement.
|
||||
"""
|
||||
config = dict(locals())
|
||||
msg = Printer()
|
||||
util.fix_random_seed(seed)
|
||||
|
||||
has_gpu = prefer_gpu()
|
||||
msg.info("Using GPU" if has_gpu else "Not using GPU")
|
||||
|
||||
output_dir = Path(output_dir)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
msg.good("Created output directory")
|
||||
util.write_json(output_dir / "config.json", config)
|
||||
msg.good("Saved settings to config.json")
|
||||
|
||||
# Load texts from file or stdin
|
||||
if texts_loc != "-": # reading from a file
|
||||
texts_loc = Path(texts_loc)
|
||||
if not texts_loc.exists():
|
||||
msg.fail("Input text file doesn't exist", texts_loc, exits=1)
|
||||
with msg.loading("Loading input texts..."):
|
||||
texts = list(util.read_jsonl(texts_loc))
|
||||
msg.good("Loaded input texts")
|
||||
random.shuffle(texts)
|
||||
else: # reading from stdin
|
||||
msg.text("Reading input text from stdin...")
|
||||
texts = stream_texts()
|
||||
|
||||
with msg.loading("Loading model '{}'...".format(vectors_model)):
|
||||
nlp = util.load_model(vectors_model)
|
||||
msg.good("Loaded model '{}'".format(vectors_model))
|
||||
pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
|
||||
model = create_pretraining_model(
|
||||
nlp,
|
||||
Tok2Vec(
|
||||
width,
|
||||
embed_rows,
|
||||
conv_depth=depth,
|
||||
pretrained_vectors=pretrained_vectors,
|
||||
bilstm_depth=0, # Requires PyTorch. Experimental.
|
||||
cnn_maxout_pieces=2, # You can try setting this higher
|
||||
subword_features=True,
|
||||
),
|
||||
) # Set to False for character models, e.g. Chinese
|
||||
optimizer = create_default_optimizer(model.ops)
|
||||
tracker = ProgressTracker()
|
||||
msg.divider("Pre-training tok2vec layer")
|
||||
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
||||
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
|
||||
for epoch in range(nr_iter):
|
||||
for batch in util.minibatch_by_words(
|
||||
((text, None) for text in texts), size=5000
|
||||
):
|
||||
docs = make_docs(nlp, [text for (text, _) in batch])
|
||||
loss = make_update(model, docs, optimizer, drop=dropout)
|
||||
progress = tracker.update(epoch, loss, docs)
|
||||
if progress:
|
||||
msg.row(progress, **row_settings)
|
||||
if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
|
||||
break
|
||||
with model.use_params(optimizer.averages):
|
||||
with (output_dir / ("model%d.bin" % epoch)).open("wb") as file_:
|
||||
file_.write(model.tok2vec.to_bytes())
|
||||
log = {
|
||||
"nr_word": tracker.nr_word,
|
||||
"loss": tracker.loss,
|
||||
"epoch_loss": tracker.epoch_loss,
|
||||
"epoch": epoch,
|
||||
}
|
||||
with (output_dir / "log.jsonl").open("a") as file_:
|
||||
file_.write(json_dumps(log) + "\n")
|
||||
tracker.epoch_loss = 0.0
|
||||
if texts_loc != "-":
|
||||
# Reshuffle the texts if texts were loaded from a file
|
||||
random.shuffle(texts)
|
||||
|
||||
|
||||
def stream_texts():
|
||||
for line in sys.stdin:
|
||||
yield json.loads(line)
|
||||
yield ujson.loads(line)
|
||||
|
||||
|
||||
def make_update(model, docs, optimizer, drop=0.):
|
||||
def make_update(model, docs, optimizer, drop=0.0):
|
||||
"""Perform an update over a single batch of documents.
|
||||
|
||||
docs (iterable): A batch of `Doc` objects.
|
||||
|
@ -74,7 +156,7 @@ def make_update(model, docs, optimizer, drop=0.):
|
|||
# Don't want to return a cupy object here
|
||||
# The gradients are modified in-place by the BERT MLM,
|
||||
# so we get an accurate loss
|
||||
loss = float((gradients**2).mean())
|
||||
loss = float((gradients ** 2).mean())
|
||||
return loss
|
||||
|
||||
|
||||
|
@ -98,7 +180,7 @@ def make_docs(nlp, batch, min_length=1, max_length=500):
|
|||
|
||||
def get_vectors_loss(ops, docs, prediction):
|
||||
"""Compute a mean-squared error loss between the documents' vectors and
|
||||
the prediction.
|
||||
the prediction.
|
||||
|
||||
Note that this is ripe for customization! We could compute the vectors
|
||||
in some other word, e.g. with an LSTM language model, or use some other
|
||||
|
@ -115,43 +197,40 @@ def get_vectors_loss(ops, docs, prediction):
|
|||
|
||||
|
||||
def create_pretraining_model(nlp, tok2vec):
|
||||
'''Define a network for the pretraining. We simply add an output layer onto
|
||||
"""Define a network for the pretraining. We simply add an output layer onto
|
||||
the tok2vec input model. The tok2vec input model needs to be a model that
|
||||
takes a batch of Doc objects (as a list), and returns a list of arrays.
|
||||
Each array in the output needs to have one row per token in the doc.
|
||||
'''
|
||||
"""
|
||||
output_size = nlp.vocab.vectors.data.shape[1]
|
||||
output_layer = chain(
|
||||
LN(Maxout(300, pieces=3)),
|
||||
zero_init(Affine(output_size, drop_factor=0.0))
|
||||
LN(Maxout(300, pieces=3)), zero_init(Affine(output_size, drop_factor=0.0))
|
||||
)
|
||||
# This is annoying, but the parser etc have the flatten step after
|
||||
# the tok2vec. To load the weights in cleanly, we need to match
|
||||
# the shape of the models' components exactly. So what we cann
|
||||
# "tok2vec" has to be the same set of processes as what the components do.
|
||||
tok2vec = chain(tok2vec, flatten)
|
||||
model = chain(
|
||||
tok2vec,
|
||||
output_layer
|
||||
)
|
||||
model = chain(tok2vec, output_layer)
|
||||
model = masked_language_model(nlp.vocab, model)
|
||||
model.tok2vec = tok2vec
|
||||
model.output_layer = output_layer
|
||||
model.begin_training([nlp.make_doc('Give it a doc to infer shapes')])
|
||||
model.begin_training([nlp.make_doc("Give it a doc to infer shapes")])
|
||||
return model
|
||||
|
||||
|
||||
def masked_language_model(vocab, model, mask_prob=0.15):
|
||||
'''Convert a model into a BERT-style masked language model'''
|
||||
"""Convert a model into a BERT-style masked language model"""
|
||||
|
||||
random_words = RandomWords(vocab)
|
||||
def mlm_forward(docs, drop=0.):
|
||||
|
||||
def mlm_forward(docs, drop=0.0):
|
||||
mask, docs = apply_mask(docs, random_words, mask_prob=mask_prob)
|
||||
mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
|
||||
output, backprop = model.begin_update(docs, drop=drop)
|
||||
|
||||
def mlm_backward(d_output, sgd=None):
|
||||
d_output *= 1-mask
|
||||
d_output *= 1 - mask
|
||||
return backprop(d_output, sgd=sgd)
|
||||
|
||||
return output, mlm_backward
|
||||
|
@ -161,7 +240,7 @@ def masked_language_model(vocab, model, mask_prob=0.15):
|
|||
|
||||
def apply_mask(docs, random_words, mask_prob=0.15):
|
||||
N = sum(len(doc) for doc in docs)
|
||||
mask = numpy.random.uniform(0., 1.0, (N,))
|
||||
mask = numpy.random.uniform(0.0, 1.0, (N,))
|
||||
mask = mask >= mask_prob
|
||||
i = 0
|
||||
masked_docs = []
|
||||
|
@ -184,7 +263,7 @@ def apply_mask(docs, random_words, mask_prob=0.15):
|
|||
return mask, masked_docs
|
||||
|
||||
|
||||
def replace_word(word, random_words, mask='[MASK]'):
|
||||
def replace_word(word, random_words, mask="[MASK]"):
|
||||
roll = random.random()
|
||||
if roll < 0.8:
|
||||
return mask
|
||||
|
@ -193,23 +272,25 @@ def replace_word(word, random_words, mask='[MASK]'):
|
|||
else:
|
||||
return word
|
||||
|
||||
|
||||
class RandomWords(object):
|
||||
def __init__(self, vocab):
|
||||
self.words = [lex.text for lex in vocab if lex.prob != 0.0]
|
||||
self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
|
||||
self.words = self.words[:10000]
|
||||
self.probs = self.probs[:10000]
|
||||
self.probs = numpy.exp(numpy.array(self.probs, dtype='f'))
|
||||
self.probs = numpy.exp(numpy.array(self.probs, dtype="f"))
|
||||
self.probs /= self.probs.sum()
|
||||
self._cache = []
|
||||
|
||||
def next(self):
|
||||
if not self._cache:
|
||||
self._cache.extend(numpy.random.choice(len(self.words), 10000,
|
||||
p=self.probs))
|
||||
self._cache.extend(
|
||||
numpy.random.choice(len(self.words), 10000, p=self.probs)
|
||||
)
|
||||
index = self._cache.pop()
|
||||
return self.words[index]
|
||||
|
||||
|
||||
|
||||
class ProgressTracker(object):
|
||||
def __init__(self, frequency=1000000):
|
||||
|
@ -245,76 +326,3 @@ class ProgressTracker(object):
|
|||
return status
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
texts_loc=("Path to jsonl file with texts to learn from", "positional", None, str),
|
||||
vectors_model=("Name or path to vectors model to learn from"),
|
||||
output_dir=("Directory to write models each epoch", "positional", None, str),
|
||||
width=("Width of CNN layers", "option", "cw", int),
|
||||
depth=("Depth of CNN layers", "option", "cd", int),
|
||||
embed_rows=("Embedding rows", "option", "er", int),
|
||||
use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
|
||||
dropout=("Dropout", "option", "d", float),
|
||||
seed=("Seed for random number generators", "option", "s", float),
|
||||
nr_iter=("Number of iterations to pretrain", "option", "i", int),
|
||||
)
|
||||
def pretrain(texts_loc, vectors_model, output_dir, width=96, depth=4,
|
||||
embed_rows=2000, use_vectors=False, dropout=0.2, nr_iter=1000, seed=0):
|
||||
"""
|
||||
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
||||
using an approximate language-modelling objective. Specifically, we load
|
||||
pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
|
||||
vectors which match the pre-trained ones. The weights are saved to a directory
|
||||
after each epoch. You can then pass a path to one of these pre-trained weights
|
||||
files to the 'spacy train' command.
|
||||
|
||||
This technique may be especially helpful if you have little labelled data.
|
||||
However, it's still quite experimental, so your mileage may vary.
|
||||
|
||||
To load the weights back in during 'spacy train', you need to ensure
|
||||
all settings are the same between pretraining and training. The API and
|
||||
errors around this need some improvement.
|
||||
"""
|
||||
config = dict(locals())
|
||||
output_dir = ensure_path(output_dir)
|
||||
random.seed(seed)
|
||||
numpy.random.seed(seed)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
with (output_dir / 'config.json').open('w') as file_:
|
||||
file_.write(json.dumps(config))
|
||||
has_gpu = prefer_gpu()
|
||||
print("Use GPU?", has_gpu)
|
||||
nlp = spacy.load(vectors_model)
|
||||
pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
|
||||
model = create_pretraining_model(nlp,
|
||||
Tok2Vec(width, embed_rows,
|
||||
conv_depth=depth,
|
||||
pretrained_vectors=pretrained_vectors,
|
||||
bilstm_depth=0, # Requires PyTorch. Experimental.
|
||||
cnn_maxout_pieces=2, # You can try setting this higher
|
||||
subword_features=True)) # Set to False for character models, e.g. Chinese
|
||||
optimizer = create_default_optimizer(model.ops)
|
||||
tracker = ProgressTracker()
|
||||
print('Epoch', '#Words', 'Loss', 'w/s')
|
||||
texts = stream_texts() if texts_loc == '-' else load_texts(texts_loc)
|
||||
for epoch in range(nr_iter):
|
||||
for batch in minibatch_by_words(((text, None) for text in texts), size=5000):
|
||||
docs = make_docs(nlp, [text for (text, _) in batch])
|
||||
loss = make_update(model, docs, optimizer, drop=dropout)
|
||||
progress = tracker.update(epoch, loss, docs)
|
||||
if progress:
|
||||
print(*progress)
|
||||
if texts_loc == '-' and tracker.words_per_epoch[epoch] >= 10**7:
|
||||
break
|
||||
with model.use_params(optimizer.averages):
|
||||
with (output_dir / ('model%d.bin' % epoch)).open('wb') as file_:
|
||||
file_.write(model.tok2vec.to_bytes())
|
||||
with (output_dir / 'log.jsonl').open('a') as file_:
|
||||
file_.write(json.dumps({'nr_word': tracker.nr_word,
|
||||
'loss': tracker.loss, 'epoch_loss': tracker.epoch_loss,
|
||||
'epoch': epoch}) + '\n')
|
||||
tracker.epoch_loss = 0.0
|
||||
if texts_loc != '-':
|
||||
texts = load_texts(texts_loc)
|
||||
|
|
|
@ -6,45 +6,64 @@ from pathlib import Path
|
|||
import ujson
|
||||
import cProfile
|
||||
import pstats
|
||||
|
||||
import spacy
|
||||
import sys
|
||||
import tqdm
|
||||
import cytoolz
|
||||
import thinc.extra.datasets
|
||||
from wasabi import Printer
|
||||
|
||||
|
||||
def read_inputs(loc):
|
||||
if loc is None:
|
||||
file_ = sys.stdin
|
||||
file_ = (line.encode('utf8') for line in file_)
|
||||
else:
|
||||
file_ = Path(loc).open()
|
||||
for line in file_:
|
||||
data = ujson.loads(line)
|
||||
text = data['text']
|
||||
yield text
|
||||
from ..util import load_model
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
lang=("model/language", "positional", None, str),
|
||||
inputs=("Location of input file", "positional", None, read_inputs))
|
||||
def profile(lang, inputs=None):
|
||||
model=("Model to load", "positional", None, str),
|
||||
inputs=("Location of input file. '-' for stdin.", "positional", None, str),
|
||||
n_texts=("Maximum number of texts to use if available", "option", "n", int),
|
||||
)
|
||||
def profile(model, inputs=None, n_texts=10000):
|
||||
"""
|
||||
Profile a spaCy pipeline, to find out which functions take the most time.
|
||||
Input should be formatted as one JSON object per line with a key "text".
|
||||
It can either be provided as a JSONL file, or be read from sys.sytdin.
|
||||
If no input file is specified, the IMDB dataset is loaded via Thinc.
|
||||
"""
|
||||
msg = Printer()
|
||||
if inputs is not None:
|
||||
inputs = _read_inputs(inputs, msg)
|
||||
if inputs is None:
|
||||
imdb_train, _ = thinc.extra.datasets.imdb()
|
||||
inputs, _ = zip(*imdb_train)
|
||||
inputs = inputs[:25000]
|
||||
nlp = spacy.load(lang)
|
||||
texts = list(cytoolz.take(10000, inputs))
|
||||
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(),
|
||||
"Profile.prof")
|
||||
n_inputs = 25000
|
||||
with msg.loading("Loading IMDB dataset via Thinc..."):
|
||||
imdb_train, _ = thinc.extra.datasets.imdb()
|
||||
inputs, _ = zip(*imdb_train)
|
||||
msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs))
|
||||
inputs = inputs[:n_inputs]
|
||||
with msg.loading("Loading model '{}'...".format(model)):
|
||||
nlp = load_model(model)
|
||||
msg.good("Loaded model '{}'".format(model))
|
||||
texts = list(cytoolz.take(n_texts, inputs))
|
||||
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
||||
s = pstats.Stats("Profile.prof")
|
||||
msg.divider("Profile stats")
|
||||
s.strip_dirs().sort_stats("time").print_stats()
|
||||
|
||||
|
||||
def parse_texts(nlp, texts):
|
||||
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
|
||||
pass
|
||||
|
||||
|
||||
def _read_inputs(loc, msg):
|
||||
if loc == "-":
|
||||
msg.info("Reading input from sys.stdin")
|
||||
file_ = sys.stdin
|
||||
file_ = (line.encode("utf8") for line in file_)
|
||||
else:
|
||||
input_path = Path(loc)
|
||||
if not input_path.exists() or not input_path.is_file():
|
||||
msg.fail("Not a valid input data file", loc, exits=1)
|
||||
msg.info("Using data from {}".format(input_path.parts[-1]))
|
||||
file_ = input_path.open()
|
||||
for line in file_:
|
||||
data = ujson.loads(line)
|
||||
text = data["text"]
|
||||
yield text
|
||||
|
|
51
spacy/cli/schemas/__init__.py
Normal file
51
spacy/cli/schemas/__init__.py
Normal file
|
@ -0,0 +1,51 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pathlib import Path
|
||||
from jsonschema import Draft4Validator
|
||||
|
||||
from ...errors import Errors
|
||||
from ...util import read_json
|
||||
|
||||
|
||||
SCHEMAS = {}
|
||||
|
||||
|
||||
def get_schema(name):
|
||||
"""Get the JSON schema for a given name. Looks for a .json file in
|
||||
spacy.cli.schemas, validates the schema and raises ValueError if not found.
|
||||
|
||||
EXAMPLE:
|
||||
>>> schema = get_schema('training')
|
||||
|
||||
name (unicode): The name of the schema.
|
||||
RETURNS (dict): The JSON schema.
|
||||
"""
|
||||
if name not in SCHEMAS:
|
||||
schema_path = Path(__file__).parent / "{}.json".format(name)
|
||||
if not schema_path.exists():
|
||||
raise ValueError(Errors.E104.format(name=name))
|
||||
schema = read_json(schema_path)
|
||||
# TODO: replace with (stable) Draft6Validator, if available
|
||||
validator = Draft4Validator(schema)
|
||||
validator.check_schema(schema)
|
||||
SCHEMAS[name] = schema
|
||||
return SCHEMAS[name]
|
||||
|
||||
|
||||
def validate_json(data, schema):
|
||||
"""Validate data against a given JSON schema (see https://json-schema.org).
|
||||
|
||||
data: JSON-serializable data to validate.
|
||||
schema (dict): The JSON schema.
|
||||
RETURNS (list): A list of error messages, if available.
|
||||
"""
|
||||
validator = Draft4Validator(schema)
|
||||
errors = []
|
||||
for err in sorted(validator.iter_errors(data), key=lambda e: e.path):
|
||||
if err.path:
|
||||
err_path = "[{}]".format(" -> ".join([str(p) for p in err.path]))
|
||||
else:
|
||||
err_path = ""
|
||||
errors.append(err.message + " " + err_path)
|
||||
return errors
|
128
spacy/cli/schemas/meta.json
Normal file
128
spacy/cli/schemas/meta.json
Normal file
|
@ -0,0 +1,128 @@
|
|||
{
|
||||
"$schema": "http://json-schema.org/draft-06/schema",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"lang": {
|
||||
"title": "Two-letter language code, e.g. 'en'",
|
||||
"type": "string",
|
||||
"minLength": 2,
|
||||
"maxLength": 2,
|
||||
"pattern": "^[a-z]*$"
|
||||
},
|
||||
"name": {
|
||||
"title": "Model name",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"pattern": "^[a-z_]*$"
|
||||
},
|
||||
"version": {
|
||||
"title": "Model version",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"pattern": "^[0-9a-z.-]*$"
|
||||
},
|
||||
"spacy_version": {
|
||||
"title": "Compatible spaCy version identifier",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"pattern": "^[0-9a-z.-><=]*$"
|
||||
},
|
||||
"parent_package": {
|
||||
"title": "Name of parent spaCy package, e.g. spacy or spacy-nightly",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"default": "spacy"
|
||||
},
|
||||
"pipeline": {
|
||||
"title": "Names of pipeline components",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"minLength": 1
|
||||
}
|
||||
},
|
||||
"description": {
|
||||
"title": "Model description",
|
||||
"type": "string"
|
||||
},
|
||||
"license": {
|
||||
"title": "Model license",
|
||||
"type": "string"
|
||||
},
|
||||
"author": {
|
||||
"title": "Model author name",
|
||||
"type": "string"
|
||||
},
|
||||
"email": {
|
||||
"title": "Model author email",
|
||||
"type": "string",
|
||||
"format": "email"
|
||||
},
|
||||
"url": {
|
||||
"title": "Model author URL",
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"sources": {
|
||||
"title": "Training data sources",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"vectors": {
|
||||
"title": "Included word vectors",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"keys": {
|
||||
"title": "Number of unique keys",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"vectors": {
|
||||
"title": "Number of unique vectors",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"width": {
|
||||
"title": "Number of dimensions",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
}
|
||||
}
|
||||
},
|
||||
"accuracy": {
|
||||
"title": "Accuracy numbers",
|
||||
"type": "object",
|
||||
"patternProperties": {
|
||||
"*": {
|
||||
"type": "number",
|
||||
"minimum": 0.0
|
||||
}
|
||||
}
|
||||
},
|
||||
"speed": {
|
||||
"title": "Speed evaluation numbers",
|
||||
"type": "object",
|
||||
"patternProperties": {
|
||||
"*": {
|
||||
"oneOf": [
|
||||
{
|
||||
"type": "number",
|
||||
"minimum": 0.0
|
||||
},
|
||||
{
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"lang",
|
||||
"name",
|
||||
"version"
|
||||
]
|
||||
}
|
146
spacy/cli/schemas/training.json
Normal file
146
spacy/cli/schemas/training.json
Normal file
|
@ -0,0 +1,146 @@
|
|||
{
|
||||
"$schema": "http://json-schema.org/draft-06/schema",
|
||||
"title": "Training data for spaCy models",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {
|
||||
"title": "The text of the training example",
|
||||
"type": "string",
|
||||
"minLength": 1
|
||||
},
|
||||
"ents": {
|
||||
"title": "Named entity spans in the text",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"start": {
|
||||
"title": "Start character offset of the span",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"end": {
|
||||
"title": "End character offset of the span",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"label": {
|
||||
"title": "Entity label",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"pattern": "^[A-Z0-9]*$"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"start",
|
||||
"end",
|
||||
"label"
|
||||
]
|
||||
}
|
||||
},
|
||||
"sents": {
|
||||
"title": "Sentence spans in the text",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"start": {
|
||||
"title": "Start character offset of the span",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"end": {
|
||||
"title": "End character offset of the span",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"start",
|
||||
"end"
|
||||
]
|
||||
}
|
||||
},
|
||||
"cats": {
|
||||
"title": "Text categories for the text classifier",
|
||||
"type": "object",
|
||||
"patternProperties": {
|
||||
"*": {
|
||||
"title": "A text category",
|
||||
"oneOf": [
|
||||
{
|
||||
"type": "boolean"
|
||||
},
|
||||
{
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"propertyNames": {
|
||||
"pattern": "^[A-Z0-9]*$",
|
||||
"minLength": 1
|
||||
}
|
||||
},
|
||||
"tokens": {
|
||||
"title": "The tokens in the text",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"minProperties": 1,
|
||||
"properties": {
|
||||
"id": {
|
||||
"title": "Token ID, usually token index",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"start": {
|
||||
"title": "Start character offset of the token",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"end": {
|
||||
"title": "End character offset of the token",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"pos": {
|
||||
"title": "Coarse-grained part-of-speech tag",
|
||||
"type": "string",
|
||||
"minLength": 1
|
||||
},
|
||||
"tag": {
|
||||
"title": "Fine-grained part-of-speech tag",
|
||||
"type": "string",
|
||||
"minLength": 1
|
||||
},
|
||||
"dep": {
|
||||
"title": "Dependency label",
|
||||
"type": "string",
|
||||
"minLength": 1
|
||||
},
|
||||
"head": {
|
||||
"title": "Index of the token's head",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"start",
|
||||
"end"
|
||||
]
|
||||
}
|
||||
},
|
||||
"_": {
|
||||
"title": "Custom user space",
|
||||
"type": "object"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"text"
|
||||
]
|
||||
}
|
||||
}
|
|
@ -6,213 +6,296 @@ from pathlib import Path
|
|||
import tqdm
|
||||
from thinc.neural._classes.model import Model
|
||||
from timeit import default_timer as timer
|
||||
import json
|
||||
import shutil
|
||||
from wasabi import Printer
|
||||
|
||||
from ._messages import Messages
|
||||
from .._ml import create_default_optimizer
|
||||
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
|
||||
from ..gold import GoldCorpus
|
||||
from ..util import prints, minibatch, minibatch_by_words
|
||||
from .. import util
|
||||
from .. import about
|
||||
from .. import displacy
|
||||
from ..compat import json_dumps
|
||||
|
||||
|
||||
# Take dropout and batch size as generators of values -- dropout
|
||||
# starts high and decays sharply, to force the optimizer to explore.
|
||||
# Batch size starts at 1 and grows, so that we make updates quickly
|
||||
# at the beginning of training.
|
||||
dropout_rates = util.decaying(
|
||||
util.env_opt("dropout_from", 0.2),
|
||||
util.env_opt("dropout_to", 0.2),
|
||||
util.env_opt("dropout_decay", 0.0),
|
||||
)
|
||||
batch_sizes = util.compounding(
|
||||
util.env_opt("batch_from", 1000),
|
||||
util.env_opt("batch_to", 1000),
|
||||
util.env_opt("batch_compound", 1.001),
|
||||
)
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
lang=("model language", "positional", None, str),
|
||||
output_dir=("output directory to store model in", "positional", None, str),
|
||||
train_data=("location of JSON-formatted training data", "positional",
|
||||
None, str),
|
||||
dev_data=("location of JSON-formatted development data (optional)",
|
||||
"positional", None, str),
|
||||
n_iter=("number of iterations", "option", "n", int),
|
||||
n_sents=("number of sentences", "option", "ns", int),
|
||||
lang=("Model language", "positional", None, str),
|
||||
output_path=("Output directory to store model in", "positional", None, Path),
|
||||
train_path=("Location of JSON-formatted training data", "positional", None, Path),
|
||||
dev_path=("Location of JSON-formatted development data", "positional", None, Path),
|
||||
base_model=("Name of model to update (optional)", "option", "b", str),
|
||||
pipeline=("Comma-separated names of pipeline components", "option", "p", str),
|
||||
vectors=("Model to load vectors from", "option", "v", str),
|
||||
n_iter=("Number of iterations", "option", "n", int),
|
||||
n_examples=("Number of examples", "option", "ns", int),
|
||||
use_gpu=("Use GPU", "option", "g", int),
|
||||
vectors=("Model to load vectors from", "option", "v"),
|
||||
no_tagger=("Don't train tagger", "flag", "T", bool),
|
||||
no_parser=("Don't train parser", "flag", "P", bool),
|
||||
no_entities=("Don't train NER", "flag", "N", bool),
|
||||
parser_multitasks=("Side objectives for parser CNN, e.g. dep dep,tag", "option", "pt", str),
|
||||
noise_level=("Amount of corruption to add for data augmentation", "option", "nl", float),
|
||||
entity_multitasks=("Side objectives for ner CNN, e.g. dep dep,tag", "option", "et", str),
|
||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||
version=("Model version", "option", "V", str),
|
||||
meta_path=("Optional path to meta.json. All relevant properties will be "
|
||||
"overwritten.", "option", "m", Path),
|
||||
init_tok2vec=("Path to pretrained weights for the token-to-vector parts "
|
||||
"of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path),
|
||||
verbose=("Display more information for debug", "option", None, bool))
|
||||
def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||
parser_multitasks='', entity_multitasks='', init_tok2vec=None,
|
||||
use_gpu=-1, vectors=None, no_tagger=False, noise_level=0.0,
|
||||
no_parser=False, no_entities=False, gold_preproc=False,
|
||||
version="0.0.0", meta_path=None, verbose=False):
|
||||
meta_path=("Optional path to meta.json to use as base.", "option", "m", Path),
|
||||
init_tok2vec=(
|
||||
"Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.",
|
||||
"option",
|
||||
"t2v",
|
||||
Path,
|
||||
),
|
||||
parser_multitasks=(
|
||||
"Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'",
|
||||
"option",
|
||||
"pt",
|
||||
str,
|
||||
),
|
||||
entity_multitasks=(
|
||||
"Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'",
|
||||
"option",
|
||||
"et",
|
||||
str,
|
||||
),
|
||||
noise_level=("Amount of corruption for data augmentation", "option", "nl", float),
|
||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||
learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool),
|
||||
verbose=("Display more information for debug", "flag", "VV", bool),
|
||||
debug=("Run data diagnostics before training", "flag", "D", bool),
|
||||
)
|
||||
def train(
|
||||
lang,
|
||||
output_path,
|
||||
train_path,
|
||||
dev_path,
|
||||
base_model=None,
|
||||
pipeline="tagger,parser,ner",
|
||||
vectors=None,
|
||||
n_iter=30,
|
||||
n_examples=0,
|
||||
use_gpu=-1,
|
||||
version="0.0.0",
|
||||
meta_path=None,
|
||||
init_tok2vec=None,
|
||||
parser_multitasks="",
|
||||
entity_multitasks="",
|
||||
noise_level=0.0,
|
||||
gold_preproc=False,
|
||||
learn_tokens=False,
|
||||
verbose=False,
|
||||
debug=False,
|
||||
):
|
||||
"""
|
||||
Train a model. Expects data in spaCy's JSON format.
|
||||
Train or update a spaCy model. Requires data to be formatted in spaCy's
|
||||
JSON format. To convert data from other formats, use the `spacy convert`
|
||||
command.
|
||||
"""
|
||||
msg = Printer()
|
||||
util.fix_random_seed()
|
||||
util.set_env_log(True)
|
||||
n_sents = n_sents or None
|
||||
output_path = util.ensure_path(output_dir)
|
||||
train_path = util.ensure_path(train_data)
|
||||
dev_path = util.ensure_path(dev_data)
|
||||
util.set_env_log(verbose)
|
||||
|
||||
# Make sure all files and paths exists if they are needed
|
||||
train_path = util.ensure_path(train_path)
|
||||
dev_path = util.ensure_path(dev_path)
|
||||
meta_path = util.ensure_path(meta_path)
|
||||
if not train_path.exists():
|
||||
prints(train_path, title=Messages.M050, exits=1)
|
||||
if dev_path and not dev_path.exists():
|
||||
prints(dev_path, title=Messages.M051, exits=1)
|
||||
if not train_path or not train_path.exists():
|
||||
msg.fail(Messages.M050, train_path, exits=1)
|
||||
if not dev_path or not dev_path.exists():
|
||||
msg.fail(Messages.M051, dev_path, exits=1)
|
||||
if meta_path is not None and not meta_path.exists():
|
||||
prints(meta_path, title=Messages.M020, exits=1)
|
||||
msg.fail(Messages.M020, meta_path, exits=1)
|
||||
meta = util.read_json(meta_path) if meta_path else {}
|
||||
if not isinstance(meta, dict):
|
||||
prints(Messages.M053.format(meta_type=type(meta)),
|
||||
title=Messages.M052, exits=1)
|
||||
meta.setdefault('lang', lang)
|
||||
meta.setdefault('name', 'unnamed')
|
||||
|
||||
msg.fail(Messages.M052, Messages.M053.format(meta_type=type(meta)), exits=1)
|
||||
if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
|
||||
msg.fail(Messages.M062, Messages.M065)
|
||||
if not output_path.exists():
|
||||
output_path.mkdir()
|
||||
|
||||
print("Counting training words (limit=%s" % n_sents)
|
||||
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
|
||||
n_train_words = corpus.count_train()
|
||||
print(n_train_words)
|
||||
pipeline = ['tagger', 'parser', 'ner']
|
||||
if no_tagger and 'tagger' in pipeline:
|
||||
pipeline.remove('tagger')
|
||||
if no_parser and 'parser' in pipeline:
|
||||
pipeline.remove('parser')
|
||||
if no_entities and 'ner' in pipeline:
|
||||
pipeline.remove('ner')
|
||||
# Set up the base model and pipeline. If a base model is specified, load
|
||||
# the model and make sure the pipeline matches the pipeline setting. If
|
||||
# training starts from a blank model, intitalize the language class.
|
||||
pipeline = [p.strip() for p in pipeline.split(",")]
|
||||
msg.text(Messages.M055.format(pipeline=pipeline))
|
||||
if base_model:
|
||||
msg.text(Messages.M056.format(model=base_model))
|
||||
nlp = util.load_model(base_model)
|
||||
if nlp.lang != lang:
|
||||
msg.fail(Messages.M072.format(model_lang=nlp.lang, lang=lang), exits=1)
|
||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipeline]
|
||||
nlp.disable_pipes(*other_pipes)
|
||||
for pipe in pipeline:
|
||||
if pipe not in nlp.pipe_names:
|
||||
nlp.add_pipe(nlp.create_pipe(pipe))
|
||||
else:
|
||||
msg.text(Messages.M057.format(model=lang))
|
||||
lang_cls = util.get_lang_class(lang)
|
||||
nlp = lang_cls()
|
||||
for pipe in pipeline:
|
||||
nlp.add_pipe(nlp.create_pipe(pipe))
|
||||
|
||||
if learn_tokens:
|
||||
nlp.add_pipe(nlp.create_pipe("merge_subtokens"))
|
||||
|
||||
# Take dropout and batch size as generators of values -- dropout
|
||||
# starts high and decays sharply, to force the optimizer to explore.
|
||||
# Batch size starts at 1 and grows, so that we make updates quickly
|
||||
# at the beginning of training.
|
||||
dropout_rates = util.decaying(util.env_opt('dropout_from', 0.1),
|
||||
util.env_opt('dropout_to', 0.1),
|
||||
util.env_opt('dropout_decay', 0.0))
|
||||
batch_sizes = util.compounding(util.env_opt('batch_from', 750),
|
||||
util.env_opt('batch_to', 750),
|
||||
util.env_opt('batch_compound', 1.001))
|
||||
dropout_rates = util.decaying(
|
||||
util.env_opt("dropout_from", 0.1),
|
||||
util.env_opt("dropout_to", 0.1),
|
||||
util.env_opt("dropout_decay", 0.0),
|
||||
)
|
||||
batch_sizes = util.compounding(
|
||||
util.env_opt("batch_from", 750),
|
||||
util.env_opt("batch_to", 750),
|
||||
util.env_opt("batch_compound", 1.001),
|
||||
)
|
||||
lang_class = util.get_lang_class(lang)
|
||||
nlp = lang_class()
|
||||
meta['pipeline'] = pipeline
|
||||
meta["pipeline"] = pipeline
|
||||
nlp.meta.update(meta)
|
||||
if vectors:
|
||||
print("Load vectors model", vectors)
|
||||
util.load_model(vectors, vocab=nlp.vocab)
|
||||
for lex in nlp.vocab:
|
||||
values = {}
|
||||
for attr, func in nlp.vocab.lex_attr_getters.items():
|
||||
# These attrs are expected to be set by data. Others should
|
||||
# be set by calling the language functions.
|
||||
if attr not in (CLUSTER, PROB, IS_OOV, LANG):
|
||||
values[lex.vocab.strings[attr]] = func(lex.orth_)
|
||||
lex.set_attrs(**values)
|
||||
lex.is_oov = False
|
||||
for name in pipeline:
|
||||
nlp.add_pipe(nlp.create_pipe(name), name=name)
|
||||
nlp.add_pipe(nlp.create_pipe('merge_subtokens'))
|
||||
if parser_multitasks:
|
||||
for objective in parser_multitasks.split(','):
|
||||
nlp.parser.add_multitask_objective(objective)
|
||||
if entity_multitasks:
|
||||
for objective in entity_multitasks.split(','):
|
||||
nlp.entity.add_multitask_objective(objective)
|
||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
||||
if init_tok2vec is not None:
|
||||
loaded = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
||||
print("Loaded pretrained tok2vec for:", loaded)
|
||||
msg.text(Messages.M058.format(model=vectors))
|
||||
_load_vectors(nlp, vectors)
|
||||
|
||||
# Multitask objectives
|
||||
multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)]
|
||||
for pipe_name, multitasks in multitask_options:
|
||||
if multitasks:
|
||||
if pipe_name not in pipeline:
|
||||
msg.fail(Messages.M059.format(pipe=pipe_name))
|
||||
pipe = nlp.get_pipe(pipe_name)
|
||||
for objective in multitasks.split(","):
|
||||
pipe.add_multitask_objective(objective)
|
||||
|
||||
# Prepare training corpus
|
||||
msg.text(Messages.M060.format(limit=n_examples))
|
||||
corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
|
||||
n_train_words = corpus.count_train()
|
||||
|
||||
if base_model:
|
||||
# Start with an existing model, use default optimizer
|
||||
optimizer = create_default_optimizer(Model.ops)
|
||||
else:
|
||||
# Start with a blank model, call begin_training
|
||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
||||
nlp._optimizer = None
|
||||
|
||||
print("Itn. Dep Loss NER Loss UAS NER P. NER R. NER F. Tag % Token % CPU WPS GPU WPS")
|
||||
# Load in pre-trained weights
|
||||
if init_tok2vec is not None:
|
||||
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
||||
msg.text(Messages.M071.format(components=components))
|
||||
|
||||
print(
|
||||
"\nItn. Dep Loss NER Loss UAS NER P. NER R. NER F. Tag % Token % CPU WPS GPU WPS"
|
||||
)
|
||||
try:
|
||||
for i in range(n_iter):
|
||||
train_docs = corpus.train_docs(nlp, noise_level=noise_level,
|
||||
gold_preproc=gold_preproc, max_length=0)
|
||||
train_docs = corpus.train_docs(
|
||||
nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0
|
||||
)
|
||||
words_seen = 0
|
||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||
losses = {}
|
||||
for batch in minibatch_by_words(train_docs, size=batch_sizes):
|
||||
for batch in util.minibatch_by_words(train_docs, size=batch_sizes):
|
||||
if not batch:
|
||||
continue
|
||||
docs, golds = zip(*batch)
|
||||
nlp.update(docs, golds, sgd=optimizer,
|
||||
drop=next(dropout_rates), losses=losses)
|
||||
nlp.update(
|
||||
docs,
|
||||
golds,
|
||||
sgd=optimizer,
|
||||
drop=next(dropout_rates),
|
||||
losses=losses,
|
||||
)
|
||||
pbar.update(sum(len(doc) for doc in docs))
|
||||
words_seen += sum(len(doc) for doc in docs)
|
||||
with nlp.use_params(optimizer.averages):
|
||||
util.set_env_log(False)
|
||||
epoch_model_path = output_path / ('model%d' % i)
|
||||
epoch_model_path = output_path / ("model%d" % i)
|
||||
nlp.to_disk(epoch_model_path)
|
||||
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
||||
dev_docs = list(corpus.dev_docs(
|
||||
nlp_loaded,
|
||||
gold_preproc=gold_preproc))
|
||||
dev_docs = list(corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc))
|
||||
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
|
||||
start_time = timer()
|
||||
scorer = nlp_loaded.evaluate(dev_docs, verbose)
|
||||
scorer = nlp_loaded.evaluate(dev_docs, debug)
|
||||
end_time = timer()
|
||||
if use_gpu < 0:
|
||||
gpu_wps = None
|
||||
cpu_wps = nwords/(end_time-start_time)
|
||||
cpu_wps = nwords / (end_time - start_time)
|
||||
else:
|
||||
gpu_wps = nwords/(end_time-start_time)
|
||||
with Model.use_device('cpu'):
|
||||
gpu_wps = nwords / (end_time - start_time)
|
||||
with Model.use_device("cpu"):
|
||||
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
||||
dev_docs = list(corpus.dev_docs(
|
||||
nlp_loaded, gold_preproc=gold_preproc))
|
||||
dev_docs = list(
|
||||
corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)
|
||||
)
|
||||
start_time = timer()
|
||||
scorer = nlp_loaded.evaluate(dev_docs)
|
||||
end_time = timer()
|
||||
cpu_wps = nwords/(end_time-start_time)
|
||||
acc_loc = (output_path / ('model%d' % i) / 'accuracy.json')
|
||||
with acc_loc.open('w') as file_:
|
||||
file_.write(json_dumps(scorer.scores))
|
||||
meta_loc = output_path / ('model%d' % i) / 'meta.json'
|
||||
meta['accuracy'] = scorer.scores
|
||||
meta['speed'] = {'nwords': nwords, 'cpu': cpu_wps,
|
||||
'gpu': gpu_wps}
|
||||
meta['vectors'] = {'width': nlp.vocab.vectors_length,
|
||||
'vectors': len(nlp.vocab.vectors),
|
||||
'keys': nlp.vocab.vectors.n_keys}
|
||||
meta['lang'] = nlp.lang
|
||||
meta['pipeline'] = pipeline
|
||||
meta['spacy_version'] = '>=%s' % about.__version__
|
||||
meta.setdefault('name', 'model%d' % i)
|
||||
meta.setdefault('version', version)
|
||||
cpu_wps = nwords / (end_time - start_time)
|
||||
acc_loc = output_path / ("model%d" % i) / "accuracy.json"
|
||||
util.write_json(acc_loc, scorer.scores)
|
||||
|
||||
with meta_loc.open('w') as file_:
|
||||
file_.write(json_dumps(meta))
|
||||
util.set_env_log(True)
|
||||
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps,
|
||||
gpu_wps=gpu_wps)
|
||||
# Update model meta.json
|
||||
meta["lang"] = nlp.lang
|
||||
meta["pipeline"] = nlp.pipe_names
|
||||
meta["spacy_version"] = ">=%s" % about.__version__
|
||||
meta["accuracy"] = scorer.scores
|
||||
meta["speed"] = {"nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps}
|
||||
meta["vectors"] = {
|
||||
"width": nlp.vocab.vectors_length,
|
||||
"vectors": len(nlp.vocab.vectors),
|
||||
"keys": nlp.vocab.vectors.n_keys,
|
||||
}
|
||||
meta.setdefault("name", "model%d" % i)
|
||||
meta.setdefault("version", version)
|
||||
meta_loc = output_path / ("model%d" % i) / "meta.json"
|
||||
util.write_json(meta_loc, meta)
|
||||
|
||||
util.set_env_log(verbose)
|
||||
|
||||
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps)
|
||||
finally:
|
||||
print("Saving model...")
|
||||
with nlp.use_params(optimizer.averages):
|
||||
final_model_path = output_path / 'model-final'
|
||||
nlp.to_disk(final_model_path)
|
||||
components = []
|
||||
if not no_parser:
|
||||
components.append('parser')
|
||||
if not no_tagger:
|
||||
components.append('tagger')
|
||||
if not no_entities:
|
||||
components.append('ner')
|
||||
_collate_best_model(meta, output_path, components)
|
||||
with msg.loading(Messages.M061):
|
||||
with nlp.use_params(optimizer.averages):
|
||||
final_model_path = output_path / "model-final"
|
||||
nlp.to_disk(final_model_path)
|
||||
msg.good(Messages.M066, util.path2str(final_model_path))
|
||||
|
||||
_collate_best_model(meta, output_path, nlp.pipe_names)
|
||||
|
||||
|
||||
def _load_vectors(nlp, vectors):
|
||||
util.load_model(vectors, vocab=nlp.vocab)
|
||||
for lex in nlp.vocab:
|
||||
values = {}
|
||||
for attr, func in nlp.vocab.lex_attr_getters.items():
|
||||
# These attrs are expected to be set by data. Others should
|
||||
# be set by calling the language functions.
|
||||
if attr not in (CLUSTER, PROB, IS_OOV, LANG):
|
||||
values[lex.vocab.strings[attr]] = func(lex.orth_)
|
||||
lex.set_attrs(**values)
|
||||
lex.is_oov = False
|
||||
|
||||
|
||||
def _load_pretrained_tok2vec(nlp, loc):
|
||||
"""Load pre-trained weights for the 'token-to-vector' part of the component
|
||||
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
||||
"""
|
||||
with loc.open('rb') as file_:
|
||||
with loc.open("rb") as file_:
|
||||
weights_data = file_.read()
|
||||
loaded = []
|
||||
for name, component in nlp.pipeline:
|
||||
if hasattr(component, 'model') and hasattr(component.model, 'tok2vec'):
|
||||
if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
|
||||
component.tok2vec.from_bytes(weights_data)
|
||||
loaded.append(name)
|
||||
return loaded
|
||||
|
@ -222,24 +305,22 @@ def _collate_best_model(meta, output_path, components):
|
|||
bests = {}
|
||||
for component in components:
|
||||
bests[component] = _find_best(output_path, component)
|
||||
best_dest = output_path / 'model-best'
|
||||
shutil.copytree(output_path / 'model-final', best_dest)
|
||||
best_dest = output_path / "model-best"
|
||||
shutil.copytree(output_path / "model-final", best_dest)
|
||||
for component, best_component_src in bests.items():
|
||||
shutil.rmtree(best_dest / component)
|
||||
shutil.copytree(best_component_src / component, best_dest / component)
|
||||
with (best_component_src / 'accuracy.json').open() as file_:
|
||||
accs = json.load(file_)
|
||||
accs = util.read_json(best_component_src / "accuracy.json")
|
||||
for metric in _get_metrics(component):
|
||||
meta['accuracy'][metric] = accs[metric]
|
||||
with (best_dest / 'meta.json').open('w') as file_:
|
||||
file_.write(json_dumps(meta))
|
||||
meta["accuracy"][metric] = accs[metric]
|
||||
util.write_json(best_dest / "meta.json", meta)
|
||||
|
||||
|
||||
def _find_best(experiment_dir, component):
|
||||
accuracies = []
|
||||
for epoch_model in experiment_dir.iterdir():
|
||||
if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
|
||||
accs = json.load((epoch_model / "accuracy.json").open())
|
||||
accs = util.read_json(epoch_model / "accuracy.json")
|
||||
scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
|
||||
accuracies.append((scores, epoch_model))
|
||||
if accuracies:
|
||||
|
@ -247,6 +328,7 @@ def _find_best(experiment_dir, component):
|
|||
else:
|
||||
return None
|
||||
|
||||
|
||||
def _get_metrics(component):
|
||||
if component == "parser":
|
||||
return ("las", "uas", "token_acc")
|
||||
|
@ -257,50 +339,40 @@ def _get_metrics(component):
|
|||
return ("token_acc",)
|
||||
|
||||
|
||||
def _render_parses(i, to_render):
|
||||
to_render[0].user_data['title'] = "Batch %d" % i
|
||||
with Path('/tmp/entities.html').open('w') as file_:
|
||||
html = displacy.render(to_render[:5], style='ent', page=True)
|
||||
file_.write(html)
|
||||
with Path('/tmp/parses.html').open('w') as file_:
|
||||
html = displacy.render(to_render[:5], style='dep', page=True)
|
||||
file_.write(html)
|
||||
|
||||
|
||||
def print_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0):
|
||||
scores = {}
|
||||
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
|
||||
'ents_p', 'ents_r', 'ents_f', 'cpu_wps', 'gpu_wps']:
|
||||
for col in [
|
||||
"dep_loss",
|
||||
"tag_loss",
|
||||
"uas",
|
||||
"tags_acc",
|
||||
"token_acc",
|
||||
"ents_p",
|
||||
"ents_r",
|
||||
"ents_f",
|
||||
"cpu_wps",
|
||||
"gpu_wps",
|
||||
]:
|
||||
scores[col] = 0.0
|
||||
scores['dep_loss'] = losses.get('parser', 0.0)
|
||||
scores['ner_loss'] = losses.get('ner', 0.0)
|
||||
scores['tag_loss'] = losses.get('tagger', 0.0)
|
||||
scores["dep_loss"] = losses.get("parser", 0.0)
|
||||
scores["ner_loss"] = losses.get("ner", 0.0)
|
||||
scores["tag_loss"] = losses.get("tagger", 0.0)
|
||||
scores.update(dev_scores)
|
||||
scores['cpu_wps'] = cpu_wps
|
||||
scores['gpu_wps'] = gpu_wps or 0.0
|
||||
tpl = ''.join((
|
||||
'{:<6d}',
|
||||
'{dep_loss:<10.3f}',
|
||||
'{ner_loss:<10.3f}',
|
||||
'{uas:<8.3f}',
|
||||
'{ents_p:<8.3f}',
|
||||
'{ents_r:<8.3f}',
|
||||
'{ents_f:<8.3f}',
|
||||
'{tags_acc:<8.3f}',
|
||||
'{token_acc:<9.3f}',
|
||||
'{cpu_wps:<9.1f}',
|
||||
'{gpu_wps:.1f}',
|
||||
))
|
||||
scores["cpu_wps"] = cpu_wps
|
||||
scores["gpu_wps"] = gpu_wps or 0.0
|
||||
tpl = "".join(
|
||||
(
|
||||
"{:<6d}",
|
||||
"{dep_loss:<10.3f}",
|
||||
"{ner_loss:<10.3f}",
|
||||
"{uas:<8.3f}",
|
||||
"{ents_p:<8.3f}",
|
||||
"{ents_r:<8.3f}",
|
||||
"{ents_f:<8.3f}",
|
||||
"{tags_acc:<8.3f}",
|
||||
"{token_acc:<9.3f}",
|
||||
"{cpu_wps:<9.1f}",
|
||||
"{gpu_wps:.1f}",
|
||||
)
|
||||
)
|
||||
print(tpl.format(itn, **scores))
|
||||
|
||||
|
||||
def print_results(scorer):
|
||||
results = {
|
||||
'TOK': '%.2f' % scorer.token_acc,
|
||||
'POS': '%.2f' % scorer.tags_acc,
|
||||
'UAS': '%.2f' % scorer.uas,
|
||||
'LAS': '%.2f' % scorer.las,
|
||||
'NER P': '%.2f' % scorer.ents_p,
|
||||
'NER R': '%.2f' % scorer.ents_r,
|
||||
'NER F': '%.2f' % scorer.ents_f}
|
||||
util.print_table(results, title="Results")
|
||||
|
|
2
spacy/cli/ud/__init__.py
Normal file
2
spacy/cli/ud/__init__.py
Normal file
|
@ -0,0 +1,2 @@
|
|||
from .conll17_ud_eval import main as ud_evaluate # noqa: F401
|
||||
from .ud_train import main as ud_train # noqa: F401
|
|
@ -1,4 +1,5 @@
|
|||
#!/usr/bin/env python
|
||||
# flake8: noqa
|
||||
|
||||
# CoNLL 2017 UD Parsing evaluation script.
|
||||
#
|
||||
|
@ -214,7 +215,7 @@ def load_conllu(file):
|
|||
start, end = map(int, columns[ID].split("-"))
|
||||
except:
|
||||
raise UDError("Cannot parse multi-word token ID '{}'".format(columns[ID]))
|
||||
|
||||
|
||||
for _ in range(start, end + 1):
|
||||
word_line = file.readline().rstrip("\r\n")
|
||||
word_columns = word_line.split("\t")
|
|
@ -1,7 +1,9 @@
|
|||
'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
|
||||
# flake8: noqa
|
||||
"""Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
|
||||
.conllu format for development data, allowing the official scorer to be used.
|
||||
'''
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
import tqdm
|
||||
from pathlib import Path
|
||||
|
@ -11,15 +13,17 @@ import json
|
|||
|
||||
import spacy
|
||||
import spacy.util
|
||||
from ..tokens import Token, Doc
|
||||
from ..gold import GoldParse
|
||||
from ..util import compounding, minibatch_by_words
|
||||
from ..syntax.nonproj import projectivize
|
||||
from ..matcher import Matcher
|
||||
#from ..morphology import Fused_begin, Fused_inside
|
||||
from .. import displacy
|
||||
from ...tokens import Token, Doc
|
||||
from ...gold import GoldParse
|
||||
from ...util import compounding, minibatch_by_words
|
||||
from ...syntax.nonproj import projectivize
|
||||
from ...matcher import Matcher
|
||||
|
||||
# from ...morphology import Fused_begin, Fused_inside
|
||||
from ... import displacy
|
||||
from collections import defaultdict, Counter
|
||||
from timeit import default_timer as timer
|
||||
|
||||
Fused_begin = None
|
||||
Fused_inside = None
|
||||
|
||||
|
@ -30,43 +34,45 @@ import cytoolz
|
|||
|
||||
from . import conll17_ud_eval
|
||||
|
||||
from .. import lang
|
||||
from .. import lang
|
||||
from ..lang import zh
|
||||
from ..lang import ja
|
||||
from ..lang import ru
|
||||
from ... import lang
|
||||
from ...lang import zh
|
||||
from ...lang import ja
|
||||
from ...lang import ru
|
||||
|
||||
|
||||
################
|
||||
# Data reading #
|
||||
################
|
||||
|
||||
space_re = re.compile('\s+')
|
||||
space_re = re.compile("\s+")
|
||||
|
||||
|
||||
def split_text(text):
|
||||
return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')]
|
||||
|
||||
return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")]
|
||||
|
||||
|
||||
##############
|
||||
# Evaluation #
|
||||
##############
|
||||
|
||||
|
||||
def read_conllu(file_):
|
||||
docs = []
|
||||
sent = []
|
||||
doc = []
|
||||
for line in file_:
|
||||
if line.startswith('# newdoc'):
|
||||
if line.startswith("# newdoc"):
|
||||
if doc:
|
||||
docs.append(doc)
|
||||
doc = []
|
||||
elif line.startswith('#'):
|
||||
elif line.startswith("#"):
|
||||
continue
|
||||
elif not line.strip():
|
||||
if sent:
|
||||
doc.append(sent)
|
||||
sent = []
|
||||
else:
|
||||
sent.append(list(line.strip().split('\t')))
|
||||
sent.append(list(line.strip().split("\t")))
|
||||
if len(sent[-1]) != 10:
|
||||
print(repr(line))
|
||||
raise ValueError
|
||||
|
@ -78,7 +84,7 @@ def read_conllu(file_):
|
|||
|
||||
|
||||
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
||||
if text_loc.parts[-1].endswith('.conllu'):
|
||||
if text_loc.parts[-1].endswith(".conllu"):
|
||||
docs = []
|
||||
with text_loc.open() as file_:
|
||||
for conllu_doc in read_conllu(file_):
|
||||
|
@ -88,14 +94,14 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
|||
for name, component in nlp.pipeline:
|
||||
docs = list(component.pipe(docs))
|
||||
else:
|
||||
with text_loc.open('r', encoding='utf8') as text_file:
|
||||
with text_loc.open("r", encoding="utf8") as text_file:
|
||||
texts = split_text(text_file.read())
|
||||
docs = list(nlp.pipe(texts))
|
||||
with sys_loc.open('w', encoding='utf8') as out_file:
|
||||
with sys_loc.open("w", encoding="utf8") as out_file:
|
||||
write_conllu(docs, out_file)
|
||||
with gold_loc.open('r', encoding='utf8') as gold_file:
|
||||
with gold_loc.open("r", encoding="utf8") as gold_file:
|
||||
gold_ud = conll17_ud_eval.load_conllu(gold_file)
|
||||
with sys_loc.open('r', encoding='utf8') as sys_file:
|
||||
with sys_loc.open("r", encoding="utf8") as sys_file:
|
||||
sys_ud = conll17_ud_eval.load_conllu(sys_file)
|
||||
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
|
||||
return docs, scores
|
||||
|
@ -103,26 +109,26 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
|||
|
||||
def write_conllu(docs, file_):
|
||||
merger = Matcher(docs[0].vocab)
|
||||
merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}])
|
||||
merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
|
||||
for i, doc in enumerate(docs):
|
||||
matches = merger(doc)
|
||||
spans = [doc[start:end+1] for _, start, end in matches]
|
||||
spans = [doc[start : end + 1] for _, start, end in matches]
|
||||
offsets = [(span.start_char, span.end_char) for span in spans]
|
||||
for start_char, end_char in offsets:
|
||||
doc.merge(start_char, end_char)
|
||||
# TODO: This shuldn't be necessary? Should be handled in merge
|
||||
for word in doc:
|
||||
if word.i == word.head.i:
|
||||
word.dep_ = 'ROOT'
|
||||
word.dep_ = "ROOT"
|
||||
file_.write("# newdoc id = {i}\n".format(i=i))
|
||||
for j, sent in enumerate(doc.sents):
|
||||
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
|
||||
file_.write("# text = {text}\n".format(text=sent.text))
|
||||
for k, token in enumerate(sent):
|
||||
file_.write(_get_token_conllu(token, k, len(sent)) + '\n')
|
||||
file_.write('\n')
|
||||
file_.write(_get_token_conllu(token, k, len(sent)) + "\n")
|
||||
file_.write("\n")
|
||||
for word in sent:
|
||||
if word.head.i == word.i and word.dep_ == 'ROOT':
|
||||
if word.head.i == word.i and word.dep_ == "ROOT":
|
||||
break
|
||||
else:
|
||||
print("Rootless sentence!")
|
||||
|
@ -134,24 +140,34 @@ def write_conllu(docs, file_):
|
|||
|
||||
|
||||
def _get_token_conllu(token, k, sent_len):
|
||||
if token.check_morph(Fused_begin) and (k+1 < sent_len):
|
||||
if token.check_morph(Fused_begin) and (k + 1 < sent_len):
|
||||
n = 1
|
||||
text = [token.text]
|
||||
while token.nbor(n).check_morph(Fused_inside):
|
||||
text.append(token.nbor(n).text)
|
||||
n += 1
|
||||
id_ = '%d-%d' % (k+1, (k+n))
|
||||
fields = [id_, ''.join(text)] + ['_'] * 8
|
||||
lines = ['\t'.join(fields)]
|
||||
id_ = "%d-%d" % (k + 1, (k + n))
|
||||
fields = [id_, "".join(text)] + ["_"] * 8
|
||||
lines = ["\t".join(fields)]
|
||||
else:
|
||||
lines = []
|
||||
if token.head.i == token.i:
|
||||
head = 0
|
||||
else:
|
||||
head = k + (token.head.i - token.i) + 1
|
||||
fields = [str(k+1), token.text, token.lemma_, token.pos_, token.tag_, '_',
|
||||
str(head), token.dep_.lower(), '_', '_']
|
||||
if token.check_morph(Fused_begin) and (k+1 < sent_len):
|
||||
fields = [
|
||||
str(k + 1),
|
||||
token.text,
|
||||
token.lemma_,
|
||||
token.pos_,
|
||||
token.tag_,
|
||||
"_",
|
||||
str(head),
|
||||
token.dep_.lower(),
|
||||
"_",
|
||||
"_",
|
||||
]
|
||||
if token.check_morph(Fused_begin) and (k + 1 < sent_len):
|
||||
if k == 0:
|
||||
fields[1] = token.norm_[0].upper() + token.norm_[1:]
|
||||
else:
|
||||
|
@ -163,18 +179,18 @@ def _get_token_conllu(token, k, sent_len):
|
|||
split_end = token._.split_end
|
||||
split_len = (split_end.i - split_start.i) + 1
|
||||
n_in_split = token.i - split_start.i
|
||||
subtokens = guess_fused_orths(split_start.text, [''] * split_len)
|
||||
subtokens = guess_fused_orths(split_start.text, [""] * split_len)
|
||||
fields[1] = subtokens[n_in_split]
|
||||
|
||||
lines.append('\t'.join(fields))
|
||||
return '\n'.join(lines)
|
||||
lines.append("\t".join(fields))
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def guess_fused_orths(word, ud_forms):
|
||||
'''The UD data 'fused tokens' don't necessarily expand to keys that match
|
||||
"""The UD data 'fused tokens' don't necessarily expand to keys that match
|
||||
the form. We need orths that exact match the string. Here we make a best
|
||||
effort to divide up the word.'''
|
||||
if word == ''.join(ud_forms):
|
||||
effort to divide up the word."""
|
||||
if word == "".join(ud_forms):
|
||||
# Happy case: we get a perfect split, with each letter accounted for.
|
||||
return ud_forms
|
||||
elif len(word) == sum(len(subtoken) for subtoken in ud_forms):
|
||||
|
@ -183,16 +199,16 @@ def guess_fused_orths(word, ud_forms):
|
|||
remain = word
|
||||
for subtoken in ud_forms:
|
||||
assert len(subtoken) >= 1
|
||||
output.append(remain[:len(subtoken)])
|
||||
remain = remain[len(subtoken):]
|
||||
output.append(remain[: len(subtoken)])
|
||||
remain = remain[len(subtoken) :]
|
||||
assert len(remain) == 0, (word, ud_forms, remain)
|
||||
return output
|
||||
else:
|
||||
# Let's say word is 6 long, and there are three subtokens. The orths
|
||||
# *must* equal the original string. Arbitrarily, split [4, 1, 1]
|
||||
first = word[:len(word)-(len(ud_forms)-1)]
|
||||
first = word[: len(word) - (len(ud_forms) - 1)]
|
||||
output = [first]
|
||||
remain = word[len(first):]
|
||||
remain = word[len(first) :]
|
||||
for i in range(1, len(ud_forms)):
|
||||
assert remain
|
||||
output.append(remain[:1])
|
||||
|
@ -201,60 +217,50 @@ def guess_fused_orths(word, ud_forms):
|
|||
return output
|
||||
|
||||
|
||||
|
||||
def print_results(name, ud_scores):
|
||||
fields = {}
|
||||
if ud_scores is not None:
|
||||
fields.update({
|
||||
'words': ud_scores['Words'].f1 * 100,
|
||||
'sents': ud_scores['Sentences'].f1 * 100,
|
||||
'tags': ud_scores['XPOS'].f1 * 100,
|
||||
'uas': ud_scores['UAS'].f1 * 100,
|
||||
'las': ud_scores['LAS'].f1 * 100,
|
||||
})
|
||||
fields.update(
|
||||
{
|
||||
"words": ud_scores["Words"].f1 * 100,
|
||||
"sents": ud_scores["Sentences"].f1 * 100,
|
||||
"tags": ud_scores["XPOS"].f1 * 100,
|
||||
"uas": ud_scores["UAS"].f1 * 100,
|
||||
"las": ud_scores["LAS"].f1 * 100,
|
||||
}
|
||||
)
|
||||
else:
|
||||
fields.update({
|
||||
'words': 0.0,
|
||||
'sents': 0.0,
|
||||
'tags': 0.0,
|
||||
'uas': 0.0,
|
||||
'las': 0.0
|
||||
})
|
||||
tpl = '\t'.join((
|
||||
name,
|
||||
'{las:.1f}',
|
||||
'{uas:.1f}',
|
||||
'{tags:.1f}',
|
||||
'{sents:.1f}',
|
||||
'{words:.1f}',
|
||||
))
|
||||
fields.update({"words": 0.0, "sents": 0.0, "tags": 0.0, "uas": 0.0, "las": 0.0})
|
||||
tpl = "\t".join(
|
||||
(name, "{las:.1f}", "{uas:.1f}", "{tags:.1f}", "{sents:.1f}", "{words:.1f}")
|
||||
)
|
||||
print(tpl.format(**fields))
|
||||
return fields
|
||||
|
||||
|
||||
def get_token_split_start(token):
|
||||
if token.text == '':
|
||||
if token.text == "":
|
||||
assert token.i != 0
|
||||
i = -1
|
||||
while token.nbor(i).text == '':
|
||||
while token.nbor(i).text == "":
|
||||
i -= 1
|
||||
return token.nbor(i)
|
||||
elif (token.i+1) < len(token.doc) and token.nbor(1).text == '':
|
||||
elif (token.i + 1) < len(token.doc) and token.nbor(1).text == "":
|
||||
return token
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def get_token_split_end(token):
|
||||
if (token.i+1) == len(token.doc):
|
||||
return token if token.text == '' else None
|
||||
elif token.text != '' and token.nbor(1).text != '':
|
||||
if (token.i + 1) == len(token.doc):
|
||||
return token if token.text == "" else None
|
||||
elif token.text != "" and token.nbor(1).text != "":
|
||||
return None
|
||||
i = 1
|
||||
while (token.i+i) < len(token.doc) and token.nbor(i).text == '':
|
||||
while (token.i + i) < len(token.doc) and token.nbor(i).text == "":
|
||||
i += 1
|
||||
return token.nbor(i-1)
|
||||
|
||||
return token.nbor(i - 1)
|
||||
|
||||
|
||||
##################
|
||||
# Initialization #
|
||||
|
@ -262,54 +268,73 @@ def get_token_split_end(token):
|
|||
|
||||
|
||||
def load_nlp(experiments_dir, corpus):
|
||||
nlp = spacy.load(experiments_dir / corpus / 'best-model')
|
||||
nlp = spacy.load(experiments_dir / corpus / "best-model")
|
||||
return nlp
|
||||
|
||||
|
||||
def initialize_pipeline(nlp, docs, golds, config, device):
|
||||
nlp.add_pipe(nlp.create_pipe('parser'))
|
||||
nlp.add_pipe(nlp.create_pipe("parser"))
|
||||
return nlp
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
test_data_dir=("Path to Universal Dependencies test data", "positional", None, Path),
|
||||
test_data_dir=(
|
||||
"Path to Universal Dependencies test data",
|
||||
"positional",
|
||||
None,
|
||||
Path,
|
||||
),
|
||||
experiment_dir=("Parent directory with output model", "positional", None, Path),
|
||||
corpus=("UD corpus to evaluate, e.g. UD_English, UD_Spanish, etc", "positional", None, str),
|
||||
corpus=(
|
||||
"UD corpus to evaluate, e.g. UD_English, UD_Spanish, etc",
|
||||
"positional",
|
||||
None,
|
||||
str,
|
||||
),
|
||||
)
|
||||
def main(test_data_dir, experiment_dir, corpus):
|
||||
Token.set_extension('split_start', getter=get_token_split_start)
|
||||
Token.set_extension('split_end', getter=get_token_split_end)
|
||||
Token.set_extension('begins_fused', default=False)
|
||||
Token.set_extension('inside_fused', default=False)
|
||||
Token.set_extension("split_start", getter=get_token_split_start)
|
||||
Token.set_extension("split_end", getter=get_token_split_end)
|
||||
Token.set_extension("begins_fused", default=False)
|
||||
Token.set_extension("inside_fused", default=False)
|
||||
lang.zh.Chinese.Defaults.use_jieba = False
|
||||
lang.ja.Japanese.Defaults.use_janome = False
|
||||
lang.ru.Russian.Defaults.use_pymorphy2 = False
|
||||
|
||||
nlp = load_nlp(experiment_dir, corpus)
|
||||
|
||||
treebank_code = nlp.meta['treebank']
|
||||
for section in ('test', 'dev'):
|
||||
if section == 'dev':
|
||||
section_dir = 'conll17-ud-development-2017-03-19'
|
||||
else:
|
||||
section_dir = 'conll17-ud-test-2017-05-09'
|
||||
text_path = test_data_dir / 'input' / section_dir / (treebank_code+'.txt')
|
||||
udpipe_path = test_data_dir / 'input' / section_dir / (treebank_code+'-udpipe.conllu')
|
||||
gold_path = test_data_dir / 'gold' / section_dir / (treebank_code+'.conllu')
|
||||
|
||||
header = [section, 'LAS', 'UAS', 'TAG', 'SENT', 'WORD']
|
||||
print('\t'.join(header))
|
||||
inputs = {'gold': gold_path, 'udp': udpipe_path, 'raw': text_path}
|
||||
for input_type in ('udp', 'raw'):
|
||||
treebank_code = nlp.meta["treebank"]
|
||||
for section in ("test", "dev"):
|
||||
if section == "dev":
|
||||
section_dir = "conll17-ud-development-2017-03-19"
|
||||
else:
|
||||
section_dir = "conll17-ud-test-2017-05-09"
|
||||
text_path = test_data_dir / "input" / section_dir / (treebank_code + ".txt")
|
||||
udpipe_path = (
|
||||
test_data_dir / "input" / section_dir / (treebank_code + "-udpipe.conllu")
|
||||
)
|
||||
gold_path = test_data_dir / "gold" / section_dir / (treebank_code + ".conllu")
|
||||
|
||||
header = [section, "LAS", "UAS", "TAG", "SENT", "WORD"]
|
||||
print("\t".join(header))
|
||||
inputs = {"gold": gold_path, "udp": udpipe_path, "raw": text_path}
|
||||
for input_type in ("udp", "raw"):
|
||||
input_path = inputs[input_type]
|
||||
output_path = experiment_dir / corpus / '{section}.conllu'.format(section=section)
|
||||
output_path = (
|
||||
experiment_dir / corpus / "{section}.conllu".format(section=section)
|
||||
)
|
||||
|
||||
parsed_docs, test_scores = evaluate(nlp, input_path, gold_path, output_path)
|
||||
|
||||
accuracy = print_results(input_type, test_scores)
|
||||
acc_path = experiment_dir / corpus / '{section}-accuracy.json'.format(section=section)
|
||||
with open(acc_path, 'w') as file_:
|
||||
acc_path = (
|
||||
experiment_dir
|
||||
/ corpus
|
||||
/ "{section}-accuracy.json".format(section=section)
|
||||
)
|
||||
with open(acc_path, "w") as file_:
|
||||
file_.write(json.dumps(accuracy, indent=2))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
plac.call(main)
|
|
@ -1,7 +1,9 @@
|
|||
'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
|
||||
# flake8: noqa
|
||||
"""Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
|
||||
.conllu format for development data, allowing the official scorer to be used.
|
||||
'''
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
import tqdm
|
||||
from pathlib import Path
|
||||
|
@ -11,12 +13,12 @@ import json
|
|||
|
||||
import spacy
|
||||
import spacy.util
|
||||
from ..tokens import Token, Doc
|
||||
from ..gold import GoldParse
|
||||
from ..util import compounding, minibatch, minibatch_by_words
|
||||
from ..syntax.nonproj import projectivize
|
||||
from ..matcher import Matcher
|
||||
from .. import displacy
|
||||
from ...tokens import Token, Doc
|
||||
from ...gold import GoldParse
|
||||
from ...util import compounding, minibatch, minibatch_by_words
|
||||
from ...syntax.nonproj import projectivize
|
||||
from ...matcher import Matcher
|
||||
from ... import displacy
|
||||
from collections import defaultdict, Counter
|
||||
from timeit import default_timer as timer
|
||||
|
||||
|
@ -27,10 +29,9 @@ import cytoolz
|
|||
|
||||
from . import conll17_ud_eval
|
||||
|
||||
from .. import lang
|
||||
from .. import lang
|
||||
from ..lang import zh
|
||||
from ..lang import ja
|
||||
from ... import lang
|
||||
from ...lang import zh
|
||||
from ...lang import ja
|
||||
|
||||
try:
|
||||
import torch
|
||||
|
@ -42,17 +43,26 @@ except ImportError:
|
|||
# Data reading #
|
||||
################
|
||||
|
||||
space_re = re.compile('\s+')
|
||||
def split_text(text):
|
||||
return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')]
|
||||
|
||||
space_re = re.compile("\s+")
|
||||
|
||||
def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
|
||||
max_doc_length=None, limit=None):
|
||||
'''Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
|
||||
|
||||
def split_text(text):
|
||||
return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")]
|
||||
|
||||
|
||||
def read_data(
|
||||
nlp,
|
||||
conllu_file,
|
||||
text_file,
|
||||
raw_text=True,
|
||||
oracle_segments=False,
|
||||
max_doc_length=None,
|
||||
limit=None,
|
||||
):
|
||||
"""Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
|
||||
include Doc objects created using nlp.make_doc and then aligned against
|
||||
the gold-standard sequences. If oracle_segments=True, include Doc objects
|
||||
created from the gold-standard segments. At least one must be True.'''
|
||||
created from the gold-standard segments. At least one must be True."""
|
||||
if not raw_text and not oracle_segments:
|
||||
raise ValueError("At least one of raw_text or oracle_segments must be True")
|
||||
paragraphs = split_text(text_file.read())
|
||||
|
@ -66,22 +76,21 @@ def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
|
|||
for cs in cd:
|
||||
sent = defaultdict(list)
|
||||
for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
|
||||
if '.' in id_:
|
||||
if "." in id_:
|
||||
continue
|
||||
if '-' in id_:
|
||||
if "-" in id_:
|
||||
continue
|
||||
id_ = int(id_)-1
|
||||
head = int(head)-1 if head != '0' else id_
|
||||
sent['words'].append(word)
|
||||
sent['tags'].append(tag)
|
||||
sent['heads'].append(head)
|
||||
sent['deps'].append('ROOT' if dep == 'root' else dep)
|
||||
sent['spaces'].append(space_after == '_')
|
||||
sent['entities'] = ['-'] * len(sent['words'])
|
||||
sent['heads'], sent['deps'] = projectivize(sent['heads'],
|
||||
sent['deps'])
|
||||
id_ = int(id_) - 1
|
||||
head = int(head) - 1 if head != "0" else id_
|
||||
sent["words"].append(word)
|
||||
sent["tags"].append(tag)
|
||||
sent["heads"].append(head)
|
||||
sent["deps"].append("ROOT" if dep == "root" else dep)
|
||||
sent["spaces"].append(space_after == "_")
|
||||
sent["entities"] = ["-"] * len(sent["words"])
|
||||
sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
|
||||
if oracle_segments:
|
||||
docs.append(Doc(nlp.vocab, words=sent['words'], spaces=sent['spaces']))
|
||||
docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
|
||||
golds.append(GoldParse(docs[-1], **sent))
|
||||
|
||||
sent_annots.append(sent)
|
||||
|
@ -107,18 +116,18 @@ def read_conllu(file_):
|
|||
sent = []
|
||||
doc = []
|
||||
for line in file_:
|
||||
if line.startswith('# newdoc'):
|
||||
if line.startswith("# newdoc"):
|
||||
if doc:
|
||||
docs.append(doc)
|
||||
doc = []
|
||||
elif line.startswith('#'):
|
||||
elif line.startswith("#"):
|
||||
continue
|
||||
elif not line.strip():
|
||||
if sent:
|
||||
doc.append(sent)
|
||||
sent = []
|
||||
else:
|
||||
sent.append(list(line.strip().split('\t')))
|
||||
sent.append(list(line.strip().split("\t")))
|
||||
if len(sent[-1]) != 10:
|
||||
print(repr(line))
|
||||
raise ValueError
|
||||
|
@ -134,17 +143,19 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
|
|||
flat = defaultdict(list)
|
||||
sent_starts = []
|
||||
for sent in sent_annots:
|
||||
flat['heads'].extend(len(flat['words'])+head for head in sent['heads'])
|
||||
for field in ['words', 'tags', 'deps', 'entities', 'spaces']:
|
||||
flat["heads"].extend(len(flat["words"]) + head for head in sent["heads"])
|
||||
for field in ["words", "tags", "deps", "entities", "spaces"]:
|
||||
flat[field].extend(sent[field])
|
||||
sent_starts.append(True)
|
||||
sent_starts.extend([False] * (len(sent['words'])-1))
|
||||
sent_starts.extend([False] * (len(sent["words"]) - 1))
|
||||
# Construct text if necessary
|
||||
assert len(flat['words']) == len(flat['spaces'])
|
||||
assert len(flat["words"]) == len(flat["spaces"])
|
||||
if text is None:
|
||||
text = ''.join(word+' '*space for word, space in zip(flat['words'], flat['spaces']))
|
||||
text = "".join(
|
||||
word + " " * space for word, space in zip(flat["words"], flat["spaces"])
|
||||
)
|
||||
doc = nlp.make_doc(text)
|
||||
flat.pop('spaces')
|
||||
flat.pop("spaces")
|
||||
gold = GoldParse(doc, **flat)
|
||||
gold.sent_starts = sent_starts
|
||||
for i in range(len(gold.heads)):
|
||||
|
@ -154,13 +165,15 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
|
|||
|
||||
return doc, gold
|
||||
|
||||
|
||||
#############################
|
||||
# Data transforms for spaCy #
|
||||
#############################
|
||||
|
||||
|
||||
def golds_to_gold_tuples(docs, golds):
|
||||
'''Get out the annoying 'tuples' format used by begin_training, given the
|
||||
GoldParse objects.'''
|
||||
"""Get out the annoying 'tuples' format used by begin_training, given the
|
||||
GoldParse objects."""
|
||||
tuples = []
|
||||
for doc, gold in zip(docs, golds):
|
||||
text = doc.text
|
||||
|
@ -174,8 +187,9 @@ def golds_to_gold_tuples(docs, golds):
|
|||
# Evaluation #
|
||||
##############
|
||||
|
||||
|
||||
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
||||
if text_loc.parts[-1].endswith('.conllu'):
|
||||
if text_loc.parts[-1].endswith(".conllu"):
|
||||
docs = []
|
||||
with text_loc.open() as file_:
|
||||
for conllu_doc in read_conllu(file_):
|
||||
|
@ -185,14 +199,14 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
|||
for name, component in nlp.pipeline:
|
||||
docs = list(component.pipe(docs))
|
||||
else:
|
||||
with text_loc.open('r', encoding='utf8') as text_file:
|
||||
with text_loc.open("r", encoding="utf8") as text_file:
|
||||
texts = split_text(text_file.read())
|
||||
docs = list(nlp.pipe(texts))
|
||||
with sys_loc.open('w', encoding='utf8') as out_file:
|
||||
with sys_loc.open("w", encoding="utf8") as out_file:
|
||||
write_conllu(docs, out_file)
|
||||
with gold_loc.open('r', encoding='utf8') as gold_file:
|
||||
with gold_loc.open("r", encoding="utf8") as gold_file:
|
||||
gold_ud = conll17_ud_eval.load_conllu(gold_file)
|
||||
with sys_loc.open('r', encoding='utf8') as sys_file:
|
||||
with sys_loc.open("r", encoding="utf8") as sys_file:
|
||||
sys_ud = conll17_ud_eval.load_conllu(sys_file)
|
||||
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
|
||||
return docs, scores
|
||||
|
@ -200,10 +214,10 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
|||
|
||||
def write_conllu(docs, file_):
|
||||
merger = Matcher(docs[0].vocab)
|
||||
merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}])
|
||||
merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
|
||||
for i, doc in enumerate(docs):
|
||||
matches = merger(doc)
|
||||
spans = [doc[start:end+1] for _, start, end in matches]
|
||||
spans = [doc[start : end + 1] for _, start, end in matches]
|
||||
offsets = [(span.start_char, span.end_char) for span in spans]
|
||||
for start_char, end_char in offsets:
|
||||
doc.merge(start_char, end_char)
|
||||
|
@ -213,65 +227,82 @@ def write_conllu(docs, file_):
|
|||
file_.write("# text = {text}\n".format(text=sent.text))
|
||||
for k, token in enumerate(sent):
|
||||
if token.head.i > sent[-1].i or token.head.i < sent[0].i:
|
||||
for word in doc[sent[0].i-10 : sent[0].i]:
|
||||
for word in doc[sent[0].i - 10 : sent[0].i]:
|
||||
print(word.i, word.head.i, word.text, word.dep_)
|
||||
for word in sent:
|
||||
print(word.i, word.head.i, word.text, word.dep_)
|
||||
for word in doc[sent[-1].i : sent[-1].i+10]:
|
||||
for word in doc[sent[-1].i : sent[-1].i + 10]:
|
||||
print(word.i, word.head.i, word.text, word.dep_)
|
||||
raise ValueError("Invalid parse: head outside sentence (%s)" % token.text)
|
||||
file_.write(token._.get_conllu_lines(k) + '\n')
|
||||
file_.write('\n')
|
||||
raise ValueError(
|
||||
"Invalid parse: head outside sentence (%s)" % token.text
|
||||
)
|
||||
file_.write(token._.get_conllu_lines(k) + "\n")
|
||||
file_.write("\n")
|
||||
|
||||
|
||||
def print_progress(itn, losses, ud_scores):
|
||||
fields = {
|
||||
'dep_loss': losses.get('parser', 0.0),
|
||||
'tag_loss': losses.get('tagger', 0.0),
|
||||
'words': ud_scores['Words'].f1 * 100,
|
||||
'sents': ud_scores['Sentences'].f1 * 100,
|
||||
'tags': ud_scores['XPOS'].f1 * 100,
|
||||
'uas': ud_scores['UAS'].f1 * 100,
|
||||
'las': ud_scores['LAS'].f1 * 100,
|
||||
"dep_loss": losses.get("parser", 0.0),
|
||||
"tag_loss": losses.get("tagger", 0.0),
|
||||
"words": ud_scores["Words"].f1 * 100,
|
||||
"sents": ud_scores["Sentences"].f1 * 100,
|
||||
"tags": ud_scores["XPOS"].f1 * 100,
|
||||
"uas": ud_scores["UAS"].f1 * 100,
|
||||
"las": ud_scores["LAS"].f1 * 100,
|
||||
}
|
||||
header = ['Epoch', 'Loss', 'LAS', 'UAS', 'TAG', 'SENT', 'WORD']
|
||||
header = ["Epoch", "Loss", "LAS", "UAS", "TAG", "SENT", "WORD"]
|
||||
if itn == 0:
|
||||
print('\t'.join(header))
|
||||
tpl = '\t'.join((
|
||||
'{:d}',
|
||||
'{dep_loss:.1f}',
|
||||
'{las:.1f}',
|
||||
'{uas:.1f}',
|
||||
'{tags:.1f}',
|
||||
'{sents:.1f}',
|
||||
'{words:.1f}',
|
||||
))
|
||||
print("\t".join(header))
|
||||
tpl = "\t".join(
|
||||
(
|
||||
"{:d}",
|
||||
"{dep_loss:.1f}",
|
||||
"{las:.1f}",
|
||||
"{uas:.1f}",
|
||||
"{tags:.1f}",
|
||||
"{sents:.1f}",
|
||||
"{words:.1f}",
|
||||
)
|
||||
)
|
||||
print(tpl.format(itn, **fields))
|
||||
|
||||
#def get_sent_conllu(sent, sent_id):
|
||||
|
||||
# def get_sent_conllu(sent, sent_id):
|
||||
# lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)]
|
||||
|
||||
|
||||
def get_token_conllu(token, i):
|
||||
if token._.begins_fused:
|
||||
n = 1
|
||||
while token.nbor(n)._.inside_fused:
|
||||
n += 1
|
||||
id_ = '%d-%d' % (i, i+n)
|
||||
lines = [id_, token.text, '_', '_', '_', '_', '_', '_', '_', '_']
|
||||
id_ = "%d-%d" % (i, i + n)
|
||||
lines = [id_, token.text, "_", "_", "_", "_", "_", "_", "_", "_"]
|
||||
else:
|
||||
lines = []
|
||||
if token.head.i == token.i:
|
||||
head = 0
|
||||
else:
|
||||
head = i + (token.head.i - token.i) + 1
|
||||
fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, '_',
|
||||
str(head), token.dep_.lower(), '_', '_']
|
||||
lines.append('\t'.join(fields))
|
||||
return '\n'.join(lines)
|
||||
fields = [
|
||||
str(i + 1),
|
||||
token.text,
|
||||
token.lemma_,
|
||||
token.pos_,
|
||||
token.tag_,
|
||||
"_",
|
||||
str(head),
|
||||
token.dep_.lower(),
|
||||
"_",
|
||||
"_",
|
||||
]
|
||||
lines.append("\t".join(fields))
|
||||
return "\n".join(lines)
|
||||
|
||||
Token.set_extension('get_conllu_lines', method=get_token_conllu)
|
||||
Token.set_extension('begins_fused', default=False)
|
||||
Token.set_extension('inside_fused', default=False)
|
||||
|
||||
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
||||
Token.set_extension("begins_fused", default=False)
|
||||
Token.set_extension("inside_fused", default=False)
|
||||
|
||||
|
||||
##################
|
||||
|
@ -280,35 +311,40 @@ Token.set_extension('inside_fused', default=False)
|
|||
|
||||
|
||||
def load_nlp(corpus, config, vectors=None):
|
||||
lang = corpus.split('_')[0]
|
||||
lang = corpus.split("_")[0]
|
||||
nlp = spacy.blank(lang)
|
||||
if config.vectors:
|
||||
if not vectors:
|
||||
raise ValueError("config asks for vectors, but no vectors "
|
||||
"directory set on command line (use -v)")
|
||||
if not vectors:
|
||||
raise ValueError(
|
||||
"config asks for vectors, but no vectors "
|
||||
"directory set on command line (use -v)"
|
||||
)
|
||||
if (Path(vectors) / corpus).exists():
|
||||
nlp.vocab.from_disk(Path(vectors) / corpus / 'vocab')
|
||||
nlp.meta['treebank'] = corpus
|
||||
nlp.vocab.from_disk(Path(vectors) / corpus / "vocab")
|
||||
nlp.meta["treebank"] = corpus
|
||||
return nlp
|
||||
|
||||
|
||||
|
||||
def initialize_pipeline(nlp, docs, golds, config, device):
|
||||
nlp.add_pipe(nlp.create_pipe('tagger'))
|
||||
nlp.add_pipe(nlp.create_pipe('parser'))
|
||||
nlp.add_pipe(nlp.create_pipe("tagger"))
|
||||
nlp.add_pipe(nlp.create_pipe("parser"))
|
||||
if config.multitask_tag:
|
||||
nlp.parser.add_multitask_objective('tag')
|
||||
nlp.parser.add_multitask_objective("tag")
|
||||
if config.multitask_sent:
|
||||
nlp.parser.add_multitask_objective('sent_start')
|
||||
nlp.parser.add_multitask_objective("sent_start")
|
||||
for gold in golds:
|
||||
for tag in gold.tags:
|
||||
if tag is not None:
|
||||
nlp.tagger.add_label(tag)
|
||||
if torch is not None and device != -1:
|
||||
torch.set_default_tensor_type('torch.cuda.FloatTensor')
|
||||
torch.set_default_tensor_type("torch.cuda.FloatTensor")
|
||||
optimizer = nlp.begin_training(
|
||||
lambda: golds_to_gold_tuples(docs, golds), device=device,
|
||||
subword_features=config.subword_features, conv_depth=config.conv_depth,
|
||||
bilstm_depth=config.bilstm_depth)
|
||||
lambda: golds_to_gold_tuples(docs, golds),
|
||||
device=device,
|
||||
subword_features=config.subword_features,
|
||||
conv_depth=config.conv_depth,
|
||||
bilstm_depth=config.bilstm_depth,
|
||||
)
|
||||
if config.pretrained_tok2vec:
|
||||
_load_pretrained_tok2vec(nlp, config.pretrained_tok2vec)
|
||||
return optimizer
|
||||
|
@ -318,27 +354,41 @@ def _load_pretrained_tok2vec(nlp, loc):
|
|||
"""Load pre-trained weights for the 'token-to-vector' part of the component
|
||||
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
||||
"""
|
||||
with Path(loc).open('rb') as file_:
|
||||
with Path(loc).open("rb") as file_:
|
||||
weights_data = file_.read()
|
||||
loaded = []
|
||||
for name, component in nlp.pipeline:
|
||||
if hasattr(component, 'model') and hasattr(component.model, 'tok2vec'):
|
||||
if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
|
||||
component.tok2vec.from_bytes(weights_data)
|
||||
loaded.append(name)
|
||||
return loaded
|
||||
|
||||
|
||||
|
||||
########################
|
||||
# Command line helpers #
|
||||
########################
|
||||
|
||||
|
||||
class Config(object):
|
||||
def __init__(self, vectors=None, max_doc_length=10, multitask_tag=False,
|
||||
multitask_sent=False, multitask_dep=False, multitask_vectors=None,
|
||||
bilstm_depth=0, nr_epoch=30, min_batch_size=750, max_batch_size=750,
|
||||
batch_by_words=True, dropout=0.1, conv_depth=4, subword_features=True,
|
||||
vectors_dir=None, pretrained_tok2vec=None):
|
||||
def __init__(
|
||||
self,
|
||||
vectors=None,
|
||||
max_doc_length=10,
|
||||
multitask_tag=False,
|
||||
multitask_sent=False,
|
||||
multitask_dep=False,
|
||||
multitask_vectors=None,
|
||||
bilstm_depth=0,
|
||||
nr_epoch=30,
|
||||
min_batch_size=100,
|
||||
max_batch_size=1000,
|
||||
batch_by_words=True,
|
||||
dropout=0.2,
|
||||
conv_depth=4,
|
||||
subword_features=True,
|
||||
vectors_dir=None,
|
||||
pretrained_tok2vec=None,
|
||||
):
|
||||
if vectors_dir is not None:
|
||||
if vectors is None:
|
||||
vectors = True
|
||||
|
@ -346,13 +396,13 @@ class Config(object):
|
|||
multitask_vectors = True
|
||||
for key, value in locals().items():
|
||||
setattr(self, key, value)
|
||||
|
||||
|
||||
@classmethod
|
||||
def load(cls, loc, vectors_dir=None):
|
||||
with Path(loc).open('r', encoding='utf8') as file_:
|
||||
with Path(loc).open("r", encoding="utf8") as file_:
|
||||
cfg = json.load(file_)
|
||||
if vectors_dir is not None:
|
||||
cfg['vectors_dir'] = vectors_dir
|
||||
cfg["vectors_dir"] = vectors_dir
|
||||
return cls(**cfg)
|
||||
|
||||
|
||||
|
@ -364,43 +414,59 @@ class Dataset(object):
|
|||
self.text = None
|
||||
for file_path in self.path.iterdir():
|
||||
name = file_path.parts[-1]
|
||||
if section in name and name.endswith('conllu'):
|
||||
if section in name and name.endswith("conllu"):
|
||||
self.conllu = file_path
|
||||
elif section in name and name.endswith('txt'):
|
||||
elif section in name and name.endswith("txt"):
|
||||
self.text = file_path
|
||||
if self.conllu is None:
|
||||
msg = "Could not find .txt file in {path} for {section}"
|
||||
raise IOError(msg.format(section=section, path=path))
|
||||
if self.text is None:
|
||||
msg = "Could not find .txt file in {path} for {section}"
|
||||
self.lang = self.conllu.parts[-1].split('-')[0].split('_')[0]
|
||||
self.lang = self.conllu.parts[-1].split("-")[0].split("_")[0]
|
||||
|
||||
|
||||
class TreebankPaths(object):
|
||||
def __init__(self, ud_path, treebank, **cfg):
|
||||
self.train = Dataset(ud_path / treebank, 'train')
|
||||
self.dev = Dataset(ud_path / treebank, 'dev')
|
||||
self.train = Dataset(ud_path / treebank, "train")
|
||||
self.dev = Dataset(ud_path / treebank, "dev")
|
||||
self.lang = self.train.lang
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
|
||||
corpus=("UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
|
||||
"positional", None, str),
|
||||
corpus=(
|
||||
"UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
|
||||
"positional",
|
||||
None,
|
||||
str,
|
||||
),
|
||||
parses_dir=("Directory to write the development parses", "positional", None, Path),
|
||||
config=("Path to json formatted config file", "option", "C", Path),
|
||||
limit=("Size limit", "option", "n", int),
|
||||
gpu_device=("Use GPU", "option", "g", int),
|
||||
use_oracle_segments=("Use oracle segments", "flag", "G", int),
|
||||
vectors_dir=("Path to directory with pre-trained vectors, named e.g. en/",
|
||||
"option", "v", Path),
|
||||
vectors_dir=(
|
||||
"Path to directory with pre-trained vectors, named e.g. en/",
|
||||
"option",
|
||||
"v",
|
||||
Path,
|
||||
),
|
||||
)
|
||||
def main(ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vectors_dir=None,
|
||||
use_oracle_segments=False):
|
||||
def main(
|
||||
ud_dir,
|
||||
parses_dir,
|
||||
corpus,
|
||||
config=None,
|
||||
limit=0,
|
||||
gpu_device=-1,
|
||||
vectors_dir=None,
|
||||
use_oracle_segments=False,
|
||||
):
|
||||
spacy.util.fix_random_seed()
|
||||
lang.zh.Chinese.Defaults.use_jieba = False
|
||||
lang.ja.Japanese.Defaults.use_janome = False
|
||||
|
||||
|
||||
if config is not None:
|
||||
config = Config.load(config, vectors_dir=vectors_dir)
|
||||
else:
|
||||
|
@ -411,19 +477,28 @@ def main(ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vector
|
|||
print("Train and evaluate", corpus, "using lang", paths.lang)
|
||||
nlp = load_nlp(paths.lang, config, vectors=vectors_dir)
|
||||
|
||||
docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
|
||||
max_doc_length=config.max_doc_length,
|
||||
limit=limit)
|
||||
docs, golds = read_data(
|
||||
nlp,
|
||||
paths.train.conllu.open(),
|
||||
paths.train.text.open(),
|
||||
max_doc_length=config.max_doc_length,
|
||||
limit=limit,
|
||||
)
|
||||
|
||||
optimizer = initialize_pipeline(nlp, docs, golds, config, gpu_device)
|
||||
|
||||
batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001)
|
||||
beam_prob = compounding(0.2, 0.8, 1.001)
|
||||
for i in range(config.nr_epoch):
|
||||
docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
|
||||
max_doc_length=config.max_doc_length, limit=limit,
|
||||
oracle_segments=use_oracle_segments,
|
||||
raw_text=not use_oracle_segments)
|
||||
docs, golds = read_data(
|
||||
nlp,
|
||||
paths.train.conllu.open(),
|
||||
paths.train.text.open(),
|
||||
max_doc_length=config.max_doc_length,
|
||||
limit=limit,
|
||||
oracle_segments=use_oracle_segments,
|
||||
raw_text=not use_oracle_segments,
|
||||
)
|
||||
Xs = list(zip(docs, golds))
|
||||
random.shuffle(Xs)
|
||||
if config.batch_by_words:
|
||||
|
@ -436,27 +511,34 @@ def main(ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vector
|
|||
for batch in batches:
|
||||
batch_docs, batch_gold = zip(*batch)
|
||||
pbar.update(sum(len(doc) for doc in batch_docs))
|
||||
nlp.parser.cfg['beam_update_prob'] = next(beam_prob)
|
||||
nlp.update(batch_docs, batch_gold, sgd=optimizer,
|
||||
drop=config.dropout, losses=losses)
|
||||
|
||||
out_path = parses_dir / corpus / 'epoch-{i}.conllu'.format(i=i)
|
||||
nlp.parser.cfg["beam_update_prob"] = next(beam_prob)
|
||||
nlp.update(
|
||||
batch_docs,
|
||||
batch_gold,
|
||||
sgd=optimizer,
|
||||
drop=config.dropout,
|
||||
losses=losses,
|
||||
)
|
||||
|
||||
out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i)
|
||||
with nlp.use_params(optimizer.averages):
|
||||
if use_oracle_segments:
|
||||
parsed_docs, scores = evaluate(nlp, paths.dev.conllu,
|
||||
paths.dev.conllu, out_path)
|
||||
parsed_docs, scores = evaluate(
|
||||
nlp, paths.dev.conllu, paths.dev.conllu, out_path
|
||||
)
|
||||
else:
|
||||
parsed_docs, scores = evaluate(nlp, paths.dev.text,
|
||||
paths.dev.conllu, out_path)
|
||||
parsed_docs, scores = evaluate(
|
||||
nlp, paths.dev.text, paths.dev.conllu, out_path
|
||||
)
|
||||
print_progress(i, losses, scores)
|
||||
|
||||
|
||||
def _render_parses(i, to_render):
|
||||
to_render[0].user_data['title'] = "Batch %d" % i
|
||||
with Path('/tmp/parses.html').open('w') as file_:
|
||||
html = displacy.render(to_render[:5], style='dep', page=True)
|
||||
to_render[0].user_data["title"] = "Batch %d" % i
|
||||
with Path("/tmp/parses.html").open("w") as file_:
|
||||
html = displacy.render(to_render[:5], style="dep", page=True)
|
||||
file_.write(html)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
plac.call(main)
|
|
@ -4,28 +4,34 @@ from __future__ import unicode_literals, print_function
|
|||
import pkg_resources
|
||||
from pathlib import Path
|
||||
import sys
|
||||
import ujson
|
||||
import requests
|
||||
from wasabi import Printer
|
||||
|
||||
from ._messages import Messages
|
||||
from ..compat import path2str, locale_escape
|
||||
from ..util import prints, get_data_path, read_json
|
||||
from ..compat import path2str
|
||||
from ..util import get_data_path, read_json
|
||||
from .. import about
|
||||
|
||||
|
||||
def validate():
|
||||
"""Validate that the currently installed version of spaCy is compatible
|
||||
"""
|
||||
Validate that the currently installed version of spaCy is compatible
|
||||
with the installed models. Should be run after `pip install -U spacy`.
|
||||
"""
|
||||
r = requests.get(about.__compatibility__)
|
||||
if r.status_code != 200:
|
||||
prints(Messages.M021, title=Messages.M003.format(code=r.status_code),
|
||||
exits=1)
|
||||
compat = r.json()['spacy']
|
||||
msg = Printer()
|
||||
with msg.loading("Loading compatibility table..."):
|
||||
r = requests.get(about.__compatibility__)
|
||||
if r.status_code != 200:
|
||||
msg.fail(Messages.M003.format(code=r.status_code), Messages.M021, exits=1)
|
||||
msg.good("Loaded compatibility table")
|
||||
compat = r.json()["spacy"]
|
||||
current_compat = compat.get(about.__version__)
|
||||
if not current_compat:
|
||||
prints(about.__compatibility__, exits=1,
|
||||
title=Messages.M022.format(version=about.__version__))
|
||||
msg.fail(
|
||||
Messages.M022.format(version=about.__version__),
|
||||
about.__compatibility__,
|
||||
exits=1,
|
||||
)
|
||||
all_models = set()
|
||||
for spacy_v, models in dict(compat).items():
|
||||
all_models.update(models.keys())
|
||||
|
@ -33,33 +39,38 @@ def validate():
|
|||
compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
|
||||
model_links = get_model_links(current_compat)
|
||||
model_pkgs = get_model_pkgs(current_compat, all_models)
|
||||
incompat_links = {l for l, d in model_links.items() if not d['compat']}
|
||||
incompat_models = {d['name'] for _, d in model_pkgs.items()
|
||||
if not d['compat']}
|
||||
incompat_models.update([d['name'] for _, d in model_links.items()
|
||||
if not d['compat']])
|
||||
incompat_links = {l for l, d in model_links.items() if not d["compat"]}
|
||||
incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
|
||||
incompat_models.update(
|
||||
[d["name"] for _, d in model_links.items() if not d["compat"]]
|
||||
)
|
||||
na_models = [m for m in incompat_models if m not in current_compat]
|
||||
update_models = [m for m in incompat_models if m in current_compat]
|
||||
spacy_dir = Path(__file__).parent.parent
|
||||
|
||||
msg.divider(Messages.M023.format(version=about.__version__))
|
||||
msg.info("spaCy installation: {}".format(path2str(spacy_dir)))
|
||||
|
||||
prints(path2str(Path(__file__).parent.parent),
|
||||
title=Messages.M023.format(version=about.__version__))
|
||||
if model_links or model_pkgs:
|
||||
print(get_row('TYPE', 'NAME', 'MODEL', 'VERSION', ''))
|
||||
header = ("TYPE", "NAME", "MODEL", "VERSION", "")
|
||||
rows = []
|
||||
for name, data in model_pkgs.items():
|
||||
print(get_model_row(current_compat, name, data, 'package'))
|
||||
rows.append(get_model_row(current_compat, name, data, msg))
|
||||
for name, data in model_links.items():
|
||||
print(get_model_row(current_compat, name, data, 'link'))
|
||||
rows.append(get_model_row(current_compat, name, data, msg, "link"))
|
||||
msg.table(rows, header=header)
|
||||
else:
|
||||
prints(Messages.M024, exits=0)
|
||||
msg.text(Messages.M024, exits=0)
|
||||
if update_models:
|
||||
cmd = ' python -m spacy download {}'
|
||||
print("\n " + Messages.M025)
|
||||
print('\n'.join([cmd.format(pkg) for pkg in update_models]))
|
||||
msg.divider("Install updates")
|
||||
cmd = "python -m spacy download {}"
|
||||
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
|
||||
if na_models:
|
||||
prints(Messages.M025.format(version=about.__version__,
|
||||
models=', '.join(na_models)))
|
||||
msg.text(
|
||||
Messages.M025.format(version=about.__version__, models=", ".join(na_models))
|
||||
)
|
||||
if incompat_links:
|
||||
prints(Messages.M027.format(path=path2str(get_data_path())))
|
||||
msg.text(Messages.M027.format(path=path2str(get_data_path())))
|
||||
if incompat_models or incompat_links:
|
||||
sys.exit(1)
|
||||
|
||||
|
@ -70,50 +81,48 @@ def get_model_links(compat):
|
|||
if data_path:
|
||||
models = [p for p in data_path.iterdir() if is_model_path(p)]
|
||||
for model in models:
|
||||
meta_path = Path(model) / 'meta.json'
|
||||
meta_path = Path(model) / "meta.json"
|
||||
if not meta_path.exists():
|
||||
continue
|
||||
meta = read_json(meta_path)
|
||||
link = model.parts[-1]
|
||||
name = meta['lang'] + '_' + meta['name']
|
||||
links[link] = {'name': name, 'version': meta['version'],
|
||||
'compat': is_compat(compat, name, meta['version'])}
|
||||
name = meta["lang"] + "_" + meta["name"]
|
||||
links[link] = {
|
||||
"name": name,
|
||||
"version": meta["version"],
|
||||
"compat": is_compat(compat, name, meta["version"]),
|
||||
}
|
||||
return links
|
||||
|
||||
|
||||
def get_model_pkgs(compat, all_models):
|
||||
pkgs = {}
|
||||
for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
|
||||
package = pkg_name.replace('-', '_')
|
||||
package = pkg_name.replace("-", "_")
|
||||
if package in all_models:
|
||||
version = pkg_data.version
|
||||
pkgs[pkg_name] = {'name': package, 'version': version,
|
||||
'compat': is_compat(compat, package, version)}
|
||||
pkgs[pkg_name] = {
|
||||
"name": package,
|
||||
"version": version,
|
||||
"compat": is_compat(compat, package, version),
|
||||
}
|
||||
return pkgs
|
||||
|
||||
|
||||
def get_model_row(compat, name, data, type='package'):
|
||||
tpl_red = '\x1b[38;5;1m{}\x1b[0m'
|
||||
tpl_green = '\x1b[38;5;2m{}\x1b[0m'
|
||||
if data['compat']:
|
||||
comp = tpl_green.format(locale_escape('✔', errors='ignore'))
|
||||
version = tpl_green.format(data['version'])
|
||||
def get_model_row(compat, name, data, msg, model_type="package"):
|
||||
if data["compat"]:
|
||||
comp = msg.text("", color="green", icon="good", no_print=True)
|
||||
version = msg.text(data["version"], color="green", no_print=True)
|
||||
else:
|
||||
comp = '--> {}'.format(compat.get(data['name'], ['n/a'])[0])
|
||||
version = tpl_red.format(data['version'])
|
||||
return get_row(type, name, data['name'], version, comp)
|
||||
|
||||
|
||||
def get_row(*args):
|
||||
tpl_row = ' {:<10}' + (' {:<20}' * 4)
|
||||
return tpl_row.format(*args)
|
||||
version = msg.text(data["version"], color="red", no_print=True)
|
||||
comp = "--> {}".format(compat.get(data["name"], ["n/a"])[0])
|
||||
return (model_type, name, data["name"], version, comp)
|
||||
|
||||
|
||||
def is_model_path(model_path):
|
||||
exclude = ['cache', 'pycache', '__pycache__']
|
||||
exclude = ["cache", "pycache", "__pycache__"]
|
||||
name = model_path.parts[-1]
|
||||
return (model_path.is_dir() and name not in exclude
|
||||
and not name.startswith('.'))
|
||||
return model_path.is_dir() and name not in exclude and not name.startswith(".")
|
||||
|
||||
|
||||
def is_compat(compat, name, version):
|
||||
|
@ -122,6 +131,6 @@ def is_compat(compat, name, version):
|
|||
|
||||
def reformat_version(version):
|
||||
"""Hack to reformat old versions ending on '-alpha' to match pip format."""
|
||||
if version.endswith('-alpha'):
|
||||
return version.replace('-alpha', 'a0')
|
||||
return version.replace('-alpha', 'a')
|
||||
if version.endswith("-alpha"):
|
||||
return version.replace("-alpha", "a0")
|
||||
return version.replace("-alpha", "a")
|
||||
|
|
|
@ -1,59 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
import json
|
||||
import spacy
|
||||
import numpy
|
||||
from pathlib import Path
|
||||
|
||||
from ..vectors import Vectors
|
||||
from ..util import prints, ensure_path
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
lang=("model language", "positional", None, str),
|
||||
output_dir=("model output directory", "positional", None, Path),
|
||||
lexemes_loc=("location of JSONL-formatted lexical data", "positional",
|
||||
None, Path),
|
||||
vectors_loc=("optional: location of vectors data, as numpy .npz",
|
||||
"positional", None, str),
|
||||
prune_vectors=("optional: number of vectors to prune to.",
|
||||
"option", "V", int)
|
||||
)
|
||||
def make_vocab(lang, output_dir, lexemes_loc, vectors_loc=None, prune_vectors=-1):
|
||||
"""Compile a vocabulary from a lexicon jsonl file and word vectors."""
|
||||
if not lexemes_loc.exists():
|
||||
prints(lexemes_loc, title="Can't find lexical data", exits=1)
|
||||
vectors_loc = ensure_path(vectors_loc)
|
||||
nlp = spacy.blank(lang)
|
||||
for word in nlp.vocab:
|
||||
word.rank = 0
|
||||
lex_added = 0
|
||||
with lexemes_loc.open() as file_:
|
||||
for line in file_:
|
||||
if line.strip():
|
||||
attrs = json.loads(line)
|
||||
if 'settings' in attrs:
|
||||
nlp.vocab.cfg.update(attrs['settings'])
|
||||
else:
|
||||
lex = nlp.vocab[attrs['orth']]
|
||||
lex.set_attrs(**attrs)
|
||||
assert lex.rank == attrs['id']
|
||||
lex_added += 1
|
||||
if vectors_loc is not None:
|
||||
vector_data = numpy.load(vectors_loc.open('rb'))
|
||||
nlp.vocab.vectors = Vectors(data=vector_data)
|
||||
for word in nlp.vocab:
|
||||
if word.rank:
|
||||
nlp.vocab.vectors.add(word.orth, row=word.rank)
|
||||
|
||||
if prune_vectors >= 1:
|
||||
remap = nlp.vocab.prune_vectors(prune_vectors)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
nlp.to_disk(output_dir)
|
||||
vec_added = len(nlp.vocab.vectors)
|
||||
prints("{} entries, {} vectors".format(lex_added, vec_added), output_dir,
|
||||
title="Sucessfully compiled vocab and vectors, and saved model")
|
||||
return nlp
|
|
@ -1,11 +1,10 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import os
|
||||
import sys
|
||||
import ujson
|
||||
import itertools
|
||||
import locale
|
||||
import os
|
||||
|
||||
from thinc.neural.util import copy_array
|
||||
|
||||
|
@ -30,9 +29,9 @@ except ImportError:
|
|||
cupy = None
|
||||
|
||||
try:
|
||||
from thinc.neural.optimizers import Optimizer
|
||||
from thinc.neural.optimizers import Optimizer # noqa: F401
|
||||
except ImportError:
|
||||
from thinc.neural.optimizers import Adam as Optimizer
|
||||
from thinc.neural.optimizers import Adam as Optimizer # noqa: F401
|
||||
|
||||
pickle = pickle
|
||||
copy_reg = copy_reg
|
||||
|
@ -136,12 +135,3 @@ def import_file(name, loc):
|
|||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
return module
|
||||
|
||||
|
||||
def locale_escape(string, errors="replace"):
|
||||
"""
|
||||
Mangle non-supported characters, for savages with ascii terminals.
|
||||
"""
|
||||
encoding = locale.getpreferredencoding()
|
||||
string = string.encode(encoding, errors).decode("utf8")
|
||||
return string
|
||||
|
|
|
@ -5,15 +5,22 @@ from .render import DependencyRenderer, EntityRenderer
|
|||
from ..tokens import Doc, Span
|
||||
from ..compat import b_to_str
|
||||
from ..errors import Errors, Warnings, user_warning
|
||||
from ..util import prints, is_in_jupyter
|
||||
from ..util import is_in_jupyter
|
||||
|
||||
|
||||
_html = {}
|
||||
IS_JUPYTER = is_in_jupyter()
|
||||
|
||||
|
||||
def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
|
||||
options={}, manual=False):
|
||||
def render(
|
||||
docs,
|
||||
style="dep",
|
||||
page=False,
|
||||
minify=False,
|
||||
jupyter=IS_JUPYTER,
|
||||
options={},
|
||||
manual=False,
|
||||
):
|
||||
"""Render displaCy visualisation.
|
||||
|
||||
docs (list or Doc): Document(s) to visualise.
|
||||
|
@ -25,8 +32,10 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
|
|||
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||
RETURNS (unicode): Rendered HTML markup.
|
||||
"""
|
||||
factories = {'dep': (DependencyRenderer, parse_deps),
|
||||
'ent': (EntityRenderer, parse_ents)}
|
||||
factories = {
|
||||
"dep": (DependencyRenderer, parse_deps),
|
||||
"ent": (EntityRenderer, parse_ents),
|
||||
}
|
||||
if style not in factories:
|
||||
raise ValueError(Errors.E087.format(style=style))
|
||||
if isinstance(docs, (Doc, Span, dict)):
|
||||
|
@ -37,16 +46,18 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
|
|||
renderer, converter = factories[style]
|
||||
renderer = renderer(options=options)
|
||||
parsed = [converter(doc, options) for doc in docs] if not manual else docs
|
||||
_html['parsed'] = renderer.render(parsed, page=page, minify=minify).strip()
|
||||
html = _html['parsed']
|
||||
_html["parsed"] = renderer.render(parsed, page=page, minify=minify).strip()
|
||||
html = _html["parsed"]
|
||||
if jupyter: # return HTML rendered by IPython display()
|
||||
from IPython.core.display import display, HTML
|
||||
|
||||
return display(HTML(html))
|
||||
return html
|
||||
|
||||
|
||||
def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
|
||||
port=5000):
|
||||
def serve(
|
||||
docs, style="dep", page=True, minify=False, options={}, manual=False, port=5000
|
||||
):
|
||||
"""Serve displaCy visualisation.
|
||||
|
||||
docs (list or Doc): Document(s) to visualise.
|
||||
|
@ -58,25 +69,24 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
|
|||
port (int): Port to serve visualisation.
|
||||
"""
|
||||
from wsgiref import simple_server
|
||||
render(docs, style=style, page=page, minify=minify, options=options,
|
||||
manual=manual)
|
||||
httpd = simple_server.make_server('0.0.0.0', port, app)
|
||||
prints("Using the '{}' visualizer".format(style),
|
||||
title="Serving on port {}...".format(port))
|
||||
|
||||
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
||||
httpd = simple_server.make_server("0.0.0.0", port, app)
|
||||
print("\nUsing the '{}' visualizer".format(style))
|
||||
print("Serving on port {}...\n".format(port))
|
||||
try:
|
||||
httpd.serve_forever()
|
||||
except KeyboardInterrupt:
|
||||
prints("Shutting down server on port {}.".format(port))
|
||||
print("Shutting down server on port {}.".format(port))
|
||||
finally:
|
||||
httpd.server_close()
|
||||
|
||||
|
||||
def app(environ, start_response):
|
||||
# headers and status need to be bytes in Python 2, see #1227
|
||||
headers = [(b_to_str(b'Content-type'),
|
||||
b_to_str(b'text/html; charset=utf-8'))]
|
||||
start_response(b_to_str(b'200 OK'), headers)
|
||||
res = _html['parsed'].encode(encoding='utf-8')
|
||||
# Headers and status need to be bytes in Python 2, see #1227
|
||||
headers = [(b_to_str(b"Content-type"), b_to_str(b"text/html; charset=utf-8"))]
|
||||
start_response(b_to_str(b"200 OK"), headers)
|
||||
res = _html["parsed"].encode(encoding="utf-8")
|
||||
return [res]
|
||||
|
||||
|
||||
|
@ -89,11 +99,10 @@ def parse_deps(orig_doc, options={}):
|
|||
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
|
||||
if not doc.is_parsed:
|
||||
user_warning(Warnings.W005)
|
||||
if options.get('collapse_phrases', False):
|
||||
if options.get("collapse_phrases", False):
|
||||
for np in list(doc.noun_chunks):
|
||||
np.merge(tag=np.root.tag_, lemma=np.root.lemma_,
|
||||
ent_type=np.root.ent_type_)
|
||||
if options.get('collapse_punct', True):
|
||||
np.merge(tag=np.root.tag_, lemma=np.root.lemma_, ent_type=np.root.ent_type_)
|
||||
if options.get("collapse_punct", True):
|
||||
spans = []
|
||||
for word in doc[:-1]:
|
||||
if word.is_punct or not word.nbor(1).is_punct:
|
||||
|
@ -103,23 +112,31 @@ def parse_deps(orig_doc, options={}):
|
|||
while end < len(doc) and doc[end].is_punct:
|
||||
end += 1
|
||||
span = doc[start:end]
|
||||
spans.append((span.start_char, span.end_char, word.tag_,
|
||||
word.lemma_, word.ent_type_))
|
||||
spans.append(
|
||||
(span.start_char, span.end_char, word.tag_, word.lemma_, word.ent_type_)
|
||||
)
|
||||
for start, end, tag, lemma, ent_type in spans:
|
||||
doc.merge(start, end, tag=tag, lemma=lemma, ent_type=ent_type)
|
||||
if options.get('fine_grained'):
|
||||
words = [{'text': w.text, 'tag': w.tag_} for w in doc]
|
||||
if options.get("fine_grained"):
|
||||
words = [{"text": w.text, "tag": w.tag_} for w in doc]
|
||||
else:
|
||||
words = [{'text': w.text, 'tag': w.pos_} for w in doc]
|
||||
words = [{"text": w.text, "tag": w.pos_} for w in doc]
|
||||
arcs = []
|
||||
for word in doc:
|
||||
if word.i < word.head.i:
|
||||
arcs.append({'start': word.i, 'end': word.head.i,
|
||||
'label': word.dep_, 'dir': 'left'})
|
||||
arcs.append(
|
||||
{"start": word.i, "end": word.head.i, "label": word.dep_, "dir": "left"}
|
||||
)
|
||||
elif word.i > word.head.i:
|
||||
arcs.append({'start': word.head.i, 'end': word.i,
|
||||
'label': word.dep_, 'dir': 'right'})
|
||||
return {'words': words, 'arcs': arcs}
|
||||
arcs.append(
|
||||
{
|
||||
"start": word.head.i,
|
||||
"end": word.i,
|
||||
"label": word.dep_,
|
||||
"dir": "right",
|
||||
}
|
||||
)
|
||||
return {"words": words, "arcs": arcs}
|
||||
|
||||
|
||||
def parse_ents(doc, options={}):
|
||||
|
@ -128,10 +145,11 @@ def parse_ents(doc, options={}):
|
|||
doc (Doc): Document do parse.
|
||||
RETURNS (dict): Generated entities keyed by text (original text) and ents.
|
||||
"""
|
||||
ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}
|
||||
for ent in doc.ents]
|
||||
ents = [
|
||||
{"start": ent.start_char, "end": ent.end_char, "label": ent.label_}
|
||||
for ent in doc.ents
|
||||
]
|
||||
if not ents:
|
||||
user_warning(Warnings.W006)
|
||||
title = (doc.user_data.get('title', None)
|
||||
if hasattr(doc, 'user_data') else None)
|
||||
return {'text': doc.text, 'ents': ents, 'title': title}
|
||||
title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None
|
||||
return {"text": doc.text, "ents": ents, "title": title}
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import random
|
||||
|
||||
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS
|
||||
from .templates import TPL_ENT, TPL_ENTS, TPL_FIGURE, TPL_TITLE, TPL_PAGE
|
||||
from ..util import minify_html, escape_html
|
||||
|
@ -8,7 +10,8 @@ from ..util import minify_html, escape_html
|
|||
|
||||
class DependencyRenderer(object):
|
||||
"""Render dependency parses as SVGs."""
|
||||
style = 'dep'
|
||||
|
||||
style = "dep"
|
||||
|
||||
def __init__(self, options={}):
|
||||
"""Initialise dependency renderer.
|
||||
|
@ -17,18 +20,16 @@ class DependencyRenderer(object):
|
|||
arrow_spacing, arrow_width, arrow_stroke, distance, offset_x,
|
||||
color, bg, font)
|
||||
"""
|
||||
self.compact = options.get('compact', False)
|
||||
self.word_spacing = options.get('word_spacing', 45)
|
||||
self.arrow_spacing = options.get('arrow_spacing',
|
||||
12 if self.compact else 20)
|
||||
self.arrow_width = options.get('arrow_width',
|
||||
6 if self.compact else 10)
|
||||
self.arrow_stroke = options.get('arrow_stroke', 2)
|
||||
self.distance = options.get('distance', 150 if self.compact else 175)
|
||||
self.offset_x = options.get('offset_x', 50)
|
||||
self.color = options.get('color', '#000000')
|
||||
self.bg = options.get('bg', '#ffffff')
|
||||
self.font = options.get('font', 'Arial')
|
||||
self.compact = options.get("compact", False)
|
||||
self.word_spacing = options.get("word_spacing", 45)
|
||||
self.arrow_spacing = options.get("arrow_spacing", 12 if self.compact else 20)
|
||||
self.arrow_width = options.get("arrow_width", 6 if self.compact else 10)
|
||||
self.arrow_stroke = options.get("arrow_stroke", 2)
|
||||
self.distance = options.get("distance", 150 if self.compact else 175)
|
||||
self.offset_x = options.get("offset_x", 50)
|
||||
self.color = options.get("color", "#000000")
|
||||
self.bg = options.get("bg", "#ffffff")
|
||||
self.font = options.get("font", "Arial")
|
||||
|
||||
def render(self, parsed, page=False, minify=False):
|
||||
"""Render complete markup.
|
||||
|
@ -38,14 +39,18 @@ class DependencyRenderer(object):
|
|||
minify (bool): Minify HTML markup.
|
||||
RETURNS (unicode): Rendered SVG or HTML markup.
|
||||
"""
|
||||
rendered = [self.render_svg(i, p['words'], p['arcs'])
|
||||
for i, p in enumerate(parsed)]
|
||||
# Create a random ID prefix to make sure parses don't receive the
|
||||
# same ID, even if they're identical
|
||||
id_prefix = random.randint(0, 999)
|
||||
rendered = [
|
||||
self.render_svg("{}-{}".format(id_prefix, i), p["words"], p["arcs"])
|
||||
for i, p in enumerate(parsed)
|
||||
]
|
||||
if page:
|
||||
content = ''.join([TPL_FIGURE.format(content=svg)
|
||||
for svg in rendered])
|
||||
content = "".join([TPL_FIGURE.format(content=svg) for svg in rendered])
|
||||
markup = TPL_PAGE.format(content=content)
|
||||
else:
|
||||
markup = ''.join(rendered)
|
||||
markup = "".join(rendered)
|
||||
if minify:
|
||||
return minify_html(markup)
|
||||
return markup
|
||||
|
@ -60,19 +65,25 @@ class DependencyRenderer(object):
|
|||
"""
|
||||
self.levels = self.get_levels(arcs)
|
||||
self.highest_level = len(self.levels)
|
||||
self.offset_y = self.distance/2*self.highest_level+self.arrow_stroke
|
||||
self.width = self.offset_x+len(words)*self.distance
|
||||
self.height = self.offset_y+3*self.word_spacing
|
||||
self.offset_y = self.distance / 2 * self.highest_level + self.arrow_stroke
|
||||
self.width = self.offset_x + len(words) * self.distance
|
||||
self.height = self.offset_y + 3 * self.word_spacing
|
||||
self.id = render_id
|
||||
words = [self.render_word(w['text'], w['tag'], i)
|
||||
for i, w in enumerate(words)]
|
||||
arcs = [self.render_arrow(a['label'], a['start'],
|
||||
a['end'], a['dir'], i)
|
||||
for i, a in enumerate(arcs)]
|
||||
content = ''.join(words) + ''.join(arcs)
|
||||
return TPL_DEP_SVG.format(id=self.id, width=self.width,
|
||||
height=self.height, color=self.color,
|
||||
bg=self.bg, font=self.font, content=content)
|
||||
words = [self.render_word(w["text"], w["tag"], i) for i, w in enumerate(words)]
|
||||
arcs = [
|
||||
self.render_arrow(a["label"], a["start"], a["end"], a["dir"], i)
|
||||
for i, a in enumerate(arcs)
|
||||
]
|
||||
content = "".join(words) + "".join(arcs)
|
||||
return TPL_DEP_SVG.format(
|
||||
id=self.id,
|
||||
width=self.width,
|
||||
height=self.height,
|
||||
color=self.color,
|
||||
bg=self.bg,
|
||||
font=self.font,
|
||||
content=content,
|
||||
)
|
||||
|
||||
def render_word(self, text, tag, i):
|
||||
"""Render individual word.
|
||||
|
@ -82,12 +93,11 @@ class DependencyRenderer(object):
|
|||
i (int): Unique ID, typically word index.
|
||||
RETURNS (unicode): Rendered SVG markup.
|
||||
"""
|
||||
y = self.offset_y+self.word_spacing
|
||||
x = self.offset_x+i*self.distance
|
||||
y = self.offset_y + self.word_spacing
|
||||
x = self.offset_x + i * self.distance
|
||||
html_text = escape_html(text)
|
||||
return TPL_DEP_WORDS.format(text=html_text, tag=tag, x=x, y=y)
|
||||
|
||||
|
||||
def render_arrow(self, label, start, end, direction, i):
|
||||
"""Render indivicual arrow.
|
||||
|
||||
|
@ -98,20 +108,30 @@ class DependencyRenderer(object):
|
|||
i (int): Unique ID, typically arrow index.
|
||||
RETURNS (unicode): Rendered SVG markup.
|
||||
"""
|
||||
level = self.levels.index(end-start)+1
|
||||
x_start = self.offset_x+start*self.distance+self.arrow_spacing
|
||||
level = self.levels.index(end - start) + 1
|
||||
x_start = self.offset_x + start * self.distance + self.arrow_spacing
|
||||
y = self.offset_y
|
||||
x_end = (self.offset_x+(end-start)*self.distance+start*self.distance
|
||||
- self.arrow_spacing*(self.highest_level-level)/4)
|
||||
y_curve = self.offset_y-level*self.distance/2
|
||||
x_end = (
|
||||
self.offset_x
|
||||
+ (end - start) * self.distance
|
||||
+ start * self.distance
|
||||
- self.arrow_spacing * (self.highest_level - level) / 4
|
||||
)
|
||||
y_curve = self.offset_y - level * self.distance / 2
|
||||
if self.compact:
|
||||
y_curve = self.offset_y-level*self.distance/6
|
||||
y_curve = self.offset_y - level * self.distance / 6
|
||||
if y_curve == 0 and len(self.levels) > 5:
|
||||
y_curve = -self.distance
|
||||
arrowhead = self.get_arrowhead(direction, x_start, y, x_end)
|
||||
arc = self.get_arc(x_start, y, y_curve, x_end)
|
||||
return TPL_DEP_ARCS.format(id=self.id, i=i, stroke=self.arrow_stroke,
|
||||
head=arrowhead, label=label, arc=arc)
|
||||
return TPL_DEP_ARCS.format(
|
||||
id=self.id,
|
||||
i=i,
|
||||
stroke=self.arrow_stroke,
|
||||
head=arrowhead,
|
||||
label=label,
|
||||
arc=arc,
|
||||
)
|
||||
|
||||
def get_arc(self, x_start, y, y_curve, x_end):
|
||||
"""Render individual arc.
|
||||
|
@ -136,13 +156,22 @@ class DependencyRenderer(object):
|
|||
end (int): X-coordinate of arrow end point.
|
||||
RETURNS (unicode): Definition of the arrow head path ('d' attribute).
|
||||
"""
|
||||
if direction == 'left':
|
||||
pos1, pos2, pos3 = (x, x-self.arrow_width+2, x+self.arrow_width-2)
|
||||
if direction == "left":
|
||||
pos1, pos2, pos3 = (x, x - self.arrow_width + 2, x + self.arrow_width - 2)
|
||||
else:
|
||||
pos1, pos2, pos3 = (end, end+self.arrow_width-2,
|
||||
end-self.arrow_width+2)
|
||||
arrowhead = (pos1, y+2, pos2, y-self.arrow_width, pos3,
|
||||
y-self.arrow_width)
|
||||
pos1, pos2, pos3 = (
|
||||
end,
|
||||
end + self.arrow_width - 2,
|
||||
end - self.arrow_width + 2,
|
||||
)
|
||||
arrowhead = (
|
||||
pos1,
|
||||
y + 2,
|
||||
pos2,
|
||||
y - self.arrow_width,
|
||||
pos3,
|
||||
y - self.arrow_width,
|
||||
)
|
||||
return "M{},{} L{},{} {},{}".format(*arrowhead)
|
||||
|
||||
def get_levels(self, arcs):
|
||||
|
@ -152,30 +181,44 @@ class DependencyRenderer(object):
|
|||
args (list): Individual arcs and their start, end, direction and label.
|
||||
RETURNS (list): Arc levels sorted from lowest to highest.
|
||||
"""
|
||||
levels = set(map(lambda arc: arc['end'] - arc['start'], arcs))
|
||||
levels = set(map(lambda arc: arc["end"] - arc["start"], arcs))
|
||||
return sorted(list(levels))
|
||||
|
||||
|
||||
class EntityRenderer(object):
|
||||
"""Render named entities as HTML."""
|
||||
style = 'ent'
|
||||
|
||||
style = "ent"
|
||||
|
||||
def __init__(self, options={}):
|
||||
"""Initialise dependency renderer.
|
||||
|
||||
options (dict): Visualiser-specific options (colors, ents)
|
||||
"""
|
||||
colors = {'ORG': '#7aecec', 'PRODUCT': '#bfeeb7', 'GPE': '#feca74',
|
||||
'LOC': '#ff9561', 'PERSON': '#aa9cfc', 'NORP': '#c887fb',
|
||||
'FACILITY': '#9cc9cc', 'EVENT': '#ffeb80', 'LAW': '#ff8197',
|
||||
'LANGUAGE': '#ff8197', 'WORK_OF_ART': '#f0d0ff',
|
||||
'DATE': '#bfe1d9', 'TIME': '#bfe1d9', 'MONEY': '#e4e7d2',
|
||||
'QUANTITY': '#e4e7d2', 'ORDINAL': '#e4e7d2',
|
||||
'CARDINAL': '#e4e7d2', 'PERCENT': '#e4e7d2'}
|
||||
colors.update(options.get('colors', {}))
|
||||
self.default_color = '#ddd'
|
||||
colors = {
|
||||
"ORG": "#7aecec",
|
||||
"PRODUCT": "#bfeeb7",
|
||||
"GPE": "#feca74",
|
||||
"LOC": "#ff9561",
|
||||
"PERSON": "#aa9cfc",
|
||||
"NORP": "#c887fb",
|
||||
"FACILITY": "#9cc9cc",
|
||||
"EVENT": "#ffeb80",
|
||||
"LAW": "#ff8197",
|
||||
"LANGUAGE": "#ff8197",
|
||||
"WORK_OF_ART": "#f0d0ff",
|
||||
"DATE": "#bfe1d9",
|
||||
"TIME": "#bfe1d9",
|
||||
"MONEY": "#e4e7d2",
|
||||
"QUANTITY": "#e4e7d2",
|
||||
"ORDINAL": "#e4e7d2",
|
||||
"CARDINAL": "#e4e7d2",
|
||||
"PERCENT": "#e4e7d2",
|
||||
}
|
||||
colors.update(options.get("colors", {}))
|
||||
self.default_color = "#ddd"
|
||||
self.colors = colors
|
||||
self.ents = options.get('ents', None)
|
||||
self.ents = options.get("ents", None)
|
||||
|
||||
def render(self, parsed, page=False, minify=False):
|
||||
"""Render complete markup.
|
||||
|
@ -185,14 +228,14 @@ class EntityRenderer(object):
|
|||
minify (bool): Minify HTML markup.
|
||||
RETURNS (unicode): Rendered HTML markup.
|
||||
"""
|
||||
rendered = [self.render_ents(p['text'], p['ents'],
|
||||
p.get('title', None)) for p in parsed]
|
||||
rendered = [
|
||||
self.render_ents(p["text"], p["ents"], p.get("title", None)) for p in parsed
|
||||
]
|
||||
if page:
|
||||
docs = ''.join([TPL_FIGURE.format(content=doc)
|
||||
for doc in rendered])
|
||||
docs = "".join([TPL_FIGURE.format(content=doc) for doc in rendered])
|
||||
markup = TPL_PAGE.format(content=docs)
|
||||
else:
|
||||
markup = ''.join(rendered)
|
||||
markup = "".join(rendered)
|
||||
if minify:
|
||||
return minify_html(markup)
|
||||
return markup
|
||||
|
@ -204,18 +247,18 @@ class EntityRenderer(object):
|
|||
spans (list): Individual entity spans and their start, end and label.
|
||||
title (unicode or None): Document title set in Doc.user_data['title'].
|
||||
"""
|
||||
markup = ''
|
||||
markup = ""
|
||||
offset = 0
|
||||
for span in spans:
|
||||
label = span['label']
|
||||
start = span['start']
|
||||
end = span['end']
|
||||
label = span["label"]
|
||||
start = span["start"]
|
||||
end = span["end"]
|
||||
entity = text[start:end]
|
||||
fragments = text[offset:start].split('\n')
|
||||
fragments = text[offset:start].split("\n")
|
||||
for i, fragment in enumerate(fragments):
|
||||
markup += fragment
|
||||
if len(fragments) > 1 and i != len(fragments)-1:
|
||||
markup += '</br>'
|
||||
if len(fragments) > 1 and i != len(fragments) - 1:
|
||||
markup += "</br>"
|
||||
if self.ents is None or label.upper() in self.ents:
|
||||
color = self.colors.get(label.upper(), self.default_color)
|
||||
markup += TPL_ENT.format(label=label, text=entity, bg=color)
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# setting explicit height and max-width: none on the SVG is required for
|
||||
# Setting explicit height and max-width: none on the SVG is required for
|
||||
# Jupyter to render it properly in a cell
|
||||
|
||||
TPL_DEP_SVG = """
|
||||
|
|
|
@ -8,13 +8,17 @@ import inspect
|
|||
|
||||
def add_codes(err_cls):
|
||||
"""Add error codes to string messages via class attribute names."""
|
||||
|
||||
class ErrorsWithCodes(object):
|
||||
def __getattribute__(self, code):
|
||||
msg = getattr(err_cls, code)
|
||||
return '[{code}] {msg}'.format(code=code, msg=msg)
|
||||
return "[{code}] {msg}".format(code=code, msg=msg)
|
||||
|
||||
return ErrorsWithCodes()
|
||||
|
||||
|
||||
# fmt: off
|
||||
|
||||
@add_codes
|
||||
class Warnings(object):
|
||||
W001 = ("As of spaCy v2.0, the keyword argument `path=` is deprecated. "
|
||||
|
@ -260,7 +264,7 @@ class Errors(object):
|
|||
E095 = ("Can't write to frozen dictionary. This is likely an internal "
|
||||
"error. Are you writing to a default function argument?")
|
||||
E096 = ("Invalid object passed to displaCy: Can only visualize Doc or "
|
||||
"Span objects, or dicts if set to manual=True.")
|
||||
"Span objects, or dicts if set to manual=True.")
|
||||
E097 = ("Invalid pattern: expected token pattern (list of dicts) or "
|
||||
"phrase pattern (string) but got:\n{pattern}")
|
||||
E098 = ("Invalid pattern specified: expected both SPEC and PATTERN.")
|
||||
|
@ -270,6 +274,19 @@ class Errors(object):
|
|||
"NBOR_RELOP.")
|
||||
E101 = ("NODE_NAME should be a new node and NBOR_NAME should already have "
|
||||
"have been declared in previous edges.")
|
||||
E102 = ("Can't merge non-disjoint spans. '{token}' is already part of tokens to merge")
|
||||
E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A token"
|
||||
" can only be part of one entity, so make sure the entities you're "
|
||||
"setting don't overlap.")
|
||||
E104 = ("Can't find JSON schema for '{name}'.")
|
||||
E105 = ("The Doc.print_tree() method is now deprecated. Please use "
|
||||
"Doc.json() instead.")
|
||||
E106 = ("Can't find doc._.{attr} attribute specified in the underscore "
|
||||
"settings: {opts}")
|
||||
E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
|
||||
E108 = ("As of spaCy v2.1, the pipe name `sbd` has been deprecated "
|
||||
"in favor of the pipe name `sentencizer`, which does the same "
|
||||
"thing. For example, use `nlp.create_pipeline('sentencizer')`")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
@ -289,55 +306,57 @@ class TempErrors(object):
|
|||
"(pretrained_dims) but not the new name (pretrained_vectors).")
|
||||
|
||||
|
||||
# fmt: on
|
||||
|
||||
|
||||
class ModelsWarning(UserWarning):
|
||||
pass
|
||||
|
||||
|
||||
WARNINGS = {
|
||||
'user': UserWarning,
|
||||
'deprecation': DeprecationWarning,
|
||||
'models': ModelsWarning,
|
||||
"user": UserWarning,
|
||||
"deprecation": DeprecationWarning,
|
||||
"models": ModelsWarning,
|
||||
}
|
||||
|
||||
|
||||
def _get_warn_types(arg):
|
||||
if arg == '': # don't show any warnings
|
||||
if arg == "": # don't show any warnings
|
||||
return []
|
||||
if not arg or arg == 'all': # show all available warnings
|
||||
if not arg or arg == "all": # show all available warnings
|
||||
return WARNINGS.keys()
|
||||
return [w_type.strip() for w_type in arg.split(',')
|
||||
if w_type.strip() in WARNINGS]
|
||||
return [w_type.strip() for w_type in arg.split(",") if w_type.strip() in WARNINGS]
|
||||
|
||||
|
||||
def _get_warn_excl(arg):
|
||||
if not arg:
|
||||
return []
|
||||
return [w_id.strip() for w_id in arg.split(',')]
|
||||
return [w_id.strip() for w_id in arg.split(",")]
|
||||
|
||||
|
||||
SPACY_WARNING_FILTER = os.environ.get('SPACY_WARNING_FILTER')
|
||||
SPACY_WARNING_TYPES = _get_warn_types(os.environ.get('SPACY_WARNING_TYPES'))
|
||||
SPACY_WARNING_IGNORE = _get_warn_excl(os.environ.get('SPACY_WARNING_IGNORE'))
|
||||
SPACY_WARNING_FILTER = os.environ.get("SPACY_WARNING_FILTER")
|
||||
SPACY_WARNING_TYPES = _get_warn_types(os.environ.get("SPACY_WARNING_TYPES"))
|
||||
SPACY_WARNING_IGNORE = _get_warn_excl(os.environ.get("SPACY_WARNING_IGNORE"))
|
||||
|
||||
|
||||
def user_warning(message):
|
||||
_warn(message, 'user')
|
||||
_warn(message, "user")
|
||||
|
||||
|
||||
def deprecation_warning(message):
|
||||
_warn(message, 'deprecation')
|
||||
_warn(message, "deprecation")
|
||||
|
||||
|
||||
def models_warning(message):
|
||||
_warn(message, 'models')
|
||||
_warn(message, "models")
|
||||
|
||||
|
||||
def _warn(message, warn_type='user'):
|
||||
def _warn(message, warn_type="user"):
|
||||
"""
|
||||
message (unicode): The message to display.
|
||||
category (Warning): The Warning to show.
|
||||
"""
|
||||
w_id = message.split('[', 1)[1].split(']', 1)[0] # get ID from string
|
||||
w_id = message.split("[", 1)[1].split("]", 1)[0] # get ID from string
|
||||
if warn_type in SPACY_WARNING_TYPES and w_id not in SPACY_WARNING_IGNORE:
|
||||
category = WARNINGS[warn_type]
|
||||
stack = inspect.stack()[-1]
|
||||
|
|
|
@ -21,294 +21,272 @@ GLOSSARY = {
|
|||
# POS tags
|
||||
# Universal POS Tags
|
||||
# http://universaldependencies.org/u/pos/
|
||||
|
||||
'ADJ': 'adjective',
|
||||
'ADP': 'adposition',
|
||||
'ADV': 'adverb',
|
||||
'AUX': 'auxiliary',
|
||||
'CONJ': 'conjunction',
|
||||
'CCONJ': 'coordinating conjunction',
|
||||
'DET': 'determiner',
|
||||
'INTJ': 'interjection',
|
||||
'NOUN': 'noun',
|
||||
'NUM': 'numeral',
|
||||
'PART': 'particle',
|
||||
'PRON': 'pronoun',
|
||||
'PROPN': 'proper noun',
|
||||
'PUNCT': 'punctuation',
|
||||
'SCONJ': 'subordinating conjunction',
|
||||
'SYM': 'symbol',
|
||||
'VERB': 'verb',
|
||||
'X': 'other',
|
||||
'EOL': 'end of line',
|
||||
'SPACE': 'space',
|
||||
|
||||
|
||||
"ADJ": "adjective",
|
||||
"ADP": "adposition",
|
||||
"ADV": "adverb",
|
||||
"AUX": "auxiliary",
|
||||
"CONJ": "conjunction",
|
||||
"CCONJ": "coordinating conjunction",
|
||||
"DET": "determiner",
|
||||
"INTJ": "interjection",
|
||||
"NOUN": "noun",
|
||||
"NUM": "numeral",
|
||||
"PART": "particle",
|
||||
"PRON": "pronoun",
|
||||
"PROPN": "proper noun",
|
||||
"PUNCT": "punctuation",
|
||||
"SCONJ": "subordinating conjunction",
|
||||
"SYM": "symbol",
|
||||
"VERB": "verb",
|
||||
"X": "other",
|
||||
"EOL": "end of line",
|
||||
"SPACE": "space",
|
||||
# POS tags (English)
|
||||
# OntoNotes 5 / Penn Treebank
|
||||
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
|
||||
|
||||
'.': 'punctuation mark, sentence closer',
|
||||
',': 'punctuation mark, comma',
|
||||
'-LRB-': 'left round bracket',
|
||||
'-RRB-': 'right round bracket',
|
||||
'``': 'opening quotation mark',
|
||||
'""': 'closing quotation mark',
|
||||
"''": 'closing quotation mark',
|
||||
':': 'punctuation mark, colon or ellipsis',
|
||||
'$': 'symbol, currency',
|
||||
'#': 'symbol, number sign',
|
||||
'AFX': 'affix',
|
||||
'CC': 'conjunction, coordinating',
|
||||
'CD': 'cardinal number',
|
||||
'DT': 'determiner',
|
||||
'EX': 'existential there',
|
||||
'FW': 'foreign word',
|
||||
'HYPH': 'punctuation mark, hyphen',
|
||||
'IN': 'conjunction, subordinating or preposition',
|
||||
'JJ': 'adjective',
|
||||
'JJR': 'adjective, comparative',
|
||||
'JJS': 'adjective, superlative',
|
||||
'LS': 'list item marker',
|
||||
'MD': 'verb, modal auxiliary',
|
||||
'NIL': 'missing tag',
|
||||
'NN': 'noun, singular or mass',
|
||||
'NNP': 'noun, proper singular',
|
||||
'NNPS': 'noun, proper plural',
|
||||
'NNS': 'noun, plural',
|
||||
'PDT': 'predeterminer',
|
||||
'POS': 'possessive ending',
|
||||
'PRP': 'pronoun, personal',
|
||||
'PRP$': 'pronoun, possessive',
|
||||
'RB': 'adverb',
|
||||
'RBR': 'adverb, comparative',
|
||||
'RBS': 'adverb, superlative',
|
||||
'RP': 'adverb, particle',
|
||||
'TO': 'infinitival to',
|
||||
'UH': 'interjection',
|
||||
'VB': 'verb, base form',
|
||||
'VBD': 'verb, past tense',
|
||||
'VBG': 'verb, gerund or present participle',
|
||||
'VBN': 'verb, past participle',
|
||||
'VBP': 'verb, non-3rd person singular present',
|
||||
'VBZ': 'verb, 3rd person singular present',
|
||||
'WDT': 'wh-determiner',
|
||||
'WP': 'wh-pronoun, personal',
|
||||
'WP$': 'wh-pronoun, possessive',
|
||||
'WRB': 'wh-adverb',
|
||||
'SP': 'space',
|
||||
'ADD': 'email',
|
||||
'NFP': 'superfluous punctuation',
|
||||
'GW': 'additional word in multi-word expression',
|
||||
'XX': 'unknown',
|
||||
'BES': 'auxiliary "be"',
|
||||
'HVS': 'forms of "have"',
|
||||
|
||||
|
||||
".": "punctuation mark, sentence closer",
|
||||
",": "punctuation mark, comma",
|
||||
"-LRB-": "left round bracket",
|
||||
"-RRB-": "right round bracket",
|
||||
"``": "opening quotation mark",
|
||||
'""': "closing quotation mark",
|
||||
"''": "closing quotation mark",
|
||||
":": "punctuation mark, colon or ellipsis",
|
||||
"$": "symbol, currency",
|
||||
"#": "symbol, number sign",
|
||||
"AFX": "affix",
|
||||
"CC": "conjunction, coordinating",
|
||||
"CD": "cardinal number",
|
||||
"DT": "determiner",
|
||||
"EX": "existential there",
|
||||
"FW": "foreign word",
|
||||
"HYPH": "punctuation mark, hyphen",
|
||||
"IN": "conjunction, subordinating or preposition",
|
||||
"JJ": "adjective",
|
||||
"JJR": "adjective, comparative",
|
||||
"JJS": "adjective, superlative",
|
||||
"LS": "list item marker",
|
||||
"MD": "verb, modal auxiliary",
|
||||
"NIL": "missing tag",
|
||||
"NN": "noun, singular or mass",
|
||||
"NNP": "noun, proper singular",
|
||||
"NNPS": "noun, proper plural",
|
||||
"NNS": "noun, plural",
|
||||
"PDT": "predeterminer",
|
||||
"POS": "possessive ending",
|
||||
"PRP": "pronoun, personal",
|
||||
"PRP$": "pronoun, possessive",
|
||||
"RB": "adverb",
|
||||
"RBR": "adverb, comparative",
|
||||
"RBS": "adverb, superlative",
|
||||
"RP": "adverb, particle",
|
||||
"TO": "infinitival to",
|
||||
"UH": "interjection",
|
||||
"VB": "verb, base form",
|
||||
"VBD": "verb, past tense",
|
||||
"VBG": "verb, gerund or present participle",
|
||||
"VBN": "verb, past participle",
|
||||
"VBP": "verb, non-3rd person singular present",
|
||||
"VBZ": "verb, 3rd person singular present",
|
||||
"WDT": "wh-determiner",
|
||||
"WP": "wh-pronoun, personal",
|
||||
"WP$": "wh-pronoun, possessive",
|
||||
"WRB": "wh-adverb",
|
||||
"SP": "space",
|
||||
"ADD": "email",
|
||||
"NFP": "superfluous punctuation",
|
||||
"GW": "additional word in multi-word expression",
|
||||
"XX": "unknown",
|
||||
"BES": 'auxiliary "be"',
|
||||
"HVS": 'forms of "have"',
|
||||
# POS Tags (German)
|
||||
# TIGER Treebank
|
||||
# http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
|
||||
|
||||
'$(': 'other sentence-internal punctuation mark',
|
||||
'$,': 'comma',
|
||||
'$.': 'sentence-final punctuation mark',
|
||||
'ADJA': 'adjective, attributive',
|
||||
'ADJD': 'adjective, adverbial or predicative',
|
||||
'APPO': 'postposition',
|
||||
'APPR': 'preposition; circumposition left',
|
||||
'APPRART': 'preposition with article',
|
||||
'APZR': 'circumposition right',
|
||||
'ART': 'definite or indefinite article',
|
||||
'CARD': 'cardinal number',
|
||||
'FM': 'foreign language material',
|
||||
'ITJ': 'interjection',
|
||||
'KOKOM': 'comparative conjunction',
|
||||
'KON': 'coordinate conjunction',
|
||||
'KOUI': 'subordinate conjunction with "zu" and infinitive',
|
||||
'KOUS': 'subordinate conjunction with sentence',
|
||||
'NE': 'proper noun',
|
||||
'NNE': 'proper noun',
|
||||
'PAV': 'pronominal adverb',
|
||||
'PROAV': 'pronominal adverb',
|
||||
'PDAT': 'attributive demonstrative pronoun',
|
||||
'PDS': 'substituting demonstrative pronoun',
|
||||
'PIAT': 'attributive indefinite pronoun without determiner',
|
||||
'PIDAT': 'attributive indefinite pronoun with determiner',
|
||||
'PIS': 'substituting indefinite pronoun',
|
||||
'PPER': 'non-reflexive personal pronoun',
|
||||
'PPOSAT': 'attributive possessive pronoun',
|
||||
'PPOSS': 'substituting possessive pronoun',
|
||||
'PRELAT': 'attributive relative pronoun',
|
||||
'PRELS': 'substituting relative pronoun',
|
||||
'PRF': 'reflexive personal pronoun',
|
||||
'PTKA': 'particle with adjective or adverb',
|
||||
'PTKANT': 'answer particle',
|
||||
'PTKNEG': 'negative particle',
|
||||
'PTKVZ': 'separable verbal particle',
|
||||
'PTKZU': '"zu" before infinitive',
|
||||
'PWAT': 'attributive interrogative pronoun',
|
||||
'PWAV': 'adverbial interrogative or relative pronoun',
|
||||
'PWS': 'substituting interrogative pronoun',
|
||||
'TRUNC': 'word remnant',
|
||||
'VAFIN': 'finite verb, auxiliary',
|
||||
'VAIMP': 'imperative, auxiliary',
|
||||
'VAINF': 'infinitive, auxiliary',
|
||||
'VAPP': 'perfect participle, auxiliary',
|
||||
'VMFIN': 'finite verb, modal',
|
||||
'VMINF': 'infinitive, modal',
|
||||
'VMPP': 'perfect participle, modal',
|
||||
'VVFIN': 'finite verb, full',
|
||||
'VVIMP': 'imperative, full',
|
||||
'VVINF': 'infinitive, full',
|
||||
'VVIZU': 'infinitive with "zu", full',
|
||||
'VVPP': 'perfect participle, full',
|
||||
'XY': 'non-word containing non-letter',
|
||||
|
||||
|
||||
"$(": "other sentence-internal punctuation mark",
|
||||
"$,": "comma",
|
||||
"$.": "sentence-final punctuation mark",
|
||||
"ADJA": "adjective, attributive",
|
||||
"ADJD": "adjective, adverbial or predicative",
|
||||
"APPO": "postposition",
|
||||
"APPR": "preposition; circumposition left",
|
||||
"APPRART": "preposition with article",
|
||||
"APZR": "circumposition right",
|
||||
"ART": "definite or indefinite article",
|
||||
"CARD": "cardinal number",
|
||||
"FM": "foreign language material",
|
||||
"ITJ": "interjection",
|
||||
"KOKOM": "comparative conjunction",
|
||||
"KON": "coordinate conjunction",
|
||||
"KOUI": 'subordinate conjunction with "zu" and infinitive',
|
||||
"KOUS": "subordinate conjunction with sentence",
|
||||
"NE": "proper noun",
|
||||
"NNE": "proper noun",
|
||||
"PAV": "pronominal adverb",
|
||||
"PROAV": "pronominal adverb",
|
||||
"PDAT": "attributive demonstrative pronoun",
|
||||
"PDS": "substituting demonstrative pronoun",
|
||||
"PIAT": "attributive indefinite pronoun without determiner",
|
||||
"PIDAT": "attributive indefinite pronoun with determiner",
|
||||
"PIS": "substituting indefinite pronoun",
|
||||
"PPER": "non-reflexive personal pronoun",
|
||||
"PPOSAT": "attributive possessive pronoun",
|
||||
"PPOSS": "substituting possessive pronoun",
|
||||
"PRELAT": "attributive relative pronoun",
|
||||
"PRELS": "substituting relative pronoun",
|
||||
"PRF": "reflexive personal pronoun",
|
||||
"PTKA": "particle with adjective or adverb",
|
||||
"PTKANT": "answer particle",
|
||||
"PTKNEG": "negative particle",
|
||||
"PTKVZ": "separable verbal particle",
|
||||
"PTKZU": '"zu" before infinitive',
|
||||
"PWAT": "attributive interrogative pronoun",
|
||||
"PWAV": "adverbial interrogative or relative pronoun",
|
||||
"PWS": "substituting interrogative pronoun",
|
||||
"TRUNC": "word remnant",
|
||||
"VAFIN": "finite verb, auxiliary",
|
||||
"VAIMP": "imperative, auxiliary",
|
||||
"VAINF": "infinitive, auxiliary",
|
||||
"VAPP": "perfect participle, auxiliary",
|
||||
"VMFIN": "finite verb, modal",
|
||||
"VMINF": "infinitive, modal",
|
||||
"VMPP": "perfect participle, modal",
|
||||
"VVFIN": "finite verb, full",
|
||||
"VVIMP": "imperative, full",
|
||||
"VVINF": "infinitive, full",
|
||||
"VVIZU": 'infinitive with "zu", full',
|
||||
"VVPP": "perfect participle, full",
|
||||
"XY": "non-word containing non-letter",
|
||||
# Noun chunks
|
||||
|
||||
'NP': 'noun phrase',
|
||||
'PP': 'prepositional phrase',
|
||||
'VP': 'verb phrase',
|
||||
'ADVP': 'adverb phrase',
|
||||
'ADJP': 'adjective phrase',
|
||||
'SBAR': 'subordinating conjunction',
|
||||
'PRT': 'particle',
|
||||
'PNP': 'prepositional noun phrase',
|
||||
|
||||
|
||||
"NP": "noun phrase",
|
||||
"PP": "prepositional phrase",
|
||||
"VP": "verb phrase",
|
||||
"ADVP": "adverb phrase",
|
||||
"ADJP": "adjective phrase",
|
||||
"SBAR": "subordinating conjunction",
|
||||
"PRT": "particle",
|
||||
"PNP": "prepositional noun phrase",
|
||||
# Dependency Labels (English)
|
||||
# ClearNLP / Universal Dependencies
|
||||
# https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md
|
||||
|
||||
'acomp': 'adjectival complement',
|
||||
'advcl': 'adverbial clause modifier',
|
||||
'advmod': 'adverbial modifier',
|
||||
'agent': 'agent',
|
||||
'amod': 'adjectival modifier',
|
||||
'appos': 'appositional modifier',
|
||||
'attr': 'attribute',
|
||||
'aux': 'auxiliary',
|
||||
'auxpass': 'auxiliary (passive)',
|
||||
'cc': 'coordinating conjunction',
|
||||
'ccomp': 'clausal complement',
|
||||
'complm': 'complementizer',
|
||||
'conj': 'conjunct',
|
||||
'cop': 'copula',
|
||||
'csubj': 'clausal subject',
|
||||
'csubjpass': 'clausal subject (passive)',
|
||||
'dep': 'unclassified dependent',
|
||||
'det': 'determiner',
|
||||
'dobj': 'direct object',
|
||||
'expl': 'expletive',
|
||||
'hmod': 'modifier in hyphenation',
|
||||
'hyph': 'hyphen',
|
||||
'infmod': 'infinitival modifier',
|
||||
'intj': 'interjection',
|
||||
'iobj': 'indirect object',
|
||||
'mark': 'marker',
|
||||
'meta': 'meta modifier',
|
||||
'neg': 'negation modifier',
|
||||
'nmod': 'modifier of nominal',
|
||||
'nn': 'noun compound modifier',
|
||||
'npadvmod': 'noun phrase as adverbial modifier',
|
||||
'nsubj': 'nominal subject',
|
||||
'nsubjpass': 'nominal subject (passive)',
|
||||
'num': 'number modifier',
|
||||
'number': 'number compound modifier',
|
||||
'oprd': 'object predicate',
|
||||
'obj': 'object',
|
||||
'obl': 'oblique nominal',
|
||||
'parataxis': 'parataxis',
|
||||
'partmod': 'participal modifier',
|
||||
'pcomp': 'complement of preposition',
|
||||
'pobj': 'object of preposition',
|
||||
'poss': 'possession modifier',
|
||||
'possessive': 'possessive modifier',
|
||||
'preconj': 'pre-correlative conjunction',
|
||||
'prep': 'prepositional modifier',
|
||||
'prt': 'particle',
|
||||
'punct': 'punctuation',
|
||||
'quantmod': 'modifier of quantifier',
|
||||
'rcmod': 'relative clause modifier',
|
||||
'root': 'root',
|
||||
'xcomp': 'open clausal complement',
|
||||
|
||||
|
||||
"acomp": "adjectival complement",
|
||||
"advcl": "adverbial clause modifier",
|
||||
"advmod": "adverbial modifier",
|
||||
"agent": "agent",
|
||||
"amod": "adjectival modifier",
|
||||
"appos": "appositional modifier",
|
||||
"attr": "attribute",
|
||||
"aux": "auxiliary",
|
||||
"auxpass": "auxiliary (passive)",
|
||||
"cc": "coordinating conjunction",
|
||||
"ccomp": "clausal complement",
|
||||
"complm": "complementizer",
|
||||
"conj": "conjunct",
|
||||
"cop": "copula",
|
||||
"csubj": "clausal subject",
|
||||
"csubjpass": "clausal subject (passive)",
|
||||
"dep": "unclassified dependent",
|
||||
"det": "determiner",
|
||||
"dobj": "direct object",
|
||||
"expl": "expletive",
|
||||
"hmod": "modifier in hyphenation",
|
||||
"hyph": "hyphen",
|
||||
"infmod": "infinitival modifier",
|
||||
"intj": "interjection",
|
||||
"iobj": "indirect object",
|
||||
"mark": "marker",
|
||||
"meta": "meta modifier",
|
||||
"neg": "negation modifier",
|
||||
"nmod": "modifier of nominal",
|
||||
"nn": "noun compound modifier",
|
||||
"npadvmod": "noun phrase as adverbial modifier",
|
||||
"nsubj": "nominal subject",
|
||||
"nsubjpass": "nominal subject (passive)",
|
||||
"num": "number modifier",
|
||||
"number": "number compound modifier",
|
||||
"oprd": "object predicate",
|
||||
"obj": "object",
|
||||
"obl": "oblique nominal",
|
||||
"parataxis": "parataxis",
|
||||
"partmod": "participal modifier",
|
||||
"pcomp": "complement of preposition",
|
||||
"pobj": "object of preposition",
|
||||
"poss": "possession modifier",
|
||||
"possessive": "possessive modifier",
|
||||
"preconj": "pre-correlative conjunction",
|
||||
"prep": "prepositional modifier",
|
||||
"prt": "particle",
|
||||
"punct": "punctuation",
|
||||
"quantmod": "modifier of quantifier",
|
||||
"rcmod": "relative clause modifier",
|
||||
"root": "root",
|
||||
"xcomp": "open clausal complement",
|
||||
# Dependency labels (German)
|
||||
# TIGER Treebank
|
||||
# http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
|
||||
# currently missing: 'cc' (comparative complement) because of conflict
|
||||
# with English labels
|
||||
|
||||
'ac': 'adpositional case marker',
|
||||
'adc': 'adjective component',
|
||||
'ag': 'genitive attribute',
|
||||
'ams': 'measure argument of adjective',
|
||||
'app': 'apposition',
|
||||
'avc': 'adverbial phrase component',
|
||||
'cd': 'coordinating conjunction',
|
||||
'cj': 'conjunct',
|
||||
'cm': 'comparative conjunction',
|
||||
'cp': 'complementizer',
|
||||
'cvc': 'collocational verb construction',
|
||||
'da': 'dative',
|
||||
'dh': 'discourse-level head',
|
||||
'dm': 'discourse marker',
|
||||
'ep': 'expletive es',
|
||||
'hd': 'head',
|
||||
'ju': 'junctor',
|
||||
'mnr': 'postnominal modifier',
|
||||
'mo': 'modifier',
|
||||
'ng': 'negation',
|
||||
'nk': 'noun kernel element',
|
||||
'nmc': 'numerical component',
|
||||
'oa': 'accusative object',
|
||||
'oc': 'clausal object',
|
||||
'og': 'genitive object',
|
||||
'op': 'prepositional object',
|
||||
'par': 'parenthetical element',
|
||||
'pd': 'predicate',
|
||||
'pg': 'phrasal genitive',
|
||||
'ph': 'placeholder',
|
||||
'pm': 'morphological particle',
|
||||
'pnc': 'proper noun component',
|
||||
'rc': 'relative clause',
|
||||
're': 'repeated element',
|
||||
'rs': 'reported speech',
|
||||
'sb': 'subject',
|
||||
|
||||
|
||||
"ac": "adpositional case marker",
|
||||
"adc": "adjective component",
|
||||
"ag": "genitive attribute",
|
||||
"ams": "measure argument of adjective",
|
||||
"app": "apposition",
|
||||
"avc": "adverbial phrase component",
|
||||
"cd": "coordinating conjunction",
|
||||
"cj": "conjunct",
|
||||
"cm": "comparative conjunction",
|
||||
"cp": "complementizer",
|
||||
"cvc": "collocational verb construction",
|
||||
"da": "dative",
|
||||
"dh": "discourse-level head",
|
||||
"dm": "discourse marker",
|
||||
"ep": "expletive es",
|
||||
"hd": "head",
|
||||
"ju": "junctor",
|
||||
"mnr": "postnominal modifier",
|
||||
"mo": "modifier",
|
||||
"ng": "negation",
|
||||
"nk": "noun kernel element",
|
||||
"nmc": "numerical component",
|
||||
"oa": "accusative object",
|
||||
"oc": "clausal object",
|
||||
"og": "genitive object",
|
||||
"op": "prepositional object",
|
||||
"par": "parenthetical element",
|
||||
"pd": "predicate",
|
||||
"pg": "phrasal genitive",
|
||||
"ph": "placeholder",
|
||||
"pm": "morphological particle",
|
||||
"pnc": "proper noun component",
|
||||
"rc": "relative clause",
|
||||
"re": "repeated element",
|
||||
"rs": "reported speech",
|
||||
"sb": "subject",
|
||||
# Named Entity Recognition
|
||||
# OntoNotes 5
|
||||
# https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf
|
||||
|
||||
'PERSON': 'People, including fictional',
|
||||
'NORP': 'Nationalities or religious or political groups',
|
||||
'FACILITY': 'Buildings, airports, highways, bridges, etc.',
|
||||
'ORG': 'Companies, agencies, institutions, etc.',
|
||||
'GPE': 'Countries, cities, states',
|
||||
'LOC': 'Non-GPE locations, mountain ranges, bodies of water',
|
||||
'PRODUCT': 'Objects, vehicles, foods, etc. (not services)',
|
||||
'EVENT': 'Named hurricanes, battles, wars, sports events, etc.',
|
||||
'WORK_OF_ART': 'Titles of books, songs, etc.',
|
||||
'LAW': 'Named documents made into laws.',
|
||||
'LANGUAGE': 'Any named language',
|
||||
'DATE': 'Absolute or relative dates or periods',
|
||||
'TIME': 'Times smaller than a day',
|
||||
'PERCENT': 'Percentage, including "%"',
|
||||
'MONEY': 'Monetary values, including unit',
|
||||
'QUANTITY': 'Measurements, as of weight or distance',
|
||||
'ORDINAL': '"first", "second", etc.',
|
||||
'CARDINAL': 'Numerals that do not fall under another type',
|
||||
|
||||
|
||||
"PERSON": "People, including fictional",
|
||||
"NORP": "Nationalities or religious or political groups",
|
||||
"FACILITY": "Buildings, airports, highways, bridges, etc.",
|
||||
"FAC": "Buildings, airports, highways, bridges, etc.",
|
||||
"ORG": "Companies, agencies, institutions, etc.",
|
||||
"GPE": "Countries, cities, states",
|
||||
"LOC": "Non-GPE locations, mountain ranges, bodies of water",
|
||||
"PRODUCT": "Objects, vehicles, foods, etc. (not services)",
|
||||
"EVENT": "Named hurricanes, battles, wars, sports events, etc.",
|
||||
"WORK_OF_ART": "Titles of books, songs, etc.",
|
||||
"LAW": "Named documents made into laws.",
|
||||
"LANGUAGE": "Any named language",
|
||||
"DATE": "Absolute or relative dates or periods",
|
||||
"TIME": "Times smaller than a day",
|
||||
"PERCENT": 'Percentage, including "%"',
|
||||
"MONEY": "Monetary values, including unit",
|
||||
"QUANTITY": "Measurements, as of weight or distance",
|
||||
"ORDINAL": '"first", "second", etc.',
|
||||
"CARDINAL": "Numerals that do not fall under another type",
|
||||
# Named Entity Recognition
|
||||
# Wikipedia
|
||||
# http://www.sciencedirect.com/science/article/pii/S0004370212000276
|
||||
# https://pdfs.semanticscholar.org/5744/578cc243d92287f47448870bb426c66cc941.pdf
|
||||
|
||||
'PER': 'Named person or family.',
|
||||
'MISC': ('Miscellaneous entities, e.g. events, nationalities, '
|
||||
'products or works of art'),
|
||||
"PER": "Named person or family.",
|
||||
"MISC": "Miscellaneous entities, e.g. events, nationalities, products or works of art",
|
||||
}
|
||||
|
|
113
spacy/gold.pyx
113
spacy/gold.pyx
|
@ -15,7 +15,7 @@ import json
|
|||
|
||||
import ujson
|
||||
|
||||
from . import _align
|
||||
from . import _align
|
||||
from .syntax import nonproj
|
||||
from .tokens import Doc
|
||||
from .errors import Errors
|
||||
|
@ -172,7 +172,7 @@ class GoldCorpus(object):
|
|||
def dev_tuples(self):
|
||||
locs = (self.tmp_dir / 'dev').iterdir()
|
||||
yield from self.read_tuples(locs, limit=self.limit)
|
||||
|
||||
|
||||
@property
|
||||
def train_tuples(self):
|
||||
locs = (self.tmp_dir / 'train').iterdir()
|
||||
|
@ -271,6 +271,53 @@ def _corrupt(c, noise_level):
|
|||
return c.lower()
|
||||
|
||||
|
||||
def read_json_object(json_corpus_section):
|
||||
"""Take a list of JSON-formatted documents (e.g. from an already loaded
|
||||
training data file) and yield tuples in the GoldParse format.
|
||||
|
||||
json_corpus_section (list): The data.
|
||||
YIELDS (tuple): The reformatted data.
|
||||
"""
|
||||
for json_doc in json_corpus_section:
|
||||
tuple_doc = json_to_tuple(json_doc)
|
||||
for tuple_paragraph in tuple_doc:
|
||||
yield tuple_paragraph
|
||||
|
||||
|
||||
def json_to_tuple(doc):
|
||||
"""Convert an item in the JSON-formatted training data to the tuple format
|
||||
used by GoldParse.
|
||||
|
||||
doc (dict): One entry in the training data.
|
||||
YIELDS (tuple): The reformatted data.
|
||||
"""
|
||||
paragraphs = []
|
||||
for paragraph in doc['paragraphs']:
|
||||
sents = []
|
||||
for sent in paragraph['sentences']:
|
||||
words = []
|
||||
ids = []
|
||||
tags = []
|
||||
heads = []
|
||||
labels = []
|
||||
ner = []
|
||||
for i, token in enumerate(sent['tokens']):
|
||||
words.append(token['orth'])
|
||||
ids.append(i)
|
||||
tags.append(token.get('tag', '-'))
|
||||
heads.append(token.get('head', 0) + i)
|
||||
labels.append(token.get('dep', ''))
|
||||
# Ensure ROOT label is case-insensitive
|
||||
if labels[-1].lower() == 'root':
|
||||
labels[-1] = 'ROOT'
|
||||
ner.append(token.get('ner', '-'))
|
||||
sents.append([
|
||||
[ids, words, tags, heads, labels, ner],
|
||||
sent.get('brackets', [])])
|
||||
if sents:
|
||||
yield [paragraph.get('raw', None), sents]
|
||||
|
||||
|
||||
def read_json_file(loc, docs_filter=None, limit=None):
|
||||
loc = util.ensure_path(loc)
|
||||
if loc.is_dir():
|
||||
|
@ -280,31 +327,8 @@ def read_json_file(loc, docs_filter=None, limit=None):
|
|||
for doc in _json_iterate(loc):
|
||||
if docs_filter is not None and not docs_filter(doc):
|
||||
continue
|
||||
paragraphs = []
|
||||
for paragraph in doc['paragraphs']:
|
||||
sents = []
|
||||
for sent in paragraph['sentences']:
|
||||
words = []
|
||||
ids = []
|
||||
tags = []
|
||||
heads = []
|
||||
labels = []
|
||||
ner = []
|
||||
for i, token in enumerate(sent['tokens']):
|
||||
words.append(token['orth'])
|
||||
ids.append(i)
|
||||
tags.append(token.get('tag', '-'))
|
||||
heads.append(token.get('head', 0) + i)
|
||||
labels.append(token.get('dep', ''))
|
||||
# Ensure ROOT label is case-insensitive
|
||||
if labels[-1].lower() == 'root':
|
||||
labels[-1] = 'ROOT'
|
||||
ner.append(token.get('ner', '-'))
|
||||
sents.append([
|
||||
[ids, words, tags, heads, labels, ner],
|
||||
sent.get('brackets', [])])
|
||||
if sents:
|
||||
yield [paragraph.get('raw', None), sents]
|
||||
for json_tuple in json_to_tuple(doc):
|
||||
yield json_tuple
|
||||
|
||||
|
||||
def _json_iterate(loc):
|
||||
|
@ -573,32 +597,19 @@ cdef class GoldParse:
|
|||
self.c.sent_start[i] = 0
|
||||
|
||||
|
||||
def docs_to_json(id, docs):
|
||||
'''Convert a list of Doc objects into the JSON-serializable format used by
|
||||
the spacy train command. Each Doc in the list will be interpreted as a
|
||||
paragraph.
|
||||
'''
|
||||
def docs_to_json(docs, underscore=None):
|
||||
"""Convert a list of Doc objects into the JSON-serializable format used by
|
||||
the spacy train command.
|
||||
|
||||
docs (iterable / Doc): The Doc object(s) to convert.
|
||||
underscore (list): Optional list of string names of custom doc._.
|
||||
attributes. Attribute values need to be JSON-serializable. Values will
|
||||
be added to an "_" key in the data, e.g. "_": {"foo": "bar"}.
|
||||
RETURNS (list): The data in spaCy's JSON format.
|
||||
"""
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
json_doc = {'id': id, 'paragraphs': []}
|
||||
for i, doc in enumerate(docs):
|
||||
json_para = {'raw': doc.text, 'sentences': []}
|
||||
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
|
||||
biluo_tags = biluo_tags_from_offsets(doc, ent_offsets)
|
||||
for j, sent in enumerate(doc.sents):
|
||||
json_sent = {'tokens': [], 'brackets': []}
|
||||
for token in sent:
|
||||
json_token = {"id": token.i, "orth": token.text}
|
||||
if doc.is_tagged:
|
||||
json_token['tag'] = token.tag_
|
||||
if doc.is_parsed:
|
||||
json_token['head'] = token.head.i-token.i
|
||||
json_token['dep'] = token.dep_
|
||||
json_token['ner'] = biluo_tags[token.i]
|
||||
json_sent['tokens'].append(json_token)
|
||||
json_para['sentences'].append(json_sent)
|
||||
json_doc['paragraphs'].append(json_para)
|
||||
return json_doc
|
||||
return [doc.to_json(underscore=underscore) for doc in docs]
|
||||
|
||||
|
||||
def biluo_tags_from_offsets(doc, entities, missing='O'):
|
||||
|
|
|
@ -16,16 +16,18 @@ from ...util import update_exc, add_lookups
|
|||
class ArabicDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: 'ar'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
lex_attr_getters[LANG] = lambda text: "ar"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
|
||||
|
||||
class Arabic(Language):
|
||||
lang = 'ar'
|
||||
lang = "ar"
|
||||
Defaults = ArabicDefaults
|
||||
|
||||
|
||||
__all__ = ['Arabic']
|
||||
__all__ = ["Arabic"]
|
||||
|
|
|
@ -10,11 +10,11 @@ Example sentences to test spaCy and its language models.
|
|||
|
||||
sentences = [
|
||||
"نال الكاتب خالد توفيق جائزة الرواية العربية في معرض الشارقة الدولي للكتاب",
|
||||
"أين تقع دمشق ؟"
|
||||
"أين تقع دمشق ؟",
|
||||
"كيف حالك ؟",
|
||||
"هل يمكن ان نلتقي على الساعة الثانية عشرة ظهرا ؟",
|
||||
"ماهي أبرز التطورات السياسية، الأمنية والاجتماعية في العالم ؟",
|
||||
"هل بالإمكان أن نلتقي غدا؟",
|
||||
"هناك نحو 382 مليون شخص مصاب بداء السكَّري في العالم",
|
||||
"كشفت دراسة حديثة أن الخيل تقرأ تعبيرات الوجه وتستطيع أن تتذكر مشاعر الناس وعواطفهم"
|
||||
"كشفت دراسة حديثة أن الخيل تقرأ تعبيرات الوجه وتستطيع أن تتذكر مشاعر الناس وعواطفهم",
|
||||
]
|
||||
|
|
|
@ -2,7 +2,8 @@
|
|||
from __future__ import unicode_literals
|
||||
from ...attrs import LIKE_NUM
|
||||
|
||||
_num_words = set("""
|
||||
_num_words = set(
|
||||
"""
|
||||
صفر
|
||||
واحد
|
||||
إثنان
|
||||
|
@ -52,9 +53,11 @@ _num_words = set("""
|
|||
مليون
|
||||
مليار
|
||||
مليارات
|
||||
""".split())
|
||||
""".split()
|
||||
)
|
||||
|
||||
_ordinal_words = set("""
|
||||
_ordinal_words = set(
|
||||
"""
|
||||
اول
|
||||
أول
|
||||
حاد
|
||||
|
@ -69,20 +72,21 @@ _ordinal_words = set("""
|
|||
ثامن
|
||||
تاسع
|
||||
عاشر
|
||||
""".split())
|
||||
""".split()
|
||||
)
|
||||
|
||||
|
||||
def like_num(text):
|
||||
"""
|
||||
check if text resembles a number
|
||||
Check if text resembles a number
|
||||
"""
|
||||
if text.startswith(('+', '-', '±', '~')):
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
text = text.replace(',', '').replace('.', '')
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count('/') == 1:
|
||||
num, denom = text.split('/')
|
||||
if text.count("/") == 1:
|
||||
num, denom = text.split("/")
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
if text in _num_words:
|
||||
|
@ -92,6 +96,4 @@ def like_num(text):
|
|||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {
|
||||
LIKE_NUM: like_num
|
||||
}
|
||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
||||
|
|
|
@ -1,15 +1,20 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..punctuation import TOKENIZER_INFIXES
|
||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
||||
from ..char_classes import QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||
from ..char_classes import UNITS, ALPHA_UPPER
|
||||
|
||||
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
|
||||
[r'(?<=[0-9])\+',
|
||||
# Arabic is written from Right-To-Left
|
||||
r'(?<=[0-9])(?:{})'.format(CURRENCY),
|
||||
r'(?<=[0-9])(?:{})'.format(UNITS),
|
||||
r'(?<=[{au}][{au}])\.'.format(au=ALPHA_UPPER)])
|
||||
_suffixes = (
|
||||
LIST_PUNCT
|
||||
+ LIST_ELLIPSES
|
||||
+ LIST_QUOTES
|
||||
+ [
|
||||
r"(?<=[0-9])\+",
|
||||
# Arabic is written from Right-To-Left
|
||||
r"(?<=[0-9])(?:{})".format(CURRENCY),
|
||||
r"(?<=[0-9])(?:{})".format(UNITS),
|
||||
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||
]
|
||||
)
|
||||
|
||||
TOKENIZER_SUFFIXES = _suffixes
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
STOP_WORDS = set("""
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
من
|
||||
نحو
|
||||
لعل
|
||||
|
@ -388,4 +389,5 @@ STOP_WORDS = set("""
|
|||
وإن
|
||||
ولو
|
||||
يا
|
||||
""".split())
|
||||
""".split()
|
||||
)
|
||||
|
|
|
@ -1,21 +1,23 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
|
||||
import re
|
||||
from ...symbols import ORTH, LEMMA
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
||||
# time
|
||||
|
||||
# Time
|
||||
for exc_data in [
|
||||
{LEMMA: "قبل الميلاد", ORTH: "ق.م"},
|
||||
{LEMMA: "بعد الميلاد", ORTH: "ب. م"},
|
||||
{LEMMA: "ميلادي", ORTH: ".م"},
|
||||
{LEMMA: "هجري", ORTH: ".هـ"},
|
||||
{LEMMA: "توفي", ORTH: ".ت"}]:
|
||||
{LEMMA: "توفي", ORTH: ".ت"},
|
||||
]:
|
||||
_exc[exc_data[ORTH]] = [exc_data]
|
||||
|
||||
# scientific abv.
|
||||
# Scientific abv.
|
||||
for exc_data in [
|
||||
{LEMMA: "صلى الله عليه وسلم", ORTH: "صلعم"},
|
||||
{LEMMA: "الشارح", ORTH: "الشـ"},
|
||||
|
@ -28,20 +30,20 @@ for exc_data in [
|
|||
{LEMMA: "أنبأنا", ORTH: "أنا"},
|
||||
{LEMMA: "أخبرنا", ORTH: "نا"},
|
||||
{LEMMA: "مصدر سابق", ORTH: "م. س"},
|
||||
{LEMMA: "مصدر نفسه", ORTH: "م. ن"}]:
|
||||
{LEMMA: "مصدر نفسه", ORTH: "م. ن"},
|
||||
]:
|
||||
_exc[exc_data[ORTH]] = [exc_data]
|
||||
|
||||
# other abv.
|
||||
# Other abv.
|
||||
for exc_data in [
|
||||
{LEMMA: "دكتور", ORTH: "د."},
|
||||
{LEMMA: "أستاذ دكتور", ORTH: "أ.د"},
|
||||
{LEMMA: "أستاذ", ORTH: "أ."},
|
||||
{LEMMA: "بروفيسور", ORTH: "ب."}]:
|
||||
{LEMMA: "بروفيسور", ORTH: "ب."},
|
||||
]:
|
||||
_exc[exc_data[ORTH]] = [exc_data]
|
||||
|
||||
for exc_data in [
|
||||
{LEMMA: "تلفون", ORTH: "ت."},
|
||||
{LEMMA: "صندوق بريد", ORTH: "ص.ب"}]:
|
||||
for exc_data in [{LEMMA: "تلفون", ORTH: "ت."}, {LEMMA: "صندوق بريد", ORTH: "ص.ب"}]:
|
||||
_exc[exc_data[ORTH]] = [exc_data]
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
|
|
|
@ -15,7 +15,7 @@ from ...util import update_exc
|
|||
|
||||
class BengaliDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'bn'
|
||||
lex_attr_getters[LANG] = lambda text: "bn"
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
|
@ -26,8 +26,8 @@ class BengaliDefaults(Language.Defaults):
|
|||
|
||||
|
||||
class Bengali(Language):
|
||||
lang = 'bn'
|
||||
lang = "bn"
|
||||
Defaults = BengaliDefaults
|
||||
|
||||
|
||||
__all__ = ['Bengali']
|
||||
__all__ = ["Bengali"]
|
||||
|
|
|
@ -13,11 +13,9 @@ LEMMA_RULES = {
|
|||
["গাছা", ""],
|
||||
["গাছি", ""],
|
||||
["ছড়া", ""],
|
||||
|
||||
["কে", ""],
|
||||
["ে", ""],
|
||||
["তে", ""],
|
||||
|
||||
["র", ""],
|
||||
["রা", ""],
|
||||
["রে", ""],
|
||||
|
@ -28,7 +26,6 @@ LEMMA_RULES = {
|
|||
["গুলা", ""],
|
||||
["গুলো", ""],
|
||||
["গুলি", ""],
|
||||
|
||||
["কুল", ""],
|
||||
["গণ", ""],
|
||||
["দল", ""],
|
||||
|
@ -45,7 +42,6 @@ LEMMA_RULES = {
|
|||
["সকল", ""],
|
||||
["মহল", ""],
|
||||
["াবলি", ""], # আবলি
|
||||
|
||||
# Bengali digit representations
|
||||
["০", "0"],
|
||||
["১", "1"],
|
||||
|
@ -58,11 +54,5 @@ LEMMA_RULES = {
|
|||
["৮", "8"],
|
||||
["৯", "9"],
|
||||
],
|
||||
|
||||
"punct": [
|
||||
["“", "\""],
|
||||
["”", "\""],
|
||||
["\u2018", "'"],
|
||||
["\u2019", "'"]
|
||||
]
|
||||
"punct": [["“", '"'], ["”", '"'], ["\u2018", "'"], ["\u2019", "'"]],
|
||||
}
|
||||
|
|
|
@ -5,64 +5,253 @@ from ...symbols import LEMMA, PRON_LEMMA
|
|||
|
||||
|
||||
MORPH_RULES = {
|
||||
"PRP": {
|
||||
'ঐ': {LEMMA: PRON_LEMMA, 'PronType': 'Dem'},
|
||||
'আমাকে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Case': 'Acc'},
|
||||
'কি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'},
|
||||
'সে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Nom'},
|
||||
'কিসে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'},
|
||||
'তাকে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Acc'},
|
||||
'স্বয়ং': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
|
||||
'কোনগুলো': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'},
|
||||
'তুমি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'},
|
||||
'তুই': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'},
|
||||
'তাদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Acc'},
|
||||
'আমরা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One ', 'PronType': 'Prs', 'Case': 'Nom'},
|
||||
'যিনি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Rel', 'Case': 'Nom'},
|
||||
'আমাদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One', 'PronType': 'Prs', 'Case': 'Acc'},
|
||||
'কোন': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'},
|
||||
'কারা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'},
|
||||
'তোমাকে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'},
|
||||
'তোকে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'},
|
||||
'খোদ': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
|
||||
'কে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'},
|
||||
'যারা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Rel', 'Case': 'Nom'},
|
||||
'যে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Rel', 'Case': 'Nom'},
|
||||
'তোমরা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'},
|
||||
'তোরা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'},
|
||||
'তোমাদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'},
|
||||
'তোদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'},
|
||||
'আপন': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
|
||||
'এ': {LEMMA: PRON_LEMMA, 'PronType': 'Dem'},
|
||||
'নিজ': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
|
||||
'কার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'},
|
||||
'যা': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Rel', 'Case': 'Nom'},
|
||||
'তারা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Nom'},
|
||||
'আমি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Case': 'Nom'}
|
||||
"PRP": {
|
||||
"ঐ": {LEMMA: PRON_LEMMA, "PronType": "Dem"},
|
||||
"আমাকে": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Sing",
|
||||
"Person": "One",
|
||||
"PronType": "Prs",
|
||||
"Case": "Acc",
|
||||
},
|
||||
"কি": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Sing",
|
||||
"Gender": "Neut",
|
||||
"PronType": "Int",
|
||||
"Case": "Acc",
|
||||
},
|
||||
"সে": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Sing",
|
||||
"Person": "Three",
|
||||
"PronType": "Prs",
|
||||
"Case": "Nom",
|
||||
},
|
||||
"কিসে": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Sing",
|
||||
"Gender": "Neut",
|
||||
"PronType": "Int",
|
||||
"Case": "Acc",
|
||||
},
|
||||
"তাকে": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Sing",
|
||||
"Person": "Three",
|
||||
"PronType": "Prs",
|
||||
"Case": "Acc",
|
||||
},
|
||||
"স্বয়ং": {LEMMA: PRON_LEMMA, "Reflex": "Yes", "PronType": "Ref"},
|
||||
"কোনগুলো": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Plur",
|
||||
"Gender": "Neut",
|
||||
"PronType": "Int",
|
||||
"Case": "Acc",
|
||||
},
|
||||
"তুমি": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Sing",
|
||||
"Person": "Two",
|
||||
"PronType": "Prs",
|
||||
"Case": "Nom",
|
||||
},
|
||||
"তুই": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Sing",
|
||||
"Person": "Two",
|
||||
"PronType": "Prs",
|
||||
"Case": "Nom",
|
||||
},
|
||||
"তাদেরকে": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Plur",
|
||||
"Person": "Three",
|
||||
"PronType": "Prs",
|
||||
"Case": "Acc",
|
||||
},
|
||||
"আমরা": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Plur",
|
||||
"Person": "One ",
|
||||
"PronType": "Prs",
|
||||
"Case": "Nom",
|
||||
},
|
||||
"যিনি": {LEMMA: PRON_LEMMA, "Number": "Sing", "PronType": "Rel", "Case": "Nom"},
|
||||
"আমাদেরকে": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Plur",
|
||||
"Person": "One",
|
||||
"PronType": "Prs",
|
||||
"Case": "Acc",
|
||||
},
|
||||
"কোন": {LEMMA: PRON_LEMMA, "Number": "Sing", "PronType": "Int", "Case": "Acc"},
|
||||
"কারা": {LEMMA: PRON_LEMMA, "Number": "Plur", "PronType": "Int", "Case": "Acc"},
|
||||
"তোমাকে": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Sing",
|
||||
"Person": "Two",
|
||||
"PronType": "Prs",
|
||||
"Case": "Acc",
|
||||
},
|
||||
"তোকে": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Sing",
|
||||
"Person": "Two",
|
||||
"PronType": "Prs",
|
||||
"Case": "Acc",
|
||||
},
|
||||
"খোদ": {LEMMA: PRON_LEMMA, "Reflex": "Yes", "PronType": "Ref"},
|
||||
"কে": {LEMMA: PRON_LEMMA, "Number": "Sing", "PronType": "Int", "Case": "Acc"},
|
||||
"যারা": {LEMMA: PRON_LEMMA, "Number": "Plur", "PronType": "Rel", "Case": "Nom"},
|
||||
"যে": {LEMMA: PRON_LEMMA, "Number": "Sing", "PronType": "Rel", "Case": "Nom"},
|
||||
"তোমরা": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Plur",
|
||||
"Person": "Two",
|
||||
"PronType": "Prs",
|
||||
"Case": "Nom",
|
||||
},
|
||||
"তোরা": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Plur",
|
||||
"Person": "Two",
|
||||
"PronType": "Prs",
|
||||
"Case": "Nom",
|
||||
},
|
||||
"তোমাদেরকে": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Plur",
|
||||
"Person": "Two",
|
||||
"PronType": "Prs",
|
||||
"Case": "Acc",
|
||||
},
|
||||
"তোদেরকে": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Plur",
|
||||
"Person": "Two",
|
||||
"PronType": "Prs",
|
||||
"Case": "Acc",
|
||||
},
|
||||
"আপন": {LEMMA: PRON_LEMMA, "Reflex": "Yes", "PronType": "Ref"},
|
||||
"এ": {LEMMA: PRON_LEMMA, "PronType": "Dem"},
|
||||
"নিজ": {LEMMA: PRON_LEMMA, "Reflex": "Yes", "PronType": "Ref"},
|
||||
"কার": {LEMMA: PRON_LEMMA, "Number": "Sing", "PronType": "Int", "Case": "Acc"},
|
||||
"যা": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Sing",
|
||||
"Gender": "Neut",
|
||||
"PronType": "Rel",
|
||||
"Case": "Nom",
|
||||
},
|
||||
"তারা": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Plur",
|
||||
"Person": "Three",
|
||||
"PronType": "Prs",
|
||||
"Case": "Nom",
|
||||
},
|
||||
"আমি": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Sing",
|
||||
"Person": "One",
|
||||
"PronType": "Prs",
|
||||
"Case": "Nom",
|
||||
},
|
||||
},
|
||||
"PRP$": {
|
||||
|
||||
'আমার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes',
|
||||
'Case': 'Nom'},
|
||||
'মোর': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes',
|
||||
'Case': 'Nom'},
|
||||
'মোদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes',
|
||||
'Case': 'Nom'},
|
||||
'তার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes',
|
||||
'Case': 'Nom'},
|
||||
'তোমাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes',
|
||||
'Case': 'Nom'},
|
||||
'আমাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes',
|
||||
'Case': 'Nom'},
|
||||
'তোমার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes',
|
||||
'Case': 'Nom'},
|
||||
'তোর': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes',
|
||||
'Case': 'Nom'},
|
||||
'তাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes',
|
||||
'Case': 'Nom'},
|
||||
'কাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'},
|
||||
'তোদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes',
|
||||
'Case': 'Nom'},
|
||||
'যাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'},
|
||||
}
|
||||
"আমার": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Sing",
|
||||
"Person": "One",
|
||||
"PronType": "Prs",
|
||||
"Poss": "Yes",
|
||||
"Case": "Nom",
|
||||
},
|
||||
"মোর": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Sing",
|
||||
"Person": "One",
|
||||
"PronType": "Prs",
|
||||
"Poss": "Yes",
|
||||
"Case": "Nom",
|
||||
},
|
||||
"মোদের": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Plur",
|
||||
"Person": "One",
|
||||
"PronType": "Prs",
|
||||
"Poss": "Yes",
|
||||
"Case": "Nom",
|
||||
},
|
||||
"তার": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Sing",
|
||||
"Person": "Three",
|
||||
"PronType": "Prs",
|
||||
"Poss": "Yes",
|
||||
"Case": "Nom",
|
||||
},
|
||||
"তোমাদের": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Plur",
|
||||
"Person": "Two",
|
||||
"PronType": "Prs",
|
||||
"Poss": "Yes",
|
||||
"Case": "Nom",
|
||||
},
|
||||
"আমাদের": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Plur",
|
||||
"Person": "One",
|
||||
"PronType": "Prs",
|
||||
"Poss": "Yes",
|
||||
"Case": "Nom",
|
||||
},
|
||||
"তোমার": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Sing",
|
||||
"Person": "Two",
|
||||
"PronType": "Prs",
|
||||
"Poss": "Yes",
|
||||
"Case": "Nom",
|
||||
},
|
||||
"তোর": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Sing",
|
||||
"Person": "Two",
|
||||
"PronType": "Prs",
|
||||
"Poss": "Yes",
|
||||
"Case": "Nom",
|
||||
},
|
||||
"তাদের": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Plur",
|
||||
"Person": "Three",
|
||||
"PronType": "Prs",
|
||||
"Poss": "Yes",
|
||||
"Case": "Nom",
|
||||
},
|
||||
"কাদের": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Plur",
|
||||
"PronType": "Int",
|
||||
"Case": "Acc",
|
||||
},
|
||||
"তোদের": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Plur",
|
||||
"Person": "Two",
|
||||
"PronType": "Prs",
|
||||
"Poss": "Yes",
|
||||
"Case": "Nom",
|
||||
},
|
||||
"যাদের": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"Number": "Plur",
|
||||
"PronType": "Int",
|
||||
"Case": "Acc",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
|
|
@ -2,30 +2,45 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
||||
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS
|
||||
from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, QUOTES, UNITS
|
||||
|
||||
|
||||
_currency = r"\$|¢|£|€|¥|฿|৳"
|
||||
_quotes = QUOTES.replace("'", '')
|
||||
_list_punct = LIST_PUNCT + '। ॥'.strip().split()
|
||||
_quotes = QUOTES.replace("'", "")
|
||||
_list_punct = LIST_PUNCT + "। ॥".strip().split()
|
||||
|
||||
|
||||
_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)
|
||||
_prefixes = [r"\+"] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS
|
||||
|
||||
_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
||||
[r'(?<=[0-9])\+',
|
||||
r'(?<=°[FfCcKk])\.',
|
||||
r'(?<=[0-9])(?:{})'.format(_currency),
|
||||
r'(?<=[0-9])(?:{})'.format(UNITS),
|
||||
r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)])
|
||||
_suffixes = (
|
||||
_list_punct
|
||||
+ LIST_ELLIPSES
|
||||
+ LIST_QUOTES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[0-9])\+",
|
||||
r"(?<=°[FfCcKk])\.",
|
||||
r"(?<=[0-9])(?:{})".format(_currency),
|
||||
r"(?<=[0-9])(?:{})".format(UNITS),
|
||||
r"(?<=[{}(?:{})])\.".format(
|
||||
"|".join([ALPHA_LOWER, r"%²\-\)\]\+", QUOTES]), _currency
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
||||
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=_quotes)])
|
||||
_infixes = (
|
||||
LIST_ELLIPSES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[0-9{zero}-{nine}])[+\-\*^=](?=[0-9{zero}-{nine}-])".format(
|
||||
zero="০", nine="৯"
|
||||
),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])[{h}](?={ae})".format(a=ALPHA, h=HYPHENS, ae="এ"),
|
||||
r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
|
||||
r'(?<=[{a}"])[:<>=/](?=[{a}])'.format(a=ALPHA),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
TOKENIZER_PREFIXES = _prefixes
|
||||
|
|
|
@ -2,43 +2,45 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
STOP_WORDS = set("""
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
অতএব অথচ অথবা অনুযায়ী অনেক অনেকে অনেকেই অন্তত অবধি অবশ্য অর্থাৎ অন্য অনুযায়ী অর্ধভাগে
|
||||
আগামী আগে আগেই আছে আজ আদ্যভাগে আপনার আপনি আবার আমরা আমাকে আমাদের আমার আমি আর আরও
|
||||
ইত্যাদি ইহা
|
||||
আগামী আগে আগেই আছে আজ আদ্যভাগে আপনার আপনি আবার আমরা আমাকে আমাদের আমার আমি আর আরও
|
||||
ইত্যাদি ইহা
|
||||
উচিত উনি উপর উপরে উত্তর
|
||||
এ এঁদের এঁরা এই এক একই একজন একটা একটি একবার একে এখন এখনও এখানে এখানেই এটা এসো
|
||||
এটাই এটি এত এতটাই এতে এদের এবং এবার এমন এমনি এমনকি এর এরা এলো এস এসে
|
||||
ঐ
|
||||
ও ওঁদের ওঁর ওঁরা ওই ওকে ওখানে ওদের ওর ওরা
|
||||
এটাই এটি এত এতটাই এতে এদের এবং এবার এমন এমনি এমনকি এর এরা এলো এস এসে
|
||||
ঐ
|
||||
ও ওঁদের ওঁর ওঁরা ওই ওকে ওখানে ওদের ওর ওরা
|
||||
কখনও কত কথা কবে কয়েক কয়েকটি করছে করছেন করতে করবে করবেন করলে কয়েক কয়েকটি করিয়ে করিয়া করায়
|
||||
করলেন করা করাই করায় করার করি করিতে করিয়া করিয়ে করে করেই করেছিলেন করেছে করেছেন করেন কাউকে
|
||||
করলেন করা করাই করায় করার করি করিতে করিয়া করিয়ে করে করেই করেছিলেন করেছে করেছেন করেন কাউকে
|
||||
কাছ কাছে কাজ কাজে কারও কারণ কি কিংবা কিছু কিছুই কিন্তু কী কে কেউ কেউই কেন কোন কোনও কোনো কেমনে কোটি
|
||||
ক্ষেত্রে খুব
|
||||
ক্ষেত্রে খুব
|
||||
গিয়ে গিয়েছে গুলি গেছে গেল গেলে গোটা গিয়ে গিয়েছে
|
||||
চলে চান চায় চেয়ে চায় চেয়ে চার চালু চেষ্টা
|
||||
চলে চান চায় চেয়ে চায় চেয়ে চার চালু চেষ্টা
|
||||
ছাড়া ছাড়াও ছিল ছিলেন ছাড়া ছাড়াও
|
||||
জন জনকে জনের জন্য জন্যে জানতে জানা জানানো জানায় জানিয়ে জানিয়েছে জানায় জাানিয়ে জানিয়েছে
|
||||
টি
|
||||
ঠিক
|
||||
তখন তত তথা তবু তবে তা তাঁকে তাঁদের তাঁর তাঁরা তাঁহারা তাই তাও তাকে তাতে তাদের তার তারপর তারা তারই তাহলে তাহা তাহাতে তাহার তিনই
|
||||
টি
|
||||
ঠিক
|
||||
তখন তত তথা তবু তবে তা তাঁকে তাঁদের তাঁর তাঁরা তাঁহারা তাই তাও তাকে তাতে তাদের তার তারপর তারা তারই তাহলে তাহা তাহাতে তাহার তিনই
|
||||
তিনি তিনিও তুমি তুলে তেমন তো তোমার তুই তোরা তোর তোমাদের তোদের
|
||||
থাকবে থাকবেন থাকা থাকায় থাকে থাকেন থেকে থেকেই থেকেও থাকায়
|
||||
দিকে দিতে দিয়ে দিয়েছে দিয়েছেন দিলেন দিয়ে দু দুটি দুটো দেওয়া দেওয়ার দেখতে দেখা দেখে দেন দেয় দেশের
|
||||
দিকে দিতে দিয়ে দিয়েছে দিয়েছেন দিলেন দিয়ে দু দুটি দুটো দেওয়া দেওয়ার দেখতে দেখা দেখে দেন দেয় দেশের
|
||||
দ্বারা দিয়েছে দিয়েছেন দেয় দেওয়া দেওয়ার দিন দুই
|
||||
ধরা ধরে
|
||||
ধরা ধরে
|
||||
নয় না নাই নাকি নাগাদ নানা নিজে নিজেই নিজেদের নিজের নিতে নিয়ে নিয়ে নেই নেওয়া নেওয়ার নয় নতুন
|
||||
পক্ষে পর পরে পরেই পরেও পর্যন্ত পাওয়া পারি পারে পারেন পেয়ে প্রতি প্রভৃতি প্রায় পাওয়া পেয়ে প্রায় পাঁচ প্রথম প্রাথমিক
|
||||
ফলে ফিরে ফের
|
||||
ফলে ফিরে ফের
|
||||
বছর বদলে বরং বলতে বলল বললেন বলা বলে বলেছেন বলেন বসে বহু বা বাদে বার বিনা বিভিন্ন বিশেষ বিষয়টি বেশ ব্যবহার ব্যাপারে বক্তব্য বন বেশি
|
||||
ভাবে ভাবেই
|
||||
মত মতো মতোই মধ্যভাগে মধ্যে মধ্যেই মধ্যেও মনে মাত্র মাধ্যমে মানুষ মানুষের মোট মোটেই মোদের মোর
|
||||
যখন যত যতটা যথেষ্ট যদি যদিও যা যাঁর যাঁরা যাওয়া যাওয়ার যাকে যাচ্ছে যাতে যাদের যান যাবে যায় যার যারা যায় যিনি যে যেখানে যেতে যেন
|
||||
যেমন
|
||||
রকম রয়েছে রাখা রেখে রয়েছে
|
||||
লক্ষ
|
||||
শুধু শুরু
|
||||
সাধারণ সামনে সঙ্গে সঙ্গেও সব সবার সমস্ত সম্প্রতি সময় সহ সহিত সাথে সুতরাং সে সেই সেখান সেখানে সেটা সেটাই সেটাও সেটি স্পষ্ট স্বয়ং
|
||||
ভাবে ভাবেই
|
||||
মত মতো মতোই মধ্যভাগে মধ্যে মধ্যেই মধ্যেও মনে মাত্র মাধ্যমে মানুষ মানুষের মোট মোটেই মোদের মোর
|
||||
যখন যত যতটা যথেষ্ট যদি যদিও যা যাঁর যাঁরা যাওয়া যাওয়ার যাকে যাচ্ছে যাতে যাদের যান যাবে যায় যার যারা যায় যিনি যে যেখানে যেতে যেন
|
||||
যেমন
|
||||
রকম রয়েছে রাখা রেখে রয়েছে
|
||||
লক্ষ
|
||||
শুধু শুরু
|
||||
সাধারণ সামনে সঙ্গে সঙ্গেও সব সবার সমস্ত সম্প্রতি সময় সহ সহিত সাথে সুতরাং সে সেই সেখান সেখানে সেটা সেটাই সেটাও সেটি স্পষ্ট স্বয়ং
|
||||
হইতে হইবে হইয়া হওয়া হওয়ায় হওয়ার হচ্ছে হত হতে হতেই হন হবে হবেন হয় হয়তো হয়নি হয়ে হয়েই হয়েছিল হয়েছে হাজার
|
||||
হয়েছেন হল হলে হলেই হলেও হলো হিসাবে হিসেবে হৈলে হোক হয় হয়ে হয়েছে হৈতে হইয়া হয়েছিল হয়েছেন হয়নি হয়েই হয়তো হওয়া হওয়ার হওয়ায়
|
||||
""".split())
|
||||
""".split()
|
||||
)
|
||||
|
|
|
@ -6,72 +6,77 @@ from ...symbols import CCONJ, NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SYM
|
|||
|
||||
|
||||
TAG_MAP = {
|
||||
".": {POS: PUNCT, "PunctType": "peri"},
|
||||
",": {POS: PUNCT, "PunctType": "comm"},
|
||||
"-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"},
|
||||
"-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"},
|
||||
"``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"},
|
||||
"\"\"": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
||||
"''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
||||
":": {POS: PUNCT},
|
||||
"৳": {POS: SYM, "Other": {"SymType": "currency"}},
|
||||
"#": {POS: SYM, "Other": {"SymType": "numbersign"}},
|
||||
"AFX": {POS: ADJ, "Hyph": "yes"},
|
||||
"CC": {POS: CONJ, "ConjType": "coor"},
|
||||
"CD": {POS: NUM, "NumType": "card"},
|
||||
"DT": {POS: DET},
|
||||
"EX": {POS: ADV, "AdvType": "ex"},
|
||||
"FW": {POS: X, "Foreign": "yes"},
|
||||
"HYPH": {POS: PUNCT, "PunctType": "dash"},
|
||||
"IN": {POS: ADP},
|
||||
"JJ": {POS: ADJ, "Degree": "pos"},
|
||||
"JJR": {POS: ADJ, "Degree": "comp"},
|
||||
"JJS": {POS: ADJ, "Degree": "sup"},
|
||||
"LS": {POS: PUNCT, "NumType": "ord"},
|
||||
"MD": {POS: VERB, "VerbType": "mod"},
|
||||
"NIL": {POS: ""},
|
||||
"NN": {POS: NOUN, "Number": "sing"},
|
||||
"NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
|
||||
"NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"},
|
||||
"NNS": {POS: NOUN, "Number": "plur"},
|
||||
"PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"},
|
||||
"POS": {POS: PART, "Poss": "yes"},
|
||||
"PRP": {POS: PRON, "PronType": "prs"},
|
||||
"PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"},
|
||||
"RB": {POS: ADV, "Degree": "pos"},
|
||||
"RBR": {POS: ADV, "Degree": "comp"},
|
||||
"RBS": {POS: ADV, "Degree": "sup"},
|
||||
"RP": {POS: PART},
|
||||
"SYM": {POS: SYM},
|
||||
"TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"},
|
||||
"UH": {POS: INTJ},
|
||||
"VB": {POS: VERB, "VerbForm": "inf"},
|
||||
"VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"},
|
||||
"VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"},
|
||||
"VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"},
|
||||
"VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"},
|
||||
"VBZ": {POS: VERB, "VerbForm": "fin", "Tense": "pres", "Number": "sing", "Person": 3},
|
||||
"WDT": {POS: ADJ, "PronType": "int|rel"},
|
||||
"WP": {POS: NOUN, "PronType": "int|rel"},
|
||||
"WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
|
||||
"WRB": {POS: ADV, "PronType": "int|rel"},
|
||||
"SP": {POS: SPACE},
|
||||
"ADV": {POS: ADV},
|
||||
"NOUN": {POS: NOUN},
|
||||
"ADP": {POS: ADP},
|
||||
"PRON": {POS: PRON},
|
||||
"SCONJ": {POS: SCONJ},
|
||||
"PROPN": {POS: PROPN},
|
||||
"DET": {POS: DET},
|
||||
"SYM": {POS: SYM},
|
||||
"INTJ": {POS: INTJ},
|
||||
"PUNCT": {POS: PUNCT},
|
||||
"NUM": {POS: NUM},
|
||||
"AUX": {POS: AUX},
|
||||
"X": {POS: X},
|
||||
"CONJ": {POS: CONJ},
|
||||
"CCONJ": {POS: CCONJ},
|
||||
"ADJ": {POS: ADJ},
|
||||
"VERB": {POS: VERB},
|
||||
"PART": {POS: PART},
|
||||
".": {POS: PUNCT, "PunctType": "peri"},
|
||||
",": {POS: PUNCT, "PunctType": "comm"},
|
||||
"-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"},
|
||||
"-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"},
|
||||
"``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"},
|
||||
'""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
||||
"''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
||||
":": {POS: PUNCT},
|
||||
"৳": {POS: SYM, "Other": {"SymType": "currency"}},
|
||||
"#": {POS: SYM, "Other": {"SymType": "numbersign"}},
|
||||
"AFX": {POS: ADJ, "Hyph": "yes"},
|
||||
"CC": {POS: CONJ, "ConjType": "coor"},
|
||||
"CD": {POS: NUM, "NumType": "card"},
|
||||
"DT": {POS: DET},
|
||||
"EX": {POS: ADV, "AdvType": "ex"},
|
||||
"FW": {POS: X, "Foreign": "yes"},
|
||||
"HYPH": {POS: PUNCT, "PunctType": "dash"},
|
||||
"IN": {POS: ADP},
|
||||
"JJ": {POS: ADJ, "Degree": "pos"},
|
||||
"JJR": {POS: ADJ, "Degree": "comp"},
|
||||
"JJS": {POS: ADJ, "Degree": "sup"},
|
||||
"LS": {POS: PUNCT, "NumType": "ord"},
|
||||
"MD": {POS: VERB, "VerbType": "mod"},
|
||||
"NIL": {POS: ""},
|
||||
"NN": {POS: NOUN, "Number": "sing"},
|
||||
"NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
|
||||
"NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"},
|
||||
"NNS": {POS: NOUN, "Number": "plur"},
|
||||
"PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"},
|
||||
"POS": {POS: PART, "Poss": "yes"},
|
||||
"PRP": {POS: PRON, "PronType": "prs"},
|
||||
"PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"},
|
||||
"RB": {POS: ADV, "Degree": "pos"},
|
||||
"RBR": {POS: ADV, "Degree": "comp"},
|
||||
"RBS": {POS: ADV, "Degree": "sup"},
|
||||
"RP": {POS: PART},
|
||||
"TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"},
|
||||
"UH": {POS: INTJ},
|
||||
"VB": {POS: VERB, "VerbForm": "inf"},
|
||||
"VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"},
|
||||
"VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"},
|
||||
"VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"},
|
||||
"VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"},
|
||||
"VBZ": {
|
||||
POS: VERB,
|
||||
"VerbForm": "fin",
|
||||
"Tense": "pres",
|
||||
"Number": "sing",
|
||||
"Person": 3,
|
||||
},
|
||||
"WDT": {POS: ADJ, "PronType": "int|rel"},
|
||||
"WP": {POS: NOUN, "PronType": "int|rel"},
|
||||
"WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
|
||||
"WRB": {POS: ADV, "PronType": "int|rel"},
|
||||
"SP": {POS: SPACE},
|
||||
"ADV": {POS: ADV},
|
||||
"NOUN": {POS: NOUN},
|
||||
"ADP": {POS: ADP},
|
||||
"PRON": {POS: PRON},
|
||||
"SCONJ": {POS: SCONJ},
|
||||
"PROPN": {POS: PROPN},
|
||||
"DET": {POS: DET},
|
||||
"SYM": {POS: SYM},
|
||||
"INTJ": {POS: INTJ},
|
||||
"PUNCT": {POS: PUNCT},
|
||||
"NUM": {POS: NUM},
|
||||
"AUX": {POS: AUX},
|
||||
"X": {POS: X},
|
||||
"CONJ": {POS: CONJ},
|
||||
"CCONJ": {POS: CCONJ},
|
||||
"ADJ": {POS: ADJ},
|
||||
"VERB": {POS: VERB},
|
||||
"PART": {POS: PART},
|
||||
}
|
||||
|
|
|
@ -19,7 +19,8 @@ for exc_data in [
|
|||
{ORTH: "কি.মি", LEMMA: "কিলোমিটার"},
|
||||
{ORTH: "সে.মি.", LEMMA: "সেন্টিমিটার"},
|
||||
{ORTH: "সে.মি", LEMMA: "সেন্টিমিটার"},
|
||||
{ORTH: "মি.লি.", LEMMA: "মিলিলিটার"}]:
|
||||
{ORTH: "মি.লি.", LEMMA: "মিলিলিটার"},
|
||||
]:
|
||||
_exc[exc_data[ORTH]] = [exc_data]
|
||||
|
||||
|
||||
|
|
33
spacy/lang/ca/__init__.py
Normal file
33
spacy/lang/ca/__init__.py
Normal file
|
@ -0,0 +1,33 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .lemmatizer import LOOKUP
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class CatalanDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "ca"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
lemma_lookup = LOOKUP
|
||||
|
||||
|
||||
class Catalan(Language):
|
||||
lang = "ca"
|
||||
Defaults = CatalanDefaults
|
||||
|
||||
|
||||
__all__ = ["Catalan"]
|
22
spacy/lang/ca/examples.py
Normal file
22
spacy/lang/ca/examples.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.ca.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple està buscant comprar una startup del Regne Unit per mil milions de dòlars",
|
||||
"Els cotxes autònoms deleguen la responsabilitat de l'assegurança als seus fabricants",
|
||||
"San Francisco analitza prohibir els robots de repartiment",
|
||||
"Londres és una gran ciutat del Regne Unit",
|
||||
"El gat menja peix",
|
||||
"Veig a l'home amb el telescopi",
|
||||
"L'Aranya menja mosques",
|
||||
"El pingüí incuba en el seu niu",
|
||||
]
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user