mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-03 21:24:11 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
4aa1002546
12
.flake8
12
.flake8
|
@ -1,4 +1,14 @@
|
||||||
[flake8]
|
[flake8]
|
||||||
ignore = E203, E266, E501, W503
|
ignore = E203, E266, E501, E731, W503
|
||||||
max-line-length = 80
|
max-line-length = 80
|
||||||
select = B,C,E,F,W,T4,B9
|
select = B,C,E,F,W,T4,B9
|
||||||
|
exclude =
|
||||||
|
.env,
|
||||||
|
.git,
|
||||||
|
__pycache__,
|
||||||
|
lemmatizer.py,
|
||||||
|
lookup.py,
|
||||||
|
_tokenizer_exceptions_list.py,
|
||||||
|
spacy/lang/fr/lemmatizer,
|
||||||
|
spacy/lang/nb/lemmatizer
|
||||||
|
spacy/__init__.py
|
||||||
|
|
2
.github/ISSUE_TEMPLATE.md
vendored
2
.github/ISSUE_TEMPLATE.md
vendored
|
@ -1,7 +1,7 @@
|
||||||
<!--- Please provide a summary in the title and describe your issue here.
|
<!--- Please provide a summary in the title and describe your issue here.
|
||||||
Is this a bug or feature request? If a bug, include all the steps that led to the issue.
|
Is this a bug or feature request? If a bug, include all the steps that led to the issue.
|
||||||
|
|
||||||
If you're looking for help with your code, consider posting a question on StackOverflow instead:
|
If you're looking for help with your code, consider posting a question on Stack Overflow instead:
|
||||||
http://stackoverflow.com/questions/tagged/spacy -->
|
http://stackoverflow.com/questions/tagged/spacy -->
|
||||||
|
|
||||||
|
|
||||||
|
|
4
.github/ISSUE_TEMPLATE/05_other.md
vendored
4
.github/ISSUE_TEMPLATE/05_other.md
vendored
|
@ -1,11 +1,11 @@
|
||||||
---
|
---
|
||||||
name: "\U0001F4AC Anything else?"
|
name: "\U0001F4AC Anything else?"
|
||||||
about: For general usage questions or help with your code, please consider
|
about: For general usage questions or help with your code, please consider
|
||||||
posting on StackOverflow instead.
|
posting on Stack Overflow instead.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
<!-- Describe your issue here. Please keep in mind that the GitHub issue tracker is mostly intended for reports related to the spaCy code base and source, and for bugs and feature requests. If you're looking for help with your code, consider posting a question on StackOverflow instead: http://stackoverflow.com/questions/tagged/spacy -->
|
<!-- Describe your issue here. Please keep in mind that the GitHub issue tracker is mostly intended for reports related to the spaCy code base and source, and for bugs and feature requests. If you're looking for help with your code, consider posting a question on Stack Overflow instead: http://stackoverflow.com/questions/tagged/spacy -->
|
||||||
|
|
||||||
## Your Environment
|
## Your Environment
|
||||||
<!-- Include details of your environment. If you're using spaCy 1.7+, you can also type `python -m spacy info --markdown` and copy-paste the result here.-->
|
<!-- Include details of your environment. If you're using spaCy 1.7+, you can also type `python -m spacy info --markdown` and copy-paste the result here.-->
|
||||||
|
|
106
.github/contributors/ALSchwalm.md
vendored
Normal file
106
.github/contributors/ALSchwalm.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | ------------------------ |
|
||||||
|
| Name | Adam Schwalm |
|
||||||
|
| Company name (if applicable) | Star Lab |
|
||||||
|
| Title or role (if applicable) | Software Engineer |
|
||||||
|
| Date | 2018-11-28 |
|
||||||
|
| GitHub username | ALSchwalm |
|
||||||
|
| Website (optional) | https://alschwalm.com |
|
106
.github/contributors/BramVanroy.md
vendored
Normal file
106
.github/contributors/BramVanroy.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | ----------------------|
|
||||||
|
| Name | Bram Vanroy |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | October 19, 2018 |
|
||||||
|
| GitHub username | BramVanroy |
|
||||||
|
| Website (optional) | https://bramvanroy.be |
|
106
.github/contributors/Cinnamy.md
vendored
Normal file
106
.github/contributors/Cinnamy.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Marina Lysyuk |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 13.10.2018 |
|
||||||
|
| GitHub username | Cinnamy |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/JKhakpour.md
vendored
Normal file
106
.github/contributors/JKhakpour.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Ja'far Khakpour |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2018-09-24 |
|
||||||
|
| GitHub username | JKhakpour |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/aniruddha-adhikary.md
vendored
Normal file
106
.github/contributors/aniruddha-adhikary.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Aniruddha Adhikary |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2018-09-05 |
|
||||||
|
| GitHub username | aniruddha-adhikary |
|
||||||
|
| Website (optional) | https://adhikary.net |
|
106
.github/contributors/aongko.md
vendored
Normal file
106
.github/contributors/aongko.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Andrew Ongko |
|
||||||
|
| Company name (if applicable) | Kurio |
|
||||||
|
| Title or role (if applicable) | Senior Data Science |
|
||||||
|
| Date | Sep 10, 2018 |
|
||||||
|
| GitHub username | aongko |
|
||||||
|
| Website (optional) | |
|
54
.github/contributors/aryaprabhudesai.md
vendored
Normal file
54
.github/contributors/aryaprabhudesai.md
vendored
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
spaCy contributor agreement
|
||||||
|
This spaCy Contributor Agreement ("SCA") is based on the Oracle Contributor Agreement. The SCA applies to any contribution that you make to any product or project managed by us (the "project"), and sets out the intellectual property rights you grant to us in the contributed materials. The term "us" shall mean ExplosionAI UG (haftungsbeschränkt). The term "you" shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested below and include the filled-in version with your first pull request, under the folder .github/contributors/. The name of the file should be your GitHub username, with the extension .md. For example, the user example_user would create the file .github/contributors/example_user.md.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions constitute a binding legal agreement.
|
||||||
|
|
||||||
|
Contributor Agreement
|
||||||
|
The term "contribution" or "contributed materials" means any source code, object code, patch, tool, sample, graphic, specification, manual, documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
With respect to any worldwide copyrights, or copyright applications and registrations, in your contribution:
|
||||||
|
|
||||||
|
you hereby assign to us joint ownership, and to the extent that such assignment is or becomes invalid, ineffective or unenforceable, you hereby grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, royalty-free, unrestricted license to exercise all rights under those copyrights. This includes, at our option, the right to sublicense these same rights to third parties through multiple levels of sublicensees or other licensing arrangements;
|
||||||
|
|
||||||
|
you agree that each of us can do all things in relation to your contribution as if each of us were the sole owners, and if one of us makes a derivative work of your contribution, the one who makes the derivative work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
you agree that you will not assert any moral rights in your contribution against us, our licensees or transferees;
|
||||||
|
|
||||||
|
you agree that we may register a copyright in your contribution and exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
you agree that neither of us has any duty to consult with, obtain the consent of, pay or render an accounting to the other for any use or distribution of your contribution.
|
||||||
|
|
||||||
|
With respect to any patents you own, or that you can license without payment to any third party, you hereby grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
make, have made, use, sell, offer to sell, import, and otherwise transfer your contribution in whole or in part, alone or in combination with or included in any product, work or materials arising out of the project to which your contribution was submitted, and
|
||||||
|
|
||||||
|
at our option, to sublicense these same rights to third parties through multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
Except as set out above, you keep all right, title, and interest in your contribution. The rights that you grant to us under these terms are effective on the date you first submitted a contribution to us, even if your submission took place before the date you sign these terms.
|
||||||
|
|
||||||
|
You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
Each contribution that you submit is and shall be an original work of authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
to the best of your knowledge, each contribution will not violate any third party's copyrights, trademarks, patents, or other intellectual property rights; and
|
||||||
|
|
||||||
|
each contribution shall be in compliance with U.S. export control laws and other applicable export and import laws. You agree to notify us if you become aware of any circumstance which would make any of the foregoing representations inaccurate in any respect. We may publicly disclose your participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
This SCA is governed by the laws of the State of California and applicable U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
Please place an “x” on one of the applicable statement below. Please do NOT mark both statements:
|
||||||
|
|
||||||
|
[X] I am signing on behalf of myself as an individual and no other person or entity, including my employer, has or will have rights with respect to my contributions.
|
||||||
|
|
||||||
|
I am signing on behalf of my employer or a legal entity and I have the actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
Contributor Details
|
||||||
|
Field Entry
|
||||||
|
Name Arya Prabhudesai
|
||||||
|
Company name (if applicable) -
|
||||||
|
Title or role (if applicable) -
|
||||||
|
Date 2018-08-17
|
||||||
|
GitHub username aryaprabhudesai
|
||||||
|
Website (optional) -
|
106
.github/contributors/charlax.md
vendored
Normal file
106
.github/contributors/charlax.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Charles-Axel Dein |
|
||||||
|
| Company name (if applicable) | Skrib |
|
||||||
|
| Title or role (if applicable) | CEO |
|
||||||
|
| Date | 27/09/2018 |
|
||||||
|
| GitHub username | charlax |
|
||||||
|
| Website (optional) | www.dein.fr |
|
106
.github/contributors/cicorias.md
vendored
Normal file
106
.github/contributors/cicorias.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [X] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Shawn Cicoria |
|
||||||
|
| Company name (if applicable) | Microsoft |
|
||||||
|
| Title or role (if applicable) | Principal Software Engineer |
|
||||||
|
| Date | November 20, 2018 |
|
||||||
|
| GitHub username | cicorias |
|
||||||
|
| Website (optional) | www.cicoria.com |
|
106
.github/contributors/darindf.md
vendored
Normal file
106
.github/contributors/darindf.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Darin DeForest |
|
||||||
|
| Company name (if applicable) | Ipro Tech |
|
||||||
|
| Title or role (if applicable) | Senior Software Engineer |
|
||||||
|
| Date | 2018-09-26 |
|
||||||
|
| GitHub username | darindf |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/filipecaixeta.md
vendored
Normal file
106
.github/contributors/filipecaixeta.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Filipe Caixeta |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 09.12.2018 |
|
||||||
|
| GitHub username | filipecaixeta |
|
||||||
|
| Website (optional) | filipecaixeta.com.br |
|
106
.github/contributors/frascuchon.md
vendored
Normal file
106
.github/contributors/frascuchon.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Francisco Aranda |
|
||||||
|
| Company name (if applicable) | recognai |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | |
|
||||||
|
| GitHub username | frascuchon |
|
||||||
|
| Website (optional) | https://recogn.ai |
|
106
.github/contributors/free-variation.md
vendored
Normal file
106
.github/contributors/free-variation.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | John Stewart |
|
||||||
|
| Company name (if applicable) | Amplify |
|
||||||
|
| Title or role (if applicable) | SVP Research |
|
||||||
|
| Date | 14/09/2018 |
|
||||||
|
| GitHub username | free-variation |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/gavrieltal.md
vendored
Normal file
106
.github/contributors/gavrieltal.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Gavriel Loria |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | Nov 29, 2018 |
|
||||||
|
| GitHub username | gavrieltal |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/grivaz.md
vendored
Normal file
106
.github/contributors/grivaz.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name |C. Grivaz |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date |08.22.2018 |
|
||||||
|
| GitHub username |grivaz |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/jacopofar.md
vendored
Normal file
106
.github/contributors/jacopofar.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [X] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Jacopo Farina |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2018-10-12 |
|
||||||
|
| GitHub username | jacopofar |
|
||||||
|
| Website (optional) | jacopofarina.eu |
|
106
.github/contributors/keshan.md
vendored
Normal file
106
.github/contributors/keshan.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Keshan Sodimana |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | Sep 21, 2018 |
|
||||||
|
| GitHub username | keshan |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/mbkupfer.md
vendored
Normal file
106
.github/contributors/mbkupfer.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Maxim Kupfer |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | Sep 6, 2018 |
|
||||||
|
| GitHub username | mbkupfer |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/mikelibg.md
vendored
Normal file
106
.github/contributors/mikelibg.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | ------------------------ |
|
||||||
|
| Name | Michael Liberman |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2018-11-08 |
|
||||||
|
| GitHub username | mikelibg |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/mpuig.md
vendored
Normal file
106
.github/contributors/mpuig.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Marc Puig |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2018-11-17 |
|
||||||
|
| GitHub username | mpuig |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/phojnacki.md
vendored
Normal file
106
.github/contributors/phojnacki.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [ X ] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | ------------------------------------- |
|
||||||
|
| Name | Przemysław Hojnacki |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 12/09/2018 |
|
||||||
|
| GitHub username | phojnacki |
|
||||||
|
| Website (optional) | https://about.me/przemyslaw.hojnacki |
|
106
.github/contributors/pzelasko.md
vendored
Normal file
106
.github/contributors/pzelasko.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Piotr Żelasko |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 04-09-2018 |
|
||||||
|
| GitHub username | pzelasko |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/sainathadapa.md
vendored
Normal file
106
.github/contributors/sainathadapa.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Sainath Adapa |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2018-09-06 |
|
||||||
|
| GitHub username | sainathadapa |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/tyburam.md
vendored
Normal file
106
.github/contributors/tyburam.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Mateusz Tybura |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 08.09.2018 |
|
||||||
|
| GitHub username | tyburam |
|
||||||
|
| Website (optional) | |
|
102
CONTRIBUTING.md
102
CONTRIBUTING.md
|
@ -26,7 +26,7 @@ also check the [troubleshooting guide](https://spacy.io/usage/#troubleshooting)
|
||||||
to see if your problem is already listed there.
|
to see if your problem is already listed there.
|
||||||
|
|
||||||
If you're looking for help with your code, consider posting a question on
|
If you're looking for help with your code, consider posting a question on
|
||||||
[StackOverflow](http://stackoverflow.com/questions/tagged/spacy) instead. If you
|
[Stack Overflow](http://stackoverflow.com/questions/tagged/spacy) instead. If you
|
||||||
tag it `spacy` and `python`, more people will see it and hopefully be able to
|
tag it `spacy` and `python`, more people will see it and hopefully be able to
|
||||||
help. Please understand that we won't be able to provide individual support via
|
help. Please understand that we won't be able to provide individual support via
|
||||||
email. We also believe that help is much more valuable if it's **shared publicly**,
|
email. We also believe that help is much more valuable if it's **shared publicly**,
|
||||||
|
@ -186,13 +186,99 @@ sure your test passes and reference the issue in your commit message.
|
||||||
## Code conventions
|
## Code conventions
|
||||||
|
|
||||||
Code should loosely follow [pep8](https://www.python.org/dev/peps/pep-0008/).
|
Code should loosely follow [pep8](https://www.python.org/dev/peps/pep-0008/).
|
||||||
Regular line length is **80 characters**, with some tolerance for lines up to
|
As of `v2.1.0`, spaCy uses [`black`](https://github.com/ambv/black) for code
|
||||||
90 characters if the alternative would be worse — for instance, if your list
|
formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its
|
||||||
comprehension comes to 82 characters, it's better not to split it over two lines.
|
Python modules. If you've built spaCy from source, you'll already have both
|
||||||
You can also use a linter like [`flake8`](https://pypi.python.org/pypi/flake8)
|
tools installed.
|
||||||
or [`frosted`](https://pypi.python.org/pypi/frosted) – just keep in mind that
|
|
||||||
it won't work very well for `.pyx` files and will complain about Cython syntax
|
**⚠️ Note that formatting and linting is currently only possible for Python
|
||||||
like `<int*>` or `cimport`.
|
modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.**
|
||||||
|
|
||||||
|
### Code formatting
|
||||||
|
|
||||||
|
[`black`](https://github.com/ambv/black) is an opinionated Python code
|
||||||
|
formatter, optimised to produce readable code and small diffs. You can run
|
||||||
|
`black` from the command-line, or via your code editor. For example, if you're
|
||||||
|
using [Visual Studio Code](https://code.visualstudio.com/), you can add the
|
||||||
|
following to your `settings.json` to use `black` for formatting and auto-format
|
||||||
|
your files on save:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"python.formatting.provider": "black",
|
||||||
|
"[python]": {
|
||||||
|
"editor.formatOnSave": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
[See here](https://github.com/ambv/black#editor-integration) for the full
|
||||||
|
list of available editor integrations.
|
||||||
|
|
||||||
|
#### Disabling formatting
|
||||||
|
|
||||||
|
There are a few cases where auto-formatting doesn't improve readability – for
|
||||||
|
example, in some of the the language data files like the `tag_map.py`, or in
|
||||||
|
the tests that construct `Doc` objects from lists of words and other labels.
|
||||||
|
Wrapping a block in `# fmt: off` and `# fmt: on` lets you disable formatting
|
||||||
|
for that particular code. Here's an example:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# fmt: off
|
||||||
|
text = "I look forward to using Thingamajig. I've been told it will make my life easier..."
|
||||||
|
heads = [1, 0, -1, -2, -1, -1, -5, -1, 3, 2, 1, 0, 2, 1, -3, 1, 1, -3, -7]
|
||||||
|
deps = ["nsubj", "ROOT", "advmod", "prep", "pcomp", "dobj", "punct", "",
|
||||||
|
"nsubjpass", "aux", "auxpass", "ROOT", "nsubj", "aux", "ccomp",
|
||||||
|
"poss", "nsubj", "ccomp", "punct"]
|
||||||
|
# fmt: on
|
||||||
|
```
|
||||||
|
|
||||||
|
### Code linting
|
||||||
|
|
||||||
|
[`flake8`](http://flake8.pycqa.org/en/latest/) is a tool for enforcing code
|
||||||
|
style. It scans one or more files and outputs errors and warnings. This feedback
|
||||||
|
can help you stick to general standards and conventions, and can be very useful
|
||||||
|
for spotting potential mistakes and inconsistencies in your code. The most
|
||||||
|
important things to watch out for are syntax errors and undefined names, but you
|
||||||
|
also want to keep an eye on unused declared variables or repeated
|
||||||
|
(i.e. overwritten) dictionary keys. If your code was formatted with `black`
|
||||||
|
(see above), you shouldn't see any formatting-related warnings.
|
||||||
|
|
||||||
|
The [`.flake8`](.flake8) config defines the configuration we use for this
|
||||||
|
codebase. For example, we're not super strict about the line length, and we're
|
||||||
|
excluding very large files like lemmatization and tokenizer exception tables.
|
||||||
|
|
||||||
|
Ideally, running the following command from within the repo directory should
|
||||||
|
not return any errors or warnings:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
flake8 spacy
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Disabling linting
|
||||||
|
|
||||||
|
Sometimes, you explicitly want to write code that's not compatible with our
|
||||||
|
rules. For example, a module's `__init__.py` might import a function so other
|
||||||
|
modules can import it from there, but `flake8` will complain about an unused
|
||||||
|
import. And although it's generally discouraged, there might be cases where it
|
||||||
|
makes sense to use a bare `except`.
|
||||||
|
|
||||||
|
To ignore a given line, you can add a comment like `# noqa: F401`, specifying
|
||||||
|
the code of the error or warning we want to ignore. It's also possible to
|
||||||
|
ignore several comma-separated codes at once, e.g. `# noqa: E731,E123`. Here
|
||||||
|
are some examples:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# The imported class isn't used in this file, but imported here, so it can be
|
||||||
|
# imported *from* here by another module.
|
||||||
|
from .submodule import SomeClass # noqa: F401
|
||||||
|
|
||||||
|
try:
|
||||||
|
do_something()
|
||||||
|
except: # noqa: E722
|
||||||
|
# This bare except is justified, for some specific reason
|
||||||
|
do_something_else()
|
||||||
|
```
|
||||||
|
|
||||||
### Python conventions
|
### Python conventions
|
||||||
|
|
||||||
|
|
|
@ -1,83 +0,0 @@
|
||||||
# 👥 Contributors
|
|
||||||
|
|
||||||
This is a list of everyone who has made significant contributions to spaCy, in alphabetical order. Thanks a lot for the great work!
|
|
||||||
|
|
||||||
* Adam Bittlingmayer, [@bittlingmayer](https://github.com/bittlingmayer)
|
|
||||||
* Alexey Kim, [@yuukos](https://github.com/yuukos)
|
|
||||||
* Alexis Eidelman, [@AlexisEidelman](https://github.com/AlexisEidelman)
|
|
||||||
* Ali Zarezade, [@azarezade](https://github.com/azarezade)
|
|
||||||
* Andreas Grivas, [@andreasgrv](https://github.com/andreasgrv)
|
|
||||||
* Andrew Poliakov, [@pavlin99th](https://github.com/pavlin99th)
|
|
||||||
* Aniruddha Adhikary, [@aniruddha-adhikary](https://github.com/aniruddha-adhikary)
|
|
||||||
* Anto Binish Kaspar, [@binishkaspar](https://github.com/binishkaspar)
|
|
||||||
* Avadh Patel, [@avadhpatel](https://github.com/avadhpatel)
|
|
||||||
* Ben Eyal, [@beneyal](https://github.com/beneyal)
|
|
||||||
* Bhargav Srinivasa, [@bhargavvader](https://github.com/bhargavvader)
|
|
||||||
* Bruno P. Kinoshita, [@kinow](https://github.com/kinow)
|
|
||||||
* Canbey Bilgili, [@cbilgili](https://github.com/cbilgili)
|
|
||||||
* Chris DuBois, [@chrisdubois](https://github.com/chrisdubois)
|
|
||||||
* Christoph Schwienheer, [@chssch](https://github.com/chssch)
|
|
||||||
* Dafne van Kuppevelt, [@dafnevk](https://github.com/dafnevk)
|
|
||||||
* Daniel Rapp, [@rappdw](https://github.com/rappdw)
|
|
||||||
* Daniel Vila Suero, [@dvsrepo](https://github.com/dvsrepo)
|
|
||||||
* Dmytro Sadovnychyi, [@sadovnychyi](https://github.com/sadovnychyi)
|
|
||||||
* Eric Zhao, [@ericzhao28](https://github.com/ericzhao28)
|
|
||||||
* Francisco Aranda, [@frascuchon](https://github.com/frascuchon)
|
|
||||||
* Greg Baker, [@solresol](https://github.com/solresol)
|
|
||||||
* Greg Dubbin, [@GregDubbin](https://github.com/GregDubbin)
|
|
||||||
* Grégory Howard, [@Gregory-Howard](https://github.com/Gregory-Howard)
|
|
||||||
* György Orosz, [@oroszgy](https://github.com/oroszgy)
|
|
||||||
* Henning Peters, [@henningpeters](https://github.com/henningpeters)
|
|
||||||
* Iddo Berger, [@iddoberger](https://github.com/iddoberger)
|
|
||||||
* Ines Montani, [@ines](https://github.com/ines)
|
|
||||||
* J Nicolas Schrading, [@NSchrading](https://github.com/NSchrading)
|
|
||||||
* Janneke van der Zwaan, [@jvdzwaan](https://github.com/jvdzwaan)
|
|
||||||
* Jim Geovedi, [@geovedi](https://github.com/geovedi)
|
|
||||||
* Jim Regan, [@jimregan](https://github.com/jimregan)
|
|
||||||
* Jeffrey Gerard, [@IamJeffG](https://github.com/IamJeffG)
|
|
||||||
* Jordan Suchow, [@suchow](https://github.com/suchow)
|
|
||||||
* Josh Reeter, [@jreeter](https://github.com/jreeter)
|
|
||||||
* Juan Miguel Cejuela, [@juanmirocks](https://github.com/juanmirocks)
|
|
||||||
* Kendrick Tan, [@kendricktan](https://github.com/kendricktan)
|
|
||||||
* Kyle P. Johnson, [@kylepjohnson](https://github.com/kylepjohnson)
|
|
||||||
* Leif Uwe Vogelsang, [@luvogels](https://github.com/luvogels)
|
|
||||||
* Liling Tan, [@alvations](https://github.com/alvations)
|
|
||||||
* Magnus Burton, [@magnusburton](https://github.com/magnusburton)
|
|
||||||
* Mark Amery, [@ExplodingCabbage](https://github.com/ExplodingCabbage)
|
|
||||||
* Matthew Honnibal, [@honnibal](https://github.com/honnibal)
|
|
||||||
* Maxim Samsonov, [@maxirmx](https://github.com/maxirmx)
|
|
||||||
* Michael Wallin, [@wallinm1](https://github.com/wallinm1)
|
|
||||||
* Miguel Almeida, [@mamoit](https://github.com/mamoit)
|
|
||||||
* Motoki Wu, [@tokestermw](https://github.com/tokestermw)
|
|
||||||
* Ole Henrik Skogstrøm, [@ohenrik](https://github.com/ohenrik)
|
|
||||||
* Oleg Zd, [@olegzd](https://github.com/olegzd)
|
|
||||||
* Orhan Bilgin, [@melanuria](https://github.com/melanuria)
|
|
||||||
* Orion Montoya, [@mdcclv](https://github.com/mdcclv)
|
|
||||||
* Paul O'Leary McCann, [@polm](https://github.com/polm)
|
|
||||||
* Pokey Rule, [@pokey](https://github.com/pokey)
|
|
||||||
* Ramanan Balakrishnan, [@ramananbalakrishnan](https://github.com/ramananbalakrishnan)
|
|
||||||
* Raphaël Bournhonesque, [@raphael0202](https://github.com/raphael0202)
|
|
||||||
* Rob van Nieuwpoort, [@RvanNieuwpoort](https://github.com/RvanNieuwpoort)
|
|
||||||
* Roman Domrachev, [@ligser](https://github.com/ligser)
|
|
||||||
* Roman Inflianskas, [@rominf](https://github.com/rominf)
|
|
||||||
* Sam Bozek, [@sambozek](https://github.com/sambozek)
|
|
||||||
* Sasho Savkov, [@savkov](https://github.com/savkov)
|
|
||||||
* Shuvanon Razik, [@shuvanon](https://github.com/shuvanon)
|
|
||||||
* Søren Lind Kristiansen, [@sorenlind](https://github.com/sorenlind)
|
|
||||||
* Swier, [@swierh](https://github.com/swierh)
|
|
||||||
* Thomas Tanon, [@Tpt](https://github.com/Tpt)
|
|
||||||
* Thomas Opsomer, [@thomasopsomer](https://github.com/thomasopsomer)
|
|
||||||
* Tiago Rodrigues, [@TiagoMRodrigues](https://github.com/TiagoMRodrigues)
|
|
||||||
* Vadim Mazaev, [@GreenRiverRUS](https://github.com/GreenRiverRUS)
|
|
||||||
* Vimos Tan, [@Vimos](https://github.com/Vimos)
|
|
||||||
* Vsevolod Solovyov, [@vsolovyov](https://github.com/vsolovyov)
|
|
||||||
* Wah Loon Keng, [@kengz](https://github.com/kengz)
|
|
||||||
* Wannaphong Phatthiyaphaibun, [@wannaphongcom](https://github.com/wannaphongcom)
|
|
||||||
* Willem van Hage, [@wrvhage](https://github.com/wrvhage)
|
|
||||||
* Wolfgang Seeker, [@wbwseeker](https://github.com/wbwseeker)
|
|
||||||
* Yam, [@hscspring](https://github.com/hscspring)
|
|
||||||
* Yanhao Yang, [@YanhaoYang](https://github.com/YanhaoYang)
|
|
||||||
* Yasuaki Uechi, [@uetchy](https://github.com/uetchy)
|
|
||||||
* Yu-chun Huang, [@galaxyh](https://github.com/galaxyh)
|
|
||||||
* Yubing Dong, [@tomtung](https://github.com/tomtung)
|
|
||||||
* Yuval Pinter, [@yuvalpinter](https://github.com/yuvalpinter)
|
|
|
@ -35,41 +35,49 @@ import subprocess
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
HASH_FILE = 'cythonize.json'
|
HASH_FILE = "cythonize.json"
|
||||||
|
|
||||||
|
|
||||||
def process_pyx(fromfile, tofile, language_level='-2'):
|
def process_pyx(fromfile, tofile, language_level="-2"):
|
||||||
print('Processing %s' % fromfile)
|
print("Processing %s" % fromfile)
|
||||||
try:
|
try:
|
||||||
from Cython.Compiler.Version import version as cython_version
|
from Cython.Compiler.Version import version as cython_version
|
||||||
from distutils.version import LooseVersion
|
from distutils.version import LooseVersion
|
||||||
if LooseVersion(cython_version) < LooseVersion('0.19'):
|
|
||||||
raise Exception('Require Cython >= 0.19')
|
if LooseVersion(cython_version) < LooseVersion("0.19"):
|
||||||
|
raise Exception("Require Cython >= 0.19")
|
||||||
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
flags = ['--fast-fail', language_level]
|
flags = ["--fast-fail", language_level]
|
||||||
if tofile.endswith('.cpp'):
|
if tofile.endswith(".cpp"):
|
||||||
flags += ['--cplus']
|
flags += ["--cplus"]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
try:
|
try:
|
||||||
r = subprocess.call(['cython'] + flags + ['-o', tofile, fromfile],
|
r = subprocess.call(
|
||||||
env=os.environ) # See Issue #791
|
["cython"] + flags + ["-o", tofile, fromfile], env=os.environ
|
||||||
|
) # See Issue #791
|
||||||
if r != 0:
|
if r != 0:
|
||||||
raise Exception('Cython failed')
|
raise Exception("Cython failed")
|
||||||
except OSError:
|
except OSError:
|
||||||
# There are ways of installing Cython that don't result in a cython
|
# There are ways of installing Cython that don't result in a cython
|
||||||
# executable on the path, see gh-2397.
|
# executable on the path, see gh-2397.
|
||||||
r = subprocess.call([sys.executable, '-c',
|
r = subprocess.call(
|
||||||
'import sys; from Cython.Compiler.Main import '
|
[
|
||||||
'setuptools_main as main; sys.exit(main())'] + flags +
|
sys.executable,
|
||||||
['-o', tofile, fromfile])
|
"-c",
|
||||||
|
"import sys; from Cython.Compiler.Main import "
|
||||||
|
"setuptools_main as main; sys.exit(main())",
|
||||||
|
]
|
||||||
|
+ flags
|
||||||
|
+ ["-o", tofile, fromfile]
|
||||||
|
)
|
||||||
if r != 0:
|
if r != 0:
|
||||||
raise Exception('Cython failed')
|
raise Exception("Cython failed")
|
||||||
except OSError:
|
except OSError:
|
||||||
raise OSError('Cython needs to be installed')
|
raise OSError("Cython needs to be installed")
|
||||||
|
|
||||||
|
|
||||||
def preserve_cwd(path, func, *args):
|
def preserve_cwd(path, func, *args):
|
||||||
|
@ -89,12 +97,12 @@ def load_hashes(filename):
|
||||||
|
|
||||||
|
|
||||||
def save_hashes(hash_db, filename):
|
def save_hashes(hash_db, filename):
|
||||||
with open(filename, 'w') as f:
|
with open(filename, "w") as f:
|
||||||
f.write(json.dumps(hash_db))
|
f.write(json.dumps(hash_db))
|
||||||
|
|
||||||
|
|
||||||
def get_hash(path):
|
def get_hash(path):
|
||||||
return hashlib.md5(open(path, 'rb').read()).hexdigest()
|
return hashlib.md5(open(path, "rb").read()).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
def hash_changed(base, path, db):
|
def hash_changed(base, path, db):
|
||||||
|
@ -109,25 +117,27 @@ def hash_add(base, path, db):
|
||||||
|
|
||||||
def process(base, filename, db):
|
def process(base, filename, db):
|
||||||
root, ext = os.path.splitext(filename)
|
root, ext = os.path.splitext(filename)
|
||||||
if ext in ['.pyx', '.cpp']:
|
if ext in [".pyx", ".cpp"]:
|
||||||
if hash_changed(base, filename, db) or not os.path.isfile(os.path.join(base, root + '.cpp')):
|
if hash_changed(base, filename, db) or not os.path.isfile(
|
||||||
preserve_cwd(base, process_pyx, root + '.pyx', root + '.cpp')
|
os.path.join(base, root + ".cpp")
|
||||||
hash_add(base, root + '.cpp', db)
|
):
|
||||||
hash_add(base, root + '.pyx', db)
|
preserve_cwd(base, process_pyx, root + ".pyx", root + ".cpp")
|
||||||
|
hash_add(base, root + ".cpp", db)
|
||||||
|
hash_add(base, root + ".pyx", db)
|
||||||
|
|
||||||
|
|
||||||
def check_changes(root, db):
|
def check_changes(root, db):
|
||||||
res = False
|
res = False
|
||||||
new_db = {}
|
new_db = {}
|
||||||
|
|
||||||
setup_filename = 'setup.py'
|
setup_filename = "setup.py"
|
||||||
hash_add('.', setup_filename, new_db)
|
hash_add(".", setup_filename, new_db)
|
||||||
if hash_changed('.', setup_filename, db):
|
if hash_changed(".", setup_filename, db):
|
||||||
res = True
|
res = True
|
||||||
|
|
||||||
for base, _, files in os.walk(root):
|
for base, _, files in os.walk(root):
|
||||||
for filename in files:
|
for filename in files:
|
||||||
if filename.endswith('.pxd'):
|
if filename.endswith(".pxd"):
|
||||||
hash_add(base, filename, new_db)
|
hash_add(base, filename, new_db)
|
||||||
if hash_changed(base, filename, db):
|
if hash_changed(base, filename, db):
|
||||||
res = True
|
res = True
|
||||||
|
@ -150,8 +160,10 @@ def run(root):
|
||||||
save_hashes(db, HASH_FILE)
|
save_hashes(db, HASH_FILE)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(description='Cythonize pyx files into C++ files as needed')
|
parser = argparse.ArgumentParser(
|
||||||
parser.add_argument('root', help='root directory')
|
description="Cythonize pyx files into C++ files as needed"
|
||||||
|
)
|
||||||
|
parser.add_argument("root", help="root directory")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
run(args.root)
|
run(args.root)
|
||||||
|
|
|
@ -15,12 +15,13 @@ _unset = object()
|
||||||
|
|
||||||
class Reddit(object):
|
class Reddit(object):
|
||||||
"""Stream cleaned comments from Reddit."""
|
"""Stream cleaned comments from Reddit."""
|
||||||
pre_format_re = re.compile(r'^[\`\*\~]')
|
|
||||||
post_format_re = re.compile(r'[\`\*\~]$')
|
|
||||||
url_re = re.compile(r'\[([^]]+)\]\(%%URL\)')
|
|
||||||
link_re = re.compile(r'\[([^]]+)\]\(https?://[^\)]+\)')
|
|
||||||
|
|
||||||
def __init__(self, file_path, meta_keys={'subreddit': 'section'}):
|
pre_format_re = re.compile(r"^[\`\*\~]")
|
||||||
|
post_format_re = re.compile(r"[\`\*\~]$")
|
||||||
|
url_re = re.compile(r"\[([^]]+)\]\(%%URL\)")
|
||||||
|
link_re = re.compile(r"\[([^]]+)\]\(https?://[^\)]+\)")
|
||||||
|
|
||||||
|
def __init__(self, file_path, meta_keys={"subreddit": "section"}):
|
||||||
"""
|
"""
|
||||||
file_path (unicode / Path): Path to archive or directory of archives.
|
file_path (unicode / Path): Path to archive or directory of archives.
|
||||||
meta_keys (dict): Meta data key included in the Reddit corpus, mapped
|
meta_keys (dict): Meta data key included in the Reddit corpus, mapped
|
||||||
|
@ -45,28 +46,30 @@ class Reddit(object):
|
||||||
continue
|
continue
|
||||||
comment = ujson.loads(line)
|
comment = ujson.loads(line)
|
||||||
if self.is_valid(comment):
|
if self.is_valid(comment):
|
||||||
text = self.strip_tags(comment['body'])
|
text = self.strip_tags(comment["body"])
|
||||||
yield {'text': text}
|
yield {"text": text}
|
||||||
|
|
||||||
def get_meta(self, item):
|
def get_meta(self, item):
|
||||||
return {name: item.get(key, 'n/a') for key, name in self.meta.items()}
|
return {name: item.get(key, "n/a") for key, name in self.meta.items()}
|
||||||
|
|
||||||
def iter_files(self):
|
def iter_files(self):
|
||||||
for file_path in self.files:
|
for file_path in self.files:
|
||||||
yield file_path
|
yield file_path
|
||||||
|
|
||||||
def strip_tags(self, text):
|
def strip_tags(self, text):
|
||||||
text = self.link_re.sub(r'\1', text)
|
text = self.link_re.sub(r"\1", text)
|
||||||
text = text.replace('>', '>').replace('<', '<')
|
text = text.replace(">", ">").replace("<", "<")
|
||||||
text = self.pre_format_re.sub('', text)
|
text = self.pre_format_re.sub("", text)
|
||||||
text = self.post_format_re.sub('', text)
|
text = self.post_format_re.sub("", text)
|
||||||
text = re.sub(r'\s+', ' ', text)
|
text = re.sub(r"\s+", " ", text)
|
||||||
return text.strip()
|
return text.strip()
|
||||||
|
|
||||||
def is_valid(self, comment):
|
def is_valid(self, comment):
|
||||||
return comment['body'] is not None \
|
return (
|
||||||
and comment['body'] != '[deleted]' \
|
comment["body"] is not None
|
||||||
and comment['body'] != '[removed]'
|
and comment["body"] != "[deleted]"
|
||||||
|
and comment["body"] != "[removed]"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def main(path):
|
def main(path):
|
||||||
|
@ -75,16 +78,18 @@ def main(path):
|
||||||
print(ujson.dumps(comment))
|
print(ujson.dumps(comment))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
import socket
|
import socket
|
||||||
|
|
||||||
try:
|
try:
|
||||||
BrokenPipeError
|
BrokenPipeError
|
||||||
except NameError:
|
except NameError:
|
||||||
BrokenPipeError = socket.error
|
BrokenPipeError = socket.error
|
||||||
try:
|
try:
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
except BrokenPipeError:
|
except BrokenPipeError:
|
||||||
import os, sys
|
import os, sys
|
||||||
|
|
||||||
# Python flushes standard streams on exit; redirect remaining output
|
# Python flushes standard streams on exit; redirect remaining output
|
||||||
# to devnull to avoid another BrokenPipeError at shutdown
|
# to devnull to avoid another BrokenPipeError at shutdown
|
||||||
devnull = os.open(os.devnull, os.O_WRONLY)
|
devnull = os.open(os.devnull, os.O_WRONLY)
|
||||||
|
|
|
@ -7,6 +7,7 @@ git diff-index --quiet HEAD
|
||||||
|
|
||||||
git checkout $1
|
git checkout $1
|
||||||
git pull origin $1
|
git pull origin $1
|
||||||
|
|
||||||
version=$(grep "__version__ = " spacy/about.py)
|
version=$(grep "__version__ = " spacy/about.py)
|
||||||
version=${version/__version__ = }
|
version=${version/__version__ = }
|
||||||
version=${version/\'/}
|
version=${version/\'/}
|
||||||
|
|
|
@ -92,11 +92,13 @@ def get_features(docs, max_length):
|
||||||
def train(train_texts, train_labels, dev_texts, dev_labels,
|
def train(train_texts, train_labels, dev_texts, dev_labels,
|
||||||
lstm_shape, lstm_settings, lstm_optimizer, batch_size=100,
|
lstm_shape, lstm_settings, lstm_optimizer, batch_size=100,
|
||||||
nb_epoch=5, by_sentence=True):
|
nb_epoch=5, by_sentence=True):
|
||||||
|
|
||||||
print("Loading spaCy")
|
print("Loading spaCy")
|
||||||
nlp = spacy.load('en_vectors_web_lg')
|
nlp = spacy.load('en_vectors_web_lg')
|
||||||
nlp.add_pipe(nlp.create_pipe('sentencizer'))
|
nlp.add_pipe(nlp.create_pipe('sentencizer'))
|
||||||
embeddings = get_embeddings(nlp.vocab)
|
embeddings = get_embeddings(nlp.vocab)
|
||||||
model = compile_lstm(embeddings, lstm_shape, lstm_settings)
|
model = compile_lstm(embeddings, lstm_shape, lstm_settings)
|
||||||
|
|
||||||
print("Parsing texts...")
|
print("Parsing texts...")
|
||||||
train_docs = list(nlp.pipe(train_texts))
|
train_docs = list(nlp.pipe(train_texts))
|
||||||
dev_docs = list(nlp.pipe(dev_texts))
|
dev_docs = list(nlp.pipe(dev_texts))
|
||||||
|
@ -107,7 +109,7 @@ def train(train_texts, train_labels, dev_texts, dev_labels,
|
||||||
train_X = get_features(train_docs, lstm_shape['max_length'])
|
train_X = get_features(train_docs, lstm_shape['max_length'])
|
||||||
dev_X = get_features(dev_docs, lstm_shape['max_length'])
|
dev_X = get_features(dev_docs, lstm_shape['max_length'])
|
||||||
model.fit(train_X, train_labels, validation_data=(dev_X, dev_labels),
|
model.fit(train_X, train_labels, validation_data=(dev_X, dev_labels),
|
||||||
nb_epoch=nb_epoch, batch_size=batch_size)
|
epochs=nb_epoch, batch_size=batch_size)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@ -138,15 +140,9 @@ def get_embeddings(vocab):
|
||||||
|
|
||||||
|
|
||||||
def evaluate(model_dir, texts, labels, max_length=100):
|
def evaluate(model_dir, texts, labels, max_length=100):
|
||||||
def create_pipeline(nlp):
|
nlp = spacy.load('en_vectors_web_lg')
|
||||||
'''
|
nlp.add_pipe(nlp.create_pipe('sentencizer'))
|
||||||
This could be a lambda, but named functions are easier to read in Python.
|
nlp.add_pipe(SentimentAnalyser.load(model_dir, nlp, max_length=max_length))
|
||||||
'''
|
|
||||||
return [nlp.tagger, nlp.parser, SentimentAnalyser.load(model_dir, nlp,
|
|
||||||
max_length=max_length)]
|
|
||||||
|
|
||||||
nlp = spacy.load('en')
|
|
||||||
nlp.pipeline = create_pipeline(nlp)
|
|
||||||
|
|
||||||
correct = 0
|
correct = 0
|
||||||
i = 0
|
i = 0
|
||||||
|
@ -186,7 +182,7 @@ def main(model_dir=None, train_dir=None, dev_dir=None,
|
||||||
is_runtime=False,
|
is_runtime=False,
|
||||||
nr_hidden=64, max_length=100, # Shape
|
nr_hidden=64, max_length=100, # Shape
|
||||||
dropout=0.5, learn_rate=0.001, # General NN config
|
dropout=0.5, learn_rate=0.001, # General NN config
|
||||||
nb_epoch=5, batch_size=100, nr_examples=-1): # Training params
|
nb_epoch=5, batch_size=256, nr_examples=-1): # Training params
|
||||||
if model_dir is not None:
|
if model_dir is not None:
|
||||||
model_dir = pathlib.Path(model_dir)
|
model_dir = pathlib.Path(model_dir)
|
||||||
if train_dir is None or dev_dir is None:
|
if train_dir is None or dev_dir is None:
|
||||||
|
@ -219,7 +215,7 @@ def main(model_dir=None, train_dir=None, dev_dir=None,
|
||||||
if model_dir is not None:
|
if model_dir is not None:
|
||||||
with (model_dir / 'model').open('wb') as file_:
|
with (model_dir / 'model').open('wb') as file_:
|
||||||
pickle.dump(weights[1:], file_)
|
pickle.dump(weights[1:], file_)
|
||||||
with (model_dir / 'config.json').open('wb') as file_:
|
with (model_dir / 'config.json').open('w') as file_:
|
||||||
file_.write(lstm.to_json())
|
file_.write(lstm.to_json())
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,11 +2,7 @@
|
||||||
|
|
||||||
# A decomposable attention model for Natural Language Inference
|
# A decomposable attention model for Natural Language Inference
|
||||||
**by Matthew Honnibal, [@honnibal](https://github.com/honnibal)**
|
**by Matthew Honnibal, [@honnibal](https://github.com/honnibal)**
|
||||||
|
**Updated for spaCy 2.0+ and Keras 2.2.2+ by John Stewart, [@free-variation](https://github.com/free-variation)**
|
||||||
> ⚠️ **IMPORTANT NOTE:** This example is currently only compatible with spaCy
|
|
||||||
> v1.x. We're working on porting the example over to Keras v2.x and spaCy v2.x.
|
|
||||||
> See [#1445](https://github.com/explosion/spaCy/issues/1445) for details –
|
|
||||||
> contributions welcome!
|
|
||||||
|
|
||||||
This directory contains an implementation of the entailment prediction model described
|
This directory contains an implementation of the entailment prediction model described
|
||||||
by [Parikh et al. (2016)](https://arxiv.org/pdf/1606.01933.pdf). The model is notable
|
by [Parikh et al. (2016)](https://arxiv.org/pdf/1606.01933.pdf). The model is notable
|
||||||
|
@ -21,19 +17,25 @@ hook is installed to customise the `.similarity()` method of spaCy's `Doc`
|
||||||
and `Span` objects:
|
and `Span` objects:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def demo(model_dir):
|
def demo(shape):
|
||||||
nlp = spacy.load('en', path=model_dir,
|
nlp = spacy.load('en_vectors_web_lg')
|
||||||
create_pipeline=create_similarity_pipeline)
|
nlp.add_pipe(KerasSimilarityShim.load(nlp.path / 'similarity', nlp, shape[0]))
|
||||||
doc1 = nlp(u'Worst fries ever! Greasy and horrible...')
|
|
||||||
doc2 = nlp(u'The milkshakes are good. The fries are bad.')
|
doc1 = nlp(u'The king of France is bald.')
|
||||||
print(doc1.similarity(doc2))
|
doc2 = nlp(u'France has no king.')
|
||||||
sent1a, sent1b = doc1.sents
|
|
||||||
print(sent1a.similarity(sent1b))
|
print("Sentence 1:", doc1)
|
||||||
print(sent1a.similarity(doc2))
|
print("Sentence 2:", doc2)
|
||||||
print(sent1b.similarity(doc2))
|
|
||||||
|
entailment_type, confidence = doc1.similarity(doc2)
|
||||||
|
print("Entailment type:", entailment_type, "(Confidence:", confidence, ")")
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Which gives the output `Entailment type: contradiction (Confidence: 0.60604566)`, showing that
|
||||||
|
the system has definite opinions about Betrand Russell's [famous conundrum](https://users.drew.edu/jlenz/br-on-denoting.html)!
|
||||||
|
|
||||||
I'm working on a blog post to explain Parikh et al.'s model in more detail.
|
I'm working on a blog post to explain Parikh et al.'s model in more detail.
|
||||||
|
A [notebook](https://github.com/free-variation/spaCy/blob/master/examples/notebooks/Decompositional%20Attention.ipynb) is available that briefly explains this implementation.
|
||||||
I think it is a very interesting example of the attention mechanism, which
|
I think it is a very interesting example of the attention mechanism, which
|
||||||
I didn't understand very well before working through this paper. There are
|
I didn't understand very well before working through this paper. There are
|
||||||
lots of ways to extend the model.
|
lots of ways to extend the model.
|
||||||
|
@ -43,7 +45,7 @@ lots of ways to extend the model.
|
||||||
| File | Description |
|
| File | Description |
|
||||||
| --- | --- |
|
| --- | --- |
|
||||||
| `__main__.py` | The script that will be executed. Defines the CLI, the data reading, etc — all the boring stuff. |
|
| `__main__.py` | The script that will be executed. Defines the CLI, the data reading, etc — all the boring stuff. |
|
||||||
| `spacy_hook.py` | Provides a class `SimilarityShim` that lets you use an arbitrary function to customize spaCy's `doc.similarity()` method. Instead of the default average-of-vectors algorithm, when you call `doc1.similarity(doc2)`, you'll get the result of `your_model(doc1, doc2)`. |
|
| `spacy_hook.py` | Provides a class `KerasSimilarityShim` that lets you use an arbitrary function to customize spaCy's `doc.similarity()` method. Instead of the default average-of-vectors algorithm, when you call `doc1.similarity(doc2)`, you'll get the result of `your_model(doc1, doc2)`. |
|
||||||
| `keras_decomposable_attention.py` | Defines the neural network model. |
|
| `keras_decomposable_attention.py` | Defines the neural network model. |
|
||||||
|
|
||||||
## Setting up
|
## Setting up
|
||||||
|
@ -52,17 +54,13 @@ First, install [Keras](https://keras.io/), [spaCy](https://spacy.io) and the spa
|
||||||
English models (about 1GB of data):
|
English models (about 1GB of data):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install https://github.com/fchollet/keras/archive/1.2.2.zip
|
pip install keras
|
||||||
pip install spacy
|
pip install spacy
|
||||||
python -m spacy.en.download
|
python -m spacy download en_vectors_web_lg
|
||||||
```
|
```
|
||||||
|
|
||||||
⚠️ **Important:** In order for the example to run, you'll need to install Keras from
|
You'll also want to get Keras working on your GPU, and you will need a backend, such as TensorFlow or Theano.
|
||||||
the 1.2.2 release (and not via `pip install keras`). For more info on this, see
|
This will depend on your set up, so you're mostly on your own for this step. If you're using AWS, try the
|
||||||
[#727](https://github.com/explosion/spaCy/issues/727).
|
|
||||||
|
|
||||||
You'll also want to get Keras working on your GPU. This will depend on your
|
|
||||||
set up, so you're mostly on your own for this step. If you're using AWS, try the
|
|
||||||
[NVidia AMI](https://aws.amazon.com/marketplace/pp/B00FYCDDTE). It made things pretty easy.
|
[NVidia AMI](https://aws.amazon.com/marketplace/pp/B00FYCDDTE). It made things pretty easy.
|
||||||
|
|
||||||
Once you've installed the dependencies, you can run a small preliminary test of
|
Once you've installed the dependencies, you can run a small preliminary test of
|
||||||
|
@ -80,22 +78,35 @@ Finally, download the [Stanford Natural Language Inference corpus](http://nlp.st
|
||||||
## Running the example
|
## Running the example
|
||||||
|
|
||||||
You can run the `keras_parikh_entailment/` directory as a script, which executes the file
|
You can run the `keras_parikh_entailment/` directory as a script, which executes the file
|
||||||
[`keras_parikh_entailment/__main__.py`](__main__.py). The first thing you'll want to do is train the model:
|
[`keras_parikh_entailment/__main__.py`](__main__.py). If you run the script without arguments
|
||||||
|
the usage is shown. Running it with `-h` explains the command line arguments.
|
||||||
|
|
||||||
|
The first thing you'll want to do is train the model:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python keras_parikh_entailment/ train <train_directory> <dev_directory>
|
python keras_parikh_entailment/ train -t <path to SNLI train JSON> -s <path to SNLI dev JSON>
|
||||||
```
|
```
|
||||||
|
|
||||||
Training takes about 300 epochs for full accuracy, and I haven't rerun the full
|
Training takes about 300 epochs for full accuracy, and I haven't rerun the full
|
||||||
experiment since refactoring things to publish this example — please let me
|
experiment since refactoring things to publish this example — please let me
|
||||||
know if I've broken something. You should get to at least 85% on the development data.
|
know if I've broken something. You should get to at least 85% on the development data even after 10-15 epochs.
|
||||||
|
|
||||||
The other two modes demonstrate run-time usage. I never like relying on the accuracy printed
|
The other two modes demonstrate run-time usage. I never like relying on the accuracy printed
|
||||||
by `.fit()` methods. I never really feel confident until I've run a new process that loads
|
by `.fit()` methods. I never really feel confident until I've run a new process that loads
|
||||||
the model and starts making predictions, without access to the gold labels. I've therefore
|
the model and starts making predictions, without access to the gold labels. I've therefore
|
||||||
included an `evaluate` mode. Finally, there's also a little demo, which mostly exists to show
|
included an `evaluate` mode.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python keras_parikh_entailment/ evaluate -s <path to SNLI train JSON>
|
||||||
|
```
|
||||||
|
|
||||||
|
Finally, there's also a little demo, which mostly exists to show
|
||||||
you how run-time usage will eventually look.
|
you how run-time usage will eventually look.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python keras_parikh_entailment/ demo
|
||||||
|
```
|
||||||
|
|
||||||
## Getting updates
|
## Getting updates
|
||||||
|
|
||||||
We should have the blog post explaining the model ready before the end of the week. To get
|
We should have the blog post explaining the model ready before the end of the week. To get
|
||||||
|
|
|
@ -1,82 +1,104 @@
|
||||||
from __future__ import division, unicode_literals, print_function
|
import numpy as np
|
||||||
import spacy
|
|
||||||
|
|
||||||
import plac
|
|
||||||
from pathlib import Path
|
|
||||||
import ujson as json
|
import ujson as json
|
||||||
import numpy
|
from keras.utils import to_categorical
|
||||||
from keras.utils.np_utils import to_categorical
|
import plac
|
||||||
|
import sys
|
||||||
from spacy_hook import get_embeddings, get_word_ids
|
|
||||||
from spacy_hook import create_similarity_pipeline
|
|
||||||
|
|
||||||
from keras_decomposable_attention import build_model
|
from keras_decomposable_attention import build_model
|
||||||
|
from spacy_hook import get_embeddings, KerasSimilarityShim
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import cPickle as pickle
|
import cPickle as pickle
|
||||||
except ImportError:
|
except ImportError:
|
||||||
import pickle
|
import pickle
|
||||||
|
|
||||||
|
import spacy
|
||||||
|
|
||||||
|
# workaround for keras/tensorflow bug
|
||||||
|
# see https://github.com/tensorflow/tensorflow/issues/3388
|
||||||
|
import os
|
||||||
|
import importlib
|
||||||
|
from keras import backend as K
|
||||||
|
|
||||||
|
def set_keras_backend(backend):
|
||||||
|
if K.backend() != backend:
|
||||||
|
os.environ['KERAS_BACKEND'] = backend
|
||||||
|
importlib.reload(K)
|
||||||
|
assert K.backend() == backend
|
||||||
|
if backend == "tensorflow":
|
||||||
|
K.get_session().close()
|
||||||
|
cfg = K.tf.ConfigProto()
|
||||||
|
cfg.gpu_options.allow_growth = True
|
||||||
|
K.set_session(K.tf.Session(config=cfg))
|
||||||
|
K.clear_session()
|
||||||
|
|
||||||
|
set_keras_backend("tensorflow")
|
||||||
|
|
||||||
|
|
||||||
def train(train_loc, dev_loc, shape, settings):
|
def train(train_loc, dev_loc, shape, settings):
|
||||||
train_texts1, train_texts2, train_labels = read_snli(train_loc)
|
train_texts1, train_texts2, train_labels = read_snli(train_loc)
|
||||||
dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc)
|
dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc)
|
||||||
|
|
||||||
print("Loading spaCy")
|
print("Loading spaCy")
|
||||||
nlp = spacy.load('en')
|
nlp = spacy.load('en_vectors_web_lg')
|
||||||
assert nlp.path is not None
|
assert nlp.path is not None
|
||||||
|
|
||||||
|
print("Processing texts...")
|
||||||
|
train_X = create_dataset(nlp, train_texts1, train_texts2, 100, shape[0])
|
||||||
|
dev_X = create_dataset(nlp, dev_texts1, dev_texts2, 100, shape[0])
|
||||||
|
|
||||||
print("Compiling network")
|
print("Compiling network")
|
||||||
model = build_model(get_embeddings(nlp.vocab), shape, settings)
|
model = build_model(get_embeddings(nlp.vocab), shape, settings)
|
||||||
print("Processing texts...")
|
|
||||||
Xs = []
|
|
||||||
for texts in (train_texts1, train_texts2, dev_texts1, dev_texts2):
|
|
||||||
Xs.append(get_word_ids(list(nlp.pipe(texts, n_threads=20, batch_size=20000)),
|
|
||||||
max_length=shape[0],
|
|
||||||
rnn_encode=settings['gru_encode'],
|
|
||||||
tree_truncate=settings['tree_truncate']))
|
|
||||||
train_X1, train_X2, dev_X1, dev_X2 = Xs
|
|
||||||
print(settings)
|
print(settings)
|
||||||
model.fit(
|
model.fit(
|
||||||
[train_X1, train_X2],
|
train_X,
|
||||||
train_labels,
|
train_labels,
|
||||||
validation_data=([dev_X1, dev_X2], dev_labels),
|
validation_data = (dev_X, dev_labels),
|
||||||
nb_epoch=settings['nr_epoch'],
|
epochs = settings['nr_epoch'],
|
||||||
batch_size=settings['batch_size'])
|
batch_size = settings['batch_size'])
|
||||||
|
|
||||||
if not (nlp.path / 'similarity').exists():
|
if not (nlp.path / 'similarity').exists():
|
||||||
(nlp.path / 'similarity').mkdir()
|
(nlp.path / 'similarity').mkdir()
|
||||||
print("Saving to", nlp.path / 'similarity')
|
print("Saving to", nlp.path / 'similarity')
|
||||||
weights = model.get_weights()
|
weights = model.get_weights()
|
||||||
|
# remove the embedding matrix. We can reconstruct it.
|
||||||
|
del weights[1]
|
||||||
with (nlp.path / 'similarity' / 'model').open('wb') as file_:
|
with (nlp.path / 'similarity' / 'model').open('wb') as file_:
|
||||||
pickle.dump(weights[1:], file_)
|
pickle.dump(weights, file_)
|
||||||
with (nlp.path / 'similarity' / 'config.json').open('wb') as file_:
|
with (nlp.path / 'similarity' / 'config.json').open('w') as file_:
|
||||||
file_.write(model.to_json())
|
file_.write(model.to_json())
|
||||||
|
|
||||||
|
|
||||||
def evaluate(dev_loc):
|
def evaluate(dev_loc, shape):
|
||||||
dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc)
|
dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc)
|
||||||
nlp = spacy.load('en',
|
nlp = spacy.load('en_vectors_web_lg')
|
||||||
create_pipeline=create_similarity_pipeline)
|
nlp.add_pipe(KerasSimilarityShim.load(nlp.path / 'similarity', nlp, shape[0]))
|
||||||
|
|
||||||
total = 0.
|
total = 0.
|
||||||
correct = 0.
|
correct = 0.
|
||||||
for text1, text2, label in zip(dev_texts1, dev_texts2, dev_labels):
|
for text1, text2, label in zip(dev_texts1, dev_texts2, dev_labels):
|
||||||
doc1 = nlp(text1)
|
doc1 = nlp(text1)
|
||||||
doc2 = nlp(text2)
|
doc2 = nlp(text2)
|
||||||
sim = doc1.similarity(doc2)
|
sim, _ = doc1.similarity(doc2)
|
||||||
if sim.argmax() == label.argmax():
|
if sim == KerasSimilarityShim.entailment_types[label.argmax()]:
|
||||||
correct += 1
|
correct += 1
|
||||||
total += 1
|
total += 1
|
||||||
return correct, total
|
return correct, total
|
||||||
|
|
||||||
|
|
||||||
def demo():
|
def demo(shape):
|
||||||
nlp = spacy.load('en',
|
nlp = spacy.load('en_vectors_web_lg')
|
||||||
create_pipeline=create_similarity_pipeline)
|
nlp.add_pipe(KerasSimilarityShim.load(nlp.path / 'similarity', nlp, shape[0]))
|
||||||
doc1 = nlp(u'What were the best crime fiction books in 2016?')
|
|
||||||
doc2 = nlp(
|
doc1 = nlp(u'The king of France is bald.')
|
||||||
u'What should I read that was published last year? I like crime stories.')
|
doc2 = nlp(u'France has no king.')
|
||||||
print(doc1)
|
|
||||||
print(doc2)
|
print("Sentence 1:", doc1)
|
||||||
print("Similarity", doc1.similarity(doc2))
|
print("Sentence 2:", doc2)
|
||||||
|
|
||||||
|
entailment_type, confidence = doc1.similarity(doc2)
|
||||||
|
print("Entailment type:", entailment_type, "(Confidence:", confidence, ")")
|
||||||
|
|
||||||
|
|
||||||
LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}
|
LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}
|
||||||
|
@ -84,56 +106,92 @@ def read_snli(path):
|
||||||
texts1 = []
|
texts1 = []
|
||||||
texts2 = []
|
texts2 = []
|
||||||
labels = []
|
labels = []
|
||||||
with path.open() as file_:
|
with open(path, 'r') as file_:
|
||||||
for line in file_:
|
for line in file_:
|
||||||
eg = json.loads(line)
|
eg = json.loads(line)
|
||||||
label = eg['gold_label']
|
label = eg['gold_label']
|
||||||
if label == '-':
|
if label == '-': # per Parikh, ignore - SNLI entries
|
||||||
continue
|
continue
|
||||||
texts1.append(eg['sentence1'])
|
texts1.append(eg['sentence1'])
|
||||||
texts2.append(eg['sentence2'])
|
texts2.append(eg['sentence2'])
|
||||||
labels.append(LABELS[label])
|
labels.append(LABELS[label])
|
||||||
return texts1, texts2, to_categorical(numpy.asarray(labels, dtype='int32'))
|
return texts1, texts2, to_categorical(np.asarray(labels, dtype='int32'))
|
||||||
|
|
||||||
|
def create_dataset(nlp, texts, hypotheses, num_unk, max_length):
|
||||||
|
sents = texts + hypotheses
|
||||||
|
|
||||||
|
sents_as_ids = []
|
||||||
|
for sent in sents:
|
||||||
|
doc = nlp(sent)
|
||||||
|
word_ids = []
|
||||||
|
|
||||||
|
for i, token in enumerate(doc):
|
||||||
|
# skip odd spaces from tokenizer
|
||||||
|
if token.has_vector and token.vector_norm == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if i > max_length:
|
||||||
|
break
|
||||||
|
|
||||||
|
if token.has_vector:
|
||||||
|
word_ids.append(token.rank + num_unk + 1)
|
||||||
|
else:
|
||||||
|
# if we don't have a vector, pick an OOV entry
|
||||||
|
word_ids.append(token.rank % num_unk + 1)
|
||||||
|
|
||||||
|
# there must be a simpler way of generating padded arrays from lists...
|
||||||
|
word_id_vec = np.zeros((max_length), dtype='int')
|
||||||
|
clipped_len = min(max_length, len(word_ids))
|
||||||
|
word_id_vec[:clipped_len] = word_ids[:clipped_len]
|
||||||
|
sents_as_ids.append(word_id_vec)
|
||||||
|
|
||||||
|
|
||||||
|
return [np.array(sents_as_ids[:len(texts)]), np.array(sents_as_ids[len(texts):])]
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
mode=("Mode to execute", "positional", None, str, ["train", "evaluate", "demo"]),
|
mode=("Mode to execute", "positional", None, str, ["train", "evaluate", "demo"]),
|
||||||
train_loc=("Path to training data", "positional", None, Path),
|
train_loc=("Path to training data", "option", "t", str),
|
||||||
dev_loc=("Path to development data", "positional", None, Path),
|
dev_loc=("Path to development or test data", "option", "s", str),
|
||||||
max_length=("Length to truncate sentences", "option", "L", int),
|
max_length=("Length to truncate sentences", "option", "L", int),
|
||||||
nr_hidden=("Number of hidden units", "option", "H", int),
|
nr_hidden=("Number of hidden units", "option", "H", int),
|
||||||
dropout=("Dropout level", "option", "d", float),
|
dropout=("Dropout level", "option", "d", float),
|
||||||
learn_rate=("Learning rate", "option", "e", float),
|
learn_rate=("Learning rate", "option", "r", float),
|
||||||
batch_size=("Batch size for neural network training", "option", "b", int),
|
batch_size=("Batch size for neural network training", "option", "b", int),
|
||||||
nr_epoch=("Number of training epochs", "option", "i", int),
|
nr_epoch=("Number of training epochs", "option", "e", int),
|
||||||
tree_truncate=("Truncate sentences by tree distance", "flag", "T", bool),
|
entail_dir=("Direction of entailment", "option", "D", str, ["both", "left", "right"])
|
||||||
gru_encode=("Encode sentences with bidirectional GRU", "flag", "E", bool),
|
|
||||||
)
|
)
|
||||||
def main(mode, train_loc, dev_loc,
|
def main(mode, train_loc, dev_loc,
|
||||||
tree_truncate=False,
|
max_length = 50,
|
||||||
gru_encode=False,
|
nr_hidden = 200,
|
||||||
max_length=100,
|
dropout = 0.2,
|
||||||
nr_hidden=100,
|
learn_rate = 0.001,
|
||||||
dropout=0.2,
|
batch_size = 1024,
|
||||||
learn_rate=0.001,
|
nr_epoch = 10,
|
||||||
batch_size=100,
|
entail_dir="both"):
|
||||||
nr_epoch=5):
|
|
||||||
shape = (max_length, nr_hidden, 3)
|
shape = (max_length, nr_hidden, 3)
|
||||||
settings = {
|
settings = {
|
||||||
'lr': learn_rate,
|
'lr': learn_rate,
|
||||||
'dropout': dropout,
|
'dropout': dropout,
|
||||||
'batch_size': batch_size,
|
'batch_size': batch_size,
|
||||||
'nr_epoch': nr_epoch,
|
'nr_epoch': nr_epoch,
|
||||||
'tree_truncate': tree_truncate,
|
'entail_dir': entail_dir
|
||||||
'gru_encode': gru_encode
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if mode == 'train':
|
if mode == 'train':
|
||||||
|
if train_loc == None or dev_loc == None:
|
||||||
|
print("Train mode requires paths to training and development data sets.")
|
||||||
|
sys.exit(1)
|
||||||
train(train_loc, dev_loc, shape, settings)
|
train(train_loc, dev_loc, shape, settings)
|
||||||
elif mode == 'evaluate':
|
elif mode == 'evaluate':
|
||||||
correct, total = evaluate(dev_loc)
|
if dev_loc == None:
|
||||||
|
print("Evaluate mode requires paths to test data set.")
|
||||||
|
sys.exit(1)
|
||||||
|
correct, total = evaluate(dev_loc, shape)
|
||||||
print(correct, '/', total, correct / total)
|
print(correct, '/', total, correct / total)
|
||||||
else:
|
else:
|
||||||
demo()
|
demo(shape)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
|
@ -1,259 +1,137 @@
|
||||||
# Semantic similarity with decomposable attention (using spaCy and Keras)
|
# Semantic entailment/similarity with decomposable attention (using spaCy and Keras)
|
||||||
# Practical state-of-the-art text similarity with spaCy and Keras
|
# Practical state-of-the-art textual entailment with spaCy and Keras
|
||||||
import numpy
|
|
||||||
|
|
||||||
from keras.layers import InputSpec, Layer, Input, Dense, merge
|
|
||||||
from keras.layers import Lambda, Activation, Dropout, Embedding, TimeDistributed
|
|
||||||
from keras.layers import Bidirectional, GRU, LSTM
|
|
||||||
from keras.layers.noise import GaussianNoise
|
|
||||||
from keras.layers.advanced_activations import ELU
|
|
||||||
import keras.backend as K
|
|
||||||
from keras.models import Sequential, Model, model_from_json
|
|
||||||
from keras.regularizers import l2
|
|
||||||
from keras.optimizers import Adam
|
|
||||||
from keras.layers.normalization import BatchNormalization
|
|
||||||
from keras.layers.pooling import GlobalAveragePooling1D, GlobalMaxPooling1D
|
|
||||||
from keras.layers import Merge
|
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from keras import layers, Model, models, optimizers
|
||||||
|
from keras import backend as K
|
||||||
|
|
||||||
def build_model(vectors, shape, settings):
|
def build_model(vectors, shape, settings):
|
||||||
'''Compile the model.'''
|
|
||||||
max_length, nr_hidden, nr_class = shape
|
max_length, nr_hidden, nr_class = shape
|
||||||
# Declare inputs.
|
|
||||||
ids1 = Input(shape=(max_length,), dtype='int32', name='words1')
|
|
||||||
ids2 = Input(shape=(max_length,), dtype='int32', name='words2')
|
|
||||||
|
|
||||||
# Construct operations, which we'll chain together.
|
input1 = layers.Input(shape=(max_length,), dtype='int32', name='words1')
|
||||||
embed = _StaticEmbedding(vectors, max_length, nr_hidden, dropout=0.2, nr_tune=5000)
|
input2 = layers.Input(shape=(max_length,), dtype='int32', name='words2')
|
||||||
if settings['gru_encode']:
|
|
||||||
encode = _BiRNNEncoding(max_length, nr_hidden, dropout=settings['dropout'])
|
# embeddings (projected)
|
||||||
attend = _Attention(max_length, nr_hidden, dropout=settings['dropout'])
|
embed = create_embedding(vectors, max_length, nr_hidden)
|
||||||
align = _SoftAlignment(max_length, nr_hidden)
|
|
||||||
compare = _Comparison(max_length, nr_hidden, dropout=settings['dropout'])
|
a = embed(input1)
|
||||||
entail = _Entailment(nr_hidden, nr_class, dropout=settings['dropout'])
|
b = embed(input2)
|
||||||
|
|
||||||
|
# step 1: attend
|
||||||
|
F = create_feedforward(nr_hidden)
|
||||||
|
att_weights = layers.dot([F(a), F(b)], axes=-1)
|
||||||
|
|
||||||
|
G = create_feedforward(nr_hidden)
|
||||||
|
|
||||||
|
if settings['entail_dir'] == 'both':
|
||||||
|
norm_weights_a = layers.Lambda(normalizer(1))(att_weights)
|
||||||
|
norm_weights_b = layers.Lambda(normalizer(2))(att_weights)
|
||||||
|
alpha = layers.dot([norm_weights_a, a], axes=1)
|
||||||
|
beta = layers.dot([norm_weights_b, b], axes=1)
|
||||||
|
|
||||||
# Declare the model as a computational graph.
|
# step 2: compare
|
||||||
sent1 = embed(ids1) # Shape: (i, n)
|
comp1 = layers.concatenate([a, beta])
|
||||||
sent2 = embed(ids2) # Shape: (j, n)
|
comp2 = layers.concatenate([b, alpha])
|
||||||
|
v1 = layers.TimeDistributed(G)(comp1)
|
||||||
|
v2 = layers.TimeDistributed(G)(comp2)
|
||||||
|
|
||||||
if settings['gru_encode']:
|
# step 3: aggregate
|
||||||
sent1 = encode(sent1)
|
v1_sum = layers.Lambda(sum_word)(v1)
|
||||||
sent2 = encode(sent2)
|
v2_sum = layers.Lambda(sum_word)(v2)
|
||||||
|
concat = layers.concatenate([v1_sum, v2_sum])
|
||||||
|
|
||||||
attention = attend(sent1, sent2) # Shape: (i, j)
|
elif settings['entail_dir'] == 'left':
|
||||||
|
norm_weights_a = layers.Lambda(normalizer(1))(att_weights)
|
||||||
|
alpha = layers.dot([norm_weights_a, a], axes=1)
|
||||||
|
comp2 = layers.concatenate([b, alpha])
|
||||||
|
v2 = layers.TimeDistributed(G)(comp2)
|
||||||
|
v2_sum = layers.Lambda(sum_word)(v2)
|
||||||
|
concat = v2_sum
|
||||||
|
|
||||||
align1 = align(sent2, attention)
|
else:
|
||||||
align2 = align(sent1, attention, transpose=True)
|
norm_weights_b = layers.Lambda(normalizer(2))(att_weights)
|
||||||
|
beta = layers.dot([norm_weights_b, b], axes=1)
|
||||||
feats1 = compare(sent1, align1)
|
comp1 = layers.concatenate([a, beta])
|
||||||
feats2 = compare(sent2, align2)
|
v1 = layers.TimeDistributed(G)(comp1)
|
||||||
|
v1_sum = layers.Lambda(sum_word)(v1)
|
||||||
scores = entail(feats1, feats2)
|
concat = v1_sum
|
||||||
|
|
||||||
# Now that we have the input/output, we can construct the Model object...
|
H = create_feedforward(nr_hidden)
|
||||||
model = Model(input=[ids1, ids2], output=[scores])
|
out = H(concat)
|
||||||
|
out = layers.Dense(nr_class, activation='softmax')(out)
|
||||||
# ...Compile it...
|
|
||||||
|
model = Model([input1, input2], out)
|
||||||
|
|
||||||
model.compile(
|
model.compile(
|
||||||
optimizer=Adam(lr=settings['lr']),
|
optimizer=optimizers.Adam(lr=settings['lr']),
|
||||||
loss='categorical_crossentropy',
|
loss='categorical_crossentropy',
|
||||||
metrics=['accuracy'])
|
metrics=['accuracy'])
|
||||||
# ...And return it for training.
|
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
class _StaticEmbedding(object):
|
def create_embedding(vectors, max_length, projected_dim):
|
||||||
def __init__(self, vectors, max_length, nr_out, nr_tune=1000, dropout=0.0):
|
return models.Sequential([
|
||||||
self.nr_out = nr_out
|
layers.Embedding(
|
||||||
self.max_length = max_length
|
vectors.shape[0],
|
||||||
self.embed = Embedding(
|
vectors.shape[1],
|
||||||
vectors.shape[0],
|
input_length=max_length,
|
||||||
vectors.shape[1],
|
weights=[vectors],
|
||||||
input_length=max_length,
|
trainable=False),
|
||||||
weights=[vectors],
|
|
||||||
name='embed',
|
layers.TimeDistributed(
|
||||||
trainable=False)
|
layers.Dense(projected_dim,
|
||||||
self.tune = Embedding(
|
activation=None,
|
||||||
nr_tune,
|
use_bias=False))
|
||||||
nr_out,
|
])
|
||||||
input_length=max_length,
|
|
||||||
weights=None,
|
|
||||||
name='tune',
|
|
||||||
trainable=True,
|
|
||||||
dropout=dropout)
|
|
||||||
self.mod_ids = Lambda(lambda sent: sent % (nr_tune-1)+1,
|
|
||||||
output_shape=(self.max_length,))
|
|
||||||
|
|
||||||
self.project = TimeDistributed(
|
def create_feedforward(num_units=200, activation='relu', dropout_rate=0.2):
|
||||||
Dense(
|
return models.Sequential([
|
||||||
nr_out,
|
layers.Dense(num_units, activation=activation),
|
||||||
activation=None,
|
layers.Dropout(dropout_rate),
|
||||||
bias=False,
|
layers.Dense(num_units, activation=activation),
|
||||||
name='project'))
|
layers.Dropout(dropout_rate)
|
||||||
|
])
|
||||||
def __call__(self, sentence):
|
|
||||||
def get_output_shape(shapes):
|
|
||||||
print(shapes)
|
|
||||||
return shapes[0]
|
|
||||||
mod_sent = self.mod_ids(sentence)
|
|
||||||
tuning = self.tune(mod_sent)
|
|
||||||
#tuning = merge([tuning, mod_sent],
|
|
||||||
# mode=lambda AB: AB[0] * (K.clip(K.cast(AB[1], 'float32'), 0, 1)),
|
|
||||||
# output_shape=(self.max_length, self.nr_out))
|
|
||||||
pretrained = self.project(self.embed(sentence))
|
|
||||||
vectors = merge([pretrained, tuning], mode='sum')
|
|
||||||
return vectors
|
|
||||||
|
|
||||||
|
|
||||||
class _BiRNNEncoding(object):
|
def normalizer(axis):
|
||||||
def __init__(self, max_length, nr_out, dropout=0.0):
|
def _normalize(att_weights):
|
||||||
self.model = Sequential()
|
exp_weights = K.exp(att_weights)
|
||||||
self.model.add(Bidirectional(LSTM(nr_out, return_sequences=True,
|
sum_weights = K.sum(exp_weights, axis=axis, keepdims=True)
|
||||||
dropout_W=dropout, dropout_U=dropout),
|
return exp_weights/sum_weights
|
||||||
input_shape=(max_length, nr_out)))
|
return _normalize
|
||||||
self.model.add(TimeDistributed(Dense(nr_out, activation='relu', init='he_normal')))
|
|
||||||
self.model.add(TimeDistributed(Dropout(0.2)))
|
|
||||||
|
|
||||||
def __call__(self, sentence):
|
def sum_word(x):
|
||||||
return self.model(sentence)
|
return K.sum(x, axis=1)
|
||||||
|
|
||||||
|
|
||||||
class _Attention(object):
|
|
||||||
def __init__(self, max_length, nr_hidden, dropout=0.0, L2=0.0, activation='relu'):
|
|
||||||
self.max_length = max_length
|
|
||||||
self.model = Sequential()
|
|
||||||
self.model.add(Dropout(dropout, input_shape=(nr_hidden,)))
|
|
||||||
self.model.add(
|
|
||||||
Dense(nr_hidden, name='attend1',
|
|
||||||
init='he_normal', W_regularizer=l2(L2),
|
|
||||||
input_shape=(nr_hidden,), activation='relu'))
|
|
||||||
self.model.add(Dropout(dropout))
|
|
||||||
self.model.add(Dense(nr_hidden, name='attend2',
|
|
||||||
init='he_normal', W_regularizer=l2(L2), activation='relu'))
|
|
||||||
self.model = TimeDistributed(self.model)
|
|
||||||
|
|
||||||
def __call__(self, sent1, sent2):
|
|
||||||
def _outer(AB):
|
|
||||||
att_ji = K.batch_dot(AB[1], K.permute_dimensions(AB[0], (0, 2, 1)))
|
|
||||||
return K.permute_dimensions(att_ji,(0, 2, 1))
|
|
||||||
return merge(
|
|
||||||
[self.model(sent1), self.model(sent2)],
|
|
||||||
mode=_outer,
|
|
||||||
output_shape=(self.max_length, self.max_length))
|
|
||||||
|
|
||||||
|
|
||||||
class _SoftAlignment(object):
|
|
||||||
def __init__(self, max_length, nr_hidden):
|
|
||||||
self.max_length = max_length
|
|
||||||
self.nr_hidden = nr_hidden
|
|
||||||
|
|
||||||
def __call__(self, sentence, attention, transpose=False):
|
|
||||||
def _normalize_attention(attmat):
|
|
||||||
att = attmat[0]
|
|
||||||
mat = attmat[1]
|
|
||||||
if transpose:
|
|
||||||
att = K.permute_dimensions(att,(0, 2, 1))
|
|
||||||
# 3d softmax
|
|
||||||
e = K.exp(att - K.max(att, axis=-1, keepdims=True))
|
|
||||||
s = K.sum(e, axis=-1, keepdims=True)
|
|
||||||
sm_att = e / s
|
|
||||||
return K.batch_dot(sm_att, mat)
|
|
||||||
return merge([attention, sentence], mode=_normalize_attention,
|
|
||||||
output_shape=(self.max_length, self.nr_hidden)) # Shape: (i, n)
|
|
||||||
|
|
||||||
|
|
||||||
class _Comparison(object):
|
|
||||||
def __init__(self, words, nr_hidden, L2=0.0, dropout=0.0):
|
|
||||||
self.words = words
|
|
||||||
self.model = Sequential()
|
|
||||||
self.model.add(Dropout(dropout, input_shape=(nr_hidden*2,)))
|
|
||||||
self.model.add(Dense(nr_hidden, name='compare1',
|
|
||||||
init='he_normal', W_regularizer=l2(L2)))
|
|
||||||
self.model.add(Activation('relu'))
|
|
||||||
self.model.add(Dropout(dropout))
|
|
||||||
self.model.add(Dense(nr_hidden, name='compare2',
|
|
||||||
W_regularizer=l2(L2), init='he_normal'))
|
|
||||||
self.model.add(Activation('relu'))
|
|
||||||
self.model = TimeDistributed(self.model)
|
|
||||||
|
|
||||||
def __call__(self, sent, align, **kwargs):
|
|
||||||
result = self.model(merge([sent, align], mode='concat')) # Shape: (i, n)
|
|
||||||
avged = GlobalAveragePooling1D()(result, mask=self.words)
|
|
||||||
maxed = GlobalMaxPooling1D()(result, mask=self.words)
|
|
||||||
merged = merge([avged, maxed])
|
|
||||||
result = BatchNormalization()(merged)
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
class _Entailment(object):
|
|
||||||
def __init__(self, nr_hidden, nr_out, dropout=0.0, L2=0.0):
|
|
||||||
self.model = Sequential()
|
|
||||||
self.model.add(Dropout(dropout, input_shape=(nr_hidden*2,)))
|
|
||||||
self.model.add(Dense(nr_hidden, name='entail1',
|
|
||||||
init='he_normal', W_regularizer=l2(L2)))
|
|
||||||
self.model.add(Activation('relu'))
|
|
||||||
self.model.add(Dropout(dropout))
|
|
||||||
self.model.add(Dense(nr_hidden, name='entail2',
|
|
||||||
init='he_normal', W_regularizer=l2(L2)))
|
|
||||||
self.model.add(Activation('relu'))
|
|
||||||
self.model.add(Dense(nr_out, name='entail_out', activation='softmax',
|
|
||||||
W_regularizer=l2(L2), init='zero'))
|
|
||||||
|
|
||||||
def __call__(self, feats1, feats2):
|
|
||||||
features = merge([feats1, feats2], mode='concat')
|
|
||||||
return self.model(features)
|
|
||||||
|
|
||||||
|
|
||||||
class _GlobalSumPooling1D(Layer):
|
|
||||||
'''Global sum pooling operation for temporal data.
|
|
||||||
|
|
||||||
# Input shape
|
|
||||||
3D tensor with shape: `(samples, steps, features)`.
|
|
||||||
|
|
||||||
# Output shape
|
|
||||||
2D tensor with shape: `(samples, features)`.
|
|
||||||
'''
|
|
||||||
def __init__(self, **kwargs):
|
|
||||||
super(_GlobalSumPooling1D, self).__init__(**kwargs)
|
|
||||||
self.input_spec = [InputSpec(ndim=3)]
|
|
||||||
|
|
||||||
def get_output_shape_for(self, input_shape):
|
|
||||||
return (input_shape[0], input_shape[2])
|
|
||||||
|
|
||||||
def call(self, x, mask=None):
|
|
||||||
if mask is not None:
|
|
||||||
return K.sum(x * K.clip(mask, 0, 1), axis=1)
|
|
||||||
else:
|
|
||||||
return K.sum(x, axis=1)
|
|
||||||
|
|
||||||
|
|
||||||
def test_build_model():
|
def test_build_model():
|
||||||
vectors = numpy.ndarray((100, 8), dtype='float32')
|
vectors = np.ndarray((100, 8), dtype='float32')
|
||||||
shape = (10, 16, 3)
|
shape = (10, 16, 3)
|
||||||
settings = {'lr': 0.001, 'dropout': 0.2, 'gru_encode':True}
|
settings = {'lr': 0.001, 'dropout': 0.2, 'gru_encode':True, 'entail_dir':'both'}
|
||||||
model = build_model(vectors, shape, settings)
|
model = build_model(vectors, shape, settings)
|
||||||
|
|
||||||
|
|
||||||
def test_fit_model():
|
def test_fit_model():
|
||||||
|
|
||||||
def _generate_X(nr_example, length, nr_vector):
|
def _generate_X(nr_example, length, nr_vector):
|
||||||
X1 = numpy.ndarray((nr_example, length), dtype='int32')
|
X1 = np.ndarray((nr_example, length), dtype='int32')
|
||||||
X1 *= X1 < nr_vector
|
X1 *= X1 < nr_vector
|
||||||
X1 *= 0 <= X1
|
X1 *= 0 <= X1
|
||||||
X2 = numpy.ndarray((nr_example, length), dtype='int32')
|
X2 = np.ndarray((nr_example, length), dtype='int32')
|
||||||
X2 *= X2 < nr_vector
|
X2 *= X2 < nr_vector
|
||||||
X2 *= 0 <= X2
|
X2 *= 0 <= X2
|
||||||
return [X1, X2]
|
return [X1, X2]
|
||||||
|
|
||||||
def _generate_Y(nr_example, nr_class):
|
def _generate_Y(nr_example, nr_class):
|
||||||
ys = numpy.zeros((nr_example, nr_class), dtype='int32')
|
ys = np.zeros((nr_example, nr_class), dtype='int32')
|
||||||
for i in range(nr_example):
|
for i in range(nr_example):
|
||||||
ys[i, i % nr_class] = 1
|
ys[i, i % nr_class] = 1
|
||||||
return ys
|
return ys
|
||||||
|
|
||||||
vectors = numpy.ndarray((100, 8), dtype='float32')
|
vectors = np.ndarray((100, 8), dtype='float32')
|
||||||
shape = (10, 16, 3)
|
shape = (10, 16, 3)
|
||||||
settings = {'lr': 0.001, 'dropout': 0.2, 'gru_encode':True}
|
settings = {'lr': 0.001, 'dropout': 0.2, 'gru_encode':True, 'entail_dir':'both'}
|
||||||
model = build_model(vectors, shape, settings)
|
model = build_model(vectors, shape, settings)
|
||||||
|
|
||||||
train_X = _generate_X(20, shape[0], vectors.shape[0])
|
train_X = _generate_X(20, shape[0], vectors.shape[0])
|
||||||
|
@ -261,8 +139,7 @@ def test_fit_model():
|
||||||
dev_X = _generate_X(15, shape[0], vectors.shape[0])
|
dev_X = _generate_X(15, shape[0], vectors.shape[0])
|
||||||
dev_Y = _generate_Y(15, shape[2])
|
dev_Y = _generate_Y(15, shape[2])
|
||||||
|
|
||||||
model.fit(train_X, train_Y, validation_data=(dev_X, dev_Y), nb_epoch=5,
|
model.fit(train_X, train_Y, validation_data=(dev_X, dev_Y), epochs=5, batch_size=4)
|
||||||
batch_size=4)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = [build_model]
|
__all__ = [build_model]
|
||||||
|
|
|
@ -1,8 +1,5 @@
|
||||||
|
import numpy as np
|
||||||
from keras.models import model_from_json
|
from keras.models import model_from_json
|
||||||
import numpy
|
|
||||||
import numpy.random
|
|
||||||
import json
|
|
||||||
from spacy.tokens.span import Span
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import cPickle as pickle
|
import cPickle as pickle
|
||||||
|
@ -11,16 +8,23 @@ except ImportError:
|
||||||
|
|
||||||
|
|
||||||
class KerasSimilarityShim(object):
|
class KerasSimilarityShim(object):
|
||||||
|
entailment_types = ["entailment", "contradiction", "neutral"]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, path, nlp, get_features=None, max_length=100):
|
def load(cls, path, nlp, max_length=100, get_features=None):
|
||||||
|
|
||||||
if get_features is None:
|
if get_features is None:
|
||||||
get_features = get_word_ids
|
get_features = get_word_ids
|
||||||
|
|
||||||
with (path / 'config.json').open() as file_:
|
with (path / 'config.json').open() as file_:
|
||||||
model = model_from_json(file_.read())
|
model = model_from_json(file_.read())
|
||||||
with (path / 'model').open('rb') as file_:
|
with (path / 'model').open('rb') as file_:
|
||||||
weights = pickle.load(file_)
|
weights = pickle.load(file_)
|
||||||
|
|
||||||
embeddings = get_embeddings(nlp.vocab)
|
embeddings = get_embeddings(nlp.vocab)
|
||||||
model.set_weights([embeddings] + weights)
|
weights.insert(1, embeddings)
|
||||||
|
model.set_weights(weights)
|
||||||
|
|
||||||
return cls(model, get_features=get_features, max_length=max_length)
|
return cls(model, get_features=get_features, max_length=max_length)
|
||||||
|
|
||||||
def __init__(self, model, get_features=None, max_length=100):
|
def __init__(self, model, get_features=None, max_length=100):
|
||||||
|
@ -32,58 +36,42 @@ class KerasSimilarityShim(object):
|
||||||
doc.user_hooks['similarity'] = self.predict
|
doc.user_hooks['similarity'] = self.predict
|
||||||
doc.user_span_hooks['similarity'] = self.predict
|
doc.user_span_hooks['similarity'] = self.predict
|
||||||
|
|
||||||
|
return doc
|
||||||
|
|
||||||
def predict(self, doc1, doc2):
|
def predict(self, doc1, doc2):
|
||||||
x1 = self.get_features([doc1], max_length=self.max_length, tree_truncate=True)
|
x1 = self.get_features([doc1], max_length=self.max_length)
|
||||||
x2 = self.get_features([doc2], max_length=self.max_length, tree_truncate=True)
|
x2 = self.get_features([doc2], max_length=self.max_length)
|
||||||
scores = self.model.predict([x1, x2])
|
scores = self.model.predict([x1, x2])
|
||||||
return scores[0]
|
|
||||||
|
return self.entailment_types[scores.argmax()], scores.max()
|
||||||
|
|
||||||
|
|
||||||
def get_embeddings(vocab, nr_unk=100):
|
def get_embeddings(vocab, nr_unk=100):
|
||||||
nr_vector = max(lex.rank for lex in vocab) + 1
|
# the extra +1 is for a zero vector representing sentence-final padding
|
||||||
vectors = numpy.zeros((nr_vector+nr_unk+2, vocab.vectors_length), dtype='float32')
|
num_vectors = max(lex.rank for lex in vocab) + 2
|
||||||
|
|
||||||
|
# create random vectors for OOV tokens
|
||||||
|
oov = np.random.normal(size=(nr_unk, vocab.vectors_length))
|
||||||
|
oov = oov / oov.sum(axis=1, keepdims=True)
|
||||||
|
|
||||||
|
vectors = np.zeros((num_vectors + nr_unk, vocab.vectors_length), dtype='float32')
|
||||||
|
vectors[1:(nr_unk + 1), ] = oov
|
||||||
for lex in vocab:
|
for lex in vocab:
|
||||||
if lex.has_vector:
|
if lex.has_vector and lex.vector_norm > 0:
|
||||||
vectors[lex.rank+1] = lex.vector / lex.vector_norm
|
vectors[nr_unk + lex.rank + 1] = lex.vector / lex.vector_norm
|
||||||
|
|
||||||
return vectors
|
return vectors
|
||||||
|
|
||||||
|
|
||||||
def get_word_ids(docs, rnn_encode=False, tree_truncate=False, max_length=100, nr_unk=100):
|
def get_word_ids(docs, max_length=100, nr_unk=100):
|
||||||
Xs = numpy.zeros((len(docs), max_length), dtype='int32')
|
Xs = np.zeros((len(docs), max_length), dtype='int32')
|
||||||
|
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
if tree_truncate:
|
for j, token in enumerate(doc):
|
||||||
if isinstance(doc, Span):
|
if j == max_length:
|
||||||
queue = [doc.root]
|
|
||||||
else:
|
|
||||||
queue = [sent.root for sent in doc.sents]
|
|
||||||
else:
|
|
||||||
queue = list(doc)
|
|
||||||
words = []
|
|
||||||
while len(words) <= max_length and queue:
|
|
||||||
word = queue.pop(0)
|
|
||||||
if rnn_encode or (not word.is_punct and not word.is_space):
|
|
||||||
words.append(word)
|
|
||||||
if tree_truncate:
|
|
||||||
queue.extend(list(word.lefts))
|
|
||||||
queue.extend(list(word.rights))
|
|
||||||
words.sort()
|
|
||||||
for j, token in enumerate(words):
|
|
||||||
if token.has_vector:
|
|
||||||
Xs[i, j] = token.rank+1
|
|
||||||
else:
|
|
||||||
Xs[i, j] = (token.shape % (nr_unk-1))+2
|
|
||||||
j += 1
|
|
||||||
if j >= max_length:
|
|
||||||
break
|
break
|
||||||
else:
|
if token.has_vector:
|
||||||
Xs[i, len(words)] = 1
|
Xs[i, j] = token.rank + nr_unk + 1
|
||||||
|
else:
|
||||||
|
Xs[i, j] = token.rank % nr_unk + 1
|
||||||
return Xs
|
return Xs
|
||||||
|
|
||||||
|
|
||||||
def create_similarity_pipeline(nlp, max_length=100):
|
|
||||||
return [
|
|
||||||
nlp.tagger,
|
|
||||||
nlp.entity,
|
|
||||||
nlp.parser,
|
|
||||||
KerasSimilarityShim.load(nlp.path / 'similarity', nlp, max_length)
|
|
||||||
]
|
|
||||||
|
|
955
examples/notebooks/Decompositional Attention.ipynb
Normal file
955
examples/notebooks/Decompositional Attention.ipynb
Normal file
|
@ -0,0 +1,955 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Natural language inference using spaCy and Keras"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Introduction"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"This notebook details an implementation of the natural language inference model presented in [(Parikh et al, 2016)](https://arxiv.org/abs/1606.01933). The model is notable for the small number of paramaters *and hyperparameters* it specifices, while still yielding good performance."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Constructing the dataset"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import spacy\n",
|
||||||
|
"import numpy as np"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We only need the GloVe vectors from spaCy, not a full NLP pipeline."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"nlp = spacy.load('en_vectors_web_lg')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Function to load the SNLI dataset. The categories are converted to one-shot representation. The function comes from an example in spaCy."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/home/jds/tensorflow-gpu/lib/python3.5/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
|
||||||
|
" from ._conv import register_converters as _register_converters\n",
|
||||||
|
"Using TensorFlow backend.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import ujson as json\n",
|
||||||
|
"from keras.utils import to_categorical\n",
|
||||||
|
"\n",
|
||||||
|
"LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}\n",
|
||||||
|
"def read_snli(path):\n",
|
||||||
|
" texts1 = []\n",
|
||||||
|
" texts2 = []\n",
|
||||||
|
" labels = []\n",
|
||||||
|
" with open(path, 'r') as file_:\n",
|
||||||
|
" for line in file_:\n",
|
||||||
|
" eg = json.loads(line)\n",
|
||||||
|
" label = eg['gold_label']\n",
|
||||||
|
" if label == '-': # per Parikh, ignore - SNLI entries\n",
|
||||||
|
" continue\n",
|
||||||
|
" texts1.append(eg['sentence1'])\n",
|
||||||
|
" texts2.append(eg['sentence2'])\n",
|
||||||
|
" labels.append(LABELS[label])\n",
|
||||||
|
" return texts1, texts2, to_categorical(np.asarray(labels, dtype='int32'))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Because Keras can do the train/test split for us, we'll load *all* SNLI triples from one file."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"texts,hypotheses,labels = read_snli('snli/snli_1.0_train.jsonl')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def create_dataset(nlp, texts, hypotheses, num_oov, max_length, norm_vectors = True):\n",
|
||||||
|
" sents = texts + hypotheses\n",
|
||||||
|
" \n",
|
||||||
|
" # the extra +1 is for a zero vector represting NULL for padding\n",
|
||||||
|
" num_vectors = max(lex.rank for lex in nlp.vocab) + 2 \n",
|
||||||
|
" \n",
|
||||||
|
" # create random vectors for OOV tokens\n",
|
||||||
|
" oov = np.random.normal(size=(num_oov, nlp.vocab.vectors_length))\n",
|
||||||
|
" oov = oov / oov.sum(axis=1, keepdims=True)\n",
|
||||||
|
" \n",
|
||||||
|
" vectors = np.zeros((num_vectors + num_oov, nlp.vocab.vectors_length), dtype='float32')\n",
|
||||||
|
" vectors[num_vectors:, ] = oov\n",
|
||||||
|
" for lex in nlp.vocab:\n",
|
||||||
|
" if lex.has_vector and lex.vector_norm > 0:\n",
|
||||||
|
" vectors[lex.rank + 1] = lex.vector / lex.vector_norm if norm_vectors == True else lex.vector\n",
|
||||||
|
" \n",
|
||||||
|
" sents_as_ids = []\n",
|
||||||
|
" for sent in sents:\n",
|
||||||
|
" doc = nlp(sent)\n",
|
||||||
|
" word_ids = []\n",
|
||||||
|
" \n",
|
||||||
|
" for i, token in enumerate(doc):\n",
|
||||||
|
" # skip odd spaces from tokenizer\n",
|
||||||
|
" if token.has_vector and token.vector_norm == 0:\n",
|
||||||
|
" continue\n",
|
||||||
|
" \n",
|
||||||
|
" if i > max_length:\n",
|
||||||
|
" break\n",
|
||||||
|
" \n",
|
||||||
|
" if token.has_vector:\n",
|
||||||
|
" word_ids.append(token.rank + 1)\n",
|
||||||
|
" else:\n",
|
||||||
|
" # if we don't have a vector, pick an OOV entry\n",
|
||||||
|
" word_ids.append(token.rank % num_oov + num_vectors) \n",
|
||||||
|
" \n",
|
||||||
|
" # there must be a simpler way of generating padded arrays from lists...\n",
|
||||||
|
" word_id_vec = np.zeros((max_length), dtype='int')\n",
|
||||||
|
" clipped_len = min(max_length, len(word_ids))\n",
|
||||||
|
" word_id_vec[:clipped_len] = word_ids[:clipped_len]\n",
|
||||||
|
" sents_as_ids.append(word_id_vec)\n",
|
||||||
|
" \n",
|
||||||
|
" \n",
|
||||||
|
" return vectors, np.array(sents_as_ids[:len(texts)]), np.array(sents_as_ids[len(texts):])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"sem_vectors, text_vectors, hypothesis_vectors = create_dataset(nlp, texts, hypotheses, 100, 50, True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"texts_test,hypotheses_test,labels_test = read_snli('snli/snli_1.0_test.jsonl')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"_, text_vectors_test, hypothesis_vectors_test = create_dataset(nlp, texts_test, hypotheses_test, 100, 50, True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We use spaCy to tokenize the sentences and return, when available, a semantic vector for each token. \n",
|
||||||
|
"\n",
|
||||||
|
"OOV terms (tokens for which no semantic vector is available) are assigned to one of a set of randomly-generated OOV vectors, per (Parikh et al, 2016).\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Note that we will clip sentences to 50 words maximum."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from keras import layers, Model, models\n",
|
||||||
|
"from keras import backend as K"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Building the model"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"The embedding layer copies the 300-dimensional GloVe vectors into GPU memory. Per (Parikh et al, 2016), the vectors, which are not adapted during training, are projected down to lower-dimensional vectors using a trained projection matrix."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def create_embedding(vectors, max_length, projected_dim):\n",
|
||||||
|
" return models.Sequential([\n",
|
||||||
|
" layers.Embedding(\n",
|
||||||
|
" vectors.shape[0],\n",
|
||||||
|
" vectors.shape[1],\n",
|
||||||
|
" input_length=max_length,\n",
|
||||||
|
" weights=[vectors],\n",
|
||||||
|
" trainable=False),\n",
|
||||||
|
" \n",
|
||||||
|
" layers.TimeDistributed(\n",
|
||||||
|
" layers.Dense(projected_dim,\n",
|
||||||
|
" activation=None,\n",
|
||||||
|
" use_bias=False))\n",
|
||||||
|
" ])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"The Parikh model makes use of three feedforward blocks that construct nonlinear combinations of their input. Each block contains two ReLU layers and two dropout layers."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def create_feedforward(num_units=200, activation='relu', dropout_rate=0.2):\n",
|
||||||
|
" return models.Sequential([\n",
|
||||||
|
" layers.Dense(num_units, activation=activation),\n",
|
||||||
|
" layers.Dropout(dropout_rate),\n",
|
||||||
|
" layers.Dense(num_units, activation=activation),\n",
|
||||||
|
" layers.Dropout(dropout_rate)\n",
|
||||||
|
" ])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"The basic idea of the (Parikh et al, 2016) model is to:\n",
|
||||||
|
"\n",
|
||||||
|
"1. *Align*: Construct an alignment of subphrases in the text and hypothesis using an attention-like mechanism, called \"decompositional\" because the layer is applied to each of the two sentences individually rather than to their product. The dot product of the nonlinear transformations of the inputs is then normalized vertically and horizontally to yield a pair of \"soft\" alignment structures, from text->hypothesis and hypothesis->text. Concretely, for each word in one sentence, a multinomial distribution is computed over the words of the other sentence, by learning a multinomial logistic with softmax target.\n",
|
||||||
|
"2. *Compare*: Each word is now compared to its aligned phrase using a function modeled as a two-layer feedforward ReLU network. The output is a high-dimensional representation of the strength of association between word and aligned phrase.\n",
|
||||||
|
"3. *Aggregate*: The comparison vectors are summed, separately, for the text and the hypothesis. The result is two vectors: one that describes the degree of association of the text to the hypothesis, and the second, of the hypothesis to the text.\n",
|
||||||
|
"4. Finally, these two vectors are processed by a dense layer followed by a softmax classifier, as usual.\n",
|
||||||
|
"\n",
|
||||||
|
"Note that because in entailment the truth conditions of the consequent must be a subset of those of the antecedent, it is not obvious that we need both vectors in step (3). Entailment is not symmetric. It may be enough to just use the hypothesis->text vector. We will explore this possibility later."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We need a couple of little functions for Lambda layers to normalize and aggregate weights:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def normalizer(axis):\n",
|
||||||
|
" def _normalize(att_weights):\n",
|
||||||
|
" exp_weights = K.exp(att_weights)\n",
|
||||||
|
" sum_weights = K.sum(exp_weights, axis=axis, keepdims=True)\n",
|
||||||
|
" return exp_weights/sum_weights\n",
|
||||||
|
" return _normalize\n",
|
||||||
|
"\n",
|
||||||
|
"def sum_word(x):\n",
|
||||||
|
" return K.sum(x, axis=1)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def build_model(vectors, max_length, num_hidden, num_classes, projected_dim, entail_dir='both'):\n",
|
||||||
|
" input1 = layers.Input(shape=(max_length,), dtype='int32', name='words1')\n",
|
||||||
|
" input2 = layers.Input(shape=(max_length,), dtype='int32', name='words2')\n",
|
||||||
|
" \n",
|
||||||
|
" # embeddings (projected)\n",
|
||||||
|
" embed = create_embedding(vectors, max_length, projected_dim)\n",
|
||||||
|
" \n",
|
||||||
|
" a = embed(input1)\n",
|
||||||
|
" b = embed(input2)\n",
|
||||||
|
" \n",
|
||||||
|
" # step 1: attend\n",
|
||||||
|
" F = create_feedforward(num_hidden)\n",
|
||||||
|
" att_weights = layers.dot([F(a), F(b)], axes=-1)\n",
|
||||||
|
" \n",
|
||||||
|
" G = create_feedforward(num_hidden)\n",
|
||||||
|
" \n",
|
||||||
|
" if entail_dir == 'both':\n",
|
||||||
|
" norm_weights_a = layers.Lambda(normalizer(1))(att_weights)\n",
|
||||||
|
" norm_weights_b = layers.Lambda(normalizer(2))(att_weights)\n",
|
||||||
|
" alpha = layers.dot([norm_weights_a, a], axes=1)\n",
|
||||||
|
" beta = layers.dot([norm_weights_b, b], axes=1)\n",
|
||||||
|
"\n",
|
||||||
|
" # step 2: compare\n",
|
||||||
|
" comp1 = layers.concatenate([a, beta])\n",
|
||||||
|
" comp2 = layers.concatenate([b, alpha])\n",
|
||||||
|
" v1 = layers.TimeDistributed(G)(comp1)\n",
|
||||||
|
" v2 = layers.TimeDistributed(G)(comp2)\n",
|
||||||
|
"\n",
|
||||||
|
" # step 3: aggregate\n",
|
||||||
|
" v1_sum = layers.Lambda(sum_word)(v1)\n",
|
||||||
|
" v2_sum = layers.Lambda(sum_word)(v2)\n",
|
||||||
|
" concat = layers.concatenate([v1_sum, v2_sum])\n",
|
||||||
|
" elif entail_dir == 'left':\n",
|
||||||
|
" norm_weights_a = layers.Lambda(normalizer(1))(att_weights)\n",
|
||||||
|
" alpha = layers.dot([norm_weights_a, a], axes=1)\n",
|
||||||
|
" comp2 = layers.concatenate([b, alpha])\n",
|
||||||
|
" v2 = layers.TimeDistributed(G)(comp2)\n",
|
||||||
|
" v2_sum = layers.Lambda(sum_word)(v2)\n",
|
||||||
|
" concat = v2_sum\n",
|
||||||
|
" else:\n",
|
||||||
|
" norm_weights_b = layers.Lambda(normalizer(2))(att_weights)\n",
|
||||||
|
" beta = layers.dot([norm_weights_b, b], axes=1)\n",
|
||||||
|
" comp1 = layers.concatenate([a, beta])\n",
|
||||||
|
" v1 = layers.TimeDistributed(G)(comp1)\n",
|
||||||
|
" v1_sum = layers.Lambda(sum_word)(v1)\n",
|
||||||
|
" concat = v1_sum\n",
|
||||||
|
" \n",
|
||||||
|
" H = create_feedforward(num_hidden)\n",
|
||||||
|
" out = H(concat)\n",
|
||||||
|
" out = layers.Dense(num_classes, activation='softmax')(out)\n",
|
||||||
|
" \n",
|
||||||
|
" model = Model([input1, input2], out)\n",
|
||||||
|
" \n",
|
||||||
|
" model.compile(optimizer='adam',\n",
|
||||||
|
" loss='categorical_crossentropy',\n",
|
||||||
|
" metrics=['accuracy'])\n",
|
||||||
|
" return model\n",
|
||||||
|
" \n",
|
||||||
|
" \n",
|
||||||
|
" "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"Layer (type) Output Shape Param # Connected to \n",
|
||||||
|
"==================================================================================================\n",
|
||||||
|
"words1 (InputLayer) (None, 50) 0 \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"words2 (InputLayer) (None, 50) 0 \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"sequential_1 (Sequential) (None, 50, 200) 321381600 words1[0][0] \n",
|
||||||
|
" words2[0][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"sequential_2 (Sequential) (None, 50, 200) 80400 sequential_1[1][0] \n",
|
||||||
|
" sequential_1[2][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"dot_1 (Dot) (None, 50, 50) 0 sequential_2[1][0] \n",
|
||||||
|
" sequential_2[2][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"lambda_2 (Lambda) (None, 50, 50) 0 dot_1[0][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"lambda_1 (Lambda) (None, 50, 50) 0 dot_1[0][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"dot_3 (Dot) (None, 50, 200) 0 lambda_2[0][0] \n",
|
||||||
|
" sequential_1[2][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"dot_2 (Dot) (None, 50, 200) 0 lambda_1[0][0] \n",
|
||||||
|
" sequential_1[1][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"concatenate_1 (Concatenate) (None, 50, 400) 0 sequential_1[1][0] \n",
|
||||||
|
" dot_3[0][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"concatenate_2 (Concatenate) (None, 50, 400) 0 sequential_1[2][0] \n",
|
||||||
|
" dot_2[0][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"time_distributed_2 (TimeDistrib (None, 50, 200) 120400 concatenate_1[0][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"time_distributed_3 (TimeDistrib (None, 50, 200) 120400 concatenate_2[0][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"lambda_3 (Lambda) (None, 200) 0 time_distributed_2[0][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"lambda_4 (Lambda) (None, 200) 0 time_distributed_3[0][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"concatenate_3 (Concatenate) (None, 400) 0 lambda_3[0][0] \n",
|
||||||
|
" lambda_4[0][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"sequential_4 (Sequential) (None, 200) 120400 concatenate_3[0][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"dense_8 (Dense) (None, 3) 603 sequential_4[1][0] \n",
|
||||||
|
"==================================================================================================\n",
|
||||||
|
"Total params: 321,703,403\n",
|
||||||
|
"Trainable params: 381,803\n",
|
||||||
|
"Non-trainable params: 321,321,600\n",
|
||||||
|
"__________________________________________________________________________________________________\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"K.clear_session()\n",
|
||||||
|
"m = build_model(sem_vectors, 50, 200, 3, 200)\n",
|
||||||
|
"m.summary()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"The number of trainable parameters, ~381k, is the number given by Parikh et al, so we're on the right track."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Training the model"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Parikh et al use tiny batches of 4, training for 50MM batches, which amounts to around 500 epochs. Here we'll use large batches to better use the GPU, and train for fewer epochs -- for purposes of this experiment."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 19,
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Train on 549367 samples, validate on 9824 samples\n",
|
||||||
|
"Epoch 1/50\n",
|
||||||
|
"549367/549367 [==============================] - 34s 62us/step - loss: 0.7599 - acc: 0.6617 - val_loss: 0.5396 - val_acc: 0.7861\n",
|
||||||
|
"Epoch 2/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.5611 - acc: 0.7763 - val_loss: 0.4892 - val_acc: 0.8085\n",
|
||||||
|
"Epoch 3/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.5212 - acc: 0.7948 - val_loss: 0.4574 - val_acc: 0.8261\n",
|
||||||
|
"Epoch 4/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4986 - acc: 0.8045 - val_loss: 0.4410 - val_acc: 0.8274\n",
|
||||||
|
"Epoch 5/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4819 - acc: 0.8114 - val_loss: 0.4224 - val_acc: 0.8383\n",
|
||||||
|
"Epoch 6/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4714 - acc: 0.8166 - val_loss: 0.4200 - val_acc: 0.8379\n",
|
||||||
|
"Epoch 7/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4633 - acc: 0.8203 - val_loss: 0.4098 - val_acc: 0.8457\n",
|
||||||
|
"Epoch 8/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4558 - acc: 0.8232 - val_loss: 0.4114 - val_acc: 0.8415\n",
|
||||||
|
"Epoch 9/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4508 - acc: 0.8250 - val_loss: 0.4062 - val_acc: 0.8477\n",
|
||||||
|
"Epoch 10/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4433 - acc: 0.8286 - val_loss: 0.3982 - val_acc: 0.8486\n",
|
||||||
|
"Epoch 11/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4388 - acc: 0.8307 - val_loss: 0.3953 - val_acc: 0.8497\n",
|
||||||
|
"Epoch 12/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4351 - acc: 0.8321 - val_loss: 0.3973 - val_acc: 0.8522\n",
|
||||||
|
"Epoch 13/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4309 - acc: 0.8342 - val_loss: 0.3939 - val_acc: 0.8539\n",
|
||||||
|
"Epoch 14/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4269 - acc: 0.8355 - val_loss: 0.3932 - val_acc: 0.8517\n",
|
||||||
|
"Epoch 15/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4247 - acc: 0.8369 - val_loss: 0.3938 - val_acc: 0.8515\n",
|
||||||
|
"Epoch 16/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4208 - acc: 0.8379 - val_loss: 0.3936 - val_acc: 0.8504\n",
|
||||||
|
"Epoch 17/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4194 - acc: 0.8390 - val_loss: 0.3885 - val_acc: 0.8560\n",
|
||||||
|
"Epoch 18/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4162 - acc: 0.8402 - val_loss: 0.3874 - val_acc: 0.8561\n",
|
||||||
|
"Epoch 19/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4140 - acc: 0.8409 - val_loss: 0.3889 - val_acc: 0.8545\n",
|
||||||
|
"Epoch 20/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4114 - acc: 0.8426 - val_loss: 0.3864 - val_acc: 0.8583\n",
|
||||||
|
"Epoch 21/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4092 - acc: 0.8430 - val_loss: 0.3870 - val_acc: 0.8561\n",
|
||||||
|
"Epoch 22/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4062 - acc: 0.8442 - val_loss: 0.3852 - val_acc: 0.8577\n",
|
||||||
|
"Epoch 23/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4050 - acc: 0.8450 - val_loss: 0.3850 - val_acc: 0.8578\n",
|
||||||
|
"Epoch 24/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4035 - acc: 0.8455 - val_loss: 0.3825 - val_acc: 0.8555\n",
|
||||||
|
"Epoch 25/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4018 - acc: 0.8460 - val_loss: 0.3837 - val_acc: 0.8573\n",
|
||||||
|
"Epoch 26/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3989 - acc: 0.8476 - val_loss: 0.3843 - val_acc: 0.8599\n",
|
||||||
|
"Epoch 27/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3979 - acc: 0.8481 - val_loss: 0.3841 - val_acc: 0.8589\n",
|
||||||
|
"Epoch 28/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3967 - acc: 0.8484 - val_loss: 0.3811 - val_acc: 0.8575\n",
|
||||||
|
"Epoch 29/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3956 - acc: 0.8492 - val_loss: 0.3829 - val_acc: 0.8589\n",
|
||||||
|
"Epoch 30/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3938 - acc: 0.8499 - val_loss: 0.3859 - val_acc: 0.8562\n",
|
||||||
|
"Epoch 31/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3925 - acc: 0.8500 - val_loss: 0.3798 - val_acc: 0.8587\n",
|
||||||
|
"Epoch 32/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3906 - acc: 0.8509 - val_loss: 0.3834 - val_acc: 0.8569\n",
|
||||||
|
"Epoch 33/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3893 - acc: 0.8511 - val_loss: 0.3806 - val_acc: 0.8588\n",
|
||||||
|
"Epoch 34/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3885 - acc: 0.8515 - val_loss: 0.3828 - val_acc: 0.8603\n",
|
||||||
|
"Epoch 35/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3879 - acc: 0.8520 - val_loss: 0.3800 - val_acc: 0.8594\n",
|
||||||
|
"Epoch 36/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3860 - acc: 0.8530 - val_loss: 0.3796 - val_acc: 0.8577\n",
|
||||||
|
"Epoch 37/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3856 - acc: 0.8532 - val_loss: 0.3857 - val_acc: 0.8591\n",
|
||||||
|
"Epoch 38/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3838 - acc: 0.8535 - val_loss: 0.3835 - val_acc: 0.8603\n",
|
||||||
|
"Epoch 39/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3830 - acc: 0.8543 - val_loss: 0.3830 - val_acc: 0.8599\n",
|
||||||
|
"Epoch 40/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3818 - acc: 0.8548 - val_loss: 0.3832 - val_acc: 0.8559\n",
|
||||||
|
"Epoch 41/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3806 - acc: 0.8551 - val_loss: 0.3845 - val_acc: 0.8553\n",
|
||||||
|
"Epoch 42/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3803 - acc: 0.8550 - val_loss: 0.3789 - val_acc: 0.8617\n",
|
||||||
|
"Epoch 43/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3791 - acc: 0.8556 - val_loss: 0.3835 - val_acc: 0.8580\n",
|
||||||
|
"Epoch 44/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3778 - acc: 0.8565 - val_loss: 0.3799 - val_acc: 0.8580\n",
|
||||||
|
"Epoch 45/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3766 - acc: 0.8571 - val_loss: 0.3790 - val_acc: 0.8625\n",
|
||||||
|
"Epoch 46/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3770 - acc: 0.8569 - val_loss: 0.3820 - val_acc: 0.8590\n",
|
||||||
|
"Epoch 47/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3761 - acc: 0.8573 - val_loss: 0.3831 - val_acc: 0.8581\n",
|
||||||
|
"Epoch 48/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3739 - acc: 0.8579 - val_loss: 0.3828 - val_acc: 0.8599\n",
|
||||||
|
"Epoch 49/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3738 - acc: 0.8577 - val_loss: 0.3785 - val_acc: 0.8590\n",
|
||||||
|
"Epoch 50/50\n",
|
||||||
|
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3726 - acc: 0.8580 - val_loss: 0.3820 - val_acc: 0.8585\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"<keras.callbacks.History at 0x7f5c9f49c438>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 19,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"m.fit([text_vectors, hypothesis_vectors], labels, batch_size=1024, epochs=50,validation_data=([text_vectors_test, hypothesis_vectors_test], labels_test))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"The result is broadly in the region reported by Parikh et al: ~86 vs 86.3%. The small difference might be accounted by differences in `max_length` (here set at 50), in the training regime, and that here we use Keras' built-in validation splitting rather than the SNLI test set."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Experiment: the asymmetric model"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"It was suggested earlier that, based on the semantics of entailment, the vector representing the strength of association between the hypothesis to the text is all that is needed for classifying the entailment.\n",
|
||||||
|
"\n",
|
||||||
|
"The following model removes consideration of the complementary vector (text to hypothesis) from the computation. This will decrease the paramater count slightly, because the final dense layers will be smaller, and speed up the forward pass when predicting, because fewer calculations will be needed."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 20,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"Layer (type) Output Shape Param # Connected to \n",
|
||||||
|
"==================================================================================================\n",
|
||||||
|
"words2 (InputLayer) (None, 50) 0 \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"words1 (InputLayer) (None, 50) 0 \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"sequential_5 (Sequential) (None, 50, 200) 321381600 words1[0][0] \n",
|
||||||
|
" words2[0][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"sequential_6 (Sequential) (None, 50, 200) 80400 sequential_5[1][0] \n",
|
||||||
|
" sequential_5[2][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"dot_4 (Dot) (None, 50, 50) 0 sequential_6[1][0] \n",
|
||||||
|
" sequential_6[2][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"lambda_5 (Lambda) (None, 50, 50) 0 dot_4[0][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"dot_5 (Dot) (None, 50, 200) 0 lambda_5[0][0] \n",
|
||||||
|
" sequential_5[1][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"concatenate_4 (Concatenate) (None, 50, 400) 0 sequential_5[2][0] \n",
|
||||||
|
" dot_5[0][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"time_distributed_5 (TimeDistrib (None, 50, 200) 120400 concatenate_4[0][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"lambda_6 (Lambda) (None, 200) 0 time_distributed_5[0][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"sequential_8 (Sequential) (None, 200) 80400 lambda_6[0][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"dense_16 (Dense) (None, 3) 603 sequential_8[1][0] \n",
|
||||||
|
"==================================================================================================\n",
|
||||||
|
"Total params: 321,663,403\n",
|
||||||
|
"Trainable params: 341,803\n",
|
||||||
|
"Non-trainable params: 321,321,600\n",
|
||||||
|
"__________________________________________________________________________________________________\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"m1 = build_model(sem_vectors, 50, 200, 3, 200, 'left')\n",
|
||||||
|
"m1.summary()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"The parameter count has indeed decreased by 40,000, corresponding to the 200x200 smaller H function."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 21,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Train on 549367 samples, validate on 9824 samples\n",
|
||||||
|
"Epoch 1/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 46us/step - loss: 0.7331 - acc: 0.6770 - val_loss: 0.5257 - val_acc: 0.7936\n",
|
||||||
|
"Epoch 2/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.5518 - acc: 0.7799 - val_loss: 0.4717 - val_acc: 0.8159\n",
|
||||||
|
"Epoch 3/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.5147 - acc: 0.7967 - val_loss: 0.4449 - val_acc: 0.8278\n",
|
||||||
|
"Epoch 4/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4948 - acc: 0.8060 - val_loss: 0.4326 - val_acc: 0.8344\n",
|
||||||
|
"Epoch 5/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4814 - acc: 0.8122 - val_loss: 0.4247 - val_acc: 0.8359\n",
|
||||||
|
"Epoch 6/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4712 - acc: 0.8162 - val_loss: 0.4143 - val_acc: 0.8430\n",
|
||||||
|
"Epoch 7/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4635 - acc: 0.8205 - val_loss: 0.4172 - val_acc: 0.8401\n",
|
||||||
|
"Epoch 8/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4570 - acc: 0.8223 - val_loss: 0.4106 - val_acc: 0.8422\n",
|
||||||
|
"Epoch 9/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4505 - acc: 0.8259 - val_loss: 0.4043 - val_acc: 0.8451\n",
|
||||||
|
"Epoch 10/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4459 - acc: 0.8280 - val_loss: 0.4050 - val_acc: 0.8467\n",
|
||||||
|
"Epoch 11/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4405 - acc: 0.8300 - val_loss: 0.3975 - val_acc: 0.8481\n",
|
||||||
|
"Epoch 12/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4360 - acc: 0.8324 - val_loss: 0.4026 - val_acc: 0.8496\n",
|
||||||
|
"Epoch 13/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4327 - acc: 0.8334 - val_loss: 0.4024 - val_acc: 0.8471\n",
|
||||||
|
"Epoch 14/50\n",
|
||||||
|
"549367/549367 [==============================] - 24s 45us/step - loss: 0.4293 - acc: 0.8350 - val_loss: 0.3955 - val_acc: 0.8496\n",
|
||||||
|
"Epoch 15/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4263 - acc: 0.8369 - val_loss: 0.3980 - val_acc: 0.8490\n",
|
||||||
|
"Epoch 16/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4236 - acc: 0.8377 - val_loss: 0.3958 - val_acc: 0.8496\n",
|
||||||
|
"Epoch 17/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4213 - acc: 0.8384 - val_loss: 0.3954 - val_acc: 0.8496\n",
|
||||||
|
"Epoch 18/50\n",
|
||||||
|
"549367/549367 [==============================] - 24s 45us/step - loss: 0.4187 - acc: 0.8394 - val_loss: 0.3929 - val_acc: 0.8514\n",
|
||||||
|
"Epoch 19/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4157 - acc: 0.8409 - val_loss: 0.3939 - val_acc: 0.8507\n",
|
||||||
|
"Epoch 20/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4135 - acc: 0.8417 - val_loss: 0.3953 - val_acc: 0.8522\n",
|
||||||
|
"Epoch 21/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4122 - acc: 0.8424 - val_loss: 0.3974 - val_acc: 0.8506\n",
|
||||||
|
"Epoch 22/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4099 - acc: 0.8435 - val_loss: 0.3918 - val_acc: 0.8522\n",
|
||||||
|
"Epoch 23/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4075 - acc: 0.8443 - val_loss: 0.3901 - val_acc: 0.8513\n",
|
||||||
|
"Epoch 24/50\n",
|
||||||
|
"549367/549367 [==============================] - 24s 44us/step - loss: 0.4067 - acc: 0.8447 - val_loss: 0.3885 - val_acc: 0.8543\n",
|
||||||
|
"Epoch 25/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4047 - acc: 0.8454 - val_loss: 0.3846 - val_acc: 0.8531\n",
|
||||||
|
"Epoch 26/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4031 - acc: 0.8461 - val_loss: 0.3864 - val_acc: 0.8562\n",
|
||||||
|
"Epoch 27/50\n",
|
||||||
|
"549367/549367 [==============================] - 24s 45us/step - loss: 0.4020 - acc: 0.8467 - val_loss: 0.3874 - val_acc: 0.8546\n",
|
||||||
|
"Epoch 28/50\n",
|
||||||
|
"549367/549367 [==============================] - 24s 45us/step - loss: 0.4001 - acc: 0.8473 - val_loss: 0.3848 - val_acc: 0.8534\n",
|
||||||
|
"Epoch 29/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3991 - acc: 0.8479 - val_loss: 0.3865 - val_acc: 0.8562\n",
|
||||||
|
"Epoch 30/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3976 - acc: 0.8484 - val_loss: 0.3833 - val_acc: 0.8574\n",
|
||||||
|
"Epoch 31/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3961 - acc: 0.8487 - val_loss: 0.3846 - val_acc: 0.8585\n",
|
||||||
|
"Epoch 32/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3942 - acc: 0.8498 - val_loss: 0.3805 - val_acc: 0.8573\n",
|
||||||
|
"Epoch 33/50\n",
|
||||||
|
"549367/549367 [==============================] - 24s 44us/step - loss: 0.3935 - acc: 0.8503 - val_loss: 0.3856 - val_acc: 0.8579\n",
|
||||||
|
"Epoch 34/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3923 - acc: 0.8507 - val_loss: 0.3829 - val_acc: 0.8560\n",
|
||||||
|
"Epoch 35/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3920 - acc: 0.8508 - val_loss: 0.3864 - val_acc: 0.8575\n",
|
||||||
|
"Epoch 36/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3907 - acc: 0.8516 - val_loss: 0.3873 - val_acc: 0.8563\n",
|
||||||
|
"Epoch 37/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3891 - acc: 0.8519 - val_loss: 0.3850 - val_acc: 0.8570\n",
|
||||||
|
"Epoch 38/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3872 - acc: 0.8522 - val_loss: 0.3815 - val_acc: 0.8591\n",
|
||||||
|
"Epoch 39/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3887 - acc: 0.8520 - val_loss: 0.3829 - val_acc: 0.8590\n",
|
||||||
|
"Epoch 40/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3868 - acc: 0.8531 - val_loss: 0.3807 - val_acc: 0.8600\n",
|
||||||
|
"Epoch 41/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3859 - acc: 0.8537 - val_loss: 0.3832 - val_acc: 0.8574\n",
|
||||||
|
"Epoch 42/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3849 - acc: 0.8537 - val_loss: 0.3850 - val_acc: 0.8576\n",
|
||||||
|
"Epoch 43/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3834 - acc: 0.8541 - val_loss: 0.3825 - val_acc: 0.8563\n",
|
||||||
|
"Epoch 44/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3829 - acc: 0.8548 - val_loss: 0.3844 - val_acc: 0.8540\n",
|
||||||
|
"Epoch 45/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3816 - acc: 0.8552 - val_loss: 0.3841 - val_acc: 0.8559\n",
|
||||||
|
"Epoch 46/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3816 - acc: 0.8549 - val_loss: 0.3880 - val_acc: 0.8567\n",
|
||||||
|
"Epoch 47/50\n",
|
||||||
|
"549367/549367 [==============================] - 24s 45us/step - loss: 0.3799 - acc: 0.8559 - val_loss: 0.3767 - val_acc: 0.8635\n",
|
||||||
|
"Epoch 48/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3800 - acc: 0.8560 - val_loss: 0.3786 - val_acc: 0.8563\n",
|
||||||
|
"Epoch 49/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3781 - acc: 0.8563 - val_loss: 0.3812 - val_acc: 0.8596\n",
|
||||||
|
"Epoch 50/50\n",
|
||||||
|
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3788 - acc: 0.8560 - val_loss: 0.3782 - val_acc: 0.8601\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"<keras.callbacks.History at 0x7f5ca1bf3e48>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 21,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"m1.fit([text_vectors, hypothesis_vectors], labels, batch_size=1024, epochs=50,validation_data=([text_vectors_test, hypothesis_vectors_test], labels_test))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"This model performs the same as the slightly more complex model that evaluates alignments in both directions. Note also that processing time is improved, from 64 down to 48 microseconds per step. \n",
|
||||||
|
"\n",
|
||||||
|
"Let's now look at an asymmetric model that evaluates text to hypothesis comparisons. The prediction is that such a model will correctly classify a decent proportion of the exemplars, but not as accurately as the previous two.\n",
|
||||||
|
"\n",
|
||||||
|
"We'll just use 10 epochs for expediency."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 96,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"Layer (type) Output Shape Param # Connected to \n",
|
||||||
|
"==================================================================================================\n",
|
||||||
|
"words1 (InputLayer) (None, 50) 0 \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"words2 (InputLayer) (None, 50) 0 \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"sequential_13 (Sequential) (None, 50, 200) 321381600 words1[0][0] \n",
|
||||||
|
" words2[0][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"sequential_14 (Sequential) (None, 50, 200) 80400 sequential_13[1][0] \n",
|
||||||
|
" sequential_13[2][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"dot_8 (Dot) (None, 50, 50) 0 sequential_14[1][0] \n",
|
||||||
|
" sequential_14[2][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"lambda_9 (Lambda) (None, 50, 50) 0 dot_8[0][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"dot_9 (Dot) (None, 50, 200) 0 lambda_9[0][0] \n",
|
||||||
|
" sequential_13[2][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"concatenate_6 (Concatenate) (None, 50, 400) 0 sequential_13[1][0] \n",
|
||||||
|
" dot_9[0][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"time_distributed_9 (TimeDistrib (None, 50, 200) 120400 concatenate_6[0][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"lambda_10 (Lambda) (None, 200) 0 time_distributed_9[0][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"sequential_16 (Sequential) (None, 200) 80400 lambda_10[0][0] \n",
|
||||||
|
"__________________________________________________________________________________________________\n",
|
||||||
|
"dense_32 (Dense) (None, 3) 603 sequential_16[1][0] \n",
|
||||||
|
"==================================================================================================\n",
|
||||||
|
"Total params: 321,663,403\n",
|
||||||
|
"Trainable params: 341,803\n",
|
||||||
|
"Non-trainable params: 321,321,600\n",
|
||||||
|
"__________________________________________________________________________________________________\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"m2 = build_model(sem_vectors, 50, 200, 3, 200, 'right')\n",
|
||||||
|
"m2.summary()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 97,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Train on 455226 samples, validate on 113807 samples\n",
|
||||||
|
"Epoch 1/10\n",
|
||||||
|
"455226/455226 [==============================] - 22s 49us/step - loss: 0.8920 - acc: 0.5771 - val_loss: 0.8001 - val_acc: 0.6435\n",
|
||||||
|
"Epoch 2/10\n",
|
||||||
|
"455226/455226 [==============================] - 22s 47us/step - loss: 0.7808 - acc: 0.6553 - val_loss: 0.7267 - val_acc: 0.6855\n",
|
||||||
|
"Epoch 3/10\n",
|
||||||
|
"455226/455226 [==============================] - 22s 47us/step - loss: 0.7329 - acc: 0.6825 - val_loss: 0.6966 - val_acc: 0.7006\n",
|
||||||
|
"Epoch 4/10\n",
|
||||||
|
"455226/455226 [==============================] - 22s 47us/step - loss: 0.7055 - acc: 0.6978 - val_loss: 0.6713 - val_acc: 0.7150\n",
|
||||||
|
"Epoch 5/10\n",
|
||||||
|
"455226/455226 [==============================] - 22s 47us/step - loss: 0.6862 - acc: 0.7081 - val_loss: 0.6533 - val_acc: 0.7253\n",
|
||||||
|
"Epoch 6/10\n",
|
||||||
|
"455226/455226 [==============================] - 21s 47us/step - loss: 0.6694 - acc: 0.7179 - val_loss: 0.6472 - val_acc: 0.7277\n",
|
||||||
|
"Epoch 7/10\n",
|
||||||
|
"455226/455226 [==============================] - 22s 47us/step - loss: 0.6555 - acc: 0.7252 - val_loss: 0.6338 - val_acc: 0.7347\n",
|
||||||
|
"Epoch 8/10\n",
|
||||||
|
"455226/455226 [==============================] - 22s 48us/step - loss: 0.6434 - acc: 0.7310 - val_loss: 0.6246 - val_acc: 0.7385\n",
|
||||||
|
"Epoch 9/10\n",
|
||||||
|
"455226/455226 [==============================] - 22s 47us/step - loss: 0.6325 - acc: 0.7367 - val_loss: 0.6164 - val_acc: 0.7424\n",
|
||||||
|
"Epoch 10/10\n",
|
||||||
|
"455226/455226 [==============================] - 22s 47us/step - loss: 0.6216 - acc: 0.7426 - val_loss: 0.6082 - val_acc: 0.7478\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"<keras.callbacks.History at 0x7fa6850cf080>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 97,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"m2.fit([text_vectors, hypothesis_vectors], labels, batch_size=1024, epochs=10,validation_split=.2)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Comparing this fit to the validation accuracy of the previous two models after 10 epochs, we observe that its accuracy is roughly 10% lower.\n",
|
||||||
|
"\n",
|
||||||
|
"It is reassuring that the neural modeling here reproduces what we know from the semantics of natural language!"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.5.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
27
examples/pipeline/fix_space_entities.py
Normal file
27
examples/pipeline/fix_space_entities.py
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
'''Demonstrate adding a rule-based component that forces some tokens to not
|
||||||
|
be entities, before the NER tagger is applied. This is used to hotfix the issue
|
||||||
|
in https://github.com/explosion/spaCy/issues/2870 , present as of spaCy v2.0.16.
|
||||||
|
'''
|
||||||
|
import spacy
|
||||||
|
from spacy.attrs import ENT_IOB
|
||||||
|
|
||||||
|
def fix_space_tags(doc):
|
||||||
|
ent_iobs = doc.to_array([ENT_IOB])
|
||||||
|
for i, token in enumerate(doc):
|
||||||
|
if token.is_space:
|
||||||
|
# Sets 'O' tag (0 is None, so I is 1, O is 2)
|
||||||
|
ent_iobs[i] = 2
|
||||||
|
doc.from_array([ENT_IOB], ent_iobs.reshape((len(doc), 1)))
|
||||||
|
return doc
|
||||||
|
|
||||||
|
def main():
|
||||||
|
nlp = spacy.load('en_core_web_sm')
|
||||||
|
text = u'''This is some crazy test where I dont need an Apple Watch to make things bug'''
|
||||||
|
doc = nlp(text)
|
||||||
|
print('Before', doc.ents)
|
||||||
|
nlp.add_pipe(fix_space_tags, name='fix-ner', before='ner')
|
||||||
|
doc = nlp(text)
|
||||||
|
print('After', doc.ents)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
|
@ -21,8 +21,9 @@ from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import random
|
import random
|
||||||
import spacy
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import spacy
|
||||||
|
from spacy.util import minibatch, compounding
|
||||||
|
|
||||||
|
|
||||||
# training data: texts, heads and dependency labels
|
# training data: texts, heads and dependency labels
|
||||||
|
@ -63,7 +64,7 @@ TRAIN_DATA = [
|
||||||
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
|
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
|
||||||
output_dir=("Optional output directory", "option", "o", Path),
|
output_dir=("Optional output directory", "option", "o", Path),
|
||||||
n_iter=("Number of training iterations", "option", "n", int))
|
n_iter=("Number of training iterations", "option", "n", int))
|
||||||
def main(model=None, output_dir=None, n_iter=5):
|
def main(model=None, output_dir=None, n_iter=15):
|
||||||
"""Load the model, set up the pipeline and train the parser."""
|
"""Load the model, set up the pipeline and train the parser."""
|
||||||
if model is not None:
|
if model is not None:
|
||||||
nlp = spacy.load(model) # load existing spaCy model
|
nlp = spacy.load(model) # load existing spaCy model
|
||||||
|
@ -89,9 +90,12 @@ def main(model=None, output_dir=None, n_iter=5):
|
||||||
for itn in range(n_iter):
|
for itn in range(n_iter):
|
||||||
random.shuffle(TRAIN_DATA)
|
random.shuffle(TRAIN_DATA)
|
||||||
losses = {}
|
losses = {}
|
||||||
for text, annotations in TRAIN_DATA:
|
# batch up the examples using spaCy's minibatch
|
||||||
nlp.update([text], [annotations], sgd=optimizer, losses=losses)
|
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
|
||||||
print(losses)
|
for batch in batches:
|
||||||
|
texts, annotations = zip(*batch)
|
||||||
|
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
|
||||||
|
print('Losses', losses)
|
||||||
|
|
||||||
# test the trained model
|
# test the trained model
|
||||||
test_model(nlp)
|
test_model(nlp)
|
||||||
|
@ -135,7 +139,8 @@ if __name__ == '__main__':
|
||||||
# [
|
# [
|
||||||
# ('find', 'ROOT', 'find'),
|
# ('find', 'ROOT', 'find'),
|
||||||
# ('cheapest', 'QUALITY', 'gym'),
|
# ('cheapest', 'QUALITY', 'gym'),
|
||||||
# ('gym', 'PLACE', 'find')
|
# ('gym', 'PLACE', 'find'),
|
||||||
|
# ('near', 'ATTRIBUTE', 'gym'),
|
||||||
# ('work', 'LOCATION', 'near')
|
# ('work', 'LOCATION', 'near')
|
||||||
# ]
|
# ]
|
||||||
# show me the best hotel in berlin
|
# show me the best hotel in berlin
|
||||||
|
|
|
@ -15,6 +15,7 @@ import plac
|
||||||
import random
|
import random
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import spacy
|
import spacy
|
||||||
|
from spacy.util import minibatch, compounding
|
||||||
|
|
||||||
|
|
||||||
# training data
|
# training data
|
||||||
|
@ -62,14 +63,17 @@ def main(model=None, output_dir=None, n_iter=100):
|
||||||
for itn in range(n_iter):
|
for itn in range(n_iter):
|
||||||
random.shuffle(TRAIN_DATA)
|
random.shuffle(TRAIN_DATA)
|
||||||
losses = {}
|
losses = {}
|
||||||
for text, annotations in TRAIN_DATA:
|
# batch up the examples using spaCy's minibatch
|
||||||
|
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
|
||||||
|
for batch in batches:
|
||||||
|
texts, annotations = zip(*batch)
|
||||||
nlp.update(
|
nlp.update(
|
||||||
[text], # batch of texts
|
texts, # batch of texts
|
||||||
[annotations], # batch of annotations
|
annotations, # batch of annotations
|
||||||
drop=0.5, # dropout - make it harder to memorise data
|
drop=0.5, # dropout - make it harder to memorise data
|
||||||
sgd=optimizer, # callable to update weights
|
sgd=optimizer, # callable to update weights
|
||||||
losses=losses)
|
losses=losses)
|
||||||
print(losses)
|
print('Losses', losses)
|
||||||
|
|
||||||
# test the trained model
|
# test the trained model
|
||||||
for text, _ in TRAIN_DATA:
|
for text, _ in TRAIN_DATA:
|
||||||
|
|
|
@ -31,6 +31,7 @@ import plac
|
||||||
import random
|
import random
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import spacy
|
import spacy
|
||||||
|
from spacy.util import minibatch, compounding
|
||||||
|
|
||||||
|
|
||||||
# new entity label
|
# new entity label
|
||||||
|
@ -73,7 +74,7 @@ TRAIN_DATA = [
|
||||||
new_model_name=("New model name for model meta.", "option", "nm", str),
|
new_model_name=("New model name for model meta.", "option", "nm", str),
|
||||||
output_dir=("Optional output directory", "option", "o", Path),
|
output_dir=("Optional output directory", "option", "o", Path),
|
||||||
n_iter=("Number of training iterations", "option", "n", int))
|
n_iter=("Number of training iterations", "option", "n", int))
|
||||||
def main(model=None, new_model_name='animal', output_dir=None, n_iter=20):
|
def main(model=None, new_model_name='animal', output_dir=None, n_iter=10):
|
||||||
"""Set up the pipeline and entity recognizer, and train the new entity."""
|
"""Set up the pipeline and entity recognizer, and train the new entity."""
|
||||||
if model is not None:
|
if model is not None:
|
||||||
nlp = spacy.load(model) # load existing spaCy model
|
nlp = spacy.load(model) # load existing spaCy model
|
||||||
|
@ -104,10 +105,13 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=20):
|
||||||
for itn in range(n_iter):
|
for itn in range(n_iter):
|
||||||
random.shuffle(TRAIN_DATA)
|
random.shuffle(TRAIN_DATA)
|
||||||
losses = {}
|
losses = {}
|
||||||
for text, annotations in TRAIN_DATA:
|
# batch up the examples using spaCy's minibatch
|
||||||
nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
|
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
|
||||||
|
for batch in batches:
|
||||||
|
texts, annotations = zip(*batch)
|
||||||
|
nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
|
||||||
losses=losses)
|
losses=losses)
|
||||||
print(losses)
|
print('Losses', losses)
|
||||||
|
|
||||||
# test the trained model
|
# test the trained model
|
||||||
test_text = 'Do you like horses?'
|
test_text = 'Do you like horses?'
|
||||||
|
|
|
@ -13,6 +13,7 @@ import plac
|
||||||
import random
|
import random
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import spacy
|
import spacy
|
||||||
|
from spacy.util import minibatch, compounding
|
||||||
|
|
||||||
|
|
||||||
# training data
|
# training data
|
||||||
|
@ -62,9 +63,12 @@ def main(model=None, output_dir=None, n_iter=10):
|
||||||
for itn in range(n_iter):
|
for itn in range(n_iter):
|
||||||
random.shuffle(TRAIN_DATA)
|
random.shuffle(TRAIN_DATA)
|
||||||
losses = {}
|
losses = {}
|
||||||
for text, annotations in TRAIN_DATA:
|
# batch up the examples using spaCy's minibatch
|
||||||
nlp.update([text], [annotations], sgd=optimizer, losses=losses)
|
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
|
||||||
print(losses)
|
for batch in batches:
|
||||||
|
texts, annotations = zip(*batch)
|
||||||
|
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
|
||||||
|
print('Losses', losses)
|
||||||
|
|
||||||
# test the trained model
|
# test the trained model
|
||||||
test_text = "I like securities."
|
test_text = "I like securities."
|
||||||
|
|
|
@ -16,6 +16,7 @@ import plac
|
||||||
import random
|
import random
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import spacy
|
import spacy
|
||||||
|
from spacy.util import minibatch, compounding
|
||||||
|
|
||||||
|
|
||||||
# You need to define a mapping from your data's part-of-speech tag names to the
|
# You need to define a mapping from your data's part-of-speech tag names to the
|
||||||
|
@ -63,9 +64,12 @@ def main(lang='en', output_dir=None, n_iter=25):
|
||||||
for i in range(n_iter):
|
for i in range(n_iter):
|
||||||
random.shuffle(TRAIN_DATA)
|
random.shuffle(TRAIN_DATA)
|
||||||
losses = {}
|
losses = {}
|
||||||
for text, annotations in TRAIN_DATA:
|
# batch up the examples using spaCy's minibatch
|
||||||
nlp.update([text], [annotations], sgd=optimizer, losses=losses)
|
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
|
||||||
print(losses)
|
for batch in batches:
|
||||||
|
texts, annotations = zip(*batch)
|
||||||
|
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
|
||||||
|
print('Losses', losses)
|
||||||
|
|
||||||
# test the trained model
|
# test the trained model
|
||||||
test_text = "I like blue eggs"
|
test_text = "I like blue eggs"
|
||||||
|
|
|
@ -2,7 +2,7 @@ cython>=0.25
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=2.0.1,<2.1.0
|
preshed>=2.0.1,<2.1.0
|
||||||
thinc==7.0.0.dev1
|
thinc==7.0.0.dev2
|
||||||
blis>=0.2.2,<0.3.0
|
blis>=0.2.2,<0.3.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cytoolz>=0.9.0,<0.10.0
|
cytoolz>=0.9.0,<0.10.0
|
||||||
|
@ -11,7 +11,11 @@ ujson>=1.35
|
||||||
dill>=0.2,<0.3
|
dill>=0.2,<0.3
|
||||||
regex==2018.01.10
|
regex==2018.01.10
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
|
jsonschema>=2.6.0,<3.0.0
|
||||||
|
wasabi>=0.0.8,<1.1.0
|
||||||
|
pathlib==1.0.1; python_version < "3.4"
|
||||||
|
# Development dependencies
|
||||||
pytest>=4.0.0,<5.0.0
|
pytest>=4.0.0,<5.0.0
|
||||||
pytest-timeout>=1.3.0,<2.0.0
|
pytest-timeout>=1.3.0,<2.0.0
|
||||||
mock>=2.0.0,<3.0.0
|
mock>=2.0.0,<3.0.0
|
||||||
pathlib==1.0.1; python_version < "3.4"
|
flake8>=3.5.0,<3.6.0
|
||||||
|
|
4
setup.py
4
setup.py
|
@ -200,13 +200,15 @@ def setup_package():
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=2.0.1,<2.1.0",
|
"preshed>=2.0.1,<2.1.0",
|
||||||
"thinc==7.0.0.dev1",
|
"thinc==7.0.0.dev2",
|
||||||
"blis>=0.2.2,<0.3.0",
|
"blis>=0.2.2,<0.3.0",
|
||||||
"plac<1.0.0,>=0.9.6",
|
"plac<1.0.0,>=0.9.6",
|
||||||
"ujson>=1.35",
|
"ujson>=1.35",
|
||||||
"regex==2018.01.10",
|
"regex==2018.01.10",
|
||||||
"dill>=0.2,<0.3",
|
"dill>=0.2,<0.3",
|
||||||
"requests>=2.13.0,<3.0.0",
|
"requests>=2.13.0,<3.0.0",
|
||||||
|
"jsonschema>=2.6.0,<3.0.0",
|
||||||
|
"wasabi>=0.0.8,<1.1.0",
|
||||||
'pathlib==1.0.1; python_version < "3.4"',
|
'pathlib==1.0.1; python_version < "3.4"',
|
||||||
],
|
],
|
||||||
setup_requires=["wheel"],
|
setup_requires=["wheel"],
|
||||||
|
|
|
@ -1,9 +1,13 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
|
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
|
||||||
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
|
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
|
||||||
|
|
||||||
|
# These are imported as part of the API
|
||||||
|
from thinc.neural.util import prefer_gpu, require_gpu
|
||||||
|
|
||||||
from .cli.info import info as cli_info
|
from .cli.info import info as cli_info
|
||||||
from .glossary import explain
|
from .glossary import explain
|
||||||
from .about import __version__
|
from .about import __version__
|
||||||
|
@ -12,7 +16,7 @@ from . import util
|
||||||
|
|
||||||
|
|
||||||
def load(name, **overrides):
|
def load(name, **overrides):
|
||||||
depr_path = overrides.get('path')
|
depr_path = overrides.get("path")
|
||||||
if depr_path not in (True, False, None):
|
if depr_path not in (True, False, None):
|
||||||
deprecation_warning(Warnings.W001.format(path=depr_path))
|
deprecation_warning(Warnings.W001.format(path=depr_path))
|
||||||
return util.load_model(name, **overrides)
|
return util.load_model(name, **overrides)
|
||||||
|
|
|
@ -1,40 +1,41 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
# NB! This breaks in plac on Python 2!!
|
# NB! This breaks in plac on Python 2!!
|
||||||
# from __future__ import unicode_literals
|
# from __future__ import unicode_literals
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
import plac
|
import plac
|
||||||
import sys
|
import sys
|
||||||
|
from wasabi import Printer
|
||||||
from spacy.cli import download, link, info, package, train, pretrain, convert
|
from spacy.cli import download, link, info, package, train, pretrain, convert
|
||||||
from spacy.cli import vocab, init_model, profile, evaluate, validate
|
from spacy.cli import init_model, profile, evaluate, validate
|
||||||
from spacy.cli import ud_train, ud_evaluate
|
from spacy.cli import ud_train, ud_evaluate, debug_data
|
||||||
from spacy.util import prints
|
|
||||||
|
msg = Printer()
|
||||||
|
|
||||||
commands = {
|
commands = {
|
||||||
'download': download,
|
"download": download,
|
||||||
'link': link,
|
"link": link,
|
||||||
'info': info,
|
"info": info,
|
||||||
'train': train,
|
"train": train,
|
||||||
'pretrain': pretrain,
|
"pretrain": pretrain,
|
||||||
'ud-train': ud_train,
|
"debug-data": debug_data,
|
||||||
'evaluate': evaluate,
|
"ud-train": ud_train,
|
||||||
'ud-evaluate': ud_evaluate,
|
"evaluate": evaluate,
|
||||||
'convert': convert,
|
"ud-evaluate": ud_evaluate,
|
||||||
'package': package,
|
"convert": convert,
|
||||||
'vocab': vocab,
|
"package": package,
|
||||||
'init-model': init_model,
|
"init-model": init_model,
|
||||||
'profile': profile,
|
"profile": profile,
|
||||||
'validate': validate
|
"validate": validate,
|
||||||
}
|
}
|
||||||
if len(sys.argv) == 1:
|
if len(sys.argv) == 1:
|
||||||
prints(', '.join(commands), title="Available commands", exits=1)
|
msg.info("Available commands", ", ".join(commands), exits=1)
|
||||||
command = sys.argv.pop(1)
|
command = sys.argv.pop(1)
|
||||||
sys.argv[0] = 'spacy %s' % command
|
sys.argv[0] = "spacy %s" % command
|
||||||
if command in commands:
|
if command in commands:
|
||||||
plac.call(commands[command], sys.argv[1:])
|
plac.call(commands[command], sys.argv[1:])
|
||||||
else:
|
else:
|
||||||
prints(
|
available = "Available: {}".format(", ".join(commands))
|
||||||
"Available: %s" % ', '.join(commands),
|
msg.fail("Unknown command: {}".format(command), available, exits=1)
|
||||||
title="Unknown command: %s" % command,
|
|
||||||
exits=1)
|
|
||||||
|
|
303
spacy/_ml.py
303
spacy/_ml.py
|
@ -14,8 +14,7 @@ from thinc.api import uniqued, wrap, noop
|
||||||
from thinc.api import with_square_sequences
|
from thinc.api import with_square_sequences
|
||||||
from thinc.linear.linear import LinearModel
|
from thinc.linear.linear import LinearModel
|
||||||
from thinc.neural.ops import NumpyOps, CupyOps
|
from thinc.neural.ops import NumpyOps, CupyOps
|
||||||
from thinc.neural.util import get_array_module, copy_array
|
from thinc.neural.util import get_array_module
|
||||||
from thinc.neural._lsuv import svd_orthonormal
|
|
||||||
from thinc.neural.optimizers import Adam
|
from thinc.neural.optimizers import Adam
|
||||||
|
|
||||||
from thinc import describe
|
from thinc import describe
|
||||||
|
@ -30,39 +29,39 @@ from . import util
|
||||||
try:
|
try:
|
||||||
import torch.nn
|
import torch.nn
|
||||||
from thinc.extra.wrappers import PyTorchWrapperRNN
|
from thinc.extra.wrappers import PyTorchWrapperRNN
|
||||||
except:
|
except ImportError:
|
||||||
torch = None
|
torch = None
|
||||||
|
|
||||||
VECTORS_KEY = 'spacy_pretrained_vectors'
|
VECTORS_KEY = "spacy_pretrained_vectors"
|
||||||
|
|
||||||
|
|
||||||
def cosine(vec1, vec2):
|
def cosine(vec1, vec2):
|
||||||
xp = get_array_module(vec1)
|
xp = get_array_module(vec1)
|
||||||
norm1 = xp.linalg.norm(vec1)
|
norm1 = xp.linalg.norm(vec1)
|
||||||
norm2 = xp.linalg.norm(vec2)
|
norm2 = xp.linalg.norm(vec2)
|
||||||
if norm1 == 0. or norm2 == 0.:
|
if norm1 == 0.0 or norm2 == 0.0:
|
||||||
return 0
|
return 0
|
||||||
else:
|
else:
|
||||||
return vec1.dot(vec2) / (norm1 * norm2)
|
return vec1.dot(vec2) / (norm1 * norm2)
|
||||||
|
|
||||||
|
|
||||||
def create_default_optimizer(ops, **cfg):
|
def create_default_optimizer(ops, **cfg):
|
||||||
learn_rate = util.env_opt('learn_rate', 0.001)
|
learn_rate = util.env_opt("learn_rate", 0.001)
|
||||||
beta1 = util.env_opt('optimizer_B1', 0.8)
|
beta1 = util.env_opt("optimizer_B1", 0.8)
|
||||||
beta2 = util.env_opt('optimizer_B2', 0.8)
|
beta2 = util.env_opt("optimizer_B2", 0.8)
|
||||||
eps = util.env_opt('optimizer_eps', 0.00001)
|
eps = util.env_opt("optimizer_eps", 0.00001)
|
||||||
L2 = util.env_opt('L2_penalty', 1e-6)
|
L2 = util.env_opt("L2_penalty", 1e-6)
|
||||||
max_grad_norm = util.env_opt('grad_norm_clip', 5.)
|
max_grad_norm = util.env_opt("grad_norm_clip", 5.0)
|
||||||
optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1,
|
optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps)
|
||||||
beta2=beta2, eps=eps)
|
|
||||||
optimizer.max_grad_norm = max_grad_norm
|
optimizer.max_grad_norm = max_grad_norm
|
||||||
optimizer.device = ops.device
|
optimizer.device = ops.device
|
||||||
return optimizer
|
return optimizer
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def _flatten_add_lengths(seqs, pad=0, drop=0.):
|
def _flatten_add_lengths(seqs, pad=0, drop=0.0):
|
||||||
ops = Model.ops
|
ops = Model.ops
|
||||||
lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
|
lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
|
||||||
|
|
||||||
def finish_update(d_X, sgd=None):
|
def finish_update(d_X, sgd=None):
|
||||||
return ops.unflatten(d_X, lengths, pad=pad)
|
return ops.unflatten(d_X, lengths, pad=pad)
|
||||||
|
@ -74,14 +73,15 @@ def _flatten_add_lengths(seqs, pad=0, drop=0.):
|
||||||
def _zero_init(model):
|
def _zero_init(model):
|
||||||
def _zero_init_impl(self, X, y):
|
def _zero_init_impl(self, X, y):
|
||||||
self.W.fill(0)
|
self.W.fill(0)
|
||||||
|
|
||||||
model.on_data_hooks.append(_zero_init_impl)
|
model.on_data_hooks.append(_zero_init_impl)
|
||||||
if model.W is not None:
|
if model.W is not None:
|
||||||
model.W.fill(0.)
|
model.W.fill(0.0)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def _preprocess_doc(docs, drop=0.):
|
def _preprocess_doc(docs, drop=0.0):
|
||||||
keys = [doc.to_array(LOWER) for doc in docs]
|
keys = [doc.to_array(LOWER) for doc in docs]
|
||||||
ops = Model.ops
|
ops = Model.ops
|
||||||
# The dtype here matches what thinc is expecting -- which differs per
|
# The dtype here matches what thinc is expecting -- which differs per
|
||||||
|
@ -89,11 +89,12 @@ def _preprocess_doc(docs, drop=0.):
|
||||||
# is fixed on Thinc's side.
|
# is fixed on Thinc's side.
|
||||||
lengths = ops.asarray([arr.shape[0] for arr in keys], dtype=numpy.int_)
|
lengths = ops.asarray([arr.shape[0] for arr in keys], dtype=numpy.int_)
|
||||||
keys = ops.xp.concatenate(keys)
|
keys = ops.xp.concatenate(keys)
|
||||||
vals = ops.allocate(keys.shape) + 1.
|
vals = ops.allocate(keys.shape) + 1.0
|
||||||
return (keys, vals, lengths), None
|
return (keys, vals, lengths), None
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def _preprocess_doc_bigrams(docs, drop=0.):
|
def _preprocess_doc_bigrams(docs, drop=0.0):
|
||||||
unigrams = [doc.to_array(LOWER) for doc in docs]
|
unigrams = [doc.to_array(LOWER) for doc in docs]
|
||||||
ops = Model.ops
|
ops = Model.ops
|
||||||
bigrams = [ops.ngrams(2, doc_unis) for doc_unis in unigrams]
|
bigrams = [ops.ngrams(2, doc_unis) for doc_unis in unigrams]
|
||||||
|
@ -104,27 +105,29 @@ def _preprocess_doc_bigrams(docs, drop=0.):
|
||||||
# is fixed on Thinc's side.
|
# is fixed on Thinc's side.
|
||||||
lengths = ops.asarray([arr.shape[0] for arr in keys], dtype=numpy.int_)
|
lengths = ops.asarray([arr.shape[0] for arr in keys], dtype=numpy.int_)
|
||||||
keys = ops.xp.concatenate(keys)
|
keys = ops.xp.concatenate(keys)
|
||||||
vals = ops.asarray(ops.xp.concatenate(vals), dtype='f')
|
vals = ops.asarray(ops.xp.concatenate(vals), dtype="f")
|
||||||
return (keys, vals, lengths), None
|
return (keys, vals, lengths), None
|
||||||
|
|
||||||
|
|
||||||
@describe.on_data(_set_dimensions_if_needed,
|
@describe.on_data(
|
||||||
lambda model, X, y: model.init_weights(model))
|
_set_dimensions_if_needed, lambda model, X, y: model.init_weights(model)
|
||||||
|
)
|
||||||
@describe.attributes(
|
@describe.attributes(
|
||||||
nI=Dimension("Input size"),
|
nI=Dimension("Input size"),
|
||||||
nF=Dimension("Number of features"),
|
nF=Dimension("Number of features"),
|
||||||
nO=Dimension("Output size"),
|
nO=Dimension("Output size"),
|
||||||
nP=Dimension("Maxout pieces"),
|
nP=Dimension("Maxout pieces"),
|
||||||
W=Synapses("Weights matrix",
|
W=Synapses("Weights matrix", lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI)),
|
||||||
lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI)),
|
b=Biases("Bias vector", lambda obj: (obj.nO, obj.nP)),
|
||||||
b=Biases("Bias vector",
|
pad=Synapses(
|
||||||
lambda obj: (obj.nO, obj.nP)),
|
"Pad",
|
||||||
pad=Synapses("Pad",
|
|
||||||
lambda obj: (1, obj.nF, obj.nO, obj.nP),
|
lambda obj: (1, obj.nF, obj.nO, obj.nP),
|
||||||
lambda M, ops: ops.normal_init(M, 1.)),
|
lambda M, ops: ops.normal_init(M, 1.0),
|
||||||
|
),
|
||||||
d_W=Gradient("W"),
|
d_W=Gradient("W"),
|
||||||
d_pad=Gradient("pad"),
|
d_pad=Gradient("pad"),
|
||||||
d_b=Gradient("b"))
|
d_b=Gradient("b"),
|
||||||
|
)
|
||||||
class PrecomputableAffine(Model):
|
class PrecomputableAffine(Model):
|
||||||
def __init__(self, nO=None, nI=None, nF=None, nP=None, **kwargs):
|
def __init__(self, nO=None, nI=None, nF=None, nP=None, **kwargs):
|
||||||
Model.__init__(self, **kwargs)
|
Model.__init__(self, **kwargs)
|
||||||
|
@ -133,9 +136,10 @@ class PrecomputableAffine(Model):
|
||||||
self.nI = nI
|
self.nI = nI
|
||||||
self.nF = nF
|
self.nF = nF
|
||||||
|
|
||||||
def begin_update(self, X, drop=0.):
|
def begin_update(self, X, drop=0.0):
|
||||||
Yf = self.ops.gemm(X,
|
Yf = self.ops.gemm(
|
||||||
self.W.reshape((self.nF*self.nO*self.nP, self.nI)), trans2=True)
|
X, self.W.reshape((self.nF * self.nO * self.nP, self.nI)), trans2=True
|
||||||
|
)
|
||||||
Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO, self.nP))
|
Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO, self.nP))
|
||||||
Yf = self._add_padding(Yf)
|
Yf = self._add_padding(Yf)
|
||||||
|
|
||||||
|
@ -146,15 +150,16 @@ class PrecomputableAffine(Model):
|
||||||
Xf = Xf.reshape((Xf.shape[0], self.nF * self.nI))
|
Xf = Xf.reshape((Xf.shape[0], self.nF * self.nI))
|
||||||
|
|
||||||
self.d_b += dY.sum(axis=0)
|
self.d_b += dY.sum(axis=0)
|
||||||
dY = dY.reshape((dY.shape[0], self.nO*self.nP))
|
dY = dY.reshape((dY.shape[0], self.nO * self.nP))
|
||||||
|
|
||||||
Wopfi = self.W.transpose((1, 2, 0, 3))
|
Wopfi = self.W.transpose((1, 2, 0, 3))
|
||||||
Wopfi = self.ops.xp.ascontiguousarray(Wopfi)
|
Wopfi = self.ops.xp.ascontiguousarray(Wopfi)
|
||||||
Wopfi = Wopfi.reshape((self.nO*self.nP, self.nF * self.nI))
|
Wopfi = Wopfi.reshape((self.nO * self.nP, self.nF * self.nI))
|
||||||
dXf = self.ops.gemm(dY.reshape((dY.shape[0], self.nO*self.nP)), Wopfi)
|
dXf = self.ops.gemm(dY.reshape((dY.shape[0], self.nO * self.nP)), Wopfi)
|
||||||
|
|
||||||
# Reuse the buffer
|
# Reuse the buffer
|
||||||
dWopfi = Wopfi; dWopfi.fill(0.)
|
dWopfi = Wopfi
|
||||||
|
dWopfi.fill(0.0)
|
||||||
self.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
|
self.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
|
||||||
dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI))
|
dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI))
|
||||||
# (o, p, f, i) --> (f, o, p, i)
|
# (o, p, f, i) --> (f, o, p, i)
|
||||||
|
@ -163,6 +168,7 @@ class PrecomputableAffine(Model):
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
||||||
return dXf.reshape((dXf.shape[0], self.nF, self.nI))
|
return dXf.reshape((dXf.shape[0], self.nF, self.nI))
|
||||||
|
|
||||||
return Yf, backward
|
return Yf, backward
|
||||||
|
|
||||||
def _add_padding(self, Yf):
|
def _add_padding(self, Yf):
|
||||||
|
@ -171,7 +177,7 @@ class PrecomputableAffine(Model):
|
||||||
|
|
||||||
def _backprop_padding(self, dY, ids):
|
def _backprop_padding(self, dY, ids):
|
||||||
# (1, nF, nO, nP) += (nN, nF, nO, nP) where IDs (nN, nF) < 0
|
# (1, nF, nO, nP) += (nN, nF, nO, nP) where IDs (nN, nF) < 0
|
||||||
mask = ids < 0.
|
mask = ids < 0.0
|
||||||
mask = mask.sum(axis=1)
|
mask = mask.sum(axis=1)
|
||||||
d_pad = dY * mask.reshape((ids.shape[0], 1, 1))
|
d_pad = dY * mask.reshape((ids.shape[0], 1, 1))
|
||||||
self.d_pad += d_pad.sum(axis=0)
|
self.d_pad += d_pad.sum(axis=0)
|
||||||
|
@ -179,33 +185,36 @@ class PrecomputableAffine(Model):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def init_weights(model):
|
def init_weights(model):
|
||||||
'''This is like the 'layer sequential unit variance', but instead
|
"""This is like the 'layer sequential unit variance', but instead
|
||||||
of taking the actual inputs, we randomly generate whitened data.
|
of taking the actual inputs, we randomly generate whitened data.
|
||||||
|
|
||||||
Why's this all so complicated? We have a huge number of inputs,
|
Why's this all so complicated? We have a huge number of inputs,
|
||||||
and the maxout unit makes guessing the dynamics tricky. Instead
|
and the maxout unit makes guessing the dynamics tricky. Instead
|
||||||
we set the maxout weights to values that empirically result in
|
we set the maxout weights to values that empirically result in
|
||||||
whitened outputs given whitened inputs.
|
whitened outputs given whitened inputs.
|
||||||
'''
|
"""
|
||||||
if (model.W**2).sum() != 0.:
|
if (model.W ** 2).sum() != 0.0:
|
||||||
return
|
return
|
||||||
ops = model.ops
|
ops = model.ops
|
||||||
xp = ops.xp
|
xp = ops.xp
|
||||||
ops.normal_init(model.W, model.nF * model.nI, inplace=True)
|
ops.normal_init(model.W, model.nF * model.nI, inplace=True)
|
||||||
|
|
||||||
ids = ops.allocate((5000, model.nF), dtype='f')
|
ids = ops.allocate((5000, model.nF), dtype="f")
|
||||||
ids += xp.random.uniform(0, 1000, ids.shape)
|
ids += xp.random.uniform(0, 1000, ids.shape)
|
||||||
ids = ops.asarray(ids, dtype='i')
|
ids = ops.asarray(ids, dtype="i")
|
||||||
tokvecs = ops.allocate((5000, model.nI), dtype='f')
|
tokvecs = ops.allocate((5000, model.nI), dtype="f")
|
||||||
tokvecs += xp.random.normal(loc=0., scale=1.,
|
tokvecs += xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
|
||||||
size=tokvecs.size).reshape(tokvecs.shape)
|
tokvecs.shape
|
||||||
|
)
|
||||||
|
|
||||||
def predict(ids, tokvecs):
|
def predict(ids, tokvecs):
|
||||||
# nS ids. nW tokvecs. Exclude the padding array.
|
# nS ids. nW tokvecs. Exclude the padding array.
|
||||||
hiddens = model(tokvecs[:-1]) # (nW, f, o, p)
|
hiddens = model(tokvecs[:-1]) # (nW, f, o, p)
|
||||||
vectors = model.ops.allocate((ids.shape[0], model.nO * model.nP), dtype='f')
|
vectors = model.ops.allocate((ids.shape[0], model.nO * model.nP), dtype="f")
|
||||||
# need nS vectors
|
# need nS vectors
|
||||||
hiddens = hiddens.reshape((hiddens.shape[0] * model.nF, model.nO * model.nP))
|
hiddens = hiddens.reshape(
|
||||||
|
(hiddens.shape[0] * model.nF, model.nO * model.nP)
|
||||||
|
)
|
||||||
model.ops.scatter_add(vectors, ids.flatten(), hiddens)
|
model.ops.scatter_add(vectors, ids.flatten(), hiddens)
|
||||||
vectors = vectors.reshape((vectors.shape[0], model.nO, model.nP))
|
vectors = vectors.reshape((vectors.shape[0], model.nO, model.nP))
|
||||||
vectors += model.b
|
vectors += model.b
|
||||||
|
@ -238,7 +247,8 @@ def link_vectors_to_models(vocab):
|
||||||
if vectors.data.size != 0:
|
if vectors.data.size != 0:
|
||||||
print(
|
print(
|
||||||
"Warning: Unnamed vectors -- this won't allow multiple vectors "
|
"Warning: Unnamed vectors -- this won't allow multiple vectors "
|
||||||
"models to be loaded. (Shape: (%d, %d))" % vectors.data.shape)
|
"models to be loaded. (Shape: (%d, %d))" % vectors.data.shape
|
||||||
|
)
|
||||||
ops = Model.ops
|
ops = Model.ops
|
||||||
for word in vocab:
|
for word in vocab:
|
||||||
if word.orth in vectors.key2row:
|
if word.orth in vectors.key2row:
|
||||||
|
@ -254,28 +264,31 @@ def link_vectors_to_models(vocab):
|
||||||
def PyTorchBiLSTM(nO, nI, depth, dropout=0.2):
|
def PyTorchBiLSTM(nO, nI, depth, dropout=0.2):
|
||||||
if depth == 0:
|
if depth == 0:
|
||||||
return layerize(noop())
|
return layerize(noop())
|
||||||
model = torch.nn.LSTM(nI, nO//2, depth, bidirectional=True, dropout=dropout)
|
model = torch.nn.LSTM(nI, nO // 2, depth, bidirectional=True, dropout=dropout)
|
||||||
return with_square_sequences(PyTorchWrapperRNN(model))
|
return with_square_sequences(PyTorchWrapperRNN(model))
|
||||||
|
|
||||||
|
|
||||||
def Tok2Vec(width, embed_size, **kwargs):
|
def Tok2Vec(width, embed_size, **kwargs):
|
||||||
pretrained_vectors = kwargs.get('pretrained_vectors', None)
|
pretrained_vectors = kwargs.get("pretrained_vectors", None)
|
||||||
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2)
|
cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 2)
|
||||||
subword_features = kwargs.get('subword_features', True)
|
subword_features = kwargs.get("subword_features", True)
|
||||||
conv_depth = kwargs.get('conv_depth', 4)
|
conv_depth = kwargs.get("conv_depth", 4)
|
||||||
bilstm_depth = kwargs.get('bilstm_depth', 0)
|
bilstm_depth = kwargs.get("bilstm_depth", 0)
|
||||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone,
|
with Model.define_operators(
|
||||||
'+': add, '*': reapply}):
|
{">>": chain, "|": concatenate, "**": clone, "+": add, "*": reapply}
|
||||||
norm = HashEmbed(width, embed_size, column=cols.index(NORM),
|
):
|
||||||
name='embed_norm')
|
norm = HashEmbed(width, embed_size, column=cols.index(NORM), name="embed_norm")
|
||||||
if subword_features:
|
if subword_features:
|
||||||
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX),
|
prefix = HashEmbed(
|
||||||
name='embed_prefix')
|
width, embed_size // 2, column=cols.index(PREFIX), name="embed_prefix"
|
||||||
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX),
|
)
|
||||||
name='embed_suffix')
|
suffix = HashEmbed(
|
||||||
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE),
|
width, embed_size // 2, column=cols.index(SUFFIX), name="embed_suffix"
|
||||||
name='embed_shape')
|
)
|
||||||
|
shape = HashEmbed(
|
||||||
|
width, embed_size // 2, column=cols.index(SHAPE), name="embed_shape"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
prefix, suffix, shape = (None, None, None)
|
prefix, suffix, shape = (None, None, None)
|
||||||
if pretrained_vectors is not None:
|
if pretrained_vectors is not None:
|
||||||
|
@ -284,28 +297,29 @@ def Tok2Vec(width, embed_size, **kwargs):
|
||||||
if subword_features:
|
if subword_features:
|
||||||
embed = uniqued(
|
embed = uniqued(
|
||||||
(glove | norm | prefix | suffix | shape)
|
(glove | norm | prefix | suffix | shape)
|
||||||
>> LN(Maxout(width, width*5, pieces=3)), column=cols.index(ORTH))
|
>> LN(Maxout(width, width * 5, pieces=3)),
|
||||||
|
column=cols.index(ORTH),
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
embed = uniqued(
|
embed = uniqued(
|
||||||
(glove | norm)
|
(glove | norm) >> LN(Maxout(width, width * 2, pieces=3)),
|
||||||
>> LN(Maxout(width, width*2, pieces=3)), column=cols.index(ORTH))
|
column=cols.index(ORTH),
|
||||||
|
)
|
||||||
elif subword_features:
|
elif subword_features:
|
||||||
embed = uniqued(
|
embed = uniqued(
|
||||||
(norm | prefix | suffix | shape)
|
(norm | prefix | suffix | shape)
|
||||||
>> LN(Maxout(width, width*4, pieces=3)), column=cols.index(ORTH))
|
>> LN(Maxout(width, width * 4, pieces=3)),
|
||||||
|
column=cols.index(ORTH),
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
embed = norm
|
embed = norm
|
||||||
|
|
||||||
convolution = Residual(
|
convolution = Residual(
|
||||||
ExtractWindow(nW=1)
|
ExtractWindow(nW=1)
|
||||||
>> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
|
>> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
|
||||||
)
|
)
|
||||||
tok2vec = (
|
tok2vec = FeatureExtracter(cols) >> with_flatten(
|
||||||
FeatureExtracter(cols)
|
embed >> convolution ** conv_depth, pad=conv_depth
|
||||||
>> with_flatten(
|
|
||||||
embed
|
|
||||||
>> convolution ** conv_depth, pad=conv_depth
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
if bilstm_depth >= 1:
|
if bilstm_depth >= 1:
|
||||||
tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth)
|
tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth)
|
||||||
|
@ -316,7 +330,7 @@ def Tok2Vec(width, embed_size, **kwargs):
|
||||||
|
|
||||||
|
|
||||||
def reapply(layer, n_times):
|
def reapply(layer, n_times):
|
||||||
def reapply_fwd(X, drop=0.):
|
def reapply_fwd(X, drop=0.0):
|
||||||
backprops = []
|
backprops = []
|
||||||
for i in range(n_times):
|
for i in range(n_times):
|
||||||
Y, backprop = layer.begin_update(X, drop=drop)
|
Y, backprop = layer.begin_update(X, drop=drop)
|
||||||
|
@ -334,12 +348,14 @@ def reapply(layer, n_times):
|
||||||
return dX
|
return dX
|
||||||
|
|
||||||
return Y, reapply_bwd
|
return Y, reapply_bwd
|
||||||
|
|
||||||
return wrap(reapply_fwd, layer)
|
return wrap(reapply_fwd, layer)
|
||||||
|
|
||||||
|
|
||||||
def asarray(ops, dtype):
|
def asarray(ops, dtype):
|
||||||
def forward(X, drop=0.):
|
def forward(X, drop=0.0):
|
||||||
return ops.asarray(X, dtype=dtype), None
|
return ops.asarray(X, dtype=dtype), None
|
||||||
|
|
||||||
return layerize(forward)
|
return layerize(forward)
|
||||||
|
|
||||||
|
|
||||||
|
@ -347,7 +363,7 @@ def _divide_array(X, size):
|
||||||
parts = []
|
parts = []
|
||||||
index = 0
|
index = 0
|
||||||
while index < len(X):
|
while index < len(X):
|
||||||
parts.append(X[index:index + size])
|
parts.append(X[index : index + size])
|
||||||
index += size
|
index += size
|
||||||
return parts
|
return parts
|
||||||
|
|
||||||
|
@ -356,7 +372,7 @@ def get_col(idx):
|
||||||
if idx < 0:
|
if idx < 0:
|
||||||
raise IndexError(Errors.E066.format(value=idx))
|
raise IndexError(Errors.E066.format(value=idx))
|
||||||
|
|
||||||
def forward(X, drop=0.):
|
def forward(X, drop=0.0):
|
||||||
if isinstance(X, numpy.ndarray):
|
if isinstance(X, numpy.ndarray):
|
||||||
ops = NumpyOps()
|
ops = NumpyOps()
|
||||||
else:
|
else:
|
||||||
|
@ -377,7 +393,7 @@ def doc2feats(cols=None):
|
||||||
if cols is None:
|
if cols is None:
|
||||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
|
|
||||||
def forward(docs, drop=0.):
|
def forward(docs, drop=0.0):
|
||||||
feats = []
|
feats = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
feats.append(doc.to_array(cols))
|
feats.append(doc.to_array(cols))
|
||||||
|
@ -389,13 +405,14 @@ def doc2feats(cols=None):
|
||||||
|
|
||||||
|
|
||||||
def print_shape(prefix):
|
def print_shape(prefix):
|
||||||
def forward(X, drop=0.):
|
def forward(X, drop=0.0):
|
||||||
return X, lambda dX, **kwargs: dX
|
return X, lambda dX, **kwargs: dX
|
||||||
|
|
||||||
return layerize(forward)
|
return layerize(forward)
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def get_token_vectors(tokens_attrs_vectors, drop=0.):
|
def get_token_vectors(tokens_attrs_vectors, drop=0.0):
|
||||||
tokens, attrs, vectors = tokens_attrs_vectors
|
tokens, attrs, vectors = tokens_attrs_vectors
|
||||||
|
|
||||||
def backward(d_output, sgd=None):
|
def backward(d_output, sgd=None):
|
||||||
|
@ -405,17 +422,17 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def logistic(X, drop=0.):
|
def logistic(X, drop=0.0):
|
||||||
xp = get_array_module(X)
|
xp = get_array_module(X)
|
||||||
if not isinstance(X, xp.ndarray):
|
if not isinstance(X, xp.ndarray):
|
||||||
X = xp.asarray(X)
|
X = xp.asarray(X)
|
||||||
# Clip to range (-10, 10)
|
# Clip to range (-10, 10)
|
||||||
X = xp.minimum(X, 10., X)
|
X = xp.minimum(X, 10.0, X)
|
||||||
X = xp.maximum(X, -10., X)
|
X = xp.maximum(X, -10.0, X)
|
||||||
Y = 1. / (1. + xp.exp(-X))
|
Y = 1.0 / (1.0 + xp.exp(-X))
|
||||||
|
|
||||||
def logistic_bwd(dY, sgd=None):
|
def logistic_bwd(dY, sgd=None):
|
||||||
dX = dY * (Y * (1-Y))
|
dX = dY * (Y * (1 - Y))
|
||||||
return dX
|
return dX
|
||||||
|
|
||||||
return Y, logistic_bwd
|
return Y, logistic_bwd
|
||||||
|
@ -424,12 +441,13 @@ def logistic(X, drop=0.):
|
||||||
def zero_init(model):
|
def zero_init(model):
|
||||||
def _zero_init_impl(self, X, y):
|
def _zero_init_impl(self, X, y):
|
||||||
self.W.fill(0)
|
self.W.fill(0)
|
||||||
|
|
||||||
model.on_data_hooks.append(_zero_init_impl)
|
model.on_data_hooks.append(_zero_init_impl)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def preprocess_doc(docs, drop=0.):
|
def preprocess_doc(docs, drop=0.0):
|
||||||
keys = [doc.to_array([LOWER]) for doc in docs]
|
keys = [doc.to_array([LOWER]) for doc in docs]
|
||||||
ops = Model.ops
|
ops = Model.ops
|
||||||
lengths = ops.asarray([arr.shape[0] for arr in keys])
|
lengths = ops.asarray([arr.shape[0] for arr in keys])
|
||||||
|
@ -439,31 +457,32 @@ def preprocess_doc(docs, drop=0.):
|
||||||
|
|
||||||
|
|
||||||
def getitem(i):
|
def getitem(i):
|
||||||
def getitem_fwd(X, drop=0.):
|
def getitem_fwd(X, drop=0.0):
|
||||||
return X[i], None
|
return X[i], None
|
||||||
|
|
||||||
return layerize(getitem_fwd)
|
return layerize(getitem_fwd)
|
||||||
|
|
||||||
|
|
||||||
def build_tagger_model(nr_class, **cfg):
|
def build_tagger_model(nr_class, **cfg):
|
||||||
embed_size = util.env_opt('embed_size', 2000)
|
embed_size = util.env_opt("embed_size", 2000)
|
||||||
if 'token_vector_width' in cfg:
|
if "token_vector_width" in cfg:
|
||||||
token_vector_width = cfg['token_vector_width']
|
token_vector_width = cfg["token_vector_width"]
|
||||||
else:
|
else:
|
||||||
token_vector_width = util.env_opt('token_vector_width', 96)
|
token_vector_width = util.env_opt("token_vector_width", 96)
|
||||||
pretrained_vectors = cfg.get('pretrained_vectors')
|
pretrained_vectors = cfg.get("pretrained_vectors")
|
||||||
subword_features = cfg.get('subword_features', True)
|
subword_features = cfg.get("subword_features", True)
|
||||||
with Model.define_operators({'>>': chain, '+': add}):
|
with Model.define_operators({">>": chain, "+": add}):
|
||||||
if 'tok2vec' in cfg:
|
if "tok2vec" in cfg:
|
||||||
tok2vec = cfg['tok2vec']
|
tok2vec = cfg["tok2vec"]
|
||||||
else:
|
else:
|
||||||
tok2vec = Tok2Vec(token_vector_width, embed_size,
|
tok2vec = Tok2Vec(
|
||||||
subword_features=subword_features,
|
token_vector_width,
|
||||||
pretrained_vectors=pretrained_vectors)
|
embed_size,
|
||||||
|
subword_features=subword_features,
|
||||||
|
pretrained_vectors=pretrained_vectors,
|
||||||
|
)
|
||||||
softmax = with_flatten(Softmax(nr_class, token_vector_width))
|
softmax = with_flatten(Softmax(nr_class, token_vector_width))
|
||||||
model = (
|
model = tok2vec >> softmax
|
||||||
tok2vec
|
|
||||||
>> softmax
|
|
||||||
)
|
|
||||||
model.nI = None
|
model.nI = None
|
||||||
model.tok2vec = tok2vec
|
model.tok2vec = tok2vec
|
||||||
model.softmax = softmax
|
model.softmax = softmax
|
||||||
|
@ -471,10 +490,10 @@ def build_tagger_model(nr_class, **cfg):
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def SpacyVectors(docs, drop=0.):
|
def SpacyVectors(docs, drop=0.0):
|
||||||
batch = []
|
batch = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
indices = numpy.zeros((len(doc),), dtype='i')
|
indices = numpy.zeros((len(doc),), dtype="i")
|
||||||
for i, word in enumerate(doc):
|
for i, word in enumerate(doc):
|
||||||
if word.orth in doc.vocab.vectors.key2row:
|
if word.orth in doc.vocab.vectors.key2row:
|
||||||
indices[i] = doc.vocab.vectors.key2row[word.orth]
|
indices[i] = doc.vocab.vectors.key2row[word.orth]
|
||||||
|
@ -486,12 +505,11 @@ def SpacyVectors(docs, drop=0.):
|
||||||
|
|
||||||
|
|
||||||
def build_text_classifier(nr_class, width=64, **cfg):
|
def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
depth = cfg.get('depth', 2)
|
depth = cfg.get("depth", 2)
|
||||||
nr_vector = cfg.get('nr_vector', 5000)
|
nr_vector = cfg.get("nr_vector", 5000)
|
||||||
pretrained_dims = cfg.get('pretrained_dims', 0)
|
pretrained_dims = cfg.get("pretrained_dims", 0)
|
||||||
with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
|
with Model.define_operators({">>": chain, "+": add, "|": concatenate, "**": clone}):
|
||||||
'**': clone}):
|
if cfg.get("low_data") and pretrained_dims:
|
||||||
if cfg.get('low_data') and pretrained_dims:
|
|
||||||
model = (
|
model = (
|
||||||
SpacyVectors
|
SpacyVectors
|
||||||
>> flatten_add_lengths
|
>> flatten_add_lengths
|
||||||
|
@ -505,41 +523,35 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
return model
|
return model
|
||||||
|
|
||||||
lower = HashEmbed(width, nr_vector, column=1)
|
lower = HashEmbed(width, nr_vector, column=1)
|
||||||
prefix = HashEmbed(width//2, nr_vector, column=2)
|
prefix = HashEmbed(width // 2, nr_vector, column=2)
|
||||||
suffix = HashEmbed(width//2, nr_vector, column=3)
|
suffix = HashEmbed(width // 2, nr_vector, column=3)
|
||||||
shape = HashEmbed(width//2, nr_vector, column=4)
|
shape = HashEmbed(width // 2, nr_vector, column=4)
|
||||||
|
|
||||||
trained_vectors = (
|
trained_vectors = FeatureExtracter(
|
||||||
FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID])
|
[ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
|
||||||
>> with_flatten(
|
) >> with_flatten(
|
||||||
uniqued(
|
uniqued(
|
||||||
(lower | prefix | suffix | shape)
|
(lower | prefix | suffix | shape)
|
||||||
>> LN(Maxout(width, width+(width//2)*3)),
|
>> LN(Maxout(width, width + (width // 2) * 3)),
|
||||||
column=0
|
column=0,
|
||||||
)
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
if pretrained_dims:
|
if pretrained_dims:
|
||||||
static_vectors = (
|
static_vectors = SpacyVectors >> with_flatten(
|
||||||
SpacyVectors
|
Affine(width, pretrained_dims)
|
||||||
>> with_flatten(Affine(width, pretrained_dims))
|
|
||||||
)
|
)
|
||||||
# TODO Make concatenate support lists
|
# TODO Make concatenate support lists
|
||||||
vectors = concatenate_lists(trained_vectors, static_vectors)
|
vectors = concatenate_lists(trained_vectors, static_vectors)
|
||||||
vectors_width = width*2
|
vectors_width = width * 2
|
||||||
else:
|
else:
|
||||||
vectors = trained_vectors
|
vectors = trained_vectors
|
||||||
vectors_width = width
|
vectors_width = width
|
||||||
static_vectors = None
|
static_vectors = None
|
||||||
tok2vec = (
|
tok2vec = vectors >> with_flatten(
|
||||||
vectors
|
LN(Maxout(width, vectors_width))
|
||||||
>> with_flatten(
|
>> Residual((ExtractWindow(nW=1) >> LN(Maxout(width, width * 3)))) ** depth,
|
||||||
LN(Maxout(width, vectors_width))
|
pad=depth,
|
||||||
>> Residual(
|
|
||||||
(ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
|
|
||||||
) ** depth, pad=depth
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
cnn_model = (
|
cnn_model = (
|
||||||
tok2vec
|
tok2vec
|
||||||
|
@ -550,13 +562,10 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
>> zero_init(Affine(nr_class, width, drop_factor=0.0))
|
>> zero_init(Affine(nr_class, width, drop_factor=0.0))
|
||||||
)
|
)
|
||||||
|
|
||||||
linear_model = (
|
linear_model = _preprocess_doc >> LinearModel(nr_class)
|
||||||
_preprocess_doc
|
|
||||||
>> LinearModel(nr_class)
|
|
||||||
)
|
|
||||||
model = (
|
model = (
|
||||||
(linear_model | cnn_model)
|
(linear_model | cnn_model)
|
||||||
>> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
|
>> zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0))
|
||||||
>> logistic
|
>> logistic
|
||||||
)
|
)
|
||||||
model.tok2vec = tok2vec
|
model.tok2vec = tok2vec
|
||||||
|
@ -566,9 +575,9 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def flatten(seqs, drop=0.):
|
def flatten(seqs, drop=0.0):
|
||||||
ops = Model.ops
|
ops = Model.ops
|
||||||
lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
|
lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
|
||||||
|
|
||||||
def finish_update(d_X, sgd=None):
|
def finish_update(d_X, sgd=None):
|
||||||
return ops.unflatten(d_X, lengths, pad=0)
|
return ops.unflatten(d_X, lengths, pad=0)
|
||||||
|
@ -583,14 +592,14 @@ def concatenate_lists(*layers, **kwargs): # pragma: no cover
|
||||||
"""
|
"""
|
||||||
if not layers:
|
if not layers:
|
||||||
return noop()
|
return noop()
|
||||||
drop_factor = kwargs.get('drop_factor', 1.0)
|
drop_factor = kwargs.get("drop_factor", 1.0)
|
||||||
ops = layers[0].ops
|
ops = layers[0].ops
|
||||||
layers = [chain(layer, flatten) for layer in layers]
|
layers = [chain(layer, flatten) for layer in layers]
|
||||||
concat = concatenate(*layers)
|
concat = concatenate(*layers)
|
||||||
|
|
||||||
def concatenate_lists_fwd(Xs, drop=0.):
|
def concatenate_lists_fwd(Xs, drop=0.0):
|
||||||
drop *= drop_factor
|
drop *= drop_factor
|
||||||
lengths = ops.asarray([len(X) for X in Xs], dtype='i')
|
lengths = ops.asarray([len(X) for X in Xs], dtype="i")
|
||||||
flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
|
flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
|
||||||
ys = ops.unflatten(flat_y, lengths)
|
ys = ops.unflatten(flat_y, lengths)
|
||||||
|
|
||||||
|
|
|
@ -1,16 +1,17 @@
|
||||||
# inspired from:
|
# inspired from:
|
||||||
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
|
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
|
||||||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||||
|
# fmt: off
|
||||||
|
|
||||||
__title__ = 'spacy-nightly'
|
__title__ = "spacy-nightly"
|
||||||
__version__ = '2.1.0a3'
|
__version__ = "2.1.0a3"
|
||||||
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
|
||||||
__uri__ = 'https://spacy.io'
|
__uri__ = "https://spacy.io"
|
||||||
__author__ = 'Explosion AI'
|
__author__ = "Explosion AI"
|
||||||
__email__ = 'contact@explosion.ai'
|
__email__ = "contact@explosion.ai"
|
||||||
__license__ = 'MIT'
|
__license__ = "MIT"
|
||||||
__release__ = False
|
__release__ = False
|
||||||
|
|
||||||
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json'
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json'
|
__shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json"
|
||||||
|
|
|
@ -1,14 +1,13 @@
|
||||||
from .download import download
|
from .download import download # noqa: F401
|
||||||
from .info import info
|
from .info import info # noqa: F401
|
||||||
from .link import link
|
from .link import link # noqa: F401
|
||||||
from .package import package
|
from .package import package # noqa: F401
|
||||||
from .profile import profile
|
from .profile import profile # noqa: F401
|
||||||
from .train import train
|
from .train import train # noqa: F401
|
||||||
from .pretrain import pretrain
|
from .pretrain import pretrain # noqa: F401
|
||||||
from .evaluate import evaluate
|
from .debug_data import debug_data # noqa: F401
|
||||||
from .convert import convert
|
from .evaluate import evaluate # noqa: F401
|
||||||
from .vocab import make_vocab as vocab
|
from .convert import convert # noqa: F401
|
||||||
from .init_model import init_model
|
from .init_model import init_model # noqa: F401
|
||||||
from .validate import validate
|
from .validate import validate # noqa: F401
|
||||||
from .ud_train import main as ud_train
|
from .ud import ud_train, ud_evaluate # noqa: F401
|
||||||
from .conll17_ud_eval import main as ud_evaluate
|
|
||||||
|
|
|
@ -2,6 +2,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
|
|
||||||
class Messages(object):
|
class Messages(object):
|
||||||
M001 = ("Download successful but linking failed")
|
M001 = ("Download successful but linking failed")
|
||||||
M002 = ("Creating a shortcut link for 'en' didn't work (maybe you "
|
M002 = ("Creating a shortcut link for 'en' didn't work (maybe you "
|
||||||
|
@ -73,3 +75,31 @@ class Messages(object):
|
||||||
M052 = ("Not a valid meta.json format")
|
M052 = ("Not a valid meta.json format")
|
||||||
M053 = ("Expected dict but got: {meta_type}")
|
M053 = ("Expected dict but got: {meta_type}")
|
||||||
M054 = ("No --lang specified, but tokenization required.")
|
M054 = ("No --lang specified, but tokenization required.")
|
||||||
|
M055 = ("Training pipeline: {pipeline}")
|
||||||
|
M056 = ("Starting with base model '{model}'")
|
||||||
|
M057 = ("Starting with blank model '{model}'")
|
||||||
|
M058 = ("Loading vector from model '{model}'")
|
||||||
|
M059 = ("Can't use multitask objective without '{pipe}' in the pipeline")
|
||||||
|
M060 = ("Counting training words (limit={limit})")
|
||||||
|
M061 = ("\nSaving model...")
|
||||||
|
M062 = ("Output directory is not empty.")
|
||||||
|
M063 = ("Incompatible arguments")
|
||||||
|
M064 = ("The -f and -c arguments are deprecated, and not compatible with "
|
||||||
|
"the -j argument, which should specify the same information. "
|
||||||
|
"Either merge the frequencies and clusters data into the "
|
||||||
|
"JSONL-formatted file (recommended), or use only the -f and -c "
|
||||||
|
"files, without the other lexical attributes.")
|
||||||
|
M065 = ("This can lead to unintended side effects when saving the model. "
|
||||||
|
"Please use an empty directory or a different path instead. If "
|
||||||
|
"the specified output path doesn't exist, the directory will be "
|
||||||
|
"created for you.")
|
||||||
|
M066 = ("Saved model to output directory")
|
||||||
|
M067 = ("Can't find lexical data")
|
||||||
|
M068 = ("Sucessfully compiled vocab and vectors, and saved model")
|
||||||
|
M069 = ("Unknown file type: '{name}'")
|
||||||
|
M070 = ("Supported file types: '{options}'")
|
||||||
|
M071 = ("Loaded pretrained tok2vec for: {components}")
|
||||||
|
M072 = ("Model language ('{model_lang}') doesn't match language specified "
|
||||||
|
"as `lang` argument ('{lang}') ")
|
||||||
|
|
||||||
|
# fmt: on
|
||||||
|
|
|
@ -3,49 +3,91 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from wasabi import Printer
|
||||||
|
|
||||||
|
from ..util import write_jsonl, write_json
|
||||||
|
from ..compat import json_dumps, path2str
|
||||||
from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json
|
from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json
|
||||||
from .converters import ner_jsonl2json
|
from .converters import ner_jsonl2json
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
from ..util import prints
|
|
||||||
|
|
||||||
# Converters are matched by file extension. To add a converter, add a new
|
# Converters are matched by file extension. To add a converter, add a new
|
||||||
# entry to this dict with the file extension mapped to the converter function
|
# entry to this dict with the file extension mapped to the converter function
|
||||||
# imported from /converters.
|
# imported from /converters.
|
||||||
CONVERTERS = {
|
CONVERTERS = {
|
||||||
'conllubio': conllubio2json,
|
"conllubio": conllubio2json,
|
||||||
'conllu': conllu2json,
|
"conllu": conllu2json,
|
||||||
'conll': conllu2json,
|
"conll": conllu2json,
|
||||||
'ner': conll_ner2json,
|
"ner": conll_ner2json,
|
||||||
'iob': iob2json,
|
"iob": iob2json,
|
||||||
'jsonl': ner_jsonl2json
|
"jsonl": ner_jsonl2json,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# File types
|
||||||
|
FILE_TYPES = ("json", "jsonl")
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
input_file=("input file", "positional", None, str),
|
input_file=("Input file", "positional", None, str),
|
||||||
output_dir=("output directory for converted file", "positional", None, str),
|
output_dir=("Output directory for converted file", "positional", None, str),
|
||||||
|
file_type=("Type of data to produce: 'jsonl' or 'json'", "option", "t", str),
|
||||||
n_sents=("Number of sentences per doc", "option", "n", int),
|
n_sents=("Number of sentences per doc", "option", "n", int),
|
||||||
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
|
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
|
||||||
lang=("Language (if tokenizer required)", "option", "l", str),
|
lang=("Language (if tokenizer required)", "option", "l", str),
|
||||||
morphology=("Enable appending morphology to tags", "flag", "m", bool))
|
morphology=("Enable appending morphology to tags", "flag", "m", bool),
|
||||||
def convert(input_file, output_dir, n_sents=1, morphology=False, converter='auto',
|
)
|
||||||
lang=None):
|
def convert(
|
||||||
|
input_file,
|
||||||
|
output_dir="-",
|
||||||
|
file_type="jsonl",
|
||||||
|
n_sents=1,
|
||||||
|
morphology=False,
|
||||||
|
converter="auto",
|
||||||
|
lang=None,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Convert files into JSON format for use with train command and other
|
Convert files into JSON format for use with train command and other
|
||||||
experiment management functions.
|
experiment management functions. If no output_dir is specified, the data
|
||||||
|
is written to stdout, so you can pipe them forward to a JSONL file:
|
||||||
|
$ spacy convert some_file.conllu > some_file.jsonl
|
||||||
"""
|
"""
|
||||||
|
msg = Printer()
|
||||||
input_path = Path(input_file)
|
input_path = Path(input_file)
|
||||||
output_path = Path(output_dir)
|
if file_type not in FILE_TYPES:
|
||||||
|
msg.fail(
|
||||||
|
Messages.M069.format(name=file_type),
|
||||||
|
Messages.M070.format(options=", ".join(FILE_TYPES)),
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
if not input_path.exists():
|
if not input_path.exists():
|
||||||
prints(input_path, title=Messages.M028, exits=1)
|
msg.fail(Messages.M028, input_path, exits=1)
|
||||||
if not output_path.exists():
|
if output_dir != "-" and not Path(output_dir).exists():
|
||||||
prints(output_path, title=Messages.M029, exits=1)
|
msg.fail(Messages.M029, output_dir, exits=1)
|
||||||
if converter == 'auto':
|
if converter == "auto":
|
||||||
converter = input_path.suffix[1:]
|
converter = input_path.suffix[1:]
|
||||||
if converter not in CONVERTERS:
|
if converter not in CONVERTERS:
|
||||||
prints(Messages.M031.format(converter=converter),
|
msg.fail(Messages.M030, Messages.M031.format(converter=converter), exits=1)
|
||||||
title=Messages.M030, exits=1)
|
# Use converter function to convert data
|
||||||
func = CONVERTERS[converter]
|
func = CONVERTERS[converter]
|
||||||
func(input_path, output_path,
|
input_data = input_path.open("r", encoding="utf-8").read()
|
||||||
n_sents=n_sents, use_morphology=morphology, lang=lang)
|
data = func(input_data, nsents=n_sents, use_morphology=morphology, lang=lang)
|
||||||
|
if output_dir != "-":
|
||||||
|
# Export data to a file
|
||||||
|
suffix = ".{}".format(file_type)
|
||||||
|
output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
|
||||||
|
if file_type == "json":
|
||||||
|
write_json(output_file, data)
|
||||||
|
elif file_type == "jsonl":
|
||||||
|
write_jsonl(output_file, data)
|
||||||
|
msg.good(
|
||||||
|
Messages.M032.format(name=path2str(output_file)),
|
||||||
|
Messages.M033.format(n_docs=len(data)),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Print to stdout
|
||||||
|
if file_type == "json":
|
||||||
|
print(json_dumps(data))
|
||||||
|
elif file_type == "jsonl":
|
||||||
|
for line in data:
|
||||||
|
print(json_dumps(line))
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from .conllu2json import conllu2json
|
from .conllu2json import conllu2json # noqa: F401
|
||||||
from .conllubio2json import conllubio2json
|
from .conllubio2json import conllubio2json # noqa: F401
|
||||||
from .iob2json import iob2json
|
from .iob2json import iob2json # noqa: F401
|
||||||
from .conll_ner2json import conll_ner2json
|
from .conll_ner2json import conll_ner2json # noqa: F401
|
||||||
from .jsonl2json import ner_jsonl2json
|
from .jsonl2json import ner_jsonl2json # noqa: F401
|
||||||
|
|
|
@ -1,52 +1,38 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .._messages import Messages
|
|
||||||
from ...compat import json_dumps, path2str
|
|
||||||
from ...util import prints
|
|
||||||
from ...gold import iob_to_biluo
|
from ...gold import iob_to_biluo
|
||||||
|
|
||||||
|
|
||||||
def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):
|
def conll_ner2json(input_data, **kwargs):
|
||||||
"""
|
"""
|
||||||
Convert files in the CoNLL-2003 NER format into JSON format for use with
|
Convert files in the CoNLL-2003 NER format into JSON format for use with
|
||||||
train cli.
|
train cli.
|
||||||
"""
|
"""
|
||||||
docs = read_conll_ner(input_path)
|
delimit_docs = "-DOCSTART- -X- O O"
|
||||||
|
|
||||||
output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
|
|
||||||
output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
|
|
||||||
output_file = output_path / output_filename
|
|
||||||
with output_file.open('w', encoding='utf-8') as f:
|
|
||||||
f.write(json_dumps(docs))
|
|
||||||
prints(Messages.M033.format(n_docs=len(docs)),
|
|
||||||
title=Messages.M032.format(name=path2str(output_file)))
|
|
||||||
|
|
||||||
|
|
||||||
def read_conll_ner(input_path):
|
|
||||||
text = input_path.open('r', encoding='utf-8').read()
|
|
||||||
i = 0
|
|
||||||
delimit_docs = '-DOCSTART- -X- O O'
|
|
||||||
output_docs = []
|
output_docs = []
|
||||||
for doc in text.strip().split(delimit_docs):
|
for doc in input_data.strip().split(delimit_docs):
|
||||||
doc = doc.strip()
|
doc = doc.strip()
|
||||||
if not doc:
|
if not doc:
|
||||||
continue
|
continue
|
||||||
output_doc = []
|
output_doc = []
|
||||||
for sent in doc.split('\n\n'):
|
for sent in doc.split("\n\n"):
|
||||||
sent = sent.strip()
|
sent = sent.strip()
|
||||||
if not sent:
|
if not sent:
|
||||||
continue
|
continue
|
||||||
lines = [line.strip() for line in sent.split('\n') if line.strip()]
|
lines = [line.strip() for line in sent.split("\n") if line.strip()]
|
||||||
words, tags, chunks, iob_ents = zip(*[line.split() for line in lines])
|
words, tags, chunks, iob_ents = zip(*[line.split() for line in lines])
|
||||||
biluo_ents = iob_to_biluo(iob_ents)
|
biluo_ents = iob_to_biluo(iob_ents)
|
||||||
output_doc.append({'tokens': [
|
output_doc.append(
|
||||||
{'orth': w, 'tag': tag, 'ner': ent} for (w, tag, ent) in
|
{
|
||||||
zip(words, tags, biluo_ents)
|
"tokens": [
|
||||||
]})
|
{"orth": w, "tag": tag, "ner": ent}
|
||||||
output_docs.append({
|
for (w, tag, ent) in zip(words, tags, biluo_ents)
|
||||||
'id': len(output_docs),
|
]
|
||||||
'paragraphs': [{'sentences': output_doc}]
|
}
|
||||||
})
|
)
|
||||||
|
output_docs.append(
|
||||||
|
{"id": len(output_docs), "paragraphs": [{"sentences": output_doc}]}
|
||||||
|
)
|
||||||
output_doc = []
|
output_doc = []
|
||||||
return output_docs
|
return output_docs
|
||||||
|
|
|
@ -1,34 +1,27 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .._messages import Messages
|
|
||||||
from ...compat import json_dumps, path2str
|
|
||||||
from ...util import prints
|
|
||||||
from ...gold import iob_to_biluo
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from ...gold import iob_to_biluo
|
||||||
|
|
||||||
def conllu2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):
|
|
||||||
|
|
||||||
|
def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None):
|
||||||
"""
|
"""
|
||||||
Convert conllu files into JSON format for use with train cli.
|
Convert conllu files into JSON format for use with train cli.
|
||||||
use_morphology parameter enables appending morphology to tags, which is
|
use_morphology parameter enables appending morphology to tags, which is
|
||||||
useful for languages such as Spanish, where UD tags are not so rich.
|
useful for languages such as Spanish, where UD tags are not so rich.
|
||||||
"""
|
|
||||||
# by @dvsrepo, via #11 explosion/spacy-dev-resources
|
|
||||||
|
|
||||||
"""
|
|
||||||
Extract NER tags if available and convert them so that they follow
|
Extract NER tags if available and convert them so that they follow
|
||||||
BILUO and the Wikipedia scheme
|
BILUO and the Wikipedia scheme
|
||||||
"""
|
"""
|
||||||
|
# by @dvsrepo, via #11 explosion/spacy-dev-resources
|
||||||
# by @katarkor
|
# by @katarkor
|
||||||
|
|
||||||
docs = []
|
docs = []
|
||||||
sentences = []
|
sentences = []
|
||||||
conll_tuples = read_conllx(input_path, use_morphology=use_morphology)
|
conll_tuples = read_conllx(input_data, use_morphology=use_morphology)
|
||||||
checked_for_ner = False
|
checked_for_ner = False
|
||||||
has_ner_tags = False
|
has_ner_tags = False
|
||||||
|
|
||||||
for i, (raw_text, tokens) in enumerate(conll_tuples):
|
for i, (raw_text, tokens) in enumerate(conll_tuples):
|
||||||
sentence, brackets = tokens[0]
|
sentence, brackets = tokens[0]
|
||||||
if not checked_for_ner:
|
if not checked_for_ner:
|
||||||
|
@ -37,29 +30,19 @@ def conllu2json(input_path, output_path, n_sents=10, use_morphology=False, lang=
|
||||||
sentences.append(generate_sentence(sentence, has_ner_tags))
|
sentences.append(generate_sentence(sentence, has_ner_tags))
|
||||||
# Real-sized documents could be extracted using the comments on the
|
# Real-sized documents could be extracted using the comments on the
|
||||||
# conluu document
|
# conluu document
|
||||||
|
if len(sentences) % n_sents == 0:
|
||||||
if(len(sentences) % n_sents == 0):
|
|
||||||
doc = create_doc(sentences, i)
|
doc = create_doc(sentences, i)
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
sentences = []
|
sentences = []
|
||||||
|
return docs
|
||||||
output_filename = input_path.parts[-1].replace(".conll", ".json")
|
|
||||||
output_filename = input_path.parts[-1].replace(".conllu", ".json")
|
|
||||||
output_file = output_path / output_filename
|
|
||||||
with output_file.open('w', encoding='utf-8') as f:
|
|
||||||
f.write(json_dumps(docs))
|
|
||||||
prints(Messages.M033.format(n_docs=len(docs)),
|
|
||||||
title=Messages.M032.format(name=path2str(output_file)))
|
|
||||||
|
|
||||||
|
|
||||||
def is_ner(tag):
|
def is_ner(tag):
|
||||||
|
|
||||||
"""
|
|
||||||
Check the 10th column of the first token to determine if the file contains
|
|
||||||
NER tags
|
|
||||||
"""
|
"""
|
||||||
|
Check the 10th column of the first token to determine if the file contains
|
||||||
tag_match = re.match('([A-Z_]+)-([A-Z_]+)', tag)
|
NER tags
|
||||||
|
"""
|
||||||
|
tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
|
||||||
if tag_match:
|
if tag_match:
|
||||||
return True
|
return True
|
||||||
elif tag == "O":
|
elif tag == "O":
|
||||||
|
@ -67,29 +50,29 @@ def is_ner(tag):
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def read_conllx(input_path, use_morphology=False, n=0):
|
|
||||||
text = input_path.open('r', encoding='utf-8').read()
|
def read_conllx(input_data, use_morphology=False, n=0):
|
||||||
i = 0
|
i = 0
|
||||||
for sent in text.strip().split('\n\n'):
|
for sent in input_data.strip().split("\n\n"):
|
||||||
lines = sent.strip().split('\n')
|
lines = sent.strip().split("\n")
|
||||||
if lines:
|
if lines:
|
||||||
while lines[0].startswith('#'):
|
while lines[0].startswith("#"):
|
||||||
lines.pop(0)
|
lines.pop(0)
|
||||||
tokens = []
|
tokens = []
|
||||||
for line in lines:
|
for line in lines:
|
||||||
|
|
||||||
parts = line.split('\t')
|
parts = line.split("\t")
|
||||||
id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts
|
id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts
|
||||||
if '-' in id_ or '.' in id_:
|
if "-" in id_ or "." in id_:
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
id_ = int(id_) - 1
|
id_ = int(id_) - 1
|
||||||
head = (int(head) - 1) if head != '0' else id_
|
head = (int(head) - 1) if head != "0" else id_
|
||||||
dep = 'ROOT' if dep == 'root' else dep
|
dep = "ROOT" if dep == "root" else dep
|
||||||
tag = pos if tag == '_' else tag
|
tag = pos if tag == "_" else tag
|
||||||
tag = tag+'__'+morph if use_morphology else tag
|
tag = tag + "__" + morph if use_morphology else tag
|
||||||
tokens.append((id_, word, tag, head, dep, iob))
|
tokens.append((id_, word, tag, head, dep, iob))
|
||||||
except:
|
except: # noqa: E722
|
||||||
print(line)
|
print(line)
|
||||||
raise
|
raise
|
||||||
tuples = [list(t) for t in zip(*tokens)]
|
tuples = [list(t) for t in zip(*tokens)]
|
||||||
|
@ -98,31 +81,31 @@ def read_conllx(input_path, use_morphology=False, n=0):
|
||||||
if n >= 1 and i >= n:
|
if n >= 1 and i >= n:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
def simplify_tags(iob):
|
def simplify_tags(iob):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Simplify tags obtained from the dataset in order to follow Wikipedia
|
Simplify tags obtained from the dataset in order to follow Wikipedia
|
||||||
scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
|
scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
|
||||||
'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
|
'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
|
||||||
'MISC'.
|
'MISC'.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
new_iob = []
|
new_iob = []
|
||||||
for tag in iob:
|
for tag in iob:
|
||||||
tag_match = re.match('([A-Z_]+)-([A-Z_]+)', tag)
|
tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
|
||||||
if tag_match:
|
if tag_match:
|
||||||
prefix = tag_match.group(1)
|
prefix = tag_match.group(1)
|
||||||
suffix = tag_match.group(2)
|
suffix = tag_match.group(2)
|
||||||
if suffix == 'GPE_LOC':
|
if suffix == "GPE_LOC":
|
||||||
suffix = 'LOC'
|
suffix = "LOC"
|
||||||
elif suffix == 'GPE_ORG':
|
elif suffix == "GPE_ORG":
|
||||||
suffix = 'ORG'
|
suffix = "ORG"
|
||||||
elif suffix != 'PER' and suffix != 'LOC' and suffix != 'ORG':
|
elif suffix != "PER" and suffix != "LOC" and suffix != "ORG":
|
||||||
suffix = 'MISC'
|
suffix = "MISC"
|
||||||
tag = prefix + '-' + suffix
|
tag = prefix + "-" + suffix
|
||||||
new_iob.append(tag)
|
new_iob.append(tag)
|
||||||
return new_iob
|
return new_iob
|
||||||
|
|
||||||
|
|
||||||
def generate_sentence(sent, has_ner_tags):
|
def generate_sentence(sent, has_ner_tags):
|
||||||
(id_, word, tag, head, dep, iob) = sent
|
(id_, word, tag, head, dep, iob) = sent
|
||||||
sentence = {}
|
sentence = {}
|
||||||
|
@ -144,7 +127,7 @@ def generate_sentence(sent, has_ner_tags):
|
||||||
return sentence
|
return sentence
|
||||||
|
|
||||||
|
|
||||||
def create_doc(sentences,id):
|
def create_doc(sentences, id):
|
||||||
doc = {}
|
doc = {}
|
||||||
paragraph = {}
|
paragraph = {}
|
||||||
doc["id"] = id
|
doc["id"] = id
|
||||||
|
|
|
@ -1,65 +1,54 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...compat import json_dumps, path2str
|
|
||||||
from ...util import prints
|
|
||||||
from ...gold import iob_to_biluo
|
from ...gold import iob_to_biluo
|
||||||
|
|
||||||
def conllubio2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):
|
|
||||||
|
def conllubio2json(input_data, n_sents=10, use_morphology=False, lang=None):
|
||||||
"""
|
"""
|
||||||
Convert conllu files into JSON format for use with train cli.
|
Convert conllu files into JSON format for use with train cli.
|
||||||
use_morphology parameter enables appending morphology to tags, which is
|
use_morphology parameter enables appending morphology to tags, which is
|
||||||
useful for languages such as Spanish, where UD tags are not so rich.
|
useful for languages such as Spanish, where UD tags are not so rich.
|
||||||
"""
|
"""
|
||||||
# by @dvsrepo, via #11 explosion/spacy-dev-resources
|
# by @dvsrepo, via #11 explosion/spacy-dev-resources
|
||||||
|
|
||||||
docs = []
|
docs = []
|
||||||
sentences = []
|
sentences = []
|
||||||
conll_tuples = read_conllx(input_path, use_morphology=use_morphology)
|
conll_tuples = read_conllx(input_data, use_morphology=use_morphology)
|
||||||
|
|
||||||
for i, (raw_text, tokens) in enumerate(conll_tuples):
|
for i, (raw_text, tokens) in enumerate(conll_tuples):
|
||||||
sentence, brackets = tokens[0]
|
sentence, brackets = tokens[0]
|
||||||
sentences.append(generate_sentence(sentence))
|
sentences.append(generate_sentence(sentence))
|
||||||
# Real-sized documents could be extracted using the comments on the
|
# Real-sized documents could be extracted using the comments on the
|
||||||
# conluu document
|
# conluu document
|
||||||
if(len(sentences) % n_sents == 0):
|
if len(sentences) % n_sents == 0:
|
||||||
doc = create_doc(sentences, i)
|
doc = create_doc(sentences, i)
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
sentences = []
|
sentences = []
|
||||||
|
return docs
|
||||||
output_filename = input_path.parts[-1].replace(".conll", ".json")
|
|
||||||
output_filename = input_path.parts[-1].replace(".conllu", ".json")
|
|
||||||
output_file = output_path / output_filename
|
|
||||||
with output_file.open('w', encoding='utf-8') as f:
|
|
||||||
f.write(json_dumps(docs))
|
|
||||||
prints("Created %d documents" % len(docs),
|
|
||||||
title="Generated output file %s" % path2str(output_file))
|
|
||||||
|
|
||||||
|
|
||||||
def read_conllx(input_path, use_morphology=False, n=0):
|
def read_conllx(input_data, use_morphology=False, n=0):
|
||||||
text = input_path.open('r', encoding='utf-8').read()
|
|
||||||
i = 0
|
i = 0
|
||||||
for sent in text.strip().split('\n\n'):
|
for sent in input_data.strip().split("\n\n"):
|
||||||
lines = sent.strip().split('\n')
|
lines = sent.strip().split("\n")
|
||||||
if lines:
|
if lines:
|
||||||
while lines[0].startswith('#'):
|
while lines[0].startswith("#"):
|
||||||
lines.pop(0)
|
lines.pop(0)
|
||||||
tokens = []
|
tokens = []
|
||||||
for line in lines:
|
for line in lines:
|
||||||
|
|
||||||
parts = line.split('\t')
|
parts = line.split("\t")
|
||||||
id_, word, lemma, pos, tag, morph, head, dep, _1, ner = parts
|
id_, word, lemma, pos, tag, morph, head, dep, _1, ner = parts
|
||||||
if '-' in id_ or '.' in id_:
|
if "-" in id_ or "." in id_:
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
id_ = int(id_) - 1
|
id_ = int(id_) - 1
|
||||||
head = (int(head) - 1) if head != '0' else id_
|
head = (int(head) - 1) if head != "0" else id_
|
||||||
dep = 'ROOT' if dep == 'root' else dep
|
dep = "ROOT" if dep == "root" else dep
|
||||||
tag = pos if tag == '_' else tag
|
tag = pos if tag == "_" else tag
|
||||||
tag = tag+'__'+morph if use_morphology else tag
|
tag = tag + "__" + morph if use_morphology else tag
|
||||||
ner = ner if ner else 'O'
|
ner = ner if ner else "O"
|
||||||
tokens.append((id_, word, tag, head, dep, ner))
|
tokens.append((id_, word, tag, head, dep, ner))
|
||||||
except:
|
except: # noqa: E722
|
||||||
print(line)
|
print(line)
|
||||||
raise
|
raise
|
||||||
tuples = [list(t) for t in zip(*tokens)]
|
tuples = [list(t) for t in zip(*tokens)]
|
||||||
|
@ -68,6 +57,7 @@ def read_conllx(input_path, use_morphology=False, n=0):
|
||||||
if n >= 1 and i >= n:
|
if n >= 1 and i >= n:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
def generate_sentence(sent):
|
def generate_sentence(sent):
|
||||||
(id_, word, tag, head, dep, ner) = sent
|
(id_, word, tag, head, dep, ner) = sent
|
||||||
sentence = {}
|
sentence = {}
|
||||||
|
@ -85,7 +75,7 @@ def generate_sentence(sent):
|
||||||
return sentence
|
return sentence
|
||||||
|
|
||||||
|
|
||||||
def create_doc(sentences,id):
|
def create_doc(sentences, id):
|
||||||
doc = {}
|
doc = {}
|
||||||
paragraph = {}
|
paragraph = {}
|
||||||
doc["id"] = id
|
doc["id"] = id
|
||||||
|
|
|
@ -1,26 +1,24 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from cytoolz import partition_all, concat
|
|
||||||
|
|
||||||
from .._messages import Messages
|
from cytoolz import partition_all
|
||||||
from ...compat import json_dumps, path2str
|
|
||||||
from ...util import prints
|
|
||||||
from ...gold import iob_to_biluo
|
from ...gold import iob_to_biluo
|
||||||
|
|
||||||
|
|
||||||
def iob2json(input_path, output_path, n_sents=10, *a, **k):
|
def iob2json(input_data, n_sents=10, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
Convert IOB files into JSON format for use with train cli.
|
Convert IOB files into JSON format for use with train cli.
|
||||||
"""
|
"""
|
||||||
with input_path.open('r', encoding='utf8') as file_:
|
docs = []
|
||||||
sentences = read_iob(file_)
|
for group in partition_all(n_sents, docs):
|
||||||
docs = merge_sentences(sentences, n_sents)
|
group = list(group)
|
||||||
output_filename = input_path.parts[-1].replace(".iob", ".json")
|
first = group.pop(0)
|
||||||
output_file = output_path / output_filename
|
to_extend = first["paragraphs"][0]["sentences"]
|
||||||
with output_file.open('w', encoding='utf-8') as f:
|
for sent in group[1:]:
|
||||||
f.write(json_dumps(docs))
|
to_extend.extend(sent["paragraphs"][0]["sentences"])
|
||||||
prints(Messages.M033.format(n_docs=len(docs)),
|
docs.append(first)
|
||||||
title=Messages.M032.format(name=path2str(output_file)))
|
return docs
|
||||||
|
|
||||||
|
|
||||||
def read_iob(raw_sents):
|
def read_iob(raw_sents):
|
||||||
|
@ -28,30 +26,20 @@ def read_iob(raw_sents):
|
||||||
for line in raw_sents:
|
for line in raw_sents:
|
||||||
if not line.strip():
|
if not line.strip():
|
||||||
continue
|
continue
|
||||||
tokens = [t.split('|') for t in line.split()]
|
tokens = [t.split("|") for t in line.split()]
|
||||||
if len(tokens[0]) == 3:
|
if len(tokens[0]) == 3:
|
||||||
words, pos, iob = zip(*tokens)
|
words, pos, iob = zip(*tokens)
|
||||||
else:
|
else:
|
||||||
words, iob = zip(*tokens)
|
words, iob = zip(*tokens)
|
||||||
pos = ['-'] * len(words)
|
pos = ["-"] * len(words)
|
||||||
biluo = iob_to_biluo(iob)
|
biluo = iob_to_biluo(iob)
|
||||||
sentences.append([
|
sentences.append(
|
||||||
{'orth': w, 'tag': p, 'ner': ent}
|
[
|
||||||
for (w, p, ent) in zip(words, pos, biluo)
|
{"orth": w, "tag": p, "ner": ent}
|
||||||
])
|
for (w, p, ent) in zip(words, pos, biluo)
|
||||||
sentences = [{'tokens': sent} for sent in sentences]
|
]
|
||||||
paragraphs = [{'sentences': [sent]} for sent in sentences]
|
)
|
||||||
docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs]
|
sentences = [{"tokens": sent} for sent in sentences]
|
||||||
|
paragraphs = [{"sentences": [sent]} for sent in sentences]
|
||||||
|
docs = [{"id": 0, "paragraphs": [para]} for para in paragraphs]
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
def merge_sentences(docs, n_sents):
|
|
||||||
counter = 0
|
|
||||||
merged = []
|
|
||||||
for group in partition_all(n_sents, docs):
|
|
||||||
group = list(group)
|
|
||||||
first = group.pop(0)
|
|
||||||
to_extend = first['paragraphs'][0]['sentences']
|
|
||||||
for sent in group[1:]:
|
|
||||||
to_extend.extend(sent['paragraphs'][0]['sentences'])
|
|
||||||
merged.append(first)
|
|
||||||
return merged
|
|
||||||
|
|
|
@ -1,33 +1,21 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
import ujson as json
|
|
||||||
|
|
||||||
|
import ujson
|
||||||
|
|
||||||
|
from ...util import get_lang_class
|
||||||
from .._messages import Messages
|
from .._messages import Messages
|
||||||
from ...compat import json_dumps, path2str
|
|
||||||
from ...util import prints, get_lang_class
|
|
||||||
from ...gold import docs_to_json
|
|
||||||
|
|
||||||
|
|
||||||
def ner_jsonl2json(input_path, output_path, lang=None, n_sents=10, use_morphology=False):
|
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False):
|
||||||
if lang is None:
|
if lang is None:
|
||||||
prints(Messages.M054, exits=True)
|
raise ValueError(Messages.M054)
|
||||||
json_docs = []
|
json_docs = []
|
||||||
input_tuples = list(read_jsonl(input_path))
|
input_tuples = [ujson.loads(line) for line in input_data]
|
||||||
nlp = get_lang_class(lang)()
|
nlp = get_lang_class(lang)()
|
||||||
for i, (raw_text, ents) in enumerate(input_tuples):
|
for i, (raw_text, ents) in enumerate(input_tuples):
|
||||||
doc = nlp.make_doc(raw_text)
|
doc = nlp.make_doc(raw_text)
|
||||||
doc[0].is_sent_start = True
|
doc[0].is_sent_start = True
|
||||||
doc.ents = [doc.char_span(s, e, label=L) for s, e, L in ents['entities']]
|
doc.ents = [doc.char_span(s, e, label=L) for s, e, L in ents["entities"]]
|
||||||
json_docs.append(docs_to_json(i, [doc]))
|
json_docs.append(doc.to_json())
|
||||||
|
return json_docs
|
||||||
output_filename = input_path.parts[-1].replace(".jsonl", ".json")
|
|
||||||
output_loc = output_path / output_filename
|
|
||||||
with (output_loc).open('w', encoding='utf8') as file_:
|
|
||||||
file_.write(json_dumps(json_docs))
|
|
||||||
prints(Messages.M033.format(n_docs=len(json_docs)),
|
|
||||||
title=Messages.M032.format(name=path2str(output_loc)))
|
|
||||||
|
|
||||||
def read_jsonl(input_path):
|
|
||||||
with input_path.open('r', encoding='utf8') as file_:
|
|
||||||
for line in file_:
|
|
||||||
yield json.loads(line)
|
|
||||||
|
|
398
spacy/cli/debug_data.py
Normal file
398
spacy/cli/debug_data.py
Normal file
|
@ -0,0 +1,398 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from collections import Counter
|
||||||
|
import plac
|
||||||
|
import sys
|
||||||
|
from wasabi import Printer, MESSAGES
|
||||||
|
|
||||||
|
from ..gold import GoldCorpus, read_json_object
|
||||||
|
from ..util import load_model, get_lang_class, read_json, read_jsonl
|
||||||
|
|
||||||
|
# from .schemas import get_schema, validate_json
|
||||||
|
from ._messages import Messages
|
||||||
|
|
||||||
|
|
||||||
|
# Minimum number of expected occurences of label in data to train new label
|
||||||
|
NEW_LABEL_THRESHOLD = 50
|
||||||
|
# Minimum number of expected examples to train a blank model
|
||||||
|
BLANK_MODEL_MIN_THRESHOLD = 100
|
||||||
|
BLANK_MODEL_THRESHOLD = 2000
|
||||||
|
|
||||||
|
|
||||||
|
@plac.annotations(
|
||||||
|
lang=("model language", "positional", None, str),
|
||||||
|
train_path=("location of JSON-formatted training data", "positional", None, Path),
|
||||||
|
dev_path=("location of JSON-formatted development data", "positional", None, Path),
|
||||||
|
base_model=("name of model to update (optional)", "option", "b", str),
|
||||||
|
pipeline=(
|
||||||
|
"Comma-separated names of pipeline components to train",
|
||||||
|
"option",
|
||||||
|
"p",
|
||||||
|
str,
|
||||||
|
),
|
||||||
|
ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
|
||||||
|
ignore_validation=(
|
||||||
|
"Don't exit if JSON format validation fails",
|
||||||
|
"flag",
|
||||||
|
"IV",
|
||||||
|
bool,
|
||||||
|
),
|
||||||
|
verbose=("Print additional information and explanations", "flag", "V", bool),
|
||||||
|
no_format=("Don't pretty-print the results", "flag", "NF", bool),
|
||||||
|
)
|
||||||
|
def debug_data(
|
||||||
|
lang,
|
||||||
|
train_path,
|
||||||
|
dev_path,
|
||||||
|
base_model=None,
|
||||||
|
pipeline="tagger,parser,ner",
|
||||||
|
ignore_warnings=False,
|
||||||
|
ignore_validation=False,
|
||||||
|
verbose=False,
|
||||||
|
no_format=False,
|
||||||
|
):
|
||||||
|
msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings)
|
||||||
|
|
||||||
|
# Make sure all files and paths exists if they are needed
|
||||||
|
if not train_path.exists():
|
||||||
|
msg.fail(Messages.M050, train_path, exits=1)
|
||||||
|
if not dev_path.exists():
|
||||||
|
msg.fail(Messages.M051, dev_path, exits=1)
|
||||||
|
|
||||||
|
# Initialize the model and pipeline
|
||||||
|
pipeline = [p.strip() for p in pipeline.split(",")]
|
||||||
|
if base_model:
|
||||||
|
nlp = load_model(base_model)
|
||||||
|
else:
|
||||||
|
lang_cls = get_lang_class(lang)
|
||||||
|
nlp = lang_cls()
|
||||||
|
|
||||||
|
msg.divider("Data format validation")
|
||||||
|
# Load the data in one – might take a while but okay in this case
|
||||||
|
with msg.loading("Loading {}...".format(train_path.parts[-1])):
|
||||||
|
train_data = _load_file(train_path, msg)
|
||||||
|
with msg.loading("Loading {}...".format(dev_path.parts[-1])):
|
||||||
|
dev_data = _load_file(dev_path, msg)
|
||||||
|
|
||||||
|
# Validate data format using the JSON schema
|
||||||
|
# TODO: update once the new format is ready
|
||||||
|
# schema = get_schema("training")
|
||||||
|
train_data_errors = [] # TODO: validate_json(train_data, schema)
|
||||||
|
dev_data_errors = [] # TODO: validate_json(dev_data, schema)
|
||||||
|
if not train_data_errors:
|
||||||
|
msg.good("Training data JSON format is valid")
|
||||||
|
if not dev_data_errors:
|
||||||
|
msg.good("Development data JSON format is valid")
|
||||||
|
for error in train_data_errors:
|
||||||
|
msg.fail("Training data: {}".format(error))
|
||||||
|
for error in dev_data_errors:
|
||||||
|
msg.fail("Develoment data: {}".format(error))
|
||||||
|
if (train_data_errors or dev_data_errors) and not ignore_validation:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Create the gold corpus to be able to better analyze data
|
||||||
|
with msg.loading("Analyzing corpus..."):
|
||||||
|
train_data = read_json_object(train_data)
|
||||||
|
dev_data = read_json_object(dev_data)
|
||||||
|
corpus = GoldCorpus(train_data, dev_data)
|
||||||
|
train_docs = list(corpus.train_docs(nlp))
|
||||||
|
dev_docs = list(corpus.dev_docs(nlp))
|
||||||
|
msg.good("Corpus is loadable")
|
||||||
|
|
||||||
|
# Create all gold data here to avoid iterating over the train_docs constantly
|
||||||
|
gold_data = _compile_gold(train_docs, pipeline)
|
||||||
|
train_texts = gold_data["texts"]
|
||||||
|
dev_texts = set([doc.text for doc, gold in dev_docs])
|
||||||
|
|
||||||
|
msg.divider("Training stats")
|
||||||
|
msg.text("Training pipeline: {}".format(", ".join(pipeline)))
|
||||||
|
for pipe in [p for p in pipeline if p not in nlp.factories]:
|
||||||
|
msg.fail("Pipeline component '{}' not available in factories".format(pipe))
|
||||||
|
if base_model:
|
||||||
|
msg.text("Starting with base model '{}'".format(base_model))
|
||||||
|
else:
|
||||||
|
msg.text("Starting with blank model '{}'".format(lang))
|
||||||
|
msg.text("{} training docs".format(len(train_docs)))
|
||||||
|
msg.text("{} evaluation docs".format(len(dev_docs)))
|
||||||
|
|
||||||
|
overlap = len(train_texts.intersection(dev_texts))
|
||||||
|
if overlap:
|
||||||
|
msg.warn("{} training examples also in evaluation data".format(overlap))
|
||||||
|
else:
|
||||||
|
msg.good("No overlap between training and evaluation data")
|
||||||
|
if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD:
|
||||||
|
text = "Low number of examples to train from a blank model ({})".format(
|
||||||
|
len(train_docs)
|
||||||
|
)
|
||||||
|
if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD:
|
||||||
|
msg.fail(text)
|
||||||
|
else:
|
||||||
|
msg.warn(text)
|
||||||
|
msg.text(
|
||||||
|
"It's recommended to use at least {} examples (minimum {})".format(
|
||||||
|
BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD
|
||||||
|
),
|
||||||
|
show=verbose,
|
||||||
|
)
|
||||||
|
|
||||||
|
msg.divider("Vocab & Vectors")
|
||||||
|
n_words = gold_data["n_words"]
|
||||||
|
msg.info(
|
||||||
|
"{} total {} in the data ({} unique)".format(
|
||||||
|
n_words, "word" if n_words == 1 else "words", len(gold_data["words"])
|
||||||
|
)
|
||||||
|
)
|
||||||
|
most_common_words = gold_data["words"].most_common(10)
|
||||||
|
msg.text(
|
||||||
|
"10 most common words: {}".format(
|
||||||
|
_format_labels(most_common_words, counts=True)
|
||||||
|
),
|
||||||
|
show=verbose,
|
||||||
|
)
|
||||||
|
if len(nlp.vocab.vectors):
|
||||||
|
msg.info(
|
||||||
|
"{} vectors ({} unique keys, {} dimensions)".format(
|
||||||
|
len(nlp.vocab.vectors),
|
||||||
|
nlp.vocab.vectors.n_keys,
|
||||||
|
nlp.vocab.vectors_length,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
msg.info("No word vectors present in the model")
|
||||||
|
|
||||||
|
if "ner" in pipeline:
|
||||||
|
# Get all unique NER labels present in the data
|
||||||
|
labels = set(label for label in gold_data["ner"] if label not in ("O", "-"))
|
||||||
|
label_counts = gold_data["ner"]
|
||||||
|
model_labels = _get_labels_from_model(nlp, "ner")
|
||||||
|
new_labels = [l for l in labels if l not in model_labels]
|
||||||
|
existing_labels = [l for l in labels if l in model_labels]
|
||||||
|
has_low_data_warning = False
|
||||||
|
has_no_neg_warning = False
|
||||||
|
|
||||||
|
msg.divider("Named Entity Recognition")
|
||||||
|
msg.info(
|
||||||
|
"{} new {}, {} existing {}".format(
|
||||||
|
len(new_labels),
|
||||||
|
"label" if len(new_labels) == 1 else "labels",
|
||||||
|
len(existing_labels),
|
||||||
|
"label" if len(existing_labels) == 1 else "labels",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
missing_values = label_counts["-"]
|
||||||
|
msg.text(
|
||||||
|
"{} missing {} (tokens with '-' label)".format(
|
||||||
|
missing_values, "value" if missing_values == 1 else "values"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if new_labels:
|
||||||
|
labels_with_counts = [
|
||||||
|
(label, count)
|
||||||
|
for label, count in label_counts.most_common()
|
||||||
|
if label != "-"
|
||||||
|
]
|
||||||
|
labels_with_counts = _format_labels(labels_with_counts, counts=True)
|
||||||
|
msg.text("New: {}".format(labels_with_counts), show=verbose)
|
||||||
|
if existing_labels:
|
||||||
|
msg.text(
|
||||||
|
"Existing: {}".format(_format_labels(existing_labels)), show=verbose
|
||||||
|
)
|
||||||
|
|
||||||
|
for label in new_labels:
|
||||||
|
if label_counts[label] <= NEW_LABEL_THRESHOLD:
|
||||||
|
msg.warn(
|
||||||
|
"Low number of examples for new label '{}' ({})".format(
|
||||||
|
label, label_counts[label]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
has_low_data_warning = True
|
||||||
|
|
||||||
|
with msg.loading("Analyzing label distribution..."):
|
||||||
|
neg_docs = _get_examples_without_label(train_docs, label)
|
||||||
|
if neg_docs == 0:
|
||||||
|
msg.warn(
|
||||||
|
"No examples for texts WITHOUT new label '{}'".format(label)
|
||||||
|
)
|
||||||
|
has_no_neg_warning = True
|
||||||
|
|
||||||
|
if not has_low_data_warning:
|
||||||
|
msg.good("Good amount of examples for all labels")
|
||||||
|
if not has_no_neg_warning:
|
||||||
|
msg.good("Examples without occurences available for all labels")
|
||||||
|
|
||||||
|
if has_low_data_warning:
|
||||||
|
msg.text(
|
||||||
|
"To train a new entity type, your data should include at "
|
||||||
|
"least {} insteances of the new label".format(NEW_LABEL_THRESHOLD),
|
||||||
|
show=verbose,
|
||||||
|
)
|
||||||
|
if has_no_neg_warning:
|
||||||
|
msg.text(
|
||||||
|
"Training data should always include examples of entities "
|
||||||
|
"in context, as well as examples without a given entity "
|
||||||
|
"type.",
|
||||||
|
show=verbose,
|
||||||
|
)
|
||||||
|
|
||||||
|
if "textcat" in pipeline:
|
||||||
|
msg.divider("Text Classification")
|
||||||
|
labels = [label for label in gold_data["textcat"]]
|
||||||
|
model_labels = _get_labels_from_model(nlp, "textcat")
|
||||||
|
new_labels = [l for l in labels if l not in model_labels]
|
||||||
|
existing_labels = [l for l in labels if l in model_labels]
|
||||||
|
msg.info(
|
||||||
|
"Text Classification: {} new label(s), {} existing label(s)".format(
|
||||||
|
len(new_labels), len(existing_labels)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if new_labels:
|
||||||
|
labels_with_counts = _format_labels(
|
||||||
|
gold_data["textcat"].most_common(), counts=True
|
||||||
|
)
|
||||||
|
msg.text("New: {}".format(labels_with_counts), show=verbose)
|
||||||
|
if existing_labels:
|
||||||
|
msg.text(
|
||||||
|
"Existing: {}".format(_format_labels(existing_labels)), show=verbose
|
||||||
|
)
|
||||||
|
|
||||||
|
if "tagger" in pipeline:
|
||||||
|
msg.divider("Part-of-speech Tagging")
|
||||||
|
labels = [label for label in gold_data["tags"]]
|
||||||
|
tag_map = nlp.Defaults.tag_map
|
||||||
|
msg.info(
|
||||||
|
"{} {} in data ({} {} in tag map)".format(
|
||||||
|
len(labels),
|
||||||
|
"label" if len(labels) == 1 else "labels",
|
||||||
|
len(tag_map),
|
||||||
|
"label" if len(tag_map) == 1 else "labels",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
labels_with_counts = _format_labels(
|
||||||
|
gold_data["tags"].most_common(), counts=True
|
||||||
|
)
|
||||||
|
msg.text(labels_with_counts, show=verbose)
|
||||||
|
non_tagmap = [l for l in labels if l not in tag_map]
|
||||||
|
if not non_tagmap:
|
||||||
|
msg.good("All labels present in tag map for language '{}'".format(nlp.lang))
|
||||||
|
for label in non_tagmap:
|
||||||
|
msg.fail(
|
||||||
|
"Label '{}' not found in tag map for language '{}'".format(
|
||||||
|
label, nlp.lang
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if "parser" in pipeline:
|
||||||
|
msg.divider("Dependency Parsing")
|
||||||
|
labels = [label for label in gold_data["deps"]]
|
||||||
|
msg.info(
|
||||||
|
"{} {} in data".format(
|
||||||
|
len(labels), "label" if len(labels) == 1 else "labels"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
labels_with_counts = _format_labels(
|
||||||
|
gold_data["deps"].most_common(), counts=True
|
||||||
|
)
|
||||||
|
msg.text(labels_with_counts, show=verbose)
|
||||||
|
|
||||||
|
msg.divider("Summary")
|
||||||
|
good_counts = msg.counts[MESSAGES.GOOD]
|
||||||
|
warn_counts = msg.counts[MESSAGES.WARN]
|
||||||
|
fail_counts = msg.counts[MESSAGES.FAIL]
|
||||||
|
if good_counts:
|
||||||
|
msg.good(
|
||||||
|
"{} {} passed".format(
|
||||||
|
good_counts, "check" if good_counts == 1 else "checks"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if warn_counts:
|
||||||
|
msg.warn(
|
||||||
|
"{} {}".format(warn_counts, "warning" if warn_counts == 1 else "warnings")
|
||||||
|
)
|
||||||
|
if fail_counts:
|
||||||
|
msg.fail("{} {}".format(fail_counts, "error" if fail_counts == 1 else "errors"))
|
||||||
|
|
||||||
|
if fail_counts:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def _load_file(file_path, msg):
|
||||||
|
file_name = file_path.parts[-1]
|
||||||
|
if file_path.suffix == ".json":
|
||||||
|
data = read_json(file_path)
|
||||||
|
msg.good("Loaded {}".format(file_name))
|
||||||
|
return data
|
||||||
|
elif file_path.suffix == ".jsonl":
|
||||||
|
data = read_jsonl(file_path)
|
||||||
|
msg.good("Loaded {}".format(file_name))
|
||||||
|
return data
|
||||||
|
msg.fail(
|
||||||
|
"Can't load file extension {}".format(file_path.suffix),
|
||||||
|
"Expected .json or .jsonl",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _compile_gold(train_docs, pipeline):
|
||||||
|
data = {
|
||||||
|
"ner": Counter(),
|
||||||
|
"cats": Counter(),
|
||||||
|
"tags": Counter(),
|
||||||
|
"deps": Counter(),
|
||||||
|
"words": Counter(),
|
||||||
|
"n_words": 0,
|
||||||
|
"texts": set(),
|
||||||
|
}
|
||||||
|
for doc, gold in train_docs:
|
||||||
|
data["words"].update(gold.words)
|
||||||
|
data["n_words"] += len(gold.words)
|
||||||
|
data["texts"].add(doc.text)
|
||||||
|
if "ner" in pipeline:
|
||||||
|
for label in gold.ner:
|
||||||
|
if label.startswith(("B-", "U-")):
|
||||||
|
combined_label = label.split("-")[1]
|
||||||
|
data["ner"][combined_label] += 1
|
||||||
|
elif label == "-":
|
||||||
|
data["ner"]["-"] += 1
|
||||||
|
if "textcat" in pipeline:
|
||||||
|
data["cats"].update(gold.cats)
|
||||||
|
if "tagger" in pipeline:
|
||||||
|
data["tags"].update(gold.tags)
|
||||||
|
if "parser" in pipeline:
|
||||||
|
data["deps"].update(gold.labels)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def _format_labels(labels, counts=False):
|
||||||
|
if counts:
|
||||||
|
return ", ".join(["'{}' ({})".format(l, c) for l, c in labels])
|
||||||
|
return ", ".join(["'{}'".format(l) for l in labels])
|
||||||
|
|
||||||
|
|
||||||
|
def _get_ner_counts(data):
|
||||||
|
counter = Counter()
|
||||||
|
for doc, gold in data:
|
||||||
|
for label in gold.ner:
|
||||||
|
if label.startswith(("B-", "U-")):
|
||||||
|
combined_label = label.split("-")[1]
|
||||||
|
counter[combined_label] += 1
|
||||||
|
elif label == "-":
|
||||||
|
counter["-"] += 1
|
||||||
|
return counter
|
||||||
|
|
||||||
|
|
||||||
|
def _get_examples_without_label(data, label):
|
||||||
|
count = 0
|
||||||
|
for doc, gold in data:
|
||||||
|
labels = [label.split("-")[1] for label in gold.ner if label not in ("O", "-")]
|
||||||
|
if label not in labels:
|
||||||
|
count += 1
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
def _get_labels_from_model(nlp, pipe_name):
|
||||||
|
if pipe_name not in nlp.pipe_names:
|
||||||
|
return set()
|
||||||
|
pipe = nlp.get_pipe(pipe_name)
|
||||||
|
return pipe.labels
|
|
@ -6,34 +6,37 @@ import requests
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
from wasabi import Printer
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
from .link import link
|
from .link import link
|
||||||
from ..util import prints, get_package_path
|
from ..util import get_package_path
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
|
|
||||||
|
msg = Printer()
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
model=("model to download, shortcut or name)", "positional", None, str),
|
model=("Model to download (shortcut or name)", "positional", None, str),
|
||||||
direct=("force direct download. Needs model name with version and won't "
|
direct=("Force direct download of name + version", "flag", "d", bool),
|
||||||
"perform compatibility check", "flag", "d", bool),
|
pip_args=("additional arguments to be passed to `pip install` on model install"),
|
||||||
pip_args=("additional arguments to be passed to `pip install` when "
|
)
|
||||||
"installing the model"))
|
|
||||||
def download(model, direct=False, *pip_args):
|
def download(model, direct=False, *pip_args):
|
||||||
"""
|
"""
|
||||||
Download compatible model from default download path using pip. Model
|
Download compatible model from default download path using pip. Model
|
||||||
can be shortcut, model name or, if --direct flag is set, full model name
|
can be shortcut, model name or, if --direct flag is set, full model name
|
||||||
with version.
|
with version. For direct downloads, the compatibility check will be skipped.
|
||||||
"""
|
"""
|
||||||
if direct:
|
if direct:
|
||||||
dl = download_model('{m}/{m}.tar.gz#egg={m}'.format(m=model), pip_args)
|
dl = download_model("{m}/{m}.tar.gz#egg={m}".format(m=model), pip_args)
|
||||||
else:
|
else:
|
||||||
shortcuts = get_json(about.__shortcuts__, "available shortcuts")
|
shortcuts = get_json(about.__shortcuts__, "available shortcuts")
|
||||||
model_name = shortcuts.get(model, model)
|
model_name = shortcuts.get(model, model)
|
||||||
compatibility = get_compatibility()
|
compatibility = get_compatibility()
|
||||||
version = get_version(model_name, compatibility)
|
version = get_version(model_name, compatibility)
|
||||||
dl = download_model('{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}'
|
dl_tpl = "{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}"
|
||||||
.format(m=model_name, v=version), pip_args)
|
dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
||||||
if dl != 0: # if download subprocess doesn't return 0, exit
|
if dl != 0: # if download subprocess doesn't return 0, exit
|
||||||
sys.exit(dl)
|
sys.exit(dl)
|
||||||
try:
|
try:
|
||||||
|
@ -43,44 +46,49 @@ def download(model, direct=False, *pip_args):
|
||||||
# subprocess
|
# subprocess
|
||||||
package_path = get_package_path(model_name)
|
package_path = get_package_path(model_name)
|
||||||
link(model_name, model, force=True, model_path=package_path)
|
link(model_name, model, force=True, model_path=package_path)
|
||||||
except:
|
except: # noqa: E722
|
||||||
# Dirty, but since spacy.download and the auto-linking is
|
# Dirty, but since spacy.download and the auto-linking is
|
||||||
# mostly a convenience wrapper, it's best to show a success
|
# mostly a convenience wrapper, it's best to show a success
|
||||||
# message and loading instructions, even if linking fails.
|
# message and loading instructions, even if linking fails.
|
||||||
prints(Messages.M001, title=Messages.M002.format(name=model_name))
|
msg.warn(Messages.M002.format(name=model_name), Messages.M001)
|
||||||
|
|
||||||
|
|
||||||
def get_json(url, desc):
|
def get_json(url, desc):
|
||||||
r = requests.get(url)
|
r = requests.get(url)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
prints(Messages.M004.format(desc=desc, version=about.__version__),
|
msg.fail(
|
||||||
title=Messages.M003.format(code=r.status_code), exits=1)
|
Messages.M003.format(code=r.status_code),
|
||||||
|
Messages.M004.format(desc=desc, version=about.__version__),
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
return r.json()
|
return r.json()
|
||||||
|
|
||||||
|
|
||||||
def get_compatibility():
|
def get_compatibility():
|
||||||
version = about.__version__
|
version = about.__version__
|
||||||
version = version.rsplit('.dev', 1)[0]
|
version = version.rsplit(".dev", 1)[0]
|
||||||
comp_table = get_json(about.__compatibility__, "compatibility table")
|
comp_table = get_json(about.__compatibility__, "compatibility table")
|
||||||
comp = comp_table['spacy']
|
comp = comp_table["spacy"]
|
||||||
if version not in comp:
|
if version not in comp:
|
||||||
prints(Messages.M006.format(version=version), title=Messages.M005,
|
msg.fail(Messages.M005, Messages.M006.format(version=version), exits=1)
|
||||||
exits=1)
|
|
||||||
return comp[version]
|
return comp[version]
|
||||||
|
|
||||||
|
|
||||||
def get_version(model, comp):
|
def get_version(model, comp):
|
||||||
model = model.rsplit('.dev', 1)[0]
|
model = model.rsplit(".dev", 1)[0]
|
||||||
if model not in comp:
|
if model not in comp:
|
||||||
prints(Messages.M007.format(name=model, version=about.__version__),
|
msg.fail(
|
||||||
title=Messages.M005, exits=1)
|
Messages.M005,
|
||||||
|
Messages.M007.format(name=model, version=about.__version__),
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
return comp[model][0]
|
return comp[model][0]
|
||||||
|
|
||||||
|
|
||||||
def download_model(filename, user_pip_args=None):
|
def download_model(filename, user_pip_args=None):
|
||||||
download_url = about.__download_url__ + '/' + filename
|
download_url = about.__download_url__ + "/" + filename
|
||||||
pip_args = ['--no-cache-dir', '--no-deps']
|
pip_args = ["--no-cache-dir", "--no-deps"]
|
||||||
if user_pip_args:
|
if user_pip_args:
|
||||||
pip_args.extend(user_pip_args)
|
pip_args.extend(user_pip_args)
|
||||||
cmd = [sys.executable, '-m', 'pip', 'install'] + pip_args + [download_url]
|
cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
|
||||||
return subprocess.call(cmd, env=os.environ.copy())
|
return subprocess.call(cmd, env=os.environ.copy())
|
||||||
|
|
|
@ -3,30 +3,35 @@ from __future__ import unicode_literals, division, print_function
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
|
from wasabi import Printer
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
from ..gold import GoldCorpus
|
from ..gold import GoldCorpus
|
||||||
from ..util import prints
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from .. import displacy
|
from .. import displacy
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
model=("model name or path", "positional", None, str),
|
model=("Model name or path", "positional", None, str),
|
||||||
data_path=("location of JSON-formatted evaluation data", "positional",
|
data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
|
||||||
None, str),
|
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||||
gold_preproc=("use gold preprocessing", "flag", "G", bool),
|
gpu_id=("Use GPU", "option", "g", int),
|
||||||
gpu_id=("use GPU", "option", "g", int),
|
displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
|
||||||
displacy_path=("directory to output rendered parses as HTML", "option",
|
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int),
|
||||||
"dp", str),
|
)
|
||||||
displacy_limit=("limit of parses to render as HTML", "option", "dl", int))
|
def evaluate(
|
||||||
def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None,
|
model,
|
||||||
displacy_limit=25):
|
data_path,
|
||||||
|
gpu_id=-1,
|
||||||
|
gold_preproc=False,
|
||||||
|
displacy_path=None,
|
||||||
|
displacy_limit=25,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Evaluate a model. To render a sample of parses in a HTML file, set an
|
Evaluate a model. To render a sample of parses in a HTML file, set an
|
||||||
output directory as the displacy_path argument.
|
output directory as the displacy_path argument.
|
||||||
"""
|
"""
|
||||||
|
msg = Printer()
|
||||||
util.fix_random_seed()
|
util.fix_random_seed()
|
||||||
if gpu_id >= 0:
|
if gpu_id >= 0:
|
||||||
util.use_gpu(gpu_id)
|
util.use_gpu(gpu_id)
|
||||||
|
@ -34,9 +39,9 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None
|
||||||
data_path = util.ensure_path(data_path)
|
data_path = util.ensure_path(data_path)
|
||||||
displacy_path = util.ensure_path(displacy_path)
|
displacy_path = util.ensure_path(displacy_path)
|
||||||
if not data_path.exists():
|
if not data_path.exists():
|
||||||
prints(data_path, title=Messages.M034, exits=1)
|
msg.fail(Messages.M034, data_path, exits=1)
|
||||||
if displacy_path and not displacy_path.exists():
|
if displacy_path and not displacy_path.exists():
|
||||||
prints(displacy_path, title=Messages.M035, exits=1)
|
msg.fail(Messages.M035, displacy_path, exits=1)
|
||||||
corpus = GoldCorpus(data_path, data_path)
|
corpus = GoldCorpus(data_path, data_path)
|
||||||
nlp = util.load_model(model)
|
nlp = util.load_model(model)
|
||||||
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
|
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
|
||||||
|
@ -44,65 +49,80 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None
|
||||||
scorer = nlp.evaluate(dev_docs, verbose=False)
|
scorer = nlp.evaluate(dev_docs, verbose=False)
|
||||||
end = timer()
|
end = timer()
|
||||||
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
|
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
|
||||||
print_results(scorer, time=end - begin, words=nwords,
|
results = {
|
||||||
wps=nwords / (end - begin))
|
"Time": "%.2f s" % end - begin,
|
||||||
|
"Words": nwords,
|
||||||
|
"Words/s": "%.0f" % nwords / (end - begin),
|
||||||
|
"TOK": "%.2f" % scorer.token_acc,
|
||||||
|
"POS": "%.2f" % scorer.tags_acc,
|
||||||
|
"UAS": "%.2f" % scorer.uas,
|
||||||
|
"LAS": "%.2f" % scorer.las,
|
||||||
|
"NER P": "%.2f" % scorer.ents_p,
|
||||||
|
"NER R": "%.2f" % scorer.ents_r,
|
||||||
|
"NER F": "%.2f" % scorer.ents_f,
|
||||||
|
}
|
||||||
|
msg.table(results, title="Results")
|
||||||
|
|
||||||
if displacy_path:
|
if displacy_path:
|
||||||
docs, golds = zip(*dev_docs)
|
docs, golds = zip(*dev_docs)
|
||||||
render_deps = 'parser' in nlp.meta.get('pipeline', [])
|
render_deps = "parser" in nlp.meta.get("pipeline", [])
|
||||||
render_ents = 'ner' in nlp.meta.get('pipeline', [])
|
render_ents = "ner" in nlp.meta.get("pipeline", [])
|
||||||
render_parses(docs, displacy_path, model_name=model,
|
render_parses(
|
||||||
limit=displacy_limit, deps=render_deps, ents=render_ents)
|
docs,
|
||||||
prints(displacy_path, title=Messages.M036.format(n=displacy_limit))
|
displacy_path,
|
||||||
|
model_name=model,
|
||||||
|
limit=displacy_limit,
|
||||||
|
deps=render_deps,
|
||||||
|
ents=render_ents,
|
||||||
|
)
|
||||||
|
msg.good(Messages.M036.format(n=displacy_limit), displacy_path)
|
||||||
|
|
||||||
|
|
||||||
def render_parses(docs, output_path, model_name='', limit=250, deps=True,
|
def render_parses(docs, output_path, model_name="", limit=250, deps=True, ents=True):
|
||||||
ents=True):
|
docs[0].user_data["title"] = model_name
|
||||||
docs[0].user_data['title'] = model_name
|
|
||||||
if ents:
|
if ents:
|
||||||
with (output_path / 'entities.html').open('w') as file_:
|
with (output_path / "entities.html").open("w") as file_:
|
||||||
html = displacy.render(docs[:limit], style='ent', page=True)
|
html = displacy.render(docs[:limit], style="ent", page=True)
|
||||||
file_.write(html)
|
file_.write(html)
|
||||||
if deps:
|
if deps:
|
||||||
with (output_path / 'parses.html').open('w') as file_:
|
with (output_path / "parses.html").open("w") as file_:
|
||||||
html = displacy.render(docs[:limit], style='dep', page=True,
|
html = displacy.render(
|
||||||
options={'compact': True})
|
docs[:limit], style="dep", page=True, options={"compact": True}
|
||||||
|
)
|
||||||
file_.write(html)
|
file_.write(html)
|
||||||
|
|
||||||
|
|
||||||
def print_progress(itn, losses, dev_scores, wps=0.0):
|
def print_progress(itn, losses, dev_scores, wps=0.0):
|
||||||
scores = {}
|
scores = {}
|
||||||
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
|
for col in [
|
||||||
'ents_p', 'ents_r', 'ents_f', 'wps']:
|
"dep_loss",
|
||||||
|
"tag_loss",
|
||||||
|
"uas",
|
||||||
|
"tags_acc",
|
||||||
|
"token_acc",
|
||||||
|
"ents_p",
|
||||||
|
"ents_r",
|
||||||
|
"ents_f",
|
||||||
|
"wps",
|
||||||
|
]:
|
||||||
scores[col] = 0.0
|
scores[col] = 0.0
|
||||||
scores['dep_loss'] = losses.get('parser', 0.0)
|
scores["dep_loss"] = losses.get("parser", 0.0)
|
||||||
scores['ner_loss'] = losses.get('ner', 0.0)
|
scores["ner_loss"] = losses.get("ner", 0.0)
|
||||||
scores['tag_loss'] = losses.get('tagger', 0.0)
|
scores["tag_loss"] = losses.get("tagger", 0.0)
|
||||||
scores.update(dev_scores)
|
scores.update(dev_scores)
|
||||||
scores['wps'] = wps
|
scores["wps"] = wps
|
||||||
tpl = '\t'.join((
|
tpl = "\t".join(
|
||||||
'{:d}',
|
(
|
||||||
'{dep_loss:.3f}',
|
"{:d}",
|
||||||
'{ner_loss:.3f}',
|
"{dep_loss:.3f}",
|
||||||
'{uas:.3f}',
|
"{ner_loss:.3f}",
|
||||||
'{ents_p:.3f}',
|
"{uas:.3f}",
|
||||||
'{ents_r:.3f}',
|
"{ents_p:.3f}",
|
||||||
'{ents_f:.3f}',
|
"{ents_r:.3f}",
|
||||||
'{tags_acc:.3f}',
|
"{ents_f:.3f}",
|
||||||
'{token_acc:.3f}',
|
"{tags_acc:.3f}",
|
||||||
'{wps:.1f}'))
|
"{token_acc:.3f}",
|
||||||
|
"{wps:.1f}",
|
||||||
|
)
|
||||||
|
)
|
||||||
print(tpl.format(itn, **scores))
|
print(tpl.format(itn, **scores))
|
||||||
|
|
||||||
|
|
||||||
def print_results(scorer, time, words, wps):
|
|
||||||
results = {
|
|
||||||
'Time': '%.2f s' % time,
|
|
||||||
'Words': words,
|
|
||||||
'Words/s': '%.0f' % wps,
|
|
||||||
'TOK': '%.2f' % scorer.token_acc,
|
|
||||||
'POS': '%.2f' % scorer.tags_acc,
|
|
||||||
'UAS': '%.2f' % scorer.uas,
|
|
||||||
'LAS': '%.2f' % scorer.las,
|
|
||||||
'NER P': '%.2f' % scorer.ents_p,
|
|
||||||
'NER R': '%.2f' % scorer.ents_r,
|
|
||||||
'NER F': '%.2f' % scorer.ents_f}
|
|
||||||
util.print_table(results, title="Results")
|
|
||||||
|
|
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals
|
||||||
import plac
|
import plac
|
||||||
import platform
|
import platform
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from wasabi import Printer
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
from ..compat import path2str
|
from ..compat import path2str
|
||||||
|
@ -12,56 +13,65 @@ from .. import about
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
model=("optional: shortcut link of model", "positional", None, str),
|
model=("Optional shortcut link of model", "positional", None, str),
|
||||||
markdown=("generate Markdown for GitHub issues", "flag", "md", str),
|
markdown=("Generate Markdown for GitHub issues", "flag", "md", str),
|
||||||
silent=("don't print anything (just return)", "flag", "s"))
|
silent=("Don't print anything (just return)", "flag", "s"),
|
||||||
|
)
|
||||||
def info(model=None, markdown=False, silent=False):
|
def info(model=None, markdown=False, silent=False):
|
||||||
"""Print info about spaCy installation. If a model shortcut link is
|
"""
|
||||||
|
Print info about spaCy installation. If a model shortcut link is
|
||||||
speficied as an argument, print model information. Flag --markdown
|
speficied as an argument, print model information. Flag --markdown
|
||||||
prints details in Markdown for easy copy-pasting to GitHub issues.
|
prints details in Markdown for easy copy-pasting to GitHub issues.
|
||||||
"""
|
"""
|
||||||
|
msg = Printer()
|
||||||
if model:
|
if model:
|
||||||
if util.is_package(model):
|
if util.is_package(model):
|
||||||
model_path = util.get_package_path(model)
|
model_path = util.get_package_path(model)
|
||||||
else:
|
else:
|
||||||
model_path = util.get_data_path() / model
|
model_path = util.get_data_path() / model
|
||||||
meta_path = model_path / 'meta.json'
|
meta_path = model_path / "meta.json"
|
||||||
if not meta_path.is_file():
|
if not meta_path.is_file():
|
||||||
util.prints(meta_path, title=Messages.M020, exits=1)
|
msg.fail(Messages.M020, meta_path, exits=1)
|
||||||
meta = util.read_json(meta_path)
|
meta = util.read_json(meta_path)
|
||||||
if model_path.resolve() != model_path:
|
if model_path.resolve() != model_path:
|
||||||
meta['link'] = path2str(model_path)
|
meta["link"] = path2str(model_path)
|
||||||
meta['source'] = path2str(model_path.resolve())
|
meta["source"] = path2str(model_path.resolve())
|
||||||
else:
|
else:
|
||||||
meta['source'] = path2str(model_path)
|
meta["source"] = path2str(model_path)
|
||||||
if not silent:
|
if not silent:
|
||||||
print_info(meta, 'model %s' % model, markdown)
|
title = "Info about model '{}'".format(model)
|
||||||
|
model_meta = {
|
||||||
|
k: v for k, v in meta.items() if k not in ("accuracy", "speed")
|
||||||
|
}
|
||||||
|
if markdown:
|
||||||
|
util.print_markdown(model_meta, title=title)
|
||||||
|
else:
|
||||||
|
msg.table(model_meta, title=title)
|
||||||
return meta
|
return meta
|
||||||
data = {'spaCy version': about.__version__,
|
data = {
|
||||||
'Location': path2str(Path(__file__).parent.parent),
|
"spaCy version": about.__version__,
|
||||||
'Platform': platform.platform(),
|
"Location": path2str(Path(__file__).parent.parent),
|
||||||
'Python version': platform.python_version(),
|
"Platform": platform.platform(),
|
||||||
'Models': list_models()}
|
"Python version": platform.python_version(),
|
||||||
|
"Models": list_models(),
|
||||||
|
}
|
||||||
if not silent:
|
if not silent:
|
||||||
print_info(data, 'spaCy', markdown)
|
title = "Info about spaCy"
|
||||||
|
if markdown:
|
||||||
|
util.print_markdown(data, title=title)
|
||||||
|
else:
|
||||||
|
msg.table(data, title=title)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def print_info(data, title, markdown):
|
|
||||||
title = 'Info about %s' % title
|
|
||||||
if markdown:
|
|
||||||
util.print_markdown(data, title=title)
|
|
||||||
else:
|
|
||||||
util.print_table(data, title=title)
|
|
||||||
|
|
||||||
|
|
||||||
def list_models():
|
def list_models():
|
||||||
def exclude_dir(dir_name):
|
def exclude_dir(dir_name):
|
||||||
# exclude common cache directories and hidden directories
|
# exclude common cache directories and hidden directories
|
||||||
exclude = ['cache', 'pycache', '__pycache__']
|
exclude = ("cache", "pycache", "__pycache__")
|
||||||
return dir_name in exclude or dir_name.startswith('.')
|
return dir_name in exclude or dir_name.startswith(".")
|
||||||
|
|
||||||
data_path = util.get_data_path()
|
data_path = util.get_data_path()
|
||||||
if data_path:
|
if data_path:
|
||||||
models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()]
|
models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()]
|
||||||
return ', '.join([m for m in models if not exclude_dir(m)])
|
return ", ".join([m for m in models if not exclude_dir(m)])
|
||||||
return '-'
|
return "-"
|
||||||
|
|
|
@ -11,13 +11,12 @@ from preshed.counter import PreshCounter
|
||||||
import tarfile
|
import tarfile
|
||||||
import gzip
|
import gzip
|
||||||
import zipfile
|
import zipfile
|
||||||
import ujson as json
|
from wasabi import Printer
|
||||||
from spacy.lexeme import intify_attrs
|
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
from ..vectors import Vectors
|
from ..vectors import Vectors
|
||||||
from ..errors import Errors, Warnings, user_warning
|
from ..errors import Errors, Warnings, user_warning
|
||||||
from ..util import prints, ensure_path, get_lang_class
|
from ..util import ensure_path, get_lang_class, read_jsonl
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import ftfy
|
import ftfy
|
||||||
|
@ -25,121 +24,133 @@ except ImportError:
|
||||||
ftfy = None
|
ftfy = None
|
||||||
|
|
||||||
|
|
||||||
|
msg = Printer()
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
lang=("model language", "positional", None, str),
|
lang=("Model language", "positional", None, str),
|
||||||
output_dir=("model output directory", "positional", None, Path),
|
output_dir=("Model output directory", "positional", None, Path),
|
||||||
freqs_loc=("location of words frequencies file", "option", "f", Path),
|
freqs_loc=("Location of words frequencies file", "option", "f", Path),
|
||||||
jsonl_loc=("location of JSONL-formatted attributes file", "option", "j", Path),
|
jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
|
||||||
clusters_loc=("optional: location of brown clusters data",
|
clusters_loc=("Optional location of brown clusters data", "option", "c", str),
|
||||||
"option", "c", str),
|
vectors_loc=("Optional vectors file in Word2Vec format" "option", "v", str),
|
||||||
vectors_loc=("optional: location of vectors file in Word2Vec format "
|
prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
|
||||||
"(either as .txt or zipped as .zip or .tar.gz)", "option",
|
|
||||||
"v", str),
|
|
||||||
prune_vectors=("optional: number of vectors to prune to",
|
|
||||||
"option", "V", int)
|
|
||||||
)
|
)
|
||||||
def init_model(lang, output_dir, freqs_loc=None, clusters_loc=None, jsonl_loc=None,
|
def init_model(
|
||||||
vectors_loc=None, prune_vectors=-1):
|
lang,
|
||||||
|
output_dir,
|
||||||
|
freqs_loc=None,
|
||||||
|
clusters_loc=None,
|
||||||
|
jsonl_loc=None,
|
||||||
|
vectors_loc=None,
|
||||||
|
prune_vectors=-1,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Create a new model from raw data, like word frequencies, Brown clusters
|
Create a new model from raw data, like word frequencies, Brown clusters
|
||||||
and word vectors.
|
and word vectors. If vectors are provided in Word2Vec format, they can
|
||||||
|
be either a .txt or zipped as a .zip or .tar.gz.
|
||||||
"""
|
"""
|
||||||
if jsonl_loc is not None:
|
if jsonl_loc is not None:
|
||||||
if freqs_loc is not None or clusters_loc is not None:
|
if freqs_loc is not None or clusters_loc is not None:
|
||||||
settings = ['-j']
|
settings = ["-j"]
|
||||||
if freqs_loc:
|
if freqs_loc:
|
||||||
settings.append('-f')
|
settings.append("-f")
|
||||||
if clusters_loc:
|
if clusters_loc:
|
||||||
settings.append('-c')
|
settings.append("-c")
|
||||||
prints(' '.join(settings),
|
msg.warn(Messages.M063, Messages.M064)
|
||||||
title=(
|
|
||||||
"The -f and -c arguments are deprecated, and not compatible "
|
|
||||||
"with the -j argument, which should specify the same information. "
|
|
||||||
"Either merge the frequencies and clusters data into the "
|
|
||||||
"jsonl-formatted file (recommended), or use only the -f and "
|
|
||||||
"-c files, without the other lexical attributes."))
|
|
||||||
jsonl_loc = ensure_path(jsonl_loc)
|
jsonl_loc = ensure_path(jsonl_loc)
|
||||||
lex_attrs = (json.loads(line) for line in jsonl_loc.open())
|
lex_attrs = read_jsonl(jsonl_loc)
|
||||||
else:
|
else:
|
||||||
clusters_loc = ensure_path(clusters_loc)
|
clusters_loc = ensure_path(clusters_loc)
|
||||||
freqs_loc = ensure_path(freqs_loc)
|
freqs_loc = ensure_path(freqs_loc)
|
||||||
if freqs_loc is not None and not freqs_loc.exists():
|
if freqs_loc is not None and not freqs_loc.exists():
|
||||||
prints(freqs_loc, title=Messages.M037, exits=1)
|
msg.fail(Messages.M037, freqs_loc, exits=1)
|
||||||
lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)
|
lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)
|
||||||
|
|
||||||
nlp = create_model(lang, lex_attrs)
|
with msg.loading("Creating model..."):
|
||||||
|
nlp = create_model(lang, lex_attrs)
|
||||||
|
msg.good("Successfully created model")
|
||||||
if vectors_loc is not None:
|
if vectors_loc is not None:
|
||||||
add_vectors(nlp, vectors_loc, prune_vectors)
|
add_vectors(nlp, vectors_loc, prune_vectors)
|
||||||
vec_added = len(nlp.vocab.vectors)
|
vec_added = len(nlp.vocab.vectors)
|
||||||
lex_added = len(nlp.vocab)
|
lex_added = len(nlp.vocab)
|
||||||
prints(Messages.M039.format(entries=lex_added, vectors=vec_added),
|
msg.good(Messages.M038, Messages.M039.format(entries=lex_added, vectors=vec_added))
|
||||||
title=Messages.M038)
|
|
||||||
if not output_dir.exists():
|
if not output_dir.exists():
|
||||||
output_dir.mkdir()
|
output_dir.mkdir()
|
||||||
nlp.to_disk(output_dir)
|
nlp.to_disk(output_dir)
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
def open_file(loc):
|
def open_file(loc):
|
||||||
'''Handle .gz, .tar.gz or unzipped files'''
|
"""Handle .gz, .tar.gz or unzipped files"""
|
||||||
loc = ensure_path(loc)
|
loc = ensure_path(loc)
|
||||||
print("Open loc")
|
|
||||||
if tarfile.is_tarfile(str(loc)):
|
if tarfile.is_tarfile(str(loc)):
|
||||||
return tarfile.open(str(loc), 'r:gz')
|
return tarfile.open(str(loc), "r:gz")
|
||||||
elif loc.parts[-1].endswith('gz'):
|
elif loc.parts[-1].endswith("gz"):
|
||||||
return (line.decode('utf8') for line in gzip.open(str(loc), 'r'))
|
return (line.decode("utf8") for line in gzip.open(str(loc), "r"))
|
||||||
elif loc.parts[-1].endswith('zip'):
|
elif loc.parts[-1].endswith("zip"):
|
||||||
zip_file = zipfile.ZipFile(str(loc))
|
zip_file = zipfile.ZipFile(str(loc))
|
||||||
names = zip_file.namelist()
|
names = zip_file.namelist()
|
||||||
file_ = zip_file.open(names[0])
|
file_ = zip_file.open(names[0])
|
||||||
return (line.decode('utf8') for line in file_)
|
return (line.decode("utf8") for line in file_)
|
||||||
else:
|
else:
|
||||||
return loc.open('r', encoding='utf8')
|
return loc.open("r", encoding="utf8")
|
||||||
|
|
||||||
|
|
||||||
def read_attrs_from_deprecated(freqs_loc, clusters_loc):
|
def read_attrs_from_deprecated(freqs_loc, clusters_loc):
|
||||||
probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20)
|
with msg.loading("Counting frequencies..."):
|
||||||
clusters = read_clusters(clusters_loc) if clusters_loc else {}
|
probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20)
|
||||||
|
msg.good("Counted frequencies")
|
||||||
|
with msg.loading("Reading clusters..."):
|
||||||
|
clusters = read_clusters(clusters_loc) if clusters_loc else {}
|
||||||
|
msg.good("Read clusters")
|
||||||
lex_attrs = []
|
lex_attrs = []
|
||||||
sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
|
sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
|
||||||
for i, (word, prob) in tqdm(enumerate(sorted_probs)):
|
for i, (word, prob) in tqdm(enumerate(sorted_probs)):
|
||||||
attrs = {'orth': word, 'id': i, 'prob': prob}
|
attrs = {"orth": word, "id": i, "prob": prob}
|
||||||
# Decode as a little-endian string, so that we can do & 15 to get
|
# Decode as a little-endian string, so that we can do & 15 to get
|
||||||
# the first 4 bits. See _parse_features.pyx
|
# the first 4 bits. See _parse_features.pyx
|
||||||
if word in clusters:
|
if word in clusters:
|
||||||
attrs['cluster'] = int(clusters[word][::-1], 2)
|
attrs["cluster"] = int(clusters[word][::-1], 2)
|
||||||
else:
|
else:
|
||||||
attrs['cluster'] = 0
|
attrs["cluster"] = 0
|
||||||
lex_attrs.append(attrs)
|
lex_attrs.append(attrs)
|
||||||
return lex_attrs
|
return lex_attrs
|
||||||
|
|
||||||
|
|
||||||
def create_model(lang, lex_attrs):
|
def create_model(lang, lex_attrs):
|
||||||
print("Creating model...")
|
|
||||||
lang_class = get_lang_class(lang)
|
lang_class = get_lang_class(lang)
|
||||||
nlp = lang_class()
|
nlp = lang_class()
|
||||||
for lexeme in nlp.vocab:
|
for lexeme in nlp.vocab:
|
||||||
lexeme.rank = 0
|
lexeme.rank = 0
|
||||||
lex_added = 0
|
lex_added = 0
|
||||||
for attrs in lex_attrs:
|
for attrs in lex_attrs:
|
||||||
if 'settings' in attrs:
|
if "settings" in attrs:
|
||||||
continue
|
continue
|
||||||
lexeme = nlp.vocab[attrs['orth']]
|
lexeme = nlp.vocab[attrs["orth"]]
|
||||||
lexeme.set_attrs(**attrs)
|
lexeme.set_attrs(**attrs)
|
||||||
lexeme.is_oov = False
|
lexeme.is_oov = False
|
||||||
lex_added += 1
|
lex_added += 1
|
||||||
lex_added += 1
|
lex_added += 1
|
||||||
oov_prob = min(lex.prob for lex in nlp.vocab)
|
oov_prob = min(lex.prob for lex in nlp.vocab)
|
||||||
nlp.vocab.cfg.update({'oov_prob': oov_prob-1})
|
nlp.vocab.cfg.update({"oov_prob": oov_prob - 1})
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
def add_vectors(nlp, vectors_loc, prune_vectors):
|
def add_vectors(nlp, vectors_loc, prune_vectors):
|
||||||
vectors_loc = ensure_path(vectors_loc)
|
vectors_loc = ensure_path(vectors_loc)
|
||||||
if vectors_loc and vectors_loc.parts[-1].endswith('.npz'):
|
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
||||||
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open('rb')))
|
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
|
||||||
for lex in nlp.vocab:
|
for lex in nlp.vocab:
|
||||||
if lex.rank:
|
if lex.rank:
|
||||||
nlp.vocab.vectors.add(lex.orth, row=lex.rank)
|
nlp.vocab.vectors.add(lex.orth, row=lex.rank)
|
||||||
else:
|
else:
|
||||||
vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None)
|
if vectors_loc:
|
||||||
|
with msg.loading("Reading vectors from {}".format(vectors_loc)):
|
||||||
|
vectors_data, vector_keys = read_vectors(vectors_loc)
|
||||||
|
msg.good("Loaded vectors from {}".format(vectors_loc))
|
||||||
|
else:
|
||||||
|
vectors_data, vector_keys = (None, None)
|
||||||
if vector_keys is not None:
|
if vector_keys is not None:
|
||||||
for word in vector_keys:
|
for word in vector_keys:
|
||||||
if word not in nlp.vocab:
|
if word not in nlp.vocab:
|
||||||
|
@ -147,35 +158,34 @@ def add_vectors(nlp, vectors_loc, prune_vectors):
|
||||||
lexeme.is_oov = False
|
lexeme.is_oov = False
|
||||||
if vectors_data is not None:
|
if vectors_data is not None:
|
||||||
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
||||||
nlp.vocab.vectors.name = '%s_model.vectors' % nlp.meta['lang']
|
nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"]
|
||||||
nlp.meta['vectors']['name'] = nlp.vocab.vectors.name
|
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
||||||
if prune_vectors >= 1:
|
if prune_vectors >= 1:
|
||||||
nlp.vocab.prune_vectors(prune_vectors)
|
nlp.vocab.prune_vectors(prune_vectors)
|
||||||
|
|
||||||
|
|
||||||
def read_vectors(vectors_loc):
|
def read_vectors(vectors_loc):
|
||||||
print("Reading vectors from %s" % vectors_loc)
|
|
||||||
f = open_file(vectors_loc)
|
f = open_file(vectors_loc)
|
||||||
shape = tuple(int(size) for size in next(f).split())
|
shape = tuple(int(size) for size in next(f).split())
|
||||||
vectors_data = numpy.zeros(shape=shape, dtype='f')
|
vectors_data = numpy.zeros(shape=shape, dtype="f")
|
||||||
vectors_keys = []
|
vectors_keys = []
|
||||||
for i, line in enumerate(tqdm(f)):
|
for i, line in enumerate(tqdm(f)):
|
||||||
line = line.rstrip()
|
line = line.rstrip()
|
||||||
pieces = line.rsplit(' ', vectors_data.shape[1]+1)
|
pieces = line.rsplit(" ", vectors_data.shape[1] + 1)
|
||||||
word = pieces.pop(0)
|
word = pieces.pop(0)
|
||||||
if len(pieces) != vectors_data.shape[1]:
|
if len(pieces) != vectors_data.shape[1]:
|
||||||
raise ValueError(Errors.E094.format(line_num=i, loc=vectors_loc))
|
msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
|
||||||
vectors_data[i] = numpy.asarray(pieces, dtype='f')
|
vectors_data[i] = numpy.asarray(pieces, dtype="f")
|
||||||
vectors_keys.append(word)
|
vectors_keys.append(word)
|
||||||
return vectors_data, vectors_keys
|
return vectors_data, vectors_keys
|
||||||
|
|
||||||
|
|
||||||
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
||||||
print("Counting frequencies...")
|
|
||||||
counts = PreshCounter()
|
counts = PreshCounter()
|
||||||
total = 0
|
total = 0
|
||||||
with freqs_loc.open() as f:
|
with freqs_loc.open() as f:
|
||||||
for i, line in enumerate(f):
|
for i, line in enumerate(f):
|
||||||
freq, doc_freq, key = line.rstrip().split('\t', 2)
|
freq, doc_freq, key = line.rstrip().split("\t", 2)
|
||||||
freq = int(freq)
|
freq = int(freq)
|
||||||
counts.inc(i + 1, freq)
|
counts.inc(i + 1, freq)
|
||||||
total += freq
|
total += freq
|
||||||
|
@ -184,7 +194,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
||||||
probs = {}
|
probs = {}
|
||||||
with freqs_loc.open() as f:
|
with freqs_loc.open() as f:
|
||||||
for line in tqdm(f):
|
for line in tqdm(f):
|
||||||
freq, doc_freq, key = line.rstrip().split('\t', 2)
|
freq, doc_freq, key = line.rstrip().split("\t", 2)
|
||||||
doc_freq = int(doc_freq)
|
doc_freq = int(doc_freq)
|
||||||
freq = int(freq)
|
freq = int(freq)
|
||||||
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
|
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
|
||||||
|
@ -196,7 +206,6 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
||||||
|
|
||||||
|
|
||||||
def read_clusters(clusters_loc):
|
def read_clusters(clusters_loc):
|
||||||
print("Reading clusters...")
|
|
||||||
clusters = {}
|
clusters = {}
|
||||||
if ftfy is None:
|
if ftfy is None:
|
||||||
user_warning(Warnings.W004)
|
user_warning(Warnings.W004)
|
||||||
|
@ -213,7 +222,7 @@ def read_clusters(clusters_loc):
|
||||||
if int(freq) >= 3:
|
if int(freq) >= 3:
|
||||||
clusters[word] = cluster
|
clusters[word] = cluster
|
||||||
else:
|
else:
|
||||||
clusters[word] = '0'
|
clusters[word] = "0"
|
||||||
# Expand clusters with re-casing
|
# Expand clusters with re-casing
|
||||||
for word, cluster in list(clusters.items()):
|
for word, cluster in list(clusters.items()):
|
||||||
if word.lower() not in clusters:
|
if word.lower() not in clusters:
|
||||||
|
|
|
@ -3,51 +3,54 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from wasabi import Printer
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
from ..compat import symlink_to, path2str
|
from ..compat import symlink_to, path2str
|
||||||
from ..util import prints
|
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
origin=("package name or local path to model", "positional", None, str),
|
origin=("package name or local path to model", "positional", None, str),
|
||||||
link_name=("name of shortuct link to create", "positional", None, str),
|
link_name=("name of shortuct link to create", "positional", None, str),
|
||||||
force=("force overwriting of existing link", "flag", "f", bool))
|
force=("force overwriting of existing link", "flag", "f", bool),
|
||||||
|
)
|
||||||
def link(origin, link_name, force=False, model_path=None):
|
def link(origin, link_name, force=False, model_path=None):
|
||||||
"""
|
"""
|
||||||
Create a symlink for models within the spacy/data directory. Accepts
|
Create a symlink for models within the spacy/data directory. Accepts
|
||||||
either the name of a pip package, or the local path to the model data
|
either the name of a pip package, or the local path to the model data
|
||||||
directory. Linking models allows loading them via spacy.load(link_name).
|
directory. Linking models allows loading them via spacy.load(link_name).
|
||||||
"""
|
"""
|
||||||
|
msg = Printer()
|
||||||
if util.is_package(origin):
|
if util.is_package(origin):
|
||||||
model_path = util.get_package_path(origin)
|
model_path = util.get_package_path(origin)
|
||||||
else:
|
else:
|
||||||
model_path = Path(origin) if model_path is None else Path(model_path)
|
model_path = Path(origin) if model_path is None else Path(model_path)
|
||||||
if not model_path.exists():
|
if not model_path.exists():
|
||||||
prints(Messages.M009.format(path=path2str(model_path)),
|
msg.fail(
|
||||||
title=Messages.M008, exits=1)
|
Messages.M008, Messages.M009.format(path=path2str(model_path)), exits=1
|
||||||
|
)
|
||||||
data_path = util.get_data_path()
|
data_path = util.get_data_path()
|
||||||
if not data_path or not data_path.exists():
|
if not data_path or not data_path.exists():
|
||||||
spacy_loc = Path(__file__).parent.parent
|
spacy_loc = Path(__file__).parent.parent
|
||||||
prints(Messages.M011, spacy_loc, title=Messages.M010, exits=1)
|
msg.fail(Messages.M010, Messages.M011.format(path=spacy_loc), exits=1)
|
||||||
link_path = util.get_data_path() / link_name
|
link_path = util.get_data_path() / link_name
|
||||||
if link_path.is_symlink() and not force:
|
if link_path.is_symlink() and not force:
|
||||||
prints(Messages.M013, title=Messages.M012.format(name=link_name),
|
msg.fail(Messages.M012.format(name=link_name), Messages.M013, exits=1)
|
||||||
exits=1)
|
|
||||||
elif link_path.is_symlink(): # does a symlink exist?
|
elif link_path.is_symlink(): # does a symlink exist?
|
||||||
# NB: It's important to check for is_symlink here and not for exists,
|
# NB: It's important to check for is_symlink here and not for exists,
|
||||||
# because invalid/outdated symlinks would return False otherwise.
|
# because invalid/outdated symlinks would return False otherwise.
|
||||||
link_path.unlink()
|
link_path.unlink()
|
||||||
elif link_path.exists(): # does it exist otherwise?
|
elif link_path.exists(): # does it exist otherwise?
|
||||||
# NB: Check this last because valid symlinks also "exist".
|
# NB: Check this last because valid symlinks also "exist".
|
||||||
prints(Messages.M015, link_path,
|
msg.fail(Messages.M014.format(name=link_name), Messages.M015, exits=1)
|
||||||
title=Messages.M014.format(name=link_name), exits=1)
|
details = "%s --> %s" % (path2str(model_path), path2str(link_path))
|
||||||
msg = "%s --> %s" % (path2str(model_path), path2str(link_path))
|
|
||||||
try:
|
try:
|
||||||
symlink_to(link_path, model_path)
|
symlink_to(link_path, model_path)
|
||||||
except:
|
except: # noqa: E722
|
||||||
# This is quite dirty, but just making sure other errors are caught.
|
# This is quite dirty, but just making sure other errors are caught.
|
||||||
prints(Messages.M017, msg, title=Messages.M016.format(name=link_name))
|
msg.fail(Messages.M016.format(name=link_name), Messages.M017)
|
||||||
|
msg.text(details)
|
||||||
raise
|
raise
|
||||||
prints(msg, Messages.M019.format(name=link_name), title=Messages.M018)
|
msg.good(Messages.M018, details)
|
||||||
|
msg.text(Messages.M019.format(name=link_name))
|
||||||
|
|
|
@ -4,109 +4,106 @@ from __future__ import unicode_literals
|
||||||
import plac
|
import plac
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from wasabi import Printer, get_raw_input
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
from ..compat import path2str, json_dumps
|
from ..compat import path2str, json_dumps
|
||||||
from ..util import prints
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
input_dir=("directory with model data", "positional", None, str),
|
input_dir=("Directory with model data", "positional", None, str),
|
||||||
output_dir=("output parent directory", "positional", None, str),
|
output_dir=("Output parent directory", "positional", None, str),
|
||||||
meta_path=("path to meta.json", "option", "m", str),
|
meta_path=("Path to meta.json", "option", "m", str),
|
||||||
create_meta=("create meta.json, even if one exists in directory – if "
|
create_meta=("Create meta.json, even if one exists", "flag", "c", bool),
|
||||||
"existing meta is found, entries are shown as defaults in "
|
force=("Force overwriting existing model in output directory", "flag", "f", bool),
|
||||||
"the command line prompt", "flag", "c", bool),
|
)
|
||||||
force=("force overwriting of existing model directory in output directory",
|
def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False):
|
||||||
"flag", "f", bool))
|
|
||||||
def package(input_dir, output_dir, meta_path=None, create_meta=False,
|
|
||||||
force=False):
|
|
||||||
"""
|
"""
|
||||||
Generate Python package for model data, including meta and required
|
Generate Python package for model data, including meta and required
|
||||||
installation files. A new directory will be created in the specified
|
installation files. A new directory will be created in the specified
|
||||||
output directory, and model data will be copied over.
|
output directory, and model data will be copied over. If --create-meta is
|
||||||
|
set and a meta.json already exists in the output directory, the existing
|
||||||
|
values will be used as the defaults in the command-line prompt.
|
||||||
"""
|
"""
|
||||||
|
msg = Printer()
|
||||||
input_path = util.ensure_path(input_dir)
|
input_path = util.ensure_path(input_dir)
|
||||||
output_path = util.ensure_path(output_dir)
|
output_path = util.ensure_path(output_dir)
|
||||||
meta_path = util.ensure_path(meta_path)
|
meta_path = util.ensure_path(meta_path)
|
||||||
if not input_path or not input_path.exists():
|
if not input_path or not input_path.exists():
|
||||||
prints(input_path, title=Messages.M008, exits=1)
|
msg.fail(Messages.M008, input_path, exits=1)
|
||||||
if not output_path or not output_path.exists():
|
if not output_path or not output_path.exists():
|
||||||
prints(output_path, title=Messages.M040, exits=1)
|
msg.fail(Messages.M040, output_path, exits=1)
|
||||||
if meta_path and not meta_path.exists():
|
if meta_path and not meta_path.exists():
|
||||||
prints(meta_path, title=Messages.M020, exits=1)
|
msg.fail(Messages.M020, meta_path, exits=1)
|
||||||
|
|
||||||
meta_path = meta_path or input_path / 'meta.json'
|
meta_path = meta_path or input_path / "meta.json"
|
||||||
if meta_path.is_file():
|
if meta_path.is_file():
|
||||||
meta = util.read_json(meta_path)
|
meta = util.read_json(meta_path)
|
||||||
if not create_meta: # only print this if user doesn't want to overwrite
|
if not create_meta: # only print if user doesn't want to overwrite
|
||||||
prints(meta_path, title=Messages.M041)
|
msg.good(Messages.M041, meta_path)
|
||||||
else:
|
else:
|
||||||
meta = generate_meta(input_dir, meta)
|
meta = generate_meta(input_dir, meta, msg)
|
||||||
meta = validate_meta(meta, ['lang', 'name', 'version'])
|
for key in ("lang", "name", "version"):
|
||||||
model_name = meta['lang'] + '_' + meta['name']
|
if key not in meta or meta[key] == "":
|
||||||
model_name_v = model_name + '-' + meta['version']
|
msg.fail(Messages.M048.format(key=key), Messages.M049, exits=1)
|
||||||
|
model_name = meta["lang"] + "_" + meta["name"]
|
||||||
|
model_name_v = model_name + "-" + meta["version"]
|
||||||
main_path = output_path / model_name_v
|
main_path = output_path / model_name_v
|
||||||
package_path = main_path / model_name
|
package_path = main_path / model_name
|
||||||
|
|
||||||
create_dirs(package_path, force)
|
|
||||||
shutil.copytree(path2str(input_path),
|
|
||||||
path2str(package_path / model_name_v))
|
|
||||||
create_file(main_path / 'meta.json', json_dumps(meta))
|
|
||||||
create_file(main_path / 'setup.py', TEMPLATE_SETUP)
|
|
||||||
create_file(main_path / 'MANIFEST.in', TEMPLATE_MANIFEST)
|
|
||||||
create_file(package_path / '__init__.py', TEMPLATE_INIT)
|
|
||||||
prints(main_path, Messages.M043,
|
|
||||||
title=Messages.M042.format(name=model_name_v))
|
|
||||||
|
|
||||||
|
|
||||||
def create_dirs(package_path, force):
|
|
||||||
if package_path.exists():
|
if package_path.exists():
|
||||||
if force:
|
if force:
|
||||||
shutil.rmtree(path2str(package_path))
|
shutil.rmtree(path2str(package_path))
|
||||||
else:
|
else:
|
||||||
prints(package_path, Messages.M045, title=Messages.M044, exits=1)
|
msg.fail(
|
||||||
|
Messages.M044,
|
||||||
|
Messages.M045.format(path=path2str(package_path)),
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
Path.mkdir(package_path, parents=True)
|
Path.mkdir(package_path, parents=True)
|
||||||
|
shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
|
||||||
|
create_file(main_path / "meta.json", json_dumps(meta))
|
||||||
|
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
||||||
|
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
||||||
|
create_file(package_path / "__init__.py", TEMPLATE_INIT)
|
||||||
|
msg.good(Messages.M042.format(name=model_name_v), main_path)
|
||||||
|
msg.text(Messages.M043)
|
||||||
|
|
||||||
|
|
||||||
def create_file(file_path, contents):
|
def create_file(file_path, contents):
|
||||||
file_path.touch()
|
file_path.touch()
|
||||||
file_path.open('w', encoding='utf-8').write(contents)
|
file_path.open("w", encoding="utf-8").write(contents)
|
||||||
|
|
||||||
|
|
||||||
def generate_meta(model_path, existing_meta):
|
def generate_meta(model_path, existing_meta, msg):
|
||||||
meta = existing_meta or {}
|
meta = existing_meta or {}
|
||||||
settings = [('lang', 'Model language', meta.get('lang', 'en')),
|
settings = [
|
||||||
('name', 'Model name', meta.get('name', 'model')),
|
("lang", "Model language", meta.get("lang", "en")),
|
||||||
('version', 'Model version', meta.get('version', '0.0.0')),
|
("name", "Model name", meta.get("name", "model")),
|
||||||
('spacy_version', 'Required spaCy version',
|
("version", "Model version", meta.get("version", "0.0.0")),
|
||||||
'>=%s,<3.0.0' % about.__version__),
|
("spacy_version", "Required spaCy version", ">=%s,<3.0.0" % about.__version__),
|
||||||
('description', 'Model description',
|
("description", "Model description", meta.get("description", False)),
|
||||||
meta.get('description', False)),
|
("author", "Author", meta.get("author", False)),
|
||||||
('author', 'Author', meta.get('author', False)),
|
("email", "Author email", meta.get("email", False)),
|
||||||
('email', 'Author email', meta.get('email', False)),
|
("url", "Author website", meta.get("url", False)),
|
||||||
('url', 'Author website', meta.get('url', False)),
|
("license", "License", meta.get("license", "CC BY-SA 3.0")),
|
||||||
('license', 'License', meta.get('license', 'CC BY-SA 3.0'))]
|
]
|
||||||
nlp = util.load_model_from_path(Path(model_path))
|
nlp = util.load_model_from_path(Path(model_path))
|
||||||
meta['pipeline'] = nlp.pipe_names
|
meta["pipeline"] = nlp.pipe_names
|
||||||
meta['vectors'] = {'width': nlp.vocab.vectors_length,
|
meta["vectors"] = {
|
||||||
'vectors': len(nlp.vocab.vectors),
|
"width": nlp.vocab.vectors_length,
|
||||||
'keys': nlp.vocab.vectors.n_keys}
|
"vectors": len(nlp.vocab.vectors),
|
||||||
prints(Messages.M047, title=Messages.M046)
|
"keys": nlp.vocab.vectors.n_keys,
|
||||||
|
}
|
||||||
|
msg.divider(Messages.M046)
|
||||||
|
msg.text(Messages.M047)
|
||||||
for setting, desc, default in settings:
|
for setting, desc, default in settings:
|
||||||
response = util.get_raw_input(desc, default)
|
response = get_raw_input(desc, default)
|
||||||
meta[setting] = default if response == '' and default else response
|
meta[setting] = default if response == "" and default else response
|
||||||
if about.__title__ != 'spacy':
|
if about.__title__ != "spacy":
|
||||||
meta['parent_package'] = about.__title__
|
meta["parent_package"] = about.__title__
|
||||||
return meta
|
|
||||||
|
|
||||||
|
|
||||||
def validate_meta(meta, keys):
|
|
||||||
for key in keys:
|
|
||||||
if key not in meta or meta[key] == '':
|
|
||||||
prints(Messages.M049, title=Messages.M048.format(key=key), exits=1)
|
|
||||||
return meta
|
return meta
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,66 +1,148 @@
|
||||||
'''This script is experimental.
|
# coding: utf8
|
||||||
|
|
||||||
Try pre-training the CNN component of the text categorizer using a cheap
|
|
||||||
language modelling-like objective. Specifically, we load pre-trained vectors
|
|
||||||
(from something like word2vec, GloVe, FastText etc), and use the CNN to
|
|
||||||
predict the tokens' pre-trained vectors. This isn't as easy as it sounds:
|
|
||||||
we're not merely doing compression here, because heavy dropout is applied,
|
|
||||||
including over the input words. This means the model must often (50% of the time)
|
|
||||||
use the context in order to predict the word.
|
|
||||||
|
|
||||||
To evaluate the technique, we're pre-training with the 50k texts from the IMDB
|
|
||||||
corpus, and then training with only 100 labels. Note that it's a bit dirty to
|
|
||||||
pre-train with the development data, but also not *so* terrible: we're not using
|
|
||||||
the development labels, after all --- only the unlabelled text.
|
|
||||||
'''
|
|
||||||
from __future__ import print_function, unicode_literals
|
from __future__ import print_function, unicode_literals
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import random
|
import random
|
||||||
import numpy
|
import numpy
|
||||||
import time
|
import time
|
||||||
import ujson as json
|
import ujson
|
||||||
from pathlib import Path
|
|
||||||
import sys
|
import sys
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
|
from pathlib import Path
|
||||||
import spacy
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
from spacy.attrs import ID, HEAD
|
|
||||||
from spacy.util import minibatch, minibatch_by_words, use_gpu, compounding, ensure_path
|
|
||||||
from spacy._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
|
|
||||||
from thinc.v2v import Affine, Maxout
|
from thinc.v2v import Affine, Maxout
|
||||||
from thinc.api import wrap
|
from thinc.api import wrap
|
||||||
from thinc.misc import LayerNorm as LN
|
from thinc.misc import LayerNorm as LN
|
||||||
|
from thinc.neural.util import prefer_gpu
|
||||||
|
from wasabi import Printer
|
||||||
|
|
||||||
|
from ..tokens import Doc
|
||||||
|
from ..attrs import ID, HEAD
|
||||||
|
from ..compat import json_dumps
|
||||||
|
from .._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
|
||||||
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
def prefer_gpu():
|
@plac.annotations(
|
||||||
used = spacy.util.use_gpu(0)
|
texts_loc=("Path to jsonl file with texts to learn from", "positional", None, str),
|
||||||
if used is None:
|
vectors_model=("Name or path to vectors model to learn from"),
|
||||||
return False
|
output_dir=("Directory to write models each epoch", "positional", None, str),
|
||||||
else:
|
width=("Width of CNN layers", "option", "cw", int),
|
||||||
import cupy.random
|
depth=("Depth of CNN layers", "option", "cd", int),
|
||||||
cupy.random.seed(0)
|
embed_rows=("Embedding rows", "option", "er", int),
|
||||||
return True
|
use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
|
||||||
|
dropout=("Dropout", "option", "d", float),
|
||||||
|
seed=("Seed for random number generators", "option", "s", float),
|
||||||
|
nr_iter=("Number of iterations to pretrain", "option", "i", int),
|
||||||
|
)
|
||||||
|
def pretrain(
|
||||||
|
texts_loc,
|
||||||
|
vectors_model,
|
||||||
|
output_dir,
|
||||||
|
width=96,
|
||||||
|
depth=4,
|
||||||
|
embed_rows=2000,
|
||||||
|
use_vectors=False,
|
||||||
|
dropout=0.2,
|
||||||
|
nr_iter=1000,
|
||||||
|
seed=0,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
||||||
|
using an approximate language-modelling objective. Specifically, we load
|
||||||
|
pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
|
||||||
|
vectors which match the pre-trained ones. The weights are saved to a directory
|
||||||
|
after each epoch. You can then pass a path to one of these pre-trained weights
|
||||||
|
files to the 'spacy train' command.
|
||||||
|
|
||||||
|
This technique may be especially helpful if you have little labelled data.
|
||||||
|
However, it's still quite experimental, so your mileage may vary.
|
||||||
|
|
||||||
def load_texts(path):
|
To load the weights back in during 'spacy train', you need to ensure
|
||||||
'''Load inputs from a jsonl file.
|
all settings are the same between pretraining and training. The API and
|
||||||
|
errors around this need some improvement.
|
||||||
Each line should be a dict like {"text": "..."}
|
"""
|
||||||
'''
|
config = dict(locals())
|
||||||
path = ensure_path(path)
|
msg = Printer()
|
||||||
with path.open('r', encoding='utf8') as file_:
|
util.fix_random_seed(seed)
|
||||||
texts = [json.loads(line) for line in file_]
|
|
||||||
random.shuffle(texts)
|
has_gpu = prefer_gpu()
|
||||||
return texts
|
msg.info("Using GPU" if has_gpu else "Not using GPU")
|
||||||
|
|
||||||
|
output_dir = Path(output_dir)
|
||||||
|
if not output_dir.exists():
|
||||||
|
output_dir.mkdir()
|
||||||
|
msg.good("Created output directory")
|
||||||
|
util.write_json(output_dir / "config.json", config)
|
||||||
|
msg.good("Saved settings to config.json")
|
||||||
|
|
||||||
|
# Load texts from file or stdin
|
||||||
|
if texts_loc != "-": # reading from a file
|
||||||
|
texts_loc = Path(texts_loc)
|
||||||
|
if not texts_loc.exists():
|
||||||
|
msg.fail("Input text file doesn't exist", texts_loc, exits=1)
|
||||||
|
with msg.loading("Loading input texts..."):
|
||||||
|
texts = list(util.read_jsonl(texts_loc))
|
||||||
|
msg.good("Loaded input texts")
|
||||||
|
random.shuffle(texts)
|
||||||
|
else: # reading from stdin
|
||||||
|
msg.text("Reading input text from stdin...")
|
||||||
|
texts = stream_texts()
|
||||||
|
|
||||||
|
with msg.loading("Loading model '{}'...".format(vectors_model)):
|
||||||
|
nlp = util.load_model(vectors_model)
|
||||||
|
msg.good("Loaded model '{}'".format(vectors_model))
|
||||||
|
pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
|
||||||
|
model = create_pretraining_model(
|
||||||
|
nlp,
|
||||||
|
Tok2Vec(
|
||||||
|
width,
|
||||||
|
embed_rows,
|
||||||
|
conv_depth=depth,
|
||||||
|
pretrained_vectors=pretrained_vectors,
|
||||||
|
bilstm_depth=0, # Requires PyTorch. Experimental.
|
||||||
|
cnn_maxout_pieces=2, # You can try setting this higher
|
||||||
|
subword_features=True,
|
||||||
|
),
|
||||||
|
) # Set to False for character models, e.g. Chinese
|
||||||
|
optimizer = create_default_optimizer(model.ops)
|
||||||
|
tracker = ProgressTracker()
|
||||||
|
msg.divider("Pre-training tok2vec layer")
|
||||||
|
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
||||||
|
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
|
||||||
|
for epoch in range(nr_iter):
|
||||||
|
for batch in util.minibatch_by_words(
|
||||||
|
((text, None) for text in texts), size=5000
|
||||||
|
):
|
||||||
|
docs = make_docs(nlp, [text for (text, _) in batch])
|
||||||
|
loss = make_update(model, docs, optimizer, drop=dropout)
|
||||||
|
progress = tracker.update(epoch, loss, docs)
|
||||||
|
if progress:
|
||||||
|
msg.row(progress, **row_settings)
|
||||||
|
if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
|
||||||
|
break
|
||||||
|
with model.use_params(optimizer.averages):
|
||||||
|
with (output_dir / ("model%d.bin" % epoch)).open("wb") as file_:
|
||||||
|
file_.write(model.tok2vec.to_bytes())
|
||||||
|
log = {
|
||||||
|
"nr_word": tracker.nr_word,
|
||||||
|
"loss": tracker.loss,
|
||||||
|
"epoch_loss": tracker.epoch_loss,
|
||||||
|
"epoch": epoch,
|
||||||
|
}
|
||||||
|
with (output_dir / "log.jsonl").open("a") as file_:
|
||||||
|
file_.write(json_dumps(log) + "\n")
|
||||||
|
tracker.epoch_loss = 0.0
|
||||||
|
if texts_loc != "-":
|
||||||
|
# Reshuffle the texts if texts were loaded from a file
|
||||||
|
random.shuffle(texts)
|
||||||
|
|
||||||
|
|
||||||
def stream_texts():
|
def stream_texts():
|
||||||
for line in sys.stdin:
|
for line in sys.stdin:
|
||||||
yield json.loads(line)
|
yield ujson.loads(line)
|
||||||
|
|
||||||
|
|
||||||
def make_update(model, docs, optimizer, drop=0.):
|
def make_update(model, docs, optimizer, drop=0.0):
|
||||||
"""Perform an update over a single batch of documents.
|
"""Perform an update over a single batch of documents.
|
||||||
|
|
||||||
docs (iterable): A batch of `Doc` objects.
|
docs (iterable): A batch of `Doc` objects.
|
||||||
|
@ -74,7 +156,7 @@ def make_update(model, docs, optimizer, drop=0.):
|
||||||
# Don't want to return a cupy object here
|
# Don't want to return a cupy object here
|
||||||
# The gradients are modified in-place by the BERT MLM,
|
# The gradients are modified in-place by the BERT MLM,
|
||||||
# so we get an accurate loss
|
# so we get an accurate loss
|
||||||
loss = float((gradients**2).mean())
|
loss = float((gradients ** 2).mean())
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
|
|
||||||
|
@ -98,7 +180,7 @@ def make_docs(nlp, batch, min_length=1, max_length=500):
|
||||||
|
|
||||||
def get_vectors_loss(ops, docs, prediction):
|
def get_vectors_loss(ops, docs, prediction):
|
||||||
"""Compute a mean-squared error loss between the documents' vectors and
|
"""Compute a mean-squared error loss between the documents' vectors and
|
||||||
the prediction.
|
the prediction.
|
||||||
|
|
||||||
Note that this is ripe for customization! We could compute the vectors
|
Note that this is ripe for customization! We could compute the vectors
|
||||||
in some other word, e.g. with an LSTM language model, or use some other
|
in some other word, e.g. with an LSTM language model, or use some other
|
||||||
|
@ -115,43 +197,40 @@ def get_vectors_loss(ops, docs, prediction):
|
||||||
|
|
||||||
|
|
||||||
def create_pretraining_model(nlp, tok2vec):
|
def create_pretraining_model(nlp, tok2vec):
|
||||||
'''Define a network for the pretraining. We simply add an output layer onto
|
"""Define a network for the pretraining. We simply add an output layer onto
|
||||||
the tok2vec input model. The tok2vec input model needs to be a model that
|
the tok2vec input model. The tok2vec input model needs to be a model that
|
||||||
takes a batch of Doc objects (as a list), and returns a list of arrays.
|
takes a batch of Doc objects (as a list), and returns a list of arrays.
|
||||||
Each array in the output needs to have one row per token in the doc.
|
Each array in the output needs to have one row per token in the doc.
|
||||||
'''
|
"""
|
||||||
output_size = nlp.vocab.vectors.data.shape[1]
|
output_size = nlp.vocab.vectors.data.shape[1]
|
||||||
output_layer = chain(
|
output_layer = chain(
|
||||||
LN(Maxout(300, pieces=3)),
|
LN(Maxout(300, pieces=3)), zero_init(Affine(output_size, drop_factor=0.0))
|
||||||
zero_init(Affine(output_size, drop_factor=0.0))
|
|
||||||
)
|
)
|
||||||
# This is annoying, but the parser etc have the flatten step after
|
# This is annoying, but the parser etc have the flatten step after
|
||||||
# the tok2vec. To load the weights in cleanly, we need to match
|
# the tok2vec. To load the weights in cleanly, we need to match
|
||||||
# the shape of the models' components exactly. So what we cann
|
# the shape of the models' components exactly. So what we cann
|
||||||
# "tok2vec" has to be the same set of processes as what the components do.
|
# "tok2vec" has to be the same set of processes as what the components do.
|
||||||
tok2vec = chain(tok2vec, flatten)
|
tok2vec = chain(tok2vec, flatten)
|
||||||
model = chain(
|
model = chain(tok2vec, output_layer)
|
||||||
tok2vec,
|
|
||||||
output_layer
|
|
||||||
)
|
|
||||||
model = masked_language_model(nlp.vocab, model)
|
model = masked_language_model(nlp.vocab, model)
|
||||||
model.tok2vec = tok2vec
|
model.tok2vec = tok2vec
|
||||||
model.output_layer = output_layer
|
model.output_layer = output_layer
|
||||||
model.begin_training([nlp.make_doc('Give it a doc to infer shapes')])
|
model.begin_training([nlp.make_doc("Give it a doc to infer shapes")])
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def masked_language_model(vocab, model, mask_prob=0.15):
|
def masked_language_model(vocab, model, mask_prob=0.15):
|
||||||
'''Convert a model into a BERT-style masked language model'''
|
"""Convert a model into a BERT-style masked language model"""
|
||||||
|
|
||||||
random_words = RandomWords(vocab)
|
random_words = RandomWords(vocab)
|
||||||
def mlm_forward(docs, drop=0.):
|
|
||||||
|
def mlm_forward(docs, drop=0.0):
|
||||||
mask, docs = apply_mask(docs, random_words, mask_prob=mask_prob)
|
mask, docs = apply_mask(docs, random_words, mask_prob=mask_prob)
|
||||||
mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
|
mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
|
||||||
output, backprop = model.begin_update(docs, drop=drop)
|
output, backprop = model.begin_update(docs, drop=drop)
|
||||||
|
|
||||||
def mlm_backward(d_output, sgd=None):
|
def mlm_backward(d_output, sgd=None):
|
||||||
d_output *= 1-mask
|
d_output *= 1 - mask
|
||||||
return backprop(d_output, sgd=sgd)
|
return backprop(d_output, sgd=sgd)
|
||||||
|
|
||||||
return output, mlm_backward
|
return output, mlm_backward
|
||||||
|
@ -161,7 +240,7 @@ def masked_language_model(vocab, model, mask_prob=0.15):
|
||||||
|
|
||||||
def apply_mask(docs, random_words, mask_prob=0.15):
|
def apply_mask(docs, random_words, mask_prob=0.15):
|
||||||
N = sum(len(doc) for doc in docs)
|
N = sum(len(doc) for doc in docs)
|
||||||
mask = numpy.random.uniform(0., 1.0, (N,))
|
mask = numpy.random.uniform(0.0, 1.0, (N,))
|
||||||
mask = mask >= mask_prob
|
mask = mask >= mask_prob
|
||||||
i = 0
|
i = 0
|
||||||
masked_docs = []
|
masked_docs = []
|
||||||
|
@ -184,7 +263,7 @@ def apply_mask(docs, random_words, mask_prob=0.15):
|
||||||
return mask, masked_docs
|
return mask, masked_docs
|
||||||
|
|
||||||
|
|
||||||
def replace_word(word, random_words, mask='[MASK]'):
|
def replace_word(word, random_words, mask="[MASK]"):
|
||||||
roll = random.random()
|
roll = random.random()
|
||||||
if roll < 0.8:
|
if roll < 0.8:
|
||||||
return mask
|
return mask
|
||||||
|
@ -193,23 +272,25 @@ def replace_word(word, random_words, mask='[MASK]'):
|
||||||
else:
|
else:
|
||||||
return word
|
return word
|
||||||
|
|
||||||
|
|
||||||
class RandomWords(object):
|
class RandomWords(object):
|
||||||
def __init__(self, vocab):
|
def __init__(self, vocab):
|
||||||
self.words = [lex.text for lex in vocab if lex.prob != 0.0]
|
self.words = [lex.text for lex in vocab if lex.prob != 0.0]
|
||||||
self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
|
self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
|
||||||
self.words = self.words[:10000]
|
self.words = self.words[:10000]
|
||||||
self.probs = self.probs[:10000]
|
self.probs = self.probs[:10000]
|
||||||
self.probs = numpy.exp(numpy.array(self.probs, dtype='f'))
|
self.probs = numpy.exp(numpy.array(self.probs, dtype="f"))
|
||||||
self.probs /= self.probs.sum()
|
self.probs /= self.probs.sum()
|
||||||
self._cache = []
|
self._cache = []
|
||||||
|
|
||||||
def next(self):
|
def next(self):
|
||||||
if not self._cache:
|
if not self._cache:
|
||||||
self._cache.extend(numpy.random.choice(len(self.words), 10000,
|
self._cache.extend(
|
||||||
p=self.probs))
|
numpy.random.choice(len(self.words), 10000, p=self.probs)
|
||||||
|
)
|
||||||
index = self._cache.pop()
|
index = self._cache.pop()
|
||||||
return self.words[index]
|
return self.words[index]
|
||||||
|
|
||||||
|
|
||||||
class ProgressTracker(object):
|
class ProgressTracker(object):
|
||||||
def __init__(self, frequency=1000000):
|
def __init__(self, frequency=1000000):
|
||||||
|
@ -245,76 +326,3 @@ class ProgressTracker(object):
|
||||||
return status
|
return status
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
texts_loc=("Path to jsonl file with texts to learn from", "positional", None, str),
|
|
||||||
vectors_model=("Name or path to vectors model to learn from"),
|
|
||||||
output_dir=("Directory to write models each epoch", "positional", None, str),
|
|
||||||
width=("Width of CNN layers", "option", "cw", int),
|
|
||||||
depth=("Depth of CNN layers", "option", "cd", int),
|
|
||||||
embed_rows=("Embedding rows", "option", "er", int),
|
|
||||||
use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
|
|
||||||
dropout=("Dropout", "option", "d", float),
|
|
||||||
seed=("Seed for random number generators", "option", "s", float),
|
|
||||||
nr_iter=("Number of iterations to pretrain", "option", "i", int),
|
|
||||||
)
|
|
||||||
def pretrain(texts_loc, vectors_model, output_dir, width=96, depth=4,
|
|
||||||
embed_rows=2000, use_vectors=False, dropout=0.2, nr_iter=1000, seed=0):
|
|
||||||
"""
|
|
||||||
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
|
||||||
using an approximate language-modelling objective. Specifically, we load
|
|
||||||
pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
|
|
||||||
vectors which match the pre-trained ones. The weights are saved to a directory
|
|
||||||
after each epoch. You can then pass a path to one of these pre-trained weights
|
|
||||||
files to the 'spacy train' command.
|
|
||||||
|
|
||||||
This technique may be especially helpful if you have little labelled data.
|
|
||||||
However, it's still quite experimental, so your mileage may vary.
|
|
||||||
|
|
||||||
To load the weights back in during 'spacy train', you need to ensure
|
|
||||||
all settings are the same between pretraining and training. The API and
|
|
||||||
errors around this need some improvement.
|
|
||||||
"""
|
|
||||||
config = dict(locals())
|
|
||||||
output_dir = ensure_path(output_dir)
|
|
||||||
random.seed(seed)
|
|
||||||
numpy.random.seed(seed)
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir()
|
|
||||||
with (output_dir / 'config.json').open('w') as file_:
|
|
||||||
file_.write(json.dumps(config))
|
|
||||||
has_gpu = prefer_gpu()
|
|
||||||
print("Use GPU?", has_gpu)
|
|
||||||
nlp = spacy.load(vectors_model)
|
|
||||||
pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
|
|
||||||
model = create_pretraining_model(nlp,
|
|
||||||
Tok2Vec(width, embed_rows,
|
|
||||||
conv_depth=depth,
|
|
||||||
pretrained_vectors=pretrained_vectors,
|
|
||||||
bilstm_depth=0, # Requires PyTorch. Experimental.
|
|
||||||
cnn_maxout_pieces=2, # You can try setting this higher
|
|
||||||
subword_features=True)) # Set to False for character models, e.g. Chinese
|
|
||||||
optimizer = create_default_optimizer(model.ops)
|
|
||||||
tracker = ProgressTracker()
|
|
||||||
print('Epoch', '#Words', 'Loss', 'w/s')
|
|
||||||
texts = stream_texts() if texts_loc == '-' else load_texts(texts_loc)
|
|
||||||
for epoch in range(nr_iter):
|
|
||||||
for batch in minibatch_by_words(((text, None) for text in texts), size=5000):
|
|
||||||
docs = make_docs(nlp, [text for (text, _) in batch])
|
|
||||||
loss = make_update(model, docs, optimizer, drop=dropout)
|
|
||||||
progress = tracker.update(epoch, loss, docs)
|
|
||||||
if progress:
|
|
||||||
print(*progress)
|
|
||||||
if texts_loc == '-' and tracker.words_per_epoch[epoch] >= 10**7:
|
|
||||||
break
|
|
||||||
with model.use_params(optimizer.averages):
|
|
||||||
with (output_dir / ('model%d.bin' % epoch)).open('wb') as file_:
|
|
||||||
file_.write(model.tok2vec.to_bytes())
|
|
||||||
with (output_dir / 'log.jsonl').open('a') as file_:
|
|
||||||
file_.write(json.dumps({'nr_word': tracker.nr_word,
|
|
||||||
'loss': tracker.loss, 'epoch_loss': tracker.epoch_loss,
|
|
||||||
'epoch': epoch}) + '\n')
|
|
||||||
tracker.epoch_loss = 0.0
|
|
||||||
if texts_loc != '-':
|
|
||||||
texts = load_texts(texts_loc)
|
|
||||||
|
|
|
@ -6,45 +6,64 @@ from pathlib import Path
|
||||||
import ujson
|
import ujson
|
||||||
import cProfile
|
import cProfile
|
||||||
import pstats
|
import pstats
|
||||||
|
|
||||||
import spacy
|
|
||||||
import sys
|
import sys
|
||||||
import tqdm
|
import tqdm
|
||||||
import cytoolz
|
import cytoolz
|
||||||
import thinc.extra.datasets
|
import thinc.extra.datasets
|
||||||
|
from wasabi import Printer
|
||||||
|
|
||||||
|
from ..util import load_model
|
||||||
def read_inputs(loc):
|
|
||||||
if loc is None:
|
|
||||||
file_ = sys.stdin
|
|
||||||
file_ = (line.encode('utf8') for line in file_)
|
|
||||||
else:
|
|
||||||
file_ = Path(loc).open()
|
|
||||||
for line in file_:
|
|
||||||
data = ujson.loads(line)
|
|
||||||
text = data['text']
|
|
||||||
yield text
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
lang=("model/language", "positional", None, str),
|
model=("Model to load", "positional", None, str),
|
||||||
inputs=("Location of input file", "positional", None, read_inputs))
|
inputs=("Location of input file. '-' for stdin.", "positional", None, str),
|
||||||
def profile(lang, inputs=None):
|
n_texts=("Maximum number of texts to use if available", "option", "n", int),
|
||||||
|
)
|
||||||
|
def profile(model, inputs=None, n_texts=10000):
|
||||||
"""
|
"""
|
||||||
Profile a spaCy pipeline, to find out which functions take the most time.
|
Profile a spaCy pipeline, to find out which functions take the most time.
|
||||||
|
Input should be formatted as one JSON object per line with a key "text".
|
||||||
|
It can either be provided as a JSONL file, or be read from sys.sytdin.
|
||||||
|
If no input file is specified, the IMDB dataset is loaded via Thinc.
|
||||||
"""
|
"""
|
||||||
|
msg = Printer()
|
||||||
|
if inputs is not None:
|
||||||
|
inputs = _read_inputs(inputs, msg)
|
||||||
if inputs is None:
|
if inputs is None:
|
||||||
imdb_train, _ = thinc.extra.datasets.imdb()
|
n_inputs = 25000
|
||||||
inputs, _ = zip(*imdb_train)
|
with msg.loading("Loading IMDB dataset via Thinc..."):
|
||||||
inputs = inputs[:25000]
|
imdb_train, _ = thinc.extra.datasets.imdb()
|
||||||
nlp = spacy.load(lang)
|
inputs, _ = zip(*imdb_train)
|
||||||
texts = list(cytoolz.take(10000, inputs))
|
msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs))
|
||||||
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(),
|
inputs = inputs[:n_inputs]
|
||||||
"Profile.prof")
|
with msg.loading("Loading model '{}'...".format(model)):
|
||||||
|
nlp = load_model(model)
|
||||||
|
msg.good("Loaded model '{}'".format(model))
|
||||||
|
texts = list(cytoolz.take(n_texts, inputs))
|
||||||
|
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
||||||
s = pstats.Stats("Profile.prof")
|
s = pstats.Stats("Profile.prof")
|
||||||
|
msg.divider("Profile stats")
|
||||||
s.strip_dirs().sort_stats("time").print_stats()
|
s.strip_dirs().sort_stats("time").print_stats()
|
||||||
|
|
||||||
|
|
||||||
def parse_texts(nlp, texts):
|
def parse_texts(nlp, texts):
|
||||||
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
|
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _read_inputs(loc, msg):
|
||||||
|
if loc == "-":
|
||||||
|
msg.info("Reading input from sys.stdin")
|
||||||
|
file_ = sys.stdin
|
||||||
|
file_ = (line.encode("utf8") for line in file_)
|
||||||
|
else:
|
||||||
|
input_path = Path(loc)
|
||||||
|
if not input_path.exists() or not input_path.is_file():
|
||||||
|
msg.fail("Not a valid input data file", loc, exits=1)
|
||||||
|
msg.info("Using data from {}".format(input_path.parts[-1]))
|
||||||
|
file_ = input_path.open()
|
||||||
|
for line in file_:
|
||||||
|
data = ujson.loads(line)
|
||||||
|
text = data["text"]
|
||||||
|
yield text
|
||||||
|
|
51
spacy/cli/schemas/__init__.py
Normal file
51
spacy/cli/schemas/__init__.py
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from jsonschema import Draft4Validator
|
||||||
|
|
||||||
|
from ...errors import Errors
|
||||||
|
from ...util import read_json
|
||||||
|
|
||||||
|
|
||||||
|
SCHEMAS = {}
|
||||||
|
|
||||||
|
|
||||||
|
def get_schema(name):
|
||||||
|
"""Get the JSON schema for a given name. Looks for a .json file in
|
||||||
|
spacy.cli.schemas, validates the schema and raises ValueError if not found.
|
||||||
|
|
||||||
|
EXAMPLE:
|
||||||
|
>>> schema = get_schema('training')
|
||||||
|
|
||||||
|
name (unicode): The name of the schema.
|
||||||
|
RETURNS (dict): The JSON schema.
|
||||||
|
"""
|
||||||
|
if name not in SCHEMAS:
|
||||||
|
schema_path = Path(__file__).parent / "{}.json".format(name)
|
||||||
|
if not schema_path.exists():
|
||||||
|
raise ValueError(Errors.E104.format(name=name))
|
||||||
|
schema = read_json(schema_path)
|
||||||
|
# TODO: replace with (stable) Draft6Validator, if available
|
||||||
|
validator = Draft4Validator(schema)
|
||||||
|
validator.check_schema(schema)
|
||||||
|
SCHEMAS[name] = schema
|
||||||
|
return SCHEMAS[name]
|
||||||
|
|
||||||
|
|
||||||
|
def validate_json(data, schema):
|
||||||
|
"""Validate data against a given JSON schema (see https://json-schema.org).
|
||||||
|
|
||||||
|
data: JSON-serializable data to validate.
|
||||||
|
schema (dict): The JSON schema.
|
||||||
|
RETURNS (list): A list of error messages, if available.
|
||||||
|
"""
|
||||||
|
validator = Draft4Validator(schema)
|
||||||
|
errors = []
|
||||||
|
for err in sorted(validator.iter_errors(data), key=lambda e: e.path):
|
||||||
|
if err.path:
|
||||||
|
err_path = "[{}]".format(" -> ".join([str(p) for p in err.path]))
|
||||||
|
else:
|
||||||
|
err_path = ""
|
||||||
|
errors.append(err.message + " " + err_path)
|
||||||
|
return errors
|
128
spacy/cli/schemas/meta.json
Normal file
128
spacy/cli/schemas/meta.json
Normal file
|
@ -0,0 +1,128 @@
|
||||||
|
{
|
||||||
|
"$schema": "http://json-schema.org/draft-06/schema",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"lang": {
|
||||||
|
"title": "Two-letter language code, e.g. 'en'",
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 2,
|
||||||
|
"maxLength": 2,
|
||||||
|
"pattern": "^[a-z]*$"
|
||||||
|
},
|
||||||
|
"name": {
|
||||||
|
"title": "Model name",
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1,
|
||||||
|
"pattern": "^[a-z_]*$"
|
||||||
|
},
|
||||||
|
"version": {
|
||||||
|
"title": "Model version",
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1,
|
||||||
|
"pattern": "^[0-9a-z.-]*$"
|
||||||
|
},
|
||||||
|
"spacy_version": {
|
||||||
|
"title": "Compatible spaCy version identifier",
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1,
|
||||||
|
"pattern": "^[0-9a-z.-><=]*$"
|
||||||
|
},
|
||||||
|
"parent_package": {
|
||||||
|
"title": "Name of parent spaCy package, e.g. spacy or spacy-nightly",
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1,
|
||||||
|
"default": "spacy"
|
||||||
|
},
|
||||||
|
"pipeline": {
|
||||||
|
"title": "Names of pipeline components",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"description": {
|
||||||
|
"title": "Model description",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"license": {
|
||||||
|
"title": "Model license",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"author": {
|
||||||
|
"title": "Model author name",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"email": {
|
||||||
|
"title": "Model author email",
|
||||||
|
"type": "string",
|
||||||
|
"format": "email"
|
||||||
|
},
|
||||||
|
"url": {
|
||||||
|
"title": "Model author URL",
|
||||||
|
"type": "string",
|
||||||
|
"format": "uri"
|
||||||
|
},
|
||||||
|
"sources": {
|
||||||
|
"title": "Training data sources",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"vectors": {
|
||||||
|
"title": "Included word vectors",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"keys": {
|
||||||
|
"title": "Number of unique keys",
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"vectors": {
|
||||||
|
"title": "Number of unique vectors",
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"width": {
|
||||||
|
"title": "Number of dimensions",
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"accuracy": {
|
||||||
|
"title": "Accuracy numbers",
|
||||||
|
"type": "object",
|
||||||
|
"patternProperties": {
|
||||||
|
"*": {
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"speed": {
|
||||||
|
"title": "Speed evaluation numbers",
|
||||||
|
"type": "object",
|
||||||
|
"patternProperties": {
|
||||||
|
"*": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"lang",
|
||||||
|
"name",
|
||||||
|
"version"
|
||||||
|
]
|
||||||
|
}
|
146
spacy/cli/schemas/training.json
Normal file
146
spacy/cli/schemas/training.json
Normal file
|
@ -0,0 +1,146 @@
|
||||||
|
{
|
||||||
|
"$schema": "http://json-schema.org/draft-06/schema",
|
||||||
|
"title": "Training data for spaCy models",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"text": {
|
||||||
|
"title": "The text of the training example",
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1
|
||||||
|
},
|
||||||
|
"ents": {
|
||||||
|
"title": "Named entity spans in the text",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"start": {
|
||||||
|
"title": "Start character offset of the span",
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"end": {
|
||||||
|
"title": "End character offset of the span",
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"label": {
|
||||||
|
"title": "Entity label",
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1,
|
||||||
|
"pattern": "^[A-Z0-9]*$"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"start",
|
||||||
|
"end",
|
||||||
|
"label"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"sents": {
|
||||||
|
"title": "Sentence spans in the text",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"start": {
|
||||||
|
"title": "Start character offset of the span",
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"end": {
|
||||||
|
"title": "End character offset of the span",
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"start",
|
||||||
|
"end"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cats": {
|
||||||
|
"title": "Text categories for the text classifier",
|
||||||
|
"type": "object",
|
||||||
|
"patternProperties": {
|
||||||
|
"*": {
|
||||||
|
"title": "A text category",
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"propertyNames": {
|
||||||
|
"pattern": "^[A-Z0-9]*$",
|
||||||
|
"minLength": 1
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"tokens": {
|
||||||
|
"title": "The tokens in the text",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"minProperties": 1,
|
||||||
|
"properties": {
|
||||||
|
"id": {
|
||||||
|
"title": "Token ID, usually token index",
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"start": {
|
||||||
|
"title": "Start character offset of the token",
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"end": {
|
||||||
|
"title": "End character offset of the token",
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"pos": {
|
||||||
|
"title": "Coarse-grained part-of-speech tag",
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1
|
||||||
|
},
|
||||||
|
"tag": {
|
||||||
|
"title": "Fine-grained part-of-speech tag",
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1
|
||||||
|
},
|
||||||
|
"dep": {
|
||||||
|
"title": "Dependency label",
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1
|
||||||
|
},
|
||||||
|
"head": {
|
||||||
|
"title": "Index of the token's head",
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"start",
|
||||||
|
"end"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"_": {
|
||||||
|
"title": "Custom user space",
|
||||||
|
"type": "object"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"text"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
|
@ -6,213 +6,296 @@ from pathlib import Path
|
||||||
import tqdm
|
import tqdm
|
||||||
from thinc.neural._classes.model import Model
|
from thinc.neural._classes.model import Model
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
import json
|
|
||||||
import shutil
|
import shutil
|
||||||
|
from wasabi import Printer
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
|
from .._ml import create_default_optimizer
|
||||||
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
|
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
|
||||||
from ..gold import GoldCorpus
|
from ..gold import GoldCorpus
|
||||||
from ..util import prints, minibatch, minibatch_by_words
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from .. import about
|
from .. import about
|
||||||
from .. import displacy
|
|
||||||
from ..compat import json_dumps
|
|
||||||
|
# Take dropout and batch size as generators of values -- dropout
|
||||||
|
# starts high and decays sharply, to force the optimizer to explore.
|
||||||
|
# Batch size starts at 1 and grows, so that we make updates quickly
|
||||||
|
# at the beginning of training.
|
||||||
|
dropout_rates = util.decaying(
|
||||||
|
util.env_opt("dropout_from", 0.2),
|
||||||
|
util.env_opt("dropout_to", 0.2),
|
||||||
|
util.env_opt("dropout_decay", 0.0),
|
||||||
|
)
|
||||||
|
batch_sizes = util.compounding(
|
||||||
|
util.env_opt("batch_from", 1000),
|
||||||
|
util.env_opt("batch_to", 1000),
|
||||||
|
util.env_opt("batch_compound", 1.001),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
lang=("model language", "positional", None, str),
|
lang=("Model language", "positional", None, str),
|
||||||
output_dir=("output directory to store model in", "positional", None, str),
|
output_path=("Output directory to store model in", "positional", None, Path),
|
||||||
train_data=("location of JSON-formatted training data", "positional",
|
train_path=("Location of JSON-formatted training data", "positional", None, Path),
|
||||||
None, str),
|
dev_path=("Location of JSON-formatted development data", "positional", None, Path),
|
||||||
dev_data=("location of JSON-formatted development data (optional)",
|
base_model=("Name of model to update (optional)", "option", "b", str),
|
||||||
"positional", None, str),
|
pipeline=("Comma-separated names of pipeline components", "option", "p", str),
|
||||||
n_iter=("number of iterations", "option", "n", int),
|
vectors=("Model to load vectors from", "option", "v", str),
|
||||||
n_sents=("number of sentences", "option", "ns", int),
|
n_iter=("Number of iterations", "option", "n", int),
|
||||||
|
n_examples=("Number of examples", "option", "ns", int),
|
||||||
use_gpu=("Use GPU", "option", "g", int),
|
use_gpu=("Use GPU", "option", "g", int),
|
||||||
vectors=("Model to load vectors from", "option", "v"),
|
|
||||||
no_tagger=("Don't train tagger", "flag", "T", bool),
|
|
||||||
no_parser=("Don't train parser", "flag", "P", bool),
|
|
||||||
no_entities=("Don't train NER", "flag", "N", bool),
|
|
||||||
parser_multitasks=("Side objectives for parser CNN, e.g. dep dep,tag", "option", "pt", str),
|
|
||||||
noise_level=("Amount of corruption to add for data augmentation", "option", "nl", float),
|
|
||||||
entity_multitasks=("Side objectives for ner CNN, e.g. dep dep,tag", "option", "et", str),
|
|
||||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
|
||||||
version=("Model version", "option", "V", str),
|
version=("Model version", "option", "V", str),
|
||||||
meta_path=("Optional path to meta.json. All relevant properties will be "
|
meta_path=("Optional path to meta.json to use as base.", "option", "m", Path),
|
||||||
"overwritten.", "option", "m", Path),
|
init_tok2vec=(
|
||||||
init_tok2vec=("Path to pretrained weights for the token-to-vector parts "
|
"Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.",
|
||||||
"of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path),
|
"option",
|
||||||
verbose=("Display more information for debug", "option", None, bool))
|
"t2v",
|
||||||
def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
Path,
|
||||||
parser_multitasks='', entity_multitasks='', init_tok2vec=None,
|
),
|
||||||
use_gpu=-1, vectors=None, no_tagger=False, noise_level=0.0,
|
parser_multitasks=(
|
||||||
no_parser=False, no_entities=False, gold_preproc=False,
|
"Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'",
|
||||||
version="0.0.0", meta_path=None, verbose=False):
|
"option",
|
||||||
|
"pt",
|
||||||
|
str,
|
||||||
|
),
|
||||||
|
entity_multitasks=(
|
||||||
|
"Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'",
|
||||||
|
"option",
|
||||||
|
"et",
|
||||||
|
str,
|
||||||
|
),
|
||||||
|
noise_level=("Amount of corruption for data augmentation", "option", "nl", float),
|
||||||
|
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||||
|
learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool),
|
||||||
|
verbose=("Display more information for debug", "flag", "VV", bool),
|
||||||
|
debug=("Run data diagnostics before training", "flag", "D", bool),
|
||||||
|
)
|
||||||
|
def train(
|
||||||
|
lang,
|
||||||
|
output_path,
|
||||||
|
train_path,
|
||||||
|
dev_path,
|
||||||
|
base_model=None,
|
||||||
|
pipeline="tagger,parser,ner",
|
||||||
|
vectors=None,
|
||||||
|
n_iter=30,
|
||||||
|
n_examples=0,
|
||||||
|
use_gpu=-1,
|
||||||
|
version="0.0.0",
|
||||||
|
meta_path=None,
|
||||||
|
init_tok2vec=None,
|
||||||
|
parser_multitasks="",
|
||||||
|
entity_multitasks="",
|
||||||
|
noise_level=0.0,
|
||||||
|
gold_preproc=False,
|
||||||
|
learn_tokens=False,
|
||||||
|
verbose=False,
|
||||||
|
debug=False,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Train a model. Expects data in spaCy's JSON format.
|
Train or update a spaCy model. Requires data to be formatted in spaCy's
|
||||||
|
JSON format. To convert data from other formats, use the `spacy convert`
|
||||||
|
command.
|
||||||
"""
|
"""
|
||||||
|
msg = Printer()
|
||||||
util.fix_random_seed()
|
util.fix_random_seed()
|
||||||
util.set_env_log(True)
|
util.set_env_log(verbose)
|
||||||
n_sents = n_sents or None
|
|
||||||
output_path = util.ensure_path(output_dir)
|
# Make sure all files and paths exists if they are needed
|
||||||
train_path = util.ensure_path(train_data)
|
train_path = util.ensure_path(train_path)
|
||||||
dev_path = util.ensure_path(dev_data)
|
dev_path = util.ensure_path(dev_path)
|
||||||
meta_path = util.ensure_path(meta_path)
|
meta_path = util.ensure_path(meta_path)
|
||||||
if not train_path.exists():
|
if not train_path or not train_path.exists():
|
||||||
prints(train_path, title=Messages.M050, exits=1)
|
msg.fail(Messages.M050, train_path, exits=1)
|
||||||
if dev_path and not dev_path.exists():
|
if not dev_path or not dev_path.exists():
|
||||||
prints(dev_path, title=Messages.M051, exits=1)
|
msg.fail(Messages.M051, dev_path, exits=1)
|
||||||
if meta_path is not None and not meta_path.exists():
|
if meta_path is not None and not meta_path.exists():
|
||||||
prints(meta_path, title=Messages.M020, exits=1)
|
msg.fail(Messages.M020, meta_path, exits=1)
|
||||||
meta = util.read_json(meta_path) if meta_path else {}
|
meta = util.read_json(meta_path) if meta_path else {}
|
||||||
if not isinstance(meta, dict):
|
if not isinstance(meta, dict):
|
||||||
prints(Messages.M053.format(meta_type=type(meta)),
|
msg.fail(Messages.M052, Messages.M053.format(meta_type=type(meta)), exits=1)
|
||||||
title=Messages.M052, exits=1)
|
if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
|
||||||
meta.setdefault('lang', lang)
|
msg.fail(Messages.M062, Messages.M065)
|
||||||
meta.setdefault('name', 'unnamed')
|
|
||||||
|
|
||||||
if not output_path.exists():
|
if not output_path.exists():
|
||||||
output_path.mkdir()
|
output_path.mkdir()
|
||||||
|
|
||||||
print("Counting training words (limit=%s" % n_sents)
|
# Set up the base model and pipeline. If a base model is specified, load
|
||||||
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
|
# the model and make sure the pipeline matches the pipeline setting. If
|
||||||
n_train_words = corpus.count_train()
|
# training starts from a blank model, intitalize the language class.
|
||||||
print(n_train_words)
|
pipeline = [p.strip() for p in pipeline.split(",")]
|
||||||
pipeline = ['tagger', 'parser', 'ner']
|
msg.text(Messages.M055.format(pipeline=pipeline))
|
||||||
if no_tagger and 'tagger' in pipeline:
|
if base_model:
|
||||||
pipeline.remove('tagger')
|
msg.text(Messages.M056.format(model=base_model))
|
||||||
if no_parser and 'parser' in pipeline:
|
nlp = util.load_model(base_model)
|
||||||
pipeline.remove('parser')
|
if nlp.lang != lang:
|
||||||
if no_entities and 'ner' in pipeline:
|
msg.fail(Messages.M072.format(model_lang=nlp.lang, lang=lang), exits=1)
|
||||||
pipeline.remove('ner')
|
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipeline]
|
||||||
|
nlp.disable_pipes(*other_pipes)
|
||||||
|
for pipe in pipeline:
|
||||||
|
if pipe not in nlp.pipe_names:
|
||||||
|
nlp.add_pipe(nlp.create_pipe(pipe))
|
||||||
|
else:
|
||||||
|
msg.text(Messages.M057.format(model=lang))
|
||||||
|
lang_cls = util.get_lang_class(lang)
|
||||||
|
nlp = lang_cls()
|
||||||
|
for pipe in pipeline:
|
||||||
|
nlp.add_pipe(nlp.create_pipe(pipe))
|
||||||
|
|
||||||
|
if learn_tokens:
|
||||||
|
nlp.add_pipe(nlp.create_pipe("merge_subtokens"))
|
||||||
|
|
||||||
# Take dropout and batch size as generators of values -- dropout
|
# Take dropout and batch size as generators of values -- dropout
|
||||||
# starts high and decays sharply, to force the optimizer to explore.
|
# starts high and decays sharply, to force the optimizer to explore.
|
||||||
# Batch size starts at 1 and grows, so that we make updates quickly
|
# Batch size starts at 1 and grows, so that we make updates quickly
|
||||||
# at the beginning of training.
|
# at the beginning of training.
|
||||||
dropout_rates = util.decaying(util.env_opt('dropout_from', 0.1),
|
dropout_rates = util.decaying(
|
||||||
util.env_opt('dropout_to', 0.1),
|
util.env_opt("dropout_from", 0.1),
|
||||||
util.env_opt('dropout_decay', 0.0))
|
util.env_opt("dropout_to", 0.1),
|
||||||
batch_sizes = util.compounding(util.env_opt('batch_from', 750),
|
util.env_opt("dropout_decay", 0.0),
|
||||||
util.env_opt('batch_to', 750),
|
)
|
||||||
util.env_opt('batch_compound', 1.001))
|
batch_sizes = util.compounding(
|
||||||
|
util.env_opt("batch_from", 750),
|
||||||
|
util.env_opt("batch_to", 750),
|
||||||
|
util.env_opt("batch_compound", 1.001),
|
||||||
|
)
|
||||||
lang_class = util.get_lang_class(lang)
|
lang_class = util.get_lang_class(lang)
|
||||||
nlp = lang_class()
|
nlp = lang_class()
|
||||||
meta['pipeline'] = pipeline
|
meta["pipeline"] = pipeline
|
||||||
nlp.meta.update(meta)
|
nlp.meta.update(meta)
|
||||||
if vectors:
|
if vectors:
|
||||||
print("Load vectors model", vectors)
|
msg.text(Messages.M058.format(model=vectors))
|
||||||
util.load_model(vectors, vocab=nlp.vocab)
|
_load_vectors(nlp, vectors)
|
||||||
for lex in nlp.vocab:
|
|
||||||
values = {}
|
# Multitask objectives
|
||||||
for attr, func in nlp.vocab.lex_attr_getters.items():
|
multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)]
|
||||||
# These attrs are expected to be set by data. Others should
|
for pipe_name, multitasks in multitask_options:
|
||||||
# be set by calling the language functions.
|
if multitasks:
|
||||||
if attr not in (CLUSTER, PROB, IS_OOV, LANG):
|
if pipe_name not in pipeline:
|
||||||
values[lex.vocab.strings[attr]] = func(lex.orth_)
|
msg.fail(Messages.M059.format(pipe=pipe_name))
|
||||||
lex.set_attrs(**values)
|
pipe = nlp.get_pipe(pipe_name)
|
||||||
lex.is_oov = False
|
for objective in multitasks.split(","):
|
||||||
for name in pipeline:
|
pipe.add_multitask_objective(objective)
|
||||||
nlp.add_pipe(nlp.create_pipe(name), name=name)
|
|
||||||
nlp.add_pipe(nlp.create_pipe('merge_subtokens'))
|
# Prepare training corpus
|
||||||
if parser_multitasks:
|
msg.text(Messages.M060.format(limit=n_examples))
|
||||||
for objective in parser_multitasks.split(','):
|
corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
|
||||||
nlp.parser.add_multitask_objective(objective)
|
n_train_words = corpus.count_train()
|
||||||
if entity_multitasks:
|
|
||||||
for objective in entity_multitasks.split(','):
|
if base_model:
|
||||||
nlp.entity.add_multitask_objective(objective)
|
# Start with an existing model, use default optimizer
|
||||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
optimizer = create_default_optimizer(Model.ops)
|
||||||
if init_tok2vec is not None:
|
else:
|
||||||
loaded = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
# Start with a blank model, call begin_training
|
||||||
print("Loaded pretrained tok2vec for:", loaded)
|
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
||||||
nlp._optimizer = None
|
nlp._optimizer = None
|
||||||
|
|
||||||
print("Itn. Dep Loss NER Loss UAS NER P. NER R. NER F. Tag % Token % CPU WPS GPU WPS")
|
# Load in pre-trained weights
|
||||||
|
if init_tok2vec is not None:
|
||||||
|
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
||||||
|
msg.text(Messages.M071.format(components=components))
|
||||||
|
|
||||||
|
print(
|
||||||
|
"\nItn. Dep Loss NER Loss UAS NER P. NER R. NER F. Tag % Token % CPU WPS GPU WPS"
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
for i in range(n_iter):
|
for i in range(n_iter):
|
||||||
train_docs = corpus.train_docs(nlp, noise_level=noise_level,
|
train_docs = corpus.train_docs(
|
||||||
gold_preproc=gold_preproc, max_length=0)
|
nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0
|
||||||
|
)
|
||||||
words_seen = 0
|
words_seen = 0
|
||||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||||
losses = {}
|
losses = {}
|
||||||
for batch in minibatch_by_words(train_docs, size=batch_sizes):
|
for batch in util.minibatch_by_words(train_docs, size=batch_sizes):
|
||||||
if not batch:
|
if not batch:
|
||||||
continue
|
continue
|
||||||
docs, golds = zip(*batch)
|
docs, golds = zip(*batch)
|
||||||
nlp.update(docs, golds, sgd=optimizer,
|
nlp.update(
|
||||||
drop=next(dropout_rates), losses=losses)
|
docs,
|
||||||
|
golds,
|
||||||
|
sgd=optimizer,
|
||||||
|
drop=next(dropout_rates),
|
||||||
|
losses=losses,
|
||||||
|
)
|
||||||
pbar.update(sum(len(doc) for doc in docs))
|
pbar.update(sum(len(doc) for doc in docs))
|
||||||
words_seen += sum(len(doc) for doc in docs)
|
words_seen += sum(len(doc) for doc in docs)
|
||||||
with nlp.use_params(optimizer.averages):
|
with nlp.use_params(optimizer.averages):
|
||||||
util.set_env_log(False)
|
util.set_env_log(False)
|
||||||
epoch_model_path = output_path / ('model%d' % i)
|
epoch_model_path = output_path / ("model%d" % i)
|
||||||
nlp.to_disk(epoch_model_path)
|
nlp.to_disk(epoch_model_path)
|
||||||
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
||||||
dev_docs = list(corpus.dev_docs(
|
dev_docs = list(corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc))
|
||||||
nlp_loaded,
|
|
||||||
gold_preproc=gold_preproc))
|
|
||||||
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
|
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
|
||||||
start_time = timer()
|
start_time = timer()
|
||||||
scorer = nlp_loaded.evaluate(dev_docs, verbose)
|
scorer = nlp_loaded.evaluate(dev_docs, debug)
|
||||||
end_time = timer()
|
end_time = timer()
|
||||||
if use_gpu < 0:
|
if use_gpu < 0:
|
||||||
gpu_wps = None
|
gpu_wps = None
|
||||||
cpu_wps = nwords/(end_time-start_time)
|
cpu_wps = nwords / (end_time - start_time)
|
||||||
else:
|
else:
|
||||||
gpu_wps = nwords/(end_time-start_time)
|
gpu_wps = nwords / (end_time - start_time)
|
||||||
with Model.use_device('cpu'):
|
with Model.use_device("cpu"):
|
||||||
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
||||||
dev_docs = list(corpus.dev_docs(
|
dev_docs = list(
|
||||||
nlp_loaded, gold_preproc=gold_preproc))
|
corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)
|
||||||
|
)
|
||||||
start_time = timer()
|
start_time = timer()
|
||||||
scorer = nlp_loaded.evaluate(dev_docs)
|
scorer = nlp_loaded.evaluate(dev_docs)
|
||||||
end_time = timer()
|
end_time = timer()
|
||||||
cpu_wps = nwords/(end_time-start_time)
|
cpu_wps = nwords / (end_time - start_time)
|
||||||
acc_loc = (output_path / ('model%d' % i) / 'accuracy.json')
|
acc_loc = output_path / ("model%d" % i) / "accuracy.json"
|
||||||
with acc_loc.open('w') as file_:
|
util.write_json(acc_loc, scorer.scores)
|
||||||
file_.write(json_dumps(scorer.scores))
|
|
||||||
meta_loc = output_path / ('model%d' % i) / 'meta.json'
|
|
||||||
meta['accuracy'] = scorer.scores
|
|
||||||
meta['speed'] = {'nwords': nwords, 'cpu': cpu_wps,
|
|
||||||
'gpu': gpu_wps}
|
|
||||||
meta['vectors'] = {'width': nlp.vocab.vectors_length,
|
|
||||||
'vectors': len(nlp.vocab.vectors),
|
|
||||||
'keys': nlp.vocab.vectors.n_keys}
|
|
||||||
meta['lang'] = nlp.lang
|
|
||||||
meta['pipeline'] = pipeline
|
|
||||||
meta['spacy_version'] = '>=%s' % about.__version__
|
|
||||||
meta.setdefault('name', 'model%d' % i)
|
|
||||||
meta.setdefault('version', version)
|
|
||||||
|
|
||||||
with meta_loc.open('w') as file_:
|
# Update model meta.json
|
||||||
file_.write(json_dumps(meta))
|
meta["lang"] = nlp.lang
|
||||||
util.set_env_log(True)
|
meta["pipeline"] = nlp.pipe_names
|
||||||
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps,
|
meta["spacy_version"] = ">=%s" % about.__version__
|
||||||
gpu_wps=gpu_wps)
|
meta["accuracy"] = scorer.scores
|
||||||
|
meta["speed"] = {"nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps}
|
||||||
|
meta["vectors"] = {
|
||||||
|
"width": nlp.vocab.vectors_length,
|
||||||
|
"vectors": len(nlp.vocab.vectors),
|
||||||
|
"keys": nlp.vocab.vectors.n_keys,
|
||||||
|
}
|
||||||
|
meta.setdefault("name", "model%d" % i)
|
||||||
|
meta.setdefault("version", version)
|
||||||
|
meta_loc = output_path / ("model%d" % i) / "meta.json"
|
||||||
|
util.write_json(meta_loc, meta)
|
||||||
|
|
||||||
|
util.set_env_log(verbose)
|
||||||
|
|
||||||
|
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps)
|
||||||
finally:
|
finally:
|
||||||
print("Saving model...")
|
with msg.loading(Messages.M061):
|
||||||
with nlp.use_params(optimizer.averages):
|
with nlp.use_params(optimizer.averages):
|
||||||
final_model_path = output_path / 'model-final'
|
final_model_path = output_path / "model-final"
|
||||||
nlp.to_disk(final_model_path)
|
nlp.to_disk(final_model_path)
|
||||||
components = []
|
msg.good(Messages.M066, util.path2str(final_model_path))
|
||||||
if not no_parser:
|
|
||||||
components.append('parser')
|
_collate_best_model(meta, output_path, nlp.pipe_names)
|
||||||
if not no_tagger:
|
|
||||||
components.append('tagger')
|
|
||||||
if not no_entities:
|
def _load_vectors(nlp, vectors):
|
||||||
components.append('ner')
|
util.load_model(vectors, vocab=nlp.vocab)
|
||||||
_collate_best_model(meta, output_path, components)
|
for lex in nlp.vocab:
|
||||||
|
values = {}
|
||||||
|
for attr, func in nlp.vocab.lex_attr_getters.items():
|
||||||
|
# These attrs are expected to be set by data. Others should
|
||||||
|
# be set by calling the language functions.
|
||||||
|
if attr not in (CLUSTER, PROB, IS_OOV, LANG):
|
||||||
|
values[lex.vocab.strings[attr]] = func(lex.orth_)
|
||||||
|
lex.set_attrs(**values)
|
||||||
|
lex.is_oov = False
|
||||||
|
|
||||||
|
|
||||||
def _load_pretrained_tok2vec(nlp, loc):
|
def _load_pretrained_tok2vec(nlp, loc):
|
||||||
"""Load pre-trained weights for the 'token-to-vector' part of the component
|
"""Load pre-trained weights for the 'token-to-vector' part of the component
|
||||||
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
||||||
"""
|
"""
|
||||||
with loc.open('rb') as file_:
|
with loc.open("rb") as file_:
|
||||||
weights_data = file_.read()
|
weights_data = file_.read()
|
||||||
loaded = []
|
loaded = []
|
||||||
for name, component in nlp.pipeline:
|
for name, component in nlp.pipeline:
|
||||||
if hasattr(component, 'model') and hasattr(component.model, 'tok2vec'):
|
if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
|
||||||
component.tok2vec.from_bytes(weights_data)
|
component.tok2vec.from_bytes(weights_data)
|
||||||
loaded.append(name)
|
loaded.append(name)
|
||||||
return loaded
|
return loaded
|
||||||
|
@ -222,24 +305,22 @@ def _collate_best_model(meta, output_path, components):
|
||||||
bests = {}
|
bests = {}
|
||||||
for component in components:
|
for component in components:
|
||||||
bests[component] = _find_best(output_path, component)
|
bests[component] = _find_best(output_path, component)
|
||||||
best_dest = output_path / 'model-best'
|
best_dest = output_path / "model-best"
|
||||||
shutil.copytree(output_path / 'model-final', best_dest)
|
shutil.copytree(output_path / "model-final", best_dest)
|
||||||
for component, best_component_src in bests.items():
|
for component, best_component_src in bests.items():
|
||||||
shutil.rmtree(best_dest / component)
|
shutil.rmtree(best_dest / component)
|
||||||
shutil.copytree(best_component_src / component, best_dest / component)
|
shutil.copytree(best_component_src / component, best_dest / component)
|
||||||
with (best_component_src / 'accuracy.json').open() as file_:
|
accs = util.read_json(best_component_src / "accuracy.json")
|
||||||
accs = json.load(file_)
|
|
||||||
for metric in _get_metrics(component):
|
for metric in _get_metrics(component):
|
||||||
meta['accuracy'][metric] = accs[metric]
|
meta["accuracy"][metric] = accs[metric]
|
||||||
with (best_dest / 'meta.json').open('w') as file_:
|
util.write_json(best_dest / "meta.json", meta)
|
||||||
file_.write(json_dumps(meta))
|
|
||||||
|
|
||||||
|
|
||||||
def _find_best(experiment_dir, component):
|
def _find_best(experiment_dir, component):
|
||||||
accuracies = []
|
accuracies = []
|
||||||
for epoch_model in experiment_dir.iterdir():
|
for epoch_model in experiment_dir.iterdir():
|
||||||
if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
|
if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
|
||||||
accs = json.load((epoch_model / "accuracy.json").open())
|
accs = util.read_json(epoch_model / "accuracy.json")
|
||||||
scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
|
scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
|
||||||
accuracies.append((scores, epoch_model))
|
accuracies.append((scores, epoch_model))
|
||||||
if accuracies:
|
if accuracies:
|
||||||
|
@ -247,6 +328,7 @@ def _find_best(experiment_dir, component):
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _get_metrics(component):
|
def _get_metrics(component):
|
||||||
if component == "parser":
|
if component == "parser":
|
||||||
return ("las", "uas", "token_acc")
|
return ("las", "uas", "token_acc")
|
||||||
|
@ -257,50 +339,40 @@ def _get_metrics(component):
|
||||||
return ("token_acc",)
|
return ("token_acc",)
|
||||||
|
|
||||||
|
|
||||||
def _render_parses(i, to_render):
|
|
||||||
to_render[0].user_data['title'] = "Batch %d" % i
|
|
||||||
with Path('/tmp/entities.html').open('w') as file_:
|
|
||||||
html = displacy.render(to_render[:5], style='ent', page=True)
|
|
||||||
file_.write(html)
|
|
||||||
with Path('/tmp/parses.html').open('w') as file_:
|
|
||||||
html = displacy.render(to_render[:5], style='dep', page=True)
|
|
||||||
file_.write(html)
|
|
||||||
|
|
||||||
|
|
||||||
def print_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0):
|
def print_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0):
|
||||||
scores = {}
|
scores = {}
|
||||||
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
|
for col in [
|
||||||
'ents_p', 'ents_r', 'ents_f', 'cpu_wps', 'gpu_wps']:
|
"dep_loss",
|
||||||
|
"tag_loss",
|
||||||
|
"uas",
|
||||||
|
"tags_acc",
|
||||||
|
"token_acc",
|
||||||
|
"ents_p",
|
||||||
|
"ents_r",
|
||||||
|
"ents_f",
|
||||||
|
"cpu_wps",
|
||||||
|
"gpu_wps",
|
||||||
|
]:
|
||||||
scores[col] = 0.0
|
scores[col] = 0.0
|
||||||
scores['dep_loss'] = losses.get('parser', 0.0)
|
scores["dep_loss"] = losses.get("parser", 0.0)
|
||||||
scores['ner_loss'] = losses.get('ner', 0.0)
|
scores["ner_loss"] = losses.get("ner", 0.0)
|
||||||
scores['tag_loss'] = losses.get('tagger', 0.0)
|
scores["tag_loss"] = losses.get("tagger", 0.0)
|
||||||
scores.update(dev_scores)
|
scores.update(dev_scores)
|
||||||
scores['cpu_wps'] = cpu_wps
|
scores["cpu_wps"] = cpu_wps
|
||||||
scores['gpu_wps'] = gpu_wps or 0.0
|
scores["gpu_wps"] = gpu_wps or 0.0
|
||||||
tpl = ''.join((
|
tpl = "".join(
|
||||||
'{:<6d}',
|
(
|
||||||
'{dep_loss:<10.3f}',
|
"{:<6d}",
|
||||||
'{ner_loss:<10.3f}',
|
"{dep_loss:<10.3f}",
|
||||||
'{uas:<8.3f}',
|
"{ner_loss:<10.3f}",
|
||||||
'{ents_p:<8.3f}',
|
"{uas:<8.3f}",
|
||||||
'{ents_r:<8.3f}',
|
"{ents_p:<8.3f}",
|
||||||
'{ents_f:<8.3f}',
|
"{ents_r:<8.3f}",
|
||||||
'{tags_acc:<8.3f}',
|
"{ents_f:<8.3f}",
|
||||||
'{token_acc:<9.3f}',
|
"{tags_acc:<8.3f}",
|
||||||
'{cpu_wps:<9.1f}',
|
"{token_acc:<9.3f}",
|
||||||
'{gpu_wps:.1f}',
|
"{cpu_wps:<9.1f}",
|
||||||
))
|
"{gpu_wps:.1f}",
|
||||||
|
)
|
||||||
|
)
|
||||||
print(tpl.format(itn, **scores))
|
print(tpl.format(itn, **scores))
|
||||||
|
|
||||||
|
|
||||||
def print_results(scorer):
|
|
||||||
results = {
|
|
||||||
'TOK': '%.2f' % scorer.token_acc,
|
|
||||||
'POS': '%.2f' % scorer.tags_acc,
|
|
||||||
'UAS': '%.2f' % scorer.uas,
|
|
||||||
'LAS': '%.2f' % scorer.las,
|
|
||||||
'NER P': '%.2f' % scorer.ents_p,
|
|
||||||
'NER R': '%.2f' % scorer.ents_r,
|
|
||||||
'NER F': '%.2f' % scorer.ents_f}
|
|
||||||
util.print_table(results, title="Results")
|
|
||||||
|
|
2
spacy/cli/ud/__init__.py
Normal file
2
spacy/cli/ud/__init__.py
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
from .conll17_ud_eval import main as ud_evaluate # noqa: F401
|
||||||
|
from .ud_train import main as ud_train # noqa: F401
|
|
@ -1,4 +1,5 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
# flake8: noqa
|
||||||
|
|
||||||
# CoNLL 2017 UD Parsing evaluation script.
|
# CoNLL 2017 UD Parsing evaluation script.
|
||||||
#
|
#
|
||||||
|
@ -214,7 +215,7 @@ def load_conllu(file):
|
||||||
start, end = map(int, columns[ID].split("-"))
|
start, end = map(int, columns[ID].split("-"))
|
||||||
except:
|
except:
|
||||||
raise UDError("Cannot parse multi-word token ID '{}'".format(columns[ID]))
|
raise UDError("Cannot parse multi-word token ID '{}'".format(columns[ID]))
|
||||||
|
|
||||||
for _ in range(start, end + 1):
|
for _ in range(start, end + 1):
|
||||||
word_line = file.readline().rstrip("\r\n")
|
word_line = file.readline().rstrip("\r\n")
|
||||||
word_columns = word_line.split("\t")
|
word_columns = word_line.split("\t")
|
|
@ -1,7 +1,9 @@
|
||||||
'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
|
# flake8: noqa
|
||||||
|
"""Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
|
||||||
.conllu format for development data, allowing the official scorer to be used.
|
.conllu format for development data, allowing the official scorer to be used.
|
||||||
'''
|
"""
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import tqdm
|
import tqdm
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -11,15 +13,17 @@ import json
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
import spacy.util
|
import spacy.util
|
||||||
from ..tokens import Token, Doc
|
from ...tokens import Token, Doc
|
||||||
from ..gold import GoldParse
|
from ...gold import GoldParse
|
||||||
from ..util import compounding, minibatch_by_words
|
from ...util import compounding, minibatch_by_words
|
||||||
from ..syntax.nonproj import projectivize
|
from ...syntax.nonproj import projectivize
|
||||||
from ..matcher import Matcher
|
from ...matcher import Matcher
|
||||||
#from ..morphology import Fused_begin, Fused_inside
|
|
||||||
from .. import displacy
|
# from ...morphology import Fused_begin, Fused_inside
|
||||||
|
from ... import displacy
|
||||||
from collections import defaultdict, Counter
|
from collections import defaultdict, Counter
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
|
|
||||||
Fused_begin = None
|
Fused_begin = None
|
||||||
Fused_inside = None
|
Fused_inside = None
|
||||||
|
|
||||||
|
@ -30,43 +34,45 @@ import cytoolz
|
||||||
|
|
||||||
from . import conll17_ud_eval
|
from . import conll17_ud_eval
|
||||||
|
|
||||||
from .. import lang
|
from ... import lang
|
||||||
from .. import lang
|
from ...lang import zh
|
||||||
from ..lang import zh
|
from ...lang import ja
|
||||||
from ..lang import ja
|
from ...lang import ru
|
||||||
from ..lang import ru
|
|
||||||
|
|
||||||
|
|
||||||
################
|
################
|
||||||
# Data reading #
|
# Data reading #
|
||||||
################
|
################
|
||||||
|
|
||||||
space_re = re.compile('\s+')
|
space_re = re.compile("\s+")
|
||||||
|
|
||||||
|
|
||||||
def split_text(text):
|
def split_text(text):
|
||||||
return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')]
|
return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")]
|
||||||
|
|
||||||
|
|
||||||
##############
|
##############
|
||||||
# Evaluation #
|
# Evaluation #
|
||||||
##############
|
##############
|
||||||
|
|
||||||
|
|
||||||
def read_conllu(file_):
|
def read_conllu(file_):
|
||||||
docs = []
|
docs = []
|
||||||
sent = []
|
sent = []
|
||||||
doc = []
|
doc = []
|
||||||
for line in file_:
|
for line in file_:
|
||||||
if line.startswith('# newdoc'):
|
if line.startswith("# newdoc"):
|
||||||
if doc:
|
if doc:
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
doc = []
|
doc = []
|
||||||
elif line.startswith('#'):
|
elif line.startswith("#"):
|
||||||
continue
|
continue
|
||||||
elif not line.strip():
|
elif not line.strip():
|
||||||
if sent:
|
if sent:
|
||||||
doc.append(sent)
|
doc.append(sent)
|
||||||
sent = []
|
sent = []
|
||||||
else:
|
else:
|
||||||
sent.append(list(line.strip().split('\t')))
|
sent.append(list(line.strip().split("\t")))
|
||||||
if len(sent[-1]) != 10:
|
if len(sent[-1]) != 10:
|
||||||
print(repr(line))
|
print(repr(line))
|
||||||
raise ValueError
|
raise ValueError
|
||||||
|
@ -78,7 +84,7 @@ def read_conllu(file_):
|
||||||
|
|
||||||
|
|
||||||
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
||||||
if text_loc.parts[-1].endswith('.conllu'):
|
if text_loc.parts[-1].endswith(".conllu"):
|
||||||
docs = []
|
docs = []
|
||||||
with text_loc.open() as file_:
|
with text_loc.open() as file_:
|
||||||
for conllu_doc in read_conllu(file_):
|
for conllu_doc in read_conllu(file_):
|
||||||
|
@ -88,14 +94,14 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
||||||
for name, component in nlp.pipeline:
|
for name, component in nlp.pipeline:
|
||||||
docs = list(component.pipe(docs))
|
docs = list(component.pipe(docs))
|
||||||
else:
|
else:
|
||||||
with text_loc.open('r', encoding='utf8') as text_file:
|
with text_loc.open("r", encoding="utf8") as text_file:
|
||||||
texts = split_text(text_file.read())
|
texts = split_text(text_file.read())
|
||||||
docs = list(nlp.pipe(texts))
|
docs = list(nlp.pipe(texts))
|
||||||
with sys_loc.open('w', encoding='utf8') as out_file:
|
with sys_loc.open("w", encoding="utf8") as out_file:
|
||||||
write_conllu(docs, out_file)
|
write_conllu(docs, out_file)
|
||||||
with gold_loc.open('r', encoding='utf8') as gold_file:
|
with gold_loc.open("r", encoding="utf8") as gold_file:
|
||||||
gold_ud = conll17_ud_eval.load_conllu(gold_file)
|
gold_ud = conll17_ud_eval.load_conllu(gold_file)
|
||||||
with sys_loc.open('r', encoding='utf8') as sys_file:
|
with sys_loc.open("r", encoding="utf8") as sys_file:
|
||||||
sys_ud = conll17_ud_eval.load_conllu(sys_file)
|
sys_ud = conll17_ud_eval.load_conllu(sys_file)
|
||||||
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
|
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
|
||||||
return docs, scores
|
return docs, scores
|
||||||
|
@ -103,26 +109,26 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
||||||
|
|
||||||
def write_conllu(docs, file_):
|
def write_conllu(docs, file_):
|
||||||
merger = Matcher(docs[0].vocab)
|
merger = Matcher(docs[0].vocab)
|
||||||
merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}])
|
merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
matches = merger(doc)
|
matches = merger(doc)
|
||||||
spans = [doc[start:end+1] for _, start, end in matches]
|
spans = [doc[start : end + 1] for _, start, end in matches]
|
||||||
offsets = [(span.start_char, span.end_char) for span in spans]
|
offsets = [(span.start_char, span.end_char) for span in spans]
|
||||||
for start_char, end_char in offsets:
|
for start_char, end_char in offsets:
|
||||||
doc.merge(start_char, end_char)
|
doc.merge(start_char, end_char)
|
||||||
# TODO: This shuldn't be necessary? Should be handled in merge
|
# TODO: This shuldn't be necessary? Should be handled in merge
|
||||||
for word in doc:
|
for word in doc:
|
||||||
if word.i == word.head.i:
|
if word.i == word.head.i:
|
||||||
word.dep_ = 'ROOT'
|
word.dep_ = "ROOT"
|
||||||
file_.write("# newdoc id = {i}\n".format(i=i))
|
file_.write("# newdoc id = {i}\n".format(i=i))
|
||||||
for j, sent in enumerate(doc.sents):
|
for j, sent in enumerate(doc.sents):
|
||||||
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
|
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
|
||||||
file_.write("# text = {text}\n".format(text=sent.text))
|
file_.write("# text = {text}\n".format(text=sent.text))
|
||||||
for k, token in enumerate(sent):
|
for k, token in enumerate(sent):
|
||||||
file_.write(_get_token_conllu(token, k, len(sent)) + '\n')
|
file_.write(_get_token_conllu(token, k, len(sent)) + "\n")
|
||||||
file_.write('\n')
|
file_.write("\n")
|
||||||
for word in sent:
|
for word in sent:
|
||||||
if word.head.i == word.i and word.dep_ == 'ROOT':
|
if word.head.i == word.i and word.dep_ == "ROOT":
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
print("Rootless sentence!")
|
print("Rootless sentence!")
|
||||||
|
@ -134,24 +140,34 @@ def write_conllu(docs, file_):
|
||||||
|
|
||||||
|
|
||||||
def _get_token_conllu(token, k, sent_len):
|
def _get_token_conllu(token, k, sent_len):
|
||||||
if token.check_morph(Fused_begin) and (k+1 < sent_len):
|
if token.check_morph(Fused_begin) and (k + 1 < sent_len):
|
||||||
n = 1
|
n = 1
|
||||||
text = [token.text]
|
text = [token.text]
|
||||||
while token.nbor(n).check_morph(Fused_inside):
|
while token.nbor(n).check_morph(Fused_inside):
|
||||||
text.append(token.nbor(n).text)
|
text.append(token.nbor(n).text)
|
||||||
n += 1
|
n += 1
|
||||||
id_ = '%d-%d' % (k+1, (k+n))
|
id_ = "%d-%d" % (k + 1, (k + n))
|
||||||
fields = [id_, ''.join(text)] + ['_'] * 8
|
fields = [id_, "".join(text)] + ["_"] * 8
|
||||||
lines = ['\t'.join(fields)]
|
lines = ["\t".join(fields)]
|
||||||
else:
|
else:
|
||||||
lines = []
|
lines = []
|
||||||
if token.head.i == token.i:
|
if token.head.i == token.i:
|
||||||
head = 0
|
head = 0
|
||||||
else:
|
else:
|
||||||
head = k + (token.head.i - token.i) + 1
|
head = k + (token.head.i - token.i) + 1
|
||||||
fields = [str(k+1), token.text, token.lemma_, token.pos_, token.tag_, '_',
|
fields = [
|
||||||
str(head), token.dep_.lower(), '_', '_']
|
str(k + 1),
|
||||||
if token.check_morph(Fused_begin) and (k+1 < sent_len):
|
token.text,
|
||||||
|
token.lemma_,
|
||||||
|
token.pos_,
|
||||||
|
token.tag_,
|
||||||
|
"_",
|
||||||
|
str(head),
|
||||||
|
token.dep_.lower(),
|
||||||
|
"_",
|
||||||
|
"_",
|
||||||
|
]
|
||||||
|
if token.check_morph(Fused_begin) and (k + 1 < sent_len):
|
||||||
if k == 0:
|
if k == 0:
|
||||||
fields[1] = token.norm_[0].upper() + token.norm_[1:]
|
fields[1] = token.norm_[0].upper() + token.norm_[1:]
|
||||||
else:
|
else:
|
||||||
|
@ -163,18 +179,18 @@ def _get_token_conllu(token, k, sent_len):
|
||||||
split_end = token._.split_end
|
split_end = token._.split_end
|
||||||
split_len = (split_end.i - split_start.i) + 1
|
split_len = (split_end.i - split_start.i) + 1
|
||||||
n_in_split = token.i - split_start.i
|
n_in_split = token.i - split_start.i
|
||||||
subtokens = guess_fused_orths(split_start.text, [''] * split_len)
|
subtokens = guess_fused_orths(split_start.text, [""] * split_len)
|
||||||
fields[1] = subtokens[n_in_split]
|
fields[1] = subtokens[n_in_split]
|
||||||
|
|
||||||
lines.append('\t'.join(fields))
|
lines.append("\t".join(fields))
|
||||||
return '\n'.join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
def guess_fused_orths(word, ud_forms):
|
def guess_fused_orths(word, ud_forms):
|
||||||
'''The UD data 'fused tokens' don't necessarily expand to keys that match
|
"""The UD data 'fused tokens' don't necessarily expand to keys that match
|
||||||
the form. We need orths that exact match the string. Here we make a best
|
the form. We need orths that exact match the string. Here we make a best
|
||||||
effort to divide up the word.'''
|
effort to divide up the word."""
|
||||||
if word == ''.join(ud_forms):
|
if word == "".join(ud_forms):
|
||||||
# Happy case: we get a perfect split, with each letter accounted for.
|
# Happy case: we get a perfect split, with each letter accounted for.
|
||||||
return ud_forms
|
return ud_forms
|
||||||
elif len(word) == sum(len(subtoken) for subtoken in ud_forms):
|
elif len(word) == sum(len(subtoken) for subtoken in ud_forms):
|
||||||
|
@ -183,16 +199,16 @@ def guess_fused_orths(word, ud_forms):
|
||||||
remain = word
|
remain = word
|
||||||
for subtoken in ud_forms:
|
for subtoken in ud_forms:
|
||||||
assert len(subtoken) >= 1
|
assert len(subtoken) >= 1
|
||||||
output.append(remain[:len(subtoken)])
|
output.append(remain[: len(subtoken)])
|
||||||
remain = remain[len(subtoken):]
|
remain = remain[len(subtoken) :]
|
||||||
assert len(remain) == 0, (word, ud_forms, remain)
|
assert len(remain) == 0, (word, ud_forms, remain)
|
||||||
return output
|
return output
|
||||||
else:
|
else:
|
||||||
# Let's say word is 6 long, and there are three subtokens. The orths
|
# Let's say word is 6 long, and there are three subtokens. The orths
|
||||||
# *must* equal the original string. Arbitrarily, split [4, 1, 1]
|
# *must* equal the original string. Arbitrarily, split [4, 1, 1]
|
||||||
first = word[:len(word)-(len(ud_forms)-1)]
|
first = word[: len(word) - (len(ud_forms) - 1)]
|
||||||
output = [first]
|
output = [first]
|
||||||
remain = word[len(first):]
|
remain = word[len(first) :]
|
||||||
for i in range(1, len(ud_forms)):
|
for i in range(1, len(ud_forms)):
|
||||||
assert remain
|
assert remain
|
||||||
output.append(remain[:1])
|
output.append(remain[:1])
|
||||||
|
@ -201,60 +217,50 @@ def guess_fused_orths(word, ud_forms):
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def print_results(name, ud_scores):
|
def print_results(name, ud_scores):
|
||||||
fields = {}
|
fields = {}
|
||||||
if ud_scores is not None:
|
if ud_scores is not None:
|
||||||
fields.update({
|
fields.update(
|
||||||
'words': ud_scores['Words'].f1 * 100,
|
{
|
||||||
'sents': ud_scores['Sentences'].f1 * 100,
|
"words": ud_scores["Words"].f1 * 100,
|
||||||
'tags': ud_scores['XPOS'].f1 * 100,
|
"sents": ud_scores["Sentences"].f1 * 100,
|
||||||
'uas': ud_scores['UAS'].f1 * 100,
|
"tags": ud_scores["XPOS"].f1 * 100,
|
||||||
'las': ud_scores['LAS'].f1 * 100,
|
"uas": ud_scores["UAS"].f1 * 100,
|
||||||
})
|
"las": ud_scores["LAS"].f1 * 100,
|
||||||
|
}
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
fields.update({
|
fields.update({"words": 0.0, "sents": 0.0, "tags": 0.0, "uas": 0.0, "las": 0.0})
|
||||||
'words': 0.0,
|
tpl = "\t".join(
|
||||||
'sents': 0.0,
|
(name, "{las:.1f}", "{uas:.1f}", "{tags:.1f}", "{sents:.1f}", "{words:.1f}")
|
||||||
'tags': 0.0,
|
)
|
||||||
'uas': 0.0,
|
|
||||||
'las': 0.0
|
|
||||||
})
|
|
||||||
tpl = '\t'.join((
|
|
||||||
name,
|
|
||||||
'{las:.1f}',
|
|
||||||
'{uas:.1f}',
|
|
||||||
'{tags:.1f}',
|
|
||||||
'{sents:.1f}',
|
|
||||||
'{words:.1f}',
|
|
||||||
))
|
|
||||||
print(tpl.format(**fields))
|
print(tpl.format(**fields))
|
||||||
return fields
|
return fields
|
||||||
|
|
||||||
|
|
||||||
def get_token_split_start(token):
|
def get_token_split_start(token):
|
||||||
if token.text == '':
|
if token.text == "":
|
||||||
assert token.i != 0
|
assert token.i != 0
|
||||||
i = -1
|
i = -1
|
||||||
while token.nbor(i).text == '':
|
while token.nbor(i).text == "":
|
||||||
i -= 1
|
i -= 1
|
||||||
return token.nbor(i)
|
return token.nbor(i)
|
||||||
elif (token.i+1) < len(token.doc) and token.nbor(1).text == '':
|
elif (token.i + 1) < len(token.doc) and token.nbor(1).text == "":
|
||||||
return token
|
return token
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def get_token_split_end(token):
|
def get_token_split_end(token):
|
||||||
if (token.i+1) == len(token.doc):
|
if (token.i + 1) == len(token.doc):
|
||||||
return token if token.text == '' else None
|
return token if token.text == "" else None
|
||||||
elif token.text != '' and token.nbor(1).text != '':
|
elif token.text != "" and token.nbor(1).text != "":
|
||||||
return None
|
return None
|
||||||
i = 1
|
i = 1
|
||||||
while (token.i+i) < len(token.doc) and token.nbor(i).text == '':
|
while (token.i + i) < len(token.doc) and token.nbor(i).text == "":
|
||||||
i += 1
|
i += 1
|
||||||
return token.nbor(i-1)
|
return token.nbor(i - 1)
|
||||||
|
|
||||||
|
|
||||||
##################
|
##################
|
||||||
# Initialization #
|
# Initialization #
|
||||||
|
@ -262,54 +268,73 @@ def get_token_split_end(token):
|
||||||
|
|
||||||
|
|
||||||
def load_nlp(experiments_dir, corpus):
|
def load_nlp(experiments_dir, corpus):
|
||||||
nlp = spacy.load(experiments_dir / corpus / 'best-model')
|
nlp = spacy.load(experiments_dir / corpus / "best-model")
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
def initialize_pipeline(nlp, docs, golds, config, device):
|
def initialize_pipeline(nlp, docs, golds, config, device):
|
||||||
nlp.add_pipe(nlp.create_pipe('parser'))
|
nlp.add_pipe(nlp.create_pipe("parser"))
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
test_data_dir=("Path to Universal Dependencies test data", "positional", None, Path),
|
test_data_dir=(
|
||||||
|
"Path to Universal Dependencies test data",
|
||||||
|
"positional",
|
||||||
|
None,
|
||||||
|
Path,
|
||||||
|
),
|
||||||
experiment_dir=("Parent directory with output model", "positional", None, Path),
|
experiment_dir=("Parent directory with output model", "positional", None, Path),
|
||||||
corpus=("UD corpus to evaluate, e.g. UD_English, UD_Spanish, etc", "positional", None, str),
|
corpus=(
|
||||||
|
"UD corpus to evaluate, e.g. UD_English, UD_Spanish, etc",
|
||||||
|
"positional",
|
||||||
|
None,
|
||||||
|
str,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
def main(test_data_dir, experiment_dir, corpus):
|
def main(test_data_dir, experiment_dir, corpus):
|
||||||
Token.set_extension('split_start', getter=get_token_split_start)
|
Token.set_extension("split_start", getter=get_token_split_start)
|
||||||
Token.set_extension('split_end', getter=get_token_split_end)
|
Token.set_extension("split_end", getter=get_token_split_end)
|
||||||
Token.set_extension('begins_fused', default=False)
|
Token.set_extension("begins_fused", default=False)
|
||||||
Token.set_extension('inside_fused', default=False)
|
Token.set_extension("inside_fused", default=False)
|
||||||
lang.zh.Chinese.Defaults.use_jieba = False
|
lang.zh.Chinese.Defaults.use_jieba = False
|
||||||
lang.ja.Japanese.Defaults.use_janome = False
|
lang.ja.Japanese.Defaults.use_janome = False
|
||||||
lang.ru.Russian.Defaults.use_pymorphy2 = False
|
lang.ru.Russian.Defaults.use_pymorphy2 = False
|
||||||
|
|
||||||
nlp = load_nlp(experiment_dir, corpus)
|
nlp = load_nlp(experiment_dir, corpus)
|
||||||
|
|
||||||
treebank_code = nlp.meta['treebank']
|
|
||||||
for section in ('test', 'dev'):
|
|
||||||
if section == 'dev':
|
|
||||||
section_dir = 'conll17-ud-development-2017-03-19'
|
|
||||||
else:
|
|
||||||
section_dir = 'conll17-ud-test-2017-05-09'
|
|
||||||
text_path = test_data_dir / 'input' / section_dir / (treebank_code+'.txt')
|
|
||||||
udpipe_path = test_data_dir / 'input' / section_dir / (treebank_code+'-udpipe.conllu')
|
|
||||||
gold_path = test_data_dir / 'gold' / section_dir / (treebank_code+'.conllu')
|
|
||||||
|
|
||||||
header = [section, 'LAS', 'UAS', 'TAG', 'SENT', 'WORD']
|
treebank_code = nlp.meta["treebank"]
|
||||||
print('\t'.join(header))
|
for section in ("test", "dev"):
|
||||||
inputs = {'gold': gold_path, 'udp': udpipe_path, 'raw': text_path}
|
if section == "dev":
|
||||||
for input_type in ('udp', 'raw'):
|
section_dir = "conll17-ud-development-2017-03-19"
|
||||||
|
else:
|
||||||
|
section_dir = "conll17-ud-test-2017-05-09"
|
||||||
|
text_path = test_data_dir / "input" / section_dir / (treebank_code + ".txt")
|
||||||
|
udpipe_path = (
|
||||||
|
test_data_dir / "input" / section_dir / (treebank_code + "-udpipe.conllu")
|
||||||
|
)
|
||||||
|
gold_path = test_data_dir / "gold" / section_dir / (treebank_code + ".conllu")
|
||||||
|
|
||||||
|
header = [section, "LAS", "UAS", "TAG", "SENT", "WORD"]
|
||||||
|
print("\t".join(header))
|
||||||
|
inputs = {"gold": gold_path, "udp": udpipe_path, "raw": text_path}
|
||||||
|
for input_type in ("udp", "raw"):
|
||||||
input_path = inputs[input_type]
|
input_path = inputs[input_type]
|
||||||
output_path = experiment_dir / corpus / '{section}.conllu'.format(section=section)
|
output_path = (
|
||||||
|
experiment_dir / corpus / "{section}.conllu".format(section=section)
|
||||||
|
)
|
||||||
|
|
||||||
parsed_docs, test_scores = evaluate(nlp, input_path, gold_path, output_path)
|
parsed_docs, test_scores = evaluate(nlp, input_path, gold_path, output_path)
|
||||||
|
|
||||||
accuracy = print_results(input_type, test_scores)
|
accuracy = print_results(input_type, test_scores)
|
||||||
acc_path = experiment_dir / corpus / '{section}-accuracy.json'.format(section=section)
|
acc_path = (
|
||||||
with open(acc_path, 'w') as file_:
|
experiment_dir
|
||||||
|
/ corpus
|
||||||
|
/ "{section}-accuracy.json".format(section=section)
|
||||||
|
)
|
||||||
|
with open(acc_path, "w") as file_:
|
||||||
file_.write(json.dumps(accuracy, indent=2))
|
file_.write(json.dumps(accuracy, indent=2))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
plac.call(main)
|
plac.call(main)
|
|
@ -1,7 +1,9 @@
|
||||||
'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
|
# flake8: noqa
|
||||||
|
"""Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
|
||||||
.conllu format for development data, allowing the official scorer to be used.
|
.conllu format for development data, allowing the official scorer to be used.
|
||||||
'''
|
"""
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import tqdm
|
import tqdm
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -11,12 +13,12 @@ import json
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
import spacy.util
|
import spacy.util
|
||||||
from ..tokens import Token, Doc
|
from ...tokens import Token, Doc
|
||||||
from ..gold import GoldParse
|
from ...gold import GoldParse
|
||||||
from ..util import compounding, minibatch, minibatch_by_words
|
from ...util import compounding, minibatch, minibatch_by_words
|
||||||
from ..syntax.nonproj import projectivize
|
from ...syntax.nonproj import projectivize
|
||||||
from ..matcher import Matcher
|
from ...matcher import Matcher
|
||||||
from .. import displacy
|
from ... import displacy
|
||||||
from collections import defaultdict, Counter
|
from collections import defaultdict, Counter
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
|
|
||||||
|
@ -27,10 +29,9 @@ import cytoolz
|
||||||
|
|
||||||
from . import conll17_ud_eval
|
from . import conll17_ud_eval
|
||||||
|
|
||||||
from .. import lang
|
from ... import lang
|
||||||
from .. import lang
|
from ...lang import zh
|
||||||
from ..lang import zh
|
from ...lang import ja
|
||||||
from ..lang import ja
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import torch
|
import torch
|
||||||
|
@ -42,17 +43,26 @@ except ImportError:
|
||||||
# Data reading #
|
# Data reading #
|
||||||
################
|
################
|
||||||
|
|
||||||
space_re = re.compile('\s+')
|
space_re = re.compile("\s+")
|
||||||
def split_text(text):
|
|
||||||
return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')]
|
|
||||||
|
|
||||||
|
|
||||||
def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
|
|
||||||
max_doc_length=None, limit=None):
|
def split_text(text):
|
||||||
'''Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
|
return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")]
|
||||||
|
|
||||||
|
|
||||||
|
def read_data(
|
||||||
|
nlp,
|
||||||
|
conllu_file,
|
||||||
|
text_file,
|
||||||
|
raw_text=True,
|
||||||
|
oracle_segments=False,
|
||||||
|
max_doc_length=None,
|
||||||
|
limit=None,
|
||||||
|
):
|
||||||
|
"""Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
|
||||||
include Doc objects created using nlp.make_doc and then aligned against
|
include Doc objects created using nlp.make_doc and then aligned against
|
||||||
the gold-standard sequences. If oracle_segments=True, include Doc objects
|
the gold-standard sequences. If oracle_segments=True, include Doc objects
|
||||||
created from the gold-standard segments. At least one must be True.'''
|
created from the gold-standard segments. At least one must be True."""
|
||||||
if not raw_text and not oracle_segments:
|
if not raw_text and not oracle_segments:
|
||||||
raise ValueError("At least one of raw_text or oracle_segments must be True")
|
raise ValueError("At least one of raw_text or oracle_segments must be True")
|
||||||
paragraphs = split_text(text_file.read())
|
paragraphs = split_text(text_file.read())
|
||||||
|
@ -66,22 +76,21 @@ def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
|
||||||
for cs in cd:
|
for cs in cd:
|
||||||
sent = defaultdict(list)
|
sent = defaultdict(list)
|
||||||
for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
|
for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
|
||||||
if '.' in id_:
|
if "." in id_:
|
||||||
continue
|
continue
|
||||||
if '-' in id_:
|
if "-" in id_:
|
||||||
continue
|
continue
|
||||||
id_ = int(id_)-1
|
id_ = int(id_) - 1
|
||||||
head = int(head)-1 if head != '0' else id_
|
head = int(head) - 1 if head != "0" else id_
|
||||||
sent['words'].append(word)
|
sent["words"].append(word)
|
||||||
sent['tags'].append(tag)
|
sent["tags"].append(tag)
|
||||||
sent['heads'].append(head)
|
sent["heads"].append(head)
|
||||||
sent['deps'].append('ROOT' if dep == 'root' else dep)
|
sent["deps"].append("ROOT" if dep == "root" else dep)
|
||||||
sent['spaces'].append(space_after == '_')
|
sent["spaces"].append(space_after == "_")
|
||||||
sent['entities'] = ['-'] * len(sent['words'])
|
sent["entities"] = ["-"] * len(sent["words"])
|
||||||
sent['heads'], sent['deps'] = projectivize(sent['heads'],
|
sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
|
||||||
sent['deps'])
|
|
||||||
if oracle_segments:
|
if oracle_segments:
|
||||||
docs.append(Doc(nlp.vocab, words=sent['words'], spaces=sent['spaces']))
|
docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
|
||||||
golds.append(GoldParse(docs[-1], **sent))
|
golds.append(GoldParse(docs[-1], **sent))
|
||||||
|
|
||||||
sent_annots.append(sent)
|
sent_annots.append(sent)
|
||||||
|
@ -107,18 +116,18 @@ def read_conllu(file_):
|
||||||
sent = []
|
sent = []
|
||||||
doc = []
|
doc = []
|
||||||
for line in file_:
|
for line in file_:
|
||||||
if line.startswith('# newdoc'):
|
if line.startswith("# newdoc"):
|
||||||
if doc:
|
if doc:
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
doc = []
|
doc = []
|
||||||
elif line.startswith('#'):
|
elif line.startswith("#"):
|
||||||
continue
|
continue
|
||||||
elif not line.strip():
|
elif not line.strip():
|
||||||
if sent:
|
if sent:
|
||||||
doc.append(sent)
|
doc.append(sent)
|
||||||
sent = []
|
sent = []
|
||||||
else:
|
else:
|
||||||
sent.append(list(line.strip().split('\t')))
|
sent.append(list(line.strip().split("\t")))
|
||||||
if len(sent[-1]) != 10:
|
if len(sent[-1]) != 10:
|
||||||
print(repr(line))
|
print(repr(line))
|
||||||
raise ValueError
|
raise ValueError
|
||||||
|
@ -134,17 +143,19 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
|
||||||
flat = defaultdict(list)
|
flat = defaultdict(list)
|
||||||
sent_starts = []
|
sent_starts = []
|
||||||
for sent in sent_annots:
|
for sent in sent_annots:
|
||||||
flat['heads'].extend(len(flat['words'])+head for head in sent['heads'])
|
flat["heads"].extend(len(flat["words"]) + head for head in sent["heads"])
|
||||||
for field in ['words', 'tags', 'deps', 'entities', 'spaces']:
|
for field in ["words", "tags", "deps", "entities", "spaces"]:
|
||||||
flat[field].extend(sent[field])
|
flat[field].extend(sent[field])
|
||||||
sent_starts.append(True)
|
sent_starts.append(True)
|
||||||
sent_starts.extend([False] * (len(sent['words'])-1))
|
sent_starts.extend([False] * (len(sent["words"]) - 1))
|
||||||
# Construct text if necessary
|
# Construct text if necessary
|
||||||
assert len(flat['words']) == len(flat['spaces'])
|
assert len(flat["words"]) == len(flat["spaces"])
|
||||||
if text is None:
|
if text is None:
|
||||||
text = ''.join(word+' '*space for word, space in zip(flat['words'], flat['spaces']))
|
text = "".join(
|
||||||
|
word + " " * space for word, space in zip(flat["words"], flat["spaces"])
|
||||||
|
)
|
||||||
doc = nlp.make_doc(text)
|
doc = nlp.make_doc(text)
|
||||||
flat.pop('spaces')
|
flat.pop("spaces")
|
||||||
gold = GoldParse(doc, **flat)
|
gold = GoldParse(doc, **flat)
|
||||||
gold.sent_starts = sent_starts
|
gold.sent_starts = sent_starts
|
||||||
for i in range(len(gold.heads)):
|
for i in range(len(gold.heads)):
|
||||||
|
@ -154,13 +165,15 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
|
||||||
|
|
||||||
return doc, gold
|
return doc, gold
|
||||||
|
|
||||||
|
|
||||||
#############################
|
#############################
|
||||||
# Data transforms for spaCy #
|
# Data transforms for spaCy #
|
||||||
#############################
|
#############################
|
||||||
|
|
||||||
|
|
||||||
def golds_to_gold_tuples(docs, golds):
|
def golds_to_gold_tuples(docs, golds):
|
||||||
'''Get out the annoying 'tuples' format used by begin_training, given the
|
"""Get out the annoying 'tuples' format used by begin_training, given the
|
||||||
GoldParse objects.'''
|
GoldParse objects."""
|
||||||
tuples = []
|
tuples = []
|
||||||
for doc, gold in zip(docs, golds):
|
for doc, gold in zip(docs, golds):
|
||||||
text = doc.text
|
text = doc.text
|
||||||
|
@ -174,8 +187,9 @@ def golds_to_gold_tuples(docs, golds):
|
||||||
# Evaluation #
|
# Evaluation #
|
||||||
##############
|
##############
|
||||||
|
|
||||||
|
|
||||||
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
||||||
if text_loc.parts[-1].endswith('.conllu'):
|
if text_loc.parts[-1].endswith(".conllu"):
|
||||||
docs = []
|
docs = []
|
||||||
with text_loc.open() as file_:
|
with text_loc.open() as file_:
|
||||||
for conllu_doc in read_conllu(file_):
|
for conllu_doc in read_conllu(file_):
|
||||||
|
@ -185,14 +199,14 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
||||||
for name, component in nlp.pipeline:
|
for name, component in nlp.pipeline:
|
||||||
docs = list(component.pipe(docs))
|
docs = list(component.pipe(docs))
|
||||||
else:
|
else:
|
||||||
with text_loc.open('r', encoding='utf8') as text_file:
|
with text_loc.open("r", encoding="utf8") as text_file:
|
||||||
texts = split_text(text_file.read())
|
texts = split_text(text_file.read())
|
||||||
docs = list(nlp.pipe(texts))
|
docs = list(nlp.pipe(texts))
|
||||||
with sys_loc.open('w', encoding='utf8') as out_file:
|
with sys_loc.open("w", encoding="utf8") as out_file:
|
||||||
write_conllu(docs, out_file)
|
write_conllu(docs, out_file)
|
||||||
with gold_loc.open('r', encoding='utf8') as gold_file:
|
with gold_loc.open("r", encoding="utf8") as gold_file:
|
||||||
gold_ud = conll17_ud_eval.load_conllu(gold_file)
|
gold_ud = conll17_ud_eval.load_conllu(gold_file)
|
||||||
with sys_loc.open('r', encoding='utf8') as sys_file:
|
with sys_loc.open("r", encoding="utf8") as sys_file:
|
||||||
sys_ud = conll17_ud_eval.load_conllu(sys_file)
|
sys_ud = conll17_ud_eval.load_conllu(sys_file)
|
||||||
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
|
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
|
||||||
return docs, scores
|
return docs, scores
|
||||||
|
@ -200,10 +214,10 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
||||||
|
|
||||||
def write_conllu(docs, file_):
|
def write_conllu(docs, file_):
|
||||||
merger = Matcher(docs[0].vocab)
|
merger = Matcher(docs[0].vocab)
|
||||||
merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}])
|
merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
matches = merger(doc)
|
matches = merger(doc)
|
||||||
spans = [doc[start:end+1] for _, start, end in matches]
|
spans = [doc[start : end + 1] for _, start, end in matches]
|
||||||
offsets = [(span.start_char, span.end_char) for span in spans]
|
offsets = [(span.start_char, span.end_char) for span in spans]
|
||||||
for start_char, end_char in offsets:
|
for start_char, end_char in offsets:
|
||||||
doc.merge(start_char, end_char)
|
doc.merge(start_char, end_char)
|
||||||
|
@ -213,65 +227,82 @@ def write_conllu(docs, file_):
|
||||||
file_.write("# text = {text}\n".format(text=sent.text))
|
file_.write("# text = {text}\n".format(text=sent.text))
|
||||||
for k, token in enumerate(sent):
|
for k, token in enumerate(sent):
|
||||||
if token.head.i > sent[-1].i or token.head.i < sent[0].i:
|
if token.head.i > sent[-1].i or token.head.i < sent[0].i:
|
||||||
for word in doc[sent[0].i-10 : sent[0].i]:
|
for word in doc[sent[0].i - 10 : sent[0].i]:
|
||||||
print(word.i, word.head.i, word.text, word.dep_)
|
print(word.i, word.head.i, word.text, word.dep_)
|
||||||
for word in sent:
|
for word in sent:
|
||||||
print(word.i, word.head.i, word.text, word.dep_)
|
print(word.i, word.head.i, word.text, word.dep_)
|
||||||
for word in doc[sent[-1].i : sent[-1].i+10]:
|
for word in doc[sent[-1].i : sent[-1].i + 10]:
|
||||||
print(word.i, word.head.i, word.text, word.dep_)
|
print(word.i, word.head.i, word.text, word.dep_)
|
||||||
raise ValueError("Invalid parse: head outside sentence (%s)" % token.text)
|
raise ValueError(
|
||||||
file_.write(token._.get_conllu_lines(k) + '\n')
|
"Invalid parse: head outside sentence (%s)" % token.text
|
||||||
file_.write('\n')
|
)
|
||||||
|
file_.write(token._.get_conllu_lines(k) + "\n")
|
||||||
|
file_.write("\n")
|
||||||
|
|
||||||
|
|
||||||
def print_progress(itn, losses, ud_scores):
|
def print_progress(itn, losses, ud_scores):
|
||||||
fields = {
|
fields = {
|
||||||
'dep_loss': losses.get('parser', 0.0),
|
"dep_loss": losses.get("parser", 0.0),
|
||||||
'tag_loss': losses.get('tagger', 0.0),
|
"tag_loss": losses.get("tagger", 0.0),
|
||||||
'words': ud_scores['Words'].f1 * 100,
|
"words": ud_scores["Words"].f1 * 100,
|
||||||
'sents': ud_scores['Sentences'].f1 * 100,
|
"sents": ud_scores["Sentences"].f1 * 100,
|
||||||
'tags': ud_scores['XPOS'].f1 * 100,
|
"tags": ud_scores["XPOS"].f1 * 100,
|
||||||
'uas': ud_scores['UAS'].f1 * 100,
|
"uas": ud_scores["UAS"].f1 * 100,
|
||||||
'las': ud_scores['LAS'].f1 * 100,
|
"las": ud_scores["LAS"].f1 * 100,
|
||||||
}
|
}
|
||||||
header = ['Epoch', 'Loss', 'LAS', 'UAS', 'TAG', 'SENT', 'WORD']
|
header = ["Epoch", "Loss", "LAS", "UAS", "TAG", "SENT", "WORD"]
|
||||||
if itn == 0:
|
if itn == 0:
|
||||||
print('\t'.join(header))
|
print("\t".join(header))
|
||||||
tpl = '\t'.join((
|
tpl = "\t".join(
|
||||||
'{:d}',
|
(
|
||||||
'{dep_loss:.1f}',
|
"{:d}",
|
||||||
'{las:.1f}',
|
"{dep_loss:.1f}",
|
||||||
'{uas:.1f}',
|
"{las:.1f}",
|
||||||
'{tags:.1f}',
|
"{uas:.1f}",
|
||||||
'{sents:.1f}',
|
"{tags:.1f}",
|
||||||
'{words:.1f}',
|
"{sents:.1f}",
|
||||||
))
|
"{words:.1f}",
|
||||||
|
)
|
||||||
|
)
|
||||||
print(tpl.format(itn, **fields))
|
print(tpl.format(itn, **fields))
|
||||||
|
|
||||||
#def get_sent_conllu(sent, sent_id):
|
|
||||||
|
# def get_sent_conllu(sent, sent_id):
|
||||||
# lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)]
|
# lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)]
|
||||||
|
|
||||||
|
|
||||||
def get_token_conllu(token, i):
|
def get_token_conllu(token, i):
|
||||||
if token._.begins_fused:
|
if token._.begins_fused:
|
||||||
n = 1
|
n = 1
|
||||||
while token.nbor(n)._.inside_fused:
|
while token.nbor(n)._.inside_fused:
|
||||||
n += 1
|
n += 1
|
||||||
id_ = '%d-%d' % (i, i+n)
|
id_ = "%d-%d" % (i, i + n)
|
||||||
lines = [id_, token.text, '_', '_', '_', '_', '_', '_', '_', '_']
|
lines = [id_, token.text, "_", "_", "_", "_", "_", "_", "_", "_"]
|
||||||
else:
|
else:
|
||||||
lines = []
|
lines = []
|
||||||
if token.head.i == token.i:
|
if token.head.i == token.i:
|
||||||
head = 0
|
head = 0
|
||||||
else:
|
else:
|
||||||
head = i + (token.head.i - token.i) + 1
|
head = i + (token.head.i - token.i) + 1
|
||||||
fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, '_',
|
fields = [
|
||||||
str(head), token.dep_.lower(), '_', '_']
|
str(i + 1),
|
||||||
lines.append('\t'.join(fields))
|
token.text,
|
||||||
return '\n'.join(lines)
|
token.lemma_,
|
||||||
|
token.pos_,
|
||||||
|
token.tag_,
|
||||||
|
"_",
|
||||||
|
str(head),
|
||||||
|
token.dep_.lower(),
|
||||||
|
"_",
|
||||||
|
"_",
|
||||||
|
]
|
||||||
|
lines.append("\t".join(fields))
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
Token.set_extension('get_conllu_lines', method=get_token_conllu)
|
|
||||||
Token.set_extension('begins_fused', default=False)
|
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
||||||
Token.set_extension('inside_fused', default=False)
|
Token.set_extension("begins_fused", default=False)
|
||||||
|
Token.set_extension("inside_fused", default=False)
|
||||||
|
|
||||||
|
|
||||||
##################
|
##################
|
||||||
|
@ -280,35 +311,40 @@ Token.set_extension('inside_fused', default=False)
|
||||||
|
|
||||||
|
|
||||||
def load_nlp(corpus, config, vectors=None):
|
def load_nlp(corpus, config, vectors=None):
|
||||||
lang = corpus.split('_')[0]
|
lang = corpus.split("_")[0]
|
||||||
nlp = spacy.blank(lang)
|
nlp = spacy.blank(lang)
|
||||||
if config.vectors:
|
if config.vectors:
|
||||||
if not vectors:
|
if not vectors:
|
||||||
raise ValueError("config asks for vectors, but no vectors "
|
raise ValueError(
|
||||||
"directory set on command line (use -v)")
|
"config asks for vectors, but no vectors "
|
||||||
|
"directory set on command line (use -v)"
|
||||||
|
)
|
||||||
if (Path(vectors) / corpus).exists():
|
if (Path(vectors) / corpus).exists():
|
||||||
nlp.vocab.from_disk(Path(vectors) / corpus / 'vocab')
|
nlp.vocab.from_disk(Path(vectors) / corpus / "vocab")
|
||||||
nlp.meta['treebank'] = corpus
|
nlp.meta["treebank"] = corpus
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
def initialize_pipeline(nlp, docs, golds, config, device):
|
def initialize_pipeline(nlp, docs, golds, config, device):
|
||||||
nlp.add_pipe(nlp.create_pipe('tagger'))
|
nlp.add_pipe(nlp.create_pipe("tagger"))
|
||||||
nlp.add_pipe(nlp.create_pipe('parser'))
|
nlp.add_pipe(nlp.create_pipe("parser"))
|
||||||
if config.multitask_tag:
|
if config.multitask_tag:
|
||||||
nlp.parser.add_multitask_objective('tag')
|
nlp.parser.add_multitask_objective("tag")
|
||||||
if config.multitask_sent:
|
if config.multitask_sent:
|
||||||
nlp.parser.add_multitask_objective('sent_start')
|
nlp.parser.add_multitask_objective("sent_start")
|
||||||
for gold in golds:
|
for gold in golds:
|
||||||
for tag in gold.tags:
|
for tag in gold.tags:
|
||||||
if tag is not None:
|
if tag is not None:
|
||||||
nlp.tagger.add_label(tag)
|
nlp.tagger.add_label(tag)
|
||||||
if torch is not None and device != -1:
|
if torch is not None and device != -1:
|
||||||
torch.set_default_tensor_type('torch.cuda.FloatTensor')
|
torch.set_default_tensor_type("torch.cuda.FloatTensor")
|
||||||
optimizer = nlp.begin_training(
|
optimizer = nlp.begin_training(
|
||||||
lambda: golds_to_gold_tuples(docs, golds), device=device,
|
lambda: golds_to_gold_tuples(docs, golds),
|
||||||
subword_features=config.subword_features, conv_depth=config.conv_depth,
|
device=device,
|
||||||
bilstm_depth=config.bilstm_depth)
|
subword_features=config.subword_features,
|
||||||
|
conv_depth=config.conv_depth,
|
||||||
|
bilstm_depth=config.bilstm_depth,
|
||||||
|
)
|
||||||
if config.pretrained_tok2vec:
|
if config.pretrained_tok2vec:
|
||||||
_load_pretrained_tok2vec(nlp, config.pretrained_tok2vec)
|
_load_pretrained_tok2vec(nlp, config.pretrained_tok2vec)
|
||||||
return optimizer
|
return optimizer
|
||||||
|
@ -318,27 +354,41 @@ def _load_pretrained_tok2vec(nlp, loc):
|
||||||
"""Load pre-trained weights for the 'token-to-vector' part of the component
|
"""Load pre-trained weights for the 'token-to-vector' part of the component
|
||||||
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
||||||
"""
|
"""
|
||||||
with Path(loc).open('rb') as file_:
|
with Path(loc).open("rb") as file_:
|
||||||
weights_data = file_.read()
|
weights_data = file_.read()
|
||||||
loaded = []
|
loaded = []
|
||||||
for name, component in nlp.pipeline:
|
for name, component in nlp.pipeline:
|
||||||
if hasattr(component, 'model') and hasattr(component.model, 'tok2vec'):
|
if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
|
||||||
component.tok2vec.from_bytes(weights_data)
|
component.tok2vec.from_bytes(weights_data)
|
||||||
loaded.append(name)
|
loaded.append(name)
|
||||||
return loaded
|
return loaded
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
########################
|
########################
|
||||||
# Command line helpers #
|
# Command line helpers #
|
||||||
########################
|
########################
|
||||||
|
|
||||||
|
|
||||||
class Config(object):
|
class Config(object):
|
||||||
def __init__(self, vectors=None, max_doc_length=10, multitask_tag=False,
|
def __init__(
|
||||||
multitask_sent=False, multitask_dep=False, multitask_vectors=None,
|
self,
|
||||||
bilstm_depth=0, nr_epoch=30, min_batch_size=750, max_batch_size=750,
|
vectors=None,
|
||||||
batch_by_words=True, dropout=0.1, conv_depth=4, subword_features=True,
|
max_doc_length=10,
|
||||||
vectors_dir=None, pretrained_tok2vec=None):
|
multitask_tag=False,
|
||||||
|
multitask_sent=False,
|
||||||
|
multitask_dep=False,
|
||||||
|
multitask_vectors=None,
|
||||||
|
bilstm_depth=0,
|
||||||
|
nr_epoch=30,
|
||||||
|
min_batch_size=100,
|
||||||
|
max_batch_size=1000,
|
||||||
|
batch_by_words=True,
|
||||||
|
dropout=0.2,
|
||||||
|
conv_depth=4,
|
||||||
|
subword_features=True,
|
||||||
|
vectors_dir=None,
|
||||||
|
pretrained_tok2vec=None,
|
||||||
|
):
|
||||||
if vectors_dir is not None:
|
if vectors_dir is not None:
|
||||||
if vectors is None:
|
if vectors is None:
|
||||||
vectors = True
|
vectors = True
|
||||||
|
@ -346,13 +396,13 @@ class Config(object):
|
||||||
multitask_vectors = True
|
multitask_vectors = True
|
||||||
for key, value in locals().items():
|
for key, value in locals().items():
|
||||||
setattr(self, key, value)
|
setattr(self, key, value)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, loc, vectors_dir=None):
|
def load(cls, loc, vectors_dir=None):
|
||||||
with Path(loc).open('r', encoding='utf8') as file_:
|
with Path(loc).open("r", encoding="utf8") as file_:
|
||||||
cfg = json.load(file_)
|
cfg = json.load(file_)
|
||||||
if vectors_dir is not None:
|
if vectors_dir is not None:
|
||||||
cfg['vectors_dir'] = vectors_dir
|
cfg["vectors_dir"] = vectors_dir
|
||||||
return cls(**cfg)
|
return cls(**cfg)
|
||||||
|
|
||||||
|
|
||||||
|
@ -364,43 +414,59 @@ class Dataset(object):
|
||||||
self.text = None
|
self.text = None
|
||||||
for file_path in self.path.iterdir():
|
for file_path in self.path.iterdir():
|
||||||
name = file_path.parts[-1]
|
name = file_path.parts[-1]
|
||||||
if section in name and name.endswith('conllu'):
|
if section in name and name.endswith("conllu"):
|
||||||
self.conllu = file_path
|
self.conllu = file_path
|
||||||
elif section in name and name.endswith('txt'):
|
elif section in name and name.endswith("txt"):
|
||||||
self.text = file_path
|
self.text = file_path
|
||||||
if self.conllu is None:
|
if self.conllu is None:
|
||||||
msg = "Could not find .txt file in {path} for {section}"
|
msg = "Could not find .txt file in {path} for {section}"
|
||||||
raise IOError(msg.format(section=section, path=path))
|
raise IOError(msg.format(section=section, path=path))
|
||||||
if self.text is None:
|
if self.text is None:
|
||||||
msg = "Could not find .txt file in {path} for {section}"
|
msg = "Could not find .txt file in {path} for {section}"
|
||||||
self.lang = self.conllu.parts[-1].split('-')[0].split('_')[0]
|
self.lang = self.conllu.parts[-1].split("-")[0].split("_")[0]
|
||||||
|
|
||||||
|
|
||||||
class TreebankPaths(object):
|
class TreebankPaths(object):
|
||||||
def __init__(self, ud_path, treebank, **cfg):
|
def __init__(self, ud_path, treebank, **cfg):
|
||||||
self.train = Dataset(ud_path / treebank, 'train')
|
self.train = Dataset(ud_path / treebank, "train")
|
||||||
self.dev = Dataset(ud_path / treebank, 'dev')
|
self.dev = Dataset(ud_path / treebank, "dev")
|
||||||
self.lang = self.train.lang
|
self.lang = self.train.lang
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
|
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
|
||||||
corpus=("UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
|
corpus=(
|
||||||
"positional", None, str),
|
"UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
|
||||||
|
"positional",
|
||||||
|
None,
|
||||||
|
str,
|
||||||
|
),
|
||||||
parses_dir=("Directory to write the development parses", "positional", None, Path),
|
parses_dir=("Directory to write the development parses", "positional", None, Path),
|
||||||
config=("Path to json formatted config file", "option", "C", Path),
|
config=("Path to json formatted config file", "option", "C", Path),
|
||||||
limit=("Size limit", "option", "n", int),
|
limit=("Size limit", "option", "n", int),
|
||||||
gpu_device=("Use GPU", "option", "g", int),
|
gpu_device=("Use GPU", "option", "g", int),
|
||||||
use_oracle_segments=("Use oracle segments", "flag", "G", int),
|
use_oracle_segments=("Use oracle segments", "flag", "G", int),
|
||||||
vectors_dir=("Path to directory with pre-trained vectors, named e.g. en/",
|
vectors_dir=(
|
||||||
"option", "v", Path),
|
"Path to directory with pre-trained vectors, named e.g. en/",
|
||||||
|
"option",
|
||||||
|
"v",
|
||||||
|
Path,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
def main(ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vectors_dir=None,
|
def main(
|
||||||
use_oracle_segments=False):
|
ud_dir,
|
||||||
|
parses_dir,
|
||||||
|
corpus,
|
||||||
|
config=None,
|
||||||
|
limit=0,
|
||||||
|
gpu_device=-1,
|
||||||
|
vectors_dir=None,
|
||||||
|
use_oracle_segments=False,
|
||||||
|
):
|
||||||
spacy.util.fix_random_seed()
|
spacy.util.fix_random_seed()
|
||||||
lang.zh.Chinese.Defaults.use_jieba = False
|
lang.zh.Chinese.Defaults.use_jieba = False
|
||||||
lang.ja.Japanese.Defaults.use_janome = False
|
lang.ja.Japanese.Defaults.use_janome = False
|
||||||
|
|
||||||
if config is not None:
|
if config is not None:
|
||||||
config = Config.load(config, vectors_dir=vectors_dir)
|
config = Config.load(config, vectors_dir=vectors_dir)
|
||||||
else:
|
else:
|
||||||
|
@ -411,19 +477,28 @@ def main(ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vector
|
||||||
print("Train and evaluate", corpus, "using lang", paths.lang)
|
print("Train and evaluate", corpus, "using lang", paths.lang)
|
||||||
nlp = load_nlp(paths.lang, config, vectors=vectors_dir)
|
nlp = load_nlp(paths.lang, config, vectors=vectors_dir)
|
||||||
|
|
||||||
docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
|
docs, golds = read_data(
|
||||||
max_doc_length=config.max_doc_length,
|
nlp,
|
||||||
limit=limit)
|
paths.train.conllu.open(),
|
||||||
|
paths.train.text.open(),
|
||||||
|
max_doc_length=config.max_doc_length,
|
||||||
|
limit=limit,
|
||||||
|
)
|
||||||
|
|
||||||
optimizer = initialize_pipeline(nlp, docs, golds, config, gpu_device)
|
optimizer = initialize_pipeline(nlp, docs, golds, config, gpu_device)
|
||||||
|
|
||||||
batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001)
|
batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001)
|
||||||
beam_prob = compounding(0.2, 0.8, 1.001)
|
beam_prob = compounding(0.2, 0.8, 1.001)
|
||||||
for i in range(config.nr_epoch):
|
for i in range(config.nr_epoch):
|
||||||
docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
|
docs, golds = read_data(
|
||||||
max_doc_length=config.max_doc_length, limit=limit,
|
nlp,
|
||||||
oracle_segments=use_oracle_segments,
|
paths.train.conllu.open(),
|
||||||
raw_text=not use_oracle_segments)
|
paths.train.text.open(),
|
||||||
|
max_doc_length=config.max_doc_length,
|
||||||
|
limit=limit,
|
||||||
|
oracle_segments=use_oracle_segments,
|
||||||
|
raw_text=not use_oracle_segments,
|
||||||
|
)
|
||||||
Xs = list(zip(docs, golds))
|
Xs = list(zip(docs, golds))
|
||||||
random.shuffle(Xs)
|
random.shuffle(Xs)
|
||||||
if config.batch_by_words:
|
if config.batch_by_words:
|
||||||
|
@ -436,27 +511,34 @@ def main(ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vector
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
batch_docs, batch_gold = zip(*batch)
|
batch_docs, batch_gold = zip(*batch)
|
||||||
pbar.update(sum(len(doc) for doc in batch_docs))
|
pbar.update(sum(len(doc) for doc in batch_docs))
|
||||||
nlp.parser.cfg['beam_update_prob'] = next(beam_prob)
|
nlp.parser.cfg["beam_update_prob"] = next(beam_prob)
|
||||||
nlp.update(batch_docs, batch_gold, sgd=optimizer,
|
nlp.update(
|
||||||
drop=config.dropout, losses=losses)
|
batch_docs,
|
||||||
|
batch_gold,
|
||||||
out_path = parses_dir / corpus / 'epoch-{i}.conllu'.format(i=i)
|
sgd=optimizer,
|
||||||
|
drop=config.dropout,
|
||||||
|
losses=losses,
|
||||||
|
)
|
||||||
|
|
||||||
|
out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i)
|
||||||
with nlp.use_params(optimizer.averages):
|
with nlp.use_params(optimizer.averages):
|
||||||
if use_oracle_segments:
|
if use_oracle_segments:
|
||||||
parsed_docs, scores = evaluate(nlp, paths.dev.conllu,
|
parsed_docs, scores = evaluate(
|
||||||
paths.dev.conllu, out_path)
|
nlp, paths.dev.conllu, paths.dev.conllu, out_path
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
parsed_docs, scores = evaluate(nlp, paths.dev.text,
|
parsed_docs, scores = evaluate(
|
||||||
paths.dev.conllu, out_path)
|
nlp, paths.dev.text, paths.dev.conllu, out_path
|
||||||
|
)
|
||||||
print_progress(i, losses, scores)
|
print_progress(i, losses, scores)
|
||||||
|
|
||||||
|
|
||||||
def _render_parses(i, to_render):
|
def _render_parses(i, to_render):
|
||||||
to_render[0].user_data['title'] = "Batch %d" % i
|
to_render[0].user_data["title"] = "Batch %d" % i
|
||||||
with Path('/tmp/parses.html').open('w') as file_:
|
with Path("/tmp/parses.html").open("w") as file_:
|
||||||
html = displacy.render(to_render[:5], style='dep', page=True)
|
html = displacy.render(to_render[:5], style="dep", page=True)
|
||||||
file_.write(html)
|
file_.write(html)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
plac.call(main)
|
plac.call(main)
|
|
@ -4,28 +4,34 @@ from __future__ import unicode_literals, print_function
|
||||||
import pkg_resources
|
import pkg_resources
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import sys
|
import sys
|
||||||
import ujson
|
|
||||||
import requests
|
import requests
|
||||||
|
from wasabi import Printer
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
from ..compat import path2str, locale_escape
|
from ..compat import path2str
|
||||||
from ..util import prints, get_data_path, read_json
|
from ..util import get_data_path, read_json
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
|
|
||||||
def validate():
|
def validate():
|
||||||
"""Validate that the currently installed version of spaCy is compatible
|
"""
|
||||||
|
Validate that the currently installed version of spaCy is compatible
|
||||||
with the installed models. Should be run after `pip install -U spacy`.
|
with the installed models. Should be run after `pip install -U spacy`.
|
||||||
"""
|
"""
|
||||||
r = requests.get(about.__compatibility__)
|
msg = Printer()
|
||||||
if r.status_code != 200:
|
with msg.loading("Loading compatibility table..."):
|
||||||
prints(Messages.M021, title=Messages.M003.format(code=r.status_code),
|
r = requests.get(about.__compatibility__)
|
||||||
exits=1)
|
if r.status_code != 200:
|
||||||
compat = r.json()['spacy']
|
msg.fail(Messages.M003.format(code=r.status_code), Messages.M021, exits=1)
|
||||||
|
msg.good("Loaded compatibility table")
|
||||||
|
compat = r.json()["spacy"]
|
||||||
current_compat = compat.get(about.__version__)
|
current_compat = compat.get(about.__version__)
|
||||||
if not current_compat:
|
if not current_compat:
|
||||||
prints(about.__compatibility__, exits=1,
|
msg.fail(
|
||||||
title=Messages.M022.format(version=about.__version__))
|
Messages.M022.format(version=about.__version__),
|
||||||
|
about.__compatibility__,
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
all_models = set()
|
all_models = set()
|
||||||
for spacy_v, models in dict(compat).items():
|
for spacy_v, models in dict(compat).items():
|
||||||
all_models.update(models.keys())
|
all_models.update(models.keys())
|
||||||
|
@ -33,33 +39,38 @@ def validate():
|
||||||
compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
|
compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
|
||||||
model_links = get_model_links(current_compat)
|
model_links = get_model_links(current_compat)
|
||||||
model_pkgs = get_model_pkgs(current_compat, all_models)
|
model_pkgs = get_model_pkgs(current_compat, all_models)
|
||||||
incompat_links = {l for l, d in model_links.items() if not d['compat']}
|
incompat_links = {l for l, d in model_links.items() if not d["compat"]}
|
||||||
incompat_models = {d['name'] for _, d in model_pkgs.items()
|
incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
|
||||||
if not d['compat']}
|
incompat_models.update(
|
||||||
incompat_models.update([d['name'] for _, d in model_links.items()
|
[d["name"] for _, d in model_links.items() if not d["compat"]]
|
||||||
if not d['compat']])
|
)
|
||||||
na_models = [m for m in incompat_models if m not in current_compat]
|
na_models = [m for m in incompat_models if m not in current_compat]
|
||||||
update_models = [m for m in incompat_models if m in current_compat]
|
update_models = [m for m in incompat_models if m in current_compat]
|
||||||
|
spacy_dir = Path(__file__).parent.parent
|
||||||
|
|
||||||
|
msg.divider(Messages.M023.format(version=about.__version__))
|
||||||
|
msg.info("spaCy installation: {}".format(path2str(spacy_dir)))
|
||||||
|
|
||||||
prints(path2str(Path(__file__).parent.parent),
|
|
||||||
title=Messages.M023.format(version=about.__version__))
|
|
||||||
if model_links or model_pkgs:
|
if model_links or model_pkgs:
|
||||||
print(get_row('TYPE', 'NAME', 'MODEL', 'VERSION', ''))
|
header = ("TYPE", "NAME", "MODEL", "VERSION", "")
|
||||||
|
rows = []
|
||||||
for name, data in model_pkgs.items():
|
for name, data in model_pkgs.items():
|
||||||
print(get_model_row(current_compat, name, data, 'package'))
|
rows.append(get_model_row(current_compat, name, data, msg))
|
||||||
for name, data in model_links.items():
|
for name, data in model_links.items():
|
||||||
print(get_model_row(current_compat, name, data, 'link'))
|
rows.append(get_model_row(current_compat, name, data, msg, "link"))
|
||||||
|
msg.table(rows, header=header)
|
||||||
else:
|
else:
|
||||||
prints(Messages.M024, exits=0)
|
msg.text(Messages.M024, exits=0)
|
||||||
if update_models:
|
if update_models:
|
||||||
cmd = ' python -m spacy download {}'
|
msg.divider("Install updates")
|
||||||
print("\n " + Messages.M025)
|
cmd = "python -m spacy download {}"
|
||||||
print('\n'.join([cmd.format(pkg) for pkg in update_models]))
|
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
|
||||||
if na_models:
|
if na_models:
|
||||||
prints(Messages.M025.format(version=about.__version__,
|
msg.text(
|
||||||
models=', '.join(na_models)))
|
Messages.M025.format(version=about.__version__, models=", ".join(na_models))
|
||||||
|
)
|
||||||
if incompat_links:
|
if incompat_links:
|
||||||
prints(Messages.M027.format(path=path2str(get_data_path())))
|
msg.text(Messages.M027.format(path=path2str(get_data_path())))
|
||||||
if incompat_models or incompat_links:
|
if incompat_models or incompat_links:
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
@ -70,50 +81,48 @@ def get_model_links(compat):
|
||||||
if data_path:
|
if data_path:
|
||||||
models = [p for p in data_path.iterdir() if is_model_path(p)]
|
models = [p for p in data_path.iterdir() if is_model_path(p)]
|
||||||
for model in models:
|
for model in models:
|
||||||
meta_path = Path(model) / 'meta.json'
|
meta_path = Path(model) / "meta.json"
|
||||||
if not meta_path.exists():
|
if not meta_path.exists():
|
||||||
continue
|
continue
|
||||||
meta = read_json(meta_path)
|
meta = read_json(meta_path)
|
||||||
link = model.parts[-1]
|
link = model.parts[-1]
|
||||||
name = meta['lang'] + '_' + meta['name']
|
name = meta["lang"] + "_" + meta["name"]
|
||||||
links[link] = {'name': name, 'version': meta['version'],
|
links[link] = {
|
||||||
'compat': is_compat(compat, name, meta['version'])}
|
"name": name,
|
||||||
|
"version": meta["version"],
|
||||||
|
"compat": is_compat(compat, name, meta["version"]),
|
||||||
|
}
|
||||||
return links
|
return links
|
||||||
|
|
||||||
|
|
||||||
def get_model_pkgs(compat, all_models):
|
def get_model_pkgs(compat, all_models):
|
||||||
pkgs = {}
|
pkgs = {}
|
||||||
for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
|
for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
|
||||||
package = pkg_name.replace('-', '_')
|
package = pkg_name.replace("-", "_")
|
||||||
if package in all_models:
|
if package in all_models:
|
||||||
version = pkg_data.version
|
version = pkg_data.version
|
||||||
pkgs[pkg_name] = {'name': package, 'version': version,
|
pkgs[pkg_name] = {
|
||||||
'compat': is_compat(compat, package, version)}
|
"name": package,
|
||||||
|
"version": version,
|
||||||
|
"compat": is_compat(compat, package, version),
|
||||||
|
}
|
||||||
return pkgs
|
return pkgs
|
||||||
|
|
||||||
|
|
||||||
def get_model_row(compat, name, data, type='package'):
|
def get_model_row(compat, name, data, msg, model_type="package"):
|
||||||
tpl_red = '\x1b[38;5;1m{}\x1b[0m'
|
if data["compat"]:
|
||||||
tpl_green = '\x1b[38;5;2m{}\x1b[0m'
|
comp = msg.text("", color="green", icon="good", no_print=True)
|
||||||
if data['compat']:
|
version = msg.text(data["version"], color="green", no_print=True)
|
||||||
comp = tpl_green.format(locale_escape('✔', errors='ignore'))
|
|
||||||
version = tpl_green.format(data['version'])
|
|
||||||
else:
|
else:
|
||||||
comp = '--> {}'.format(compat.get(data['name'], ['n/a'])[0])
|
version = msg.text(data["version"], color="red", no_print=True)
|
||||||
version = tpl_red.format(data['version'])
|
comp = "--> {}".format(compat.get(data["name"], ["n/a"])[0])
|
||||||
return get_row(type, name, data['name'], version, comp)
|
return (model_type, name, data["name"], version, comp)
|
||||||
|
|
||||||
|
|
||||||
def get_row(*args):
|
|
||||||
tpl_row = ' {:<10}' + (' {:<20}' * 4)
|
|
||||||
return tpl_row.format(*args)
|
|
||||||
|
|
||||||
|
|
||||||
def is_model_path(model_path):
|
def is_model_path(model_path):
|
||||||
exclude = ['cache', 'pycache', '__pycache__']
|
exclude = ["cache", "pycache", "__pycache__"]
|
||||||
name = model_path.parts[-1]
|
name = model_path.parts[-1]
|
||||||
return (model_path.is_dir() and name not in exclude
|
return model_path.is_dir() and name not in exclude and not name.startswith(".")
|
||||||
and not name.startswith('.'))
|
|
||||||
|
|
||||||
|
|
||||||
def is_compat(compat, name, version):
|
def is_compat(compat, name, version):
|
||||||
|
@ -122,6 +131,6 @@ def is_compat(compat, name, version):
|
||||||
|
|
||||||
def reformat_version(version):
|
def reformat_version(version):
|
||||||
"""Hack to reformat old versions ending on '-alpha' to match pip format."""
|
"""Hack to reformat old versions ending on '-alpha' to match pip format."""
|
||||||
if version.endswith('-alpha'):
|
if version.endswith("-alpha"):
|
||||||
return version.replace('-alpha', 'a0')
|
return version.replace("-alpha", "a0")
|
||||||
return version.replace('-alpha', 'a')
|
return version.replace("-alpha", "a")
|
||||||
|
|
|
@ -1,59 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import plac
|
|
||||||
import json
|
|
||||||
import spacy
|
|
||||||
import numpy
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from ..vectors import Vectors
|
|
||||||
from ..util import prints, ensure_path
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
lang=("model language", "positional", None, str),
|
|
||||||
output_dir=("model output directory", "positional", None, Path),
|
|
||||||
lexemes_loc=("location of JSONL-formatted lexical data", "positional",
|
|
||||||
None, Path),
|
|
||||||
vectors_loc=("optional: location of vectors data, as numpy .npz",
|
|
||||||
"positional", None, str),
|
|
||||||
prune_vectors=("optional: number of vectors to prune to.",
|
|
||||||
"option", "V", int)
|
|
||||||
)
|
|
||||||
def make_vocab(lang, output_dir, lexemes_loc, vectors_loc=None, prune_vectors=-1):
|
|
||||||
"""Compile a vocabulary from a lexicon jsonl file and word vectors."""
|
|
||||||
if not lexemes_loc.exists():
|
|
||||||
prints(lexemes_loc, title="Can't find lexical data", exits=1)
|
|
||||||
vectors_loc = ensure_path(vectors_loc)
|
|
||||||
nlp = spacy.blank(lang)
|
|
||||||
for word in nlp.vocab:
|
|
||||||
word.rank = 0
|
|
||||||
lex_added = 0
|
|
||||||
with lexemes_loc.open() as file_:
|
|
||||||
for line in file_:
|
|
||||||
if line.strip():
|
|
||||||
attrs = json.loads(line)
|
|
||||||
if 'settings' in attrs:
|
|
||||||
nlp.vocab.cfg.update(attrs['settings'])
|
|
||||||
else:
|
|
||||||
lex = nlp.vocab[attrs['orth']]
|
|
||||||
lex.set_attrs(**attrs)
|
|
||||||
assert lex.rank == attrs['id']
|
|
||||||
lex_added += 1
|
|
||||||
if vectors_loc is not None:
|
|
||||||
vector_data = numpy.load(vectors_loc.open('rb'))
|
|
||||||
nlp.vocab.vectors = Vectors(data=vector_data)
|
|
||||||
for word in nlp.vocab:
|
|
||||||
if word.rank:
|
|
||||||
nlp.vocab.vectors.add(word.orth, row=word.rank)
|
|
||||||
|
|
||||||
if prune_vectors >= 1:
|
|
||||||
remap = nlp.vocab.prune_vectors(prune_vectors)
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir()
|
|
||||||
nlp.to_disk(output_dir)
|
|
||||||
vec_added = len(nlp.vocab.vectors)
|
|
||||||
prints("{} entries, {} vectors".format(lex_added, vec_added), output_dir,
|
|
||||||
title="Sucessfully compiled vocab and vectors, and saved model")
|
|
||||||
return nlp
|
|
|
@ -1,11 +1,10 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import os
|
||||||
import sys
|
import sys
|
||||||
import ujson
|
import ujson
|
||||||
import itertools
|
import itertools
|
||||||
import locale
|
|
||||||
import os
|
|
||||||
|
|
||||||
from thinc.neural.util import copy_array
|
from thinc.neural.util import copy_array
|
||||||
|
|
||||||
|
@ -30,9 +29,9 @@ except ImportError:
|
||||||
cupy = None
|
cupy = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from thinc.neural.optimizers import Optimizer
|
from thinc.neural.optimizers import Optimizer # noqa: F401
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from thinc.neural.optimizers import Adam as Optimizer
|
from thinc.neural.optimizers import Adam as Optimizer # noqa: F401
|
||||||
|
|
||||||
pickle = pickle
|
pickle = pickle
|
||||||
copy_reg = copy_reg
|
copy_reg = copy_reg
|
||||||
|
@ -136,12 +135,3 @@ def import_file(name, loc):
|
||||||
module = importlib.util.module_from_spec(spec)
|
module = importlib.util.module_from_spec(spec)
|
||||||
spec.loader.exec_module(module)
|
spec.loader.exec_module(module)
|
||||||
return module
|
return module
|
||||||
|
|
||||||
|
|
||||||
def locale_escape(string, errors="replace"):
|
|
||||||
"""
|
|
||||||
Mangle non-supported characters, for savages with ascii terminals.
|
|
||||||
"""
|
|
||||||
encoding = locale.getpreferredencoding()
|
|
||||||
string = string.encode(encoding, errors).decode("utf8")
|
|
||||||
return string
|
|
||||||
|
|
|
@ -5,15 +5,22 @@ from .render import DependencyRenderer, EntityRenderer
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
from ..compat import b_to_str
|
from ..compat import b_to_str
|
||||||
from ..errors import Errors, Warnings, user_warning
|
from ..errors import Errors, Warnings, user_warning
|
||||||
from ..util import prints, is_in_jupyter
|
from ..util import is_in_jupyter
|
||||||
|
|
||||||
|
|
||||||
_html = {}
|
_html = {}
|
||||||
IS_JUPYTER = is_in_jupyter()
|
IS_JUPYTER = is_in_jupyter()
|
||||||
|
|
||||||
|
|
||||||
def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
|
def render(
|
||||||
options={}, manual=False):
|
docs,
|
||||||
|
style="dep",
|
||||||
|
page=False,
|
||||||
|
minify=False,
|
||||||
|
jupyter=IS_JUPYTER,
|
||||||
|
options={},
|
||||||
|
manual=False,
|
||||||
|
):
|
||||||
"""Render displaCy visualisation.
|
"""Render displaCy visualisation.
|
||||||
|
|
||||||
docs (list or Doc): Document(s) to visualise.
|
docs (list or Doc): Document(s) to visualise.
|
||||||
|
@ -25,8 +32,10 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
|
||||||
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||||
RETURNS (unicode): Rendered HTML markup.
|
RETURNS (unicode): Rendered HTML markup.
|
||||||
"""
|
"""
|
||||||
factories = {'dep': (DependencyRenderer, parse_deps),
|
factories = {
|
||||||
'ent': (EntityRenderer, parse_ents)}
|
"dep": (DependencyRenderer, parse_deps),
|
||||||
|
"ent": (EntityRenderer, parse_ents),
|
||||||
|
}
|
||||||
if style not in factories:
|
if style not in factories:
|
||||||
raise ValueError(Errors.E087.format(style=style))
|
raise ValueError(Errors.E087.format(style=style))
|
||||||
if isinstance(docs, (Doc, Span, dict)):
|
if isinstance(docs, (Doc, Span, dict)):
|
||||||
|
@ -37,16 +46,18 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
|
||||||
renderer, converter = factories[style]
|
renderer, converter = factories[style]
|
||||||
renderer = renderer(options=options)
|
renderer = renderer(options=options)
|
||||||
parsed = [converter(doc, options) for doc in docs] if not manual else docs
|
parsed = [converter(doc, options) for doc in docs] if not manual else docs
|
||||||
_html['parsed'] = renderer.render(parsed, page=page, minify=minify).strip()
|
_html["parsed"] = renderer.render(parsed, page=page, minify=minify).strip()
|
||||||
html = _html['parsed']
|
html = _html["parsed"]
|
||||||
if jupyter: # return HTML rendered by IPython display()
|
if jupyter: # return HTML rendered by IPython display()
|
||||||
from IPython.core.display import display, HTML
|
from IPython.core.display import display, HTML
|
||||||
|
|
||||||
return display(HTML(html))
|
return display(HTML(html))
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
|
||||||
def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
|
def serve(
|
||||||
port=5000):
|
docs, style="dep", page=True, minify=False, options={}, manual=False, port=5000
|
||||||
|
):
|
||||||
"""Serve displaCy visualisation.
|
"""Serve displaCy visualisation.
|
||||||
|
|
||||||
docs (list or Doc): Document(s) to visualise.
|
docs (list or Doc): Document(s) to visualise.
|
||||||
|
@ -58,25 +69,24 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
|
||||||
port (int): Port to serve visualisation.
|
port (int): Port to serve visualisation.
|
||||||
"""
|
"""
|
||||||
from wsgiref import simple_server
|
from wsgiref import simple_server
|
||||||
render(docs, style=style, page=page, minify=minify, options=options,
|
|
||||||
manual=manual)
|
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
||||||
httpd = simple_server.make_server('0.0.0.0', port, app)
|
httpd = simple_server.make_server("0.0.0.0", port, app)
|
||||||
prints("Using the '{}' visualizer".format(style),
|
print("\nUsing the '{}' visualizer".format(style))
|
||||||
title="Serving on port {}...".format(port))
|
print("Serving on port {}...\n".format(port))
|
||||||
try:
|
try:
|
||||||
httpd.serve_forever()
|
httpd.serve_forever()
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
prints("Shutting down server on port {}.".format(port))
|
print("Shutting down server on port {}.".format(port))
|
||||||
finally:
|
finally:
|
||||||
httpd.server_close()
|
httpd.server_close()
|
||||||
|
|
||||||
|
|
||||||
def app(environ, start_response):
|
def app(environ, start_response):
|
||||||
# headers and status need to be bytes in Python 2, see #1227
|
# Headers and status need to be bytes in Python 2, see #1227
|
||||||
headers = [(b_to_str(b'Content-type'),
|
headers = [(b_to_str(b"Content-type"), b_to_str(b"text/html; charset=utf-8"))]
|
||||||
b_to_str(b'text/html; charset=utf-8'))]
|
start_response(b_to_str(b"200 OK"), headers)
|
||||||
start_response(b_to_str(b'200 OK'), headers)
|
res = _html["parsed"].encode(encoding="utf-8")
|
||||||
res = _html['parsed'].encode(encoding='utf-8')
|
|
||||||
return [res]
|
return [res]
|
||||||
|
|
||||||
|
|
||||||
|
@ -89,11 +99,10 @@ def parse_deps(orig_doc, options={}):
|
||||||
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
|
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
|
||||||
if not doc.is_parsed:
|
if not doc.is_parsed:
|
||||||
user_warning(Warnings.W005)
|
user_warning(Warnings.W005)
|
||||||
if options.get('collapse_phrases', False):
|
if options.get("collapse_phrases", False):
|
||||||
for np in list(doc.noun_chunks):
|
for np in list(doc.noun_chunks):
|
||||||
np.merge(tag=np.root.tag_, lemma=np.root.lemma_,
|
np.merge(tag=np.root.tag_, lemma=np.root.lemma_, ent_type=np.root.ent_type_)
|
||||||
ent_type=np.root.ent_type_)
|
if options.get("collapse_punct", True):
|
||||||
if options.get('collapse_punct', True):
|
|
||||||
spans = []
|
spans = []
|
||||||
for word in doc[:-1]:
|
for word in doc[:-1]:
|
||||||
if word.is_punct or not word.nbor(1).is_punct:
|
if word.is_punct or not word.nbor(1).is_punct:
|
||||||
|
@ -103,23 +112,31 @@ def parse_deps(orig_doc, options={}):
|
||||||
while end < len(doc) and doc[end].is_punct:
|
while end < len(doc) and doc[end].is_punct:
|
||||||
end += 1
|
end += 1
|
||||||
span = doc[start:end]
|
span = doc[start:end]
|
||||||
spans.append((span.start_char, span.end_char, word.tag_,
|
spans.append(
|
||||||
word.lemma_, word.ent_type_))
|
(span.start_char, span.end_char, word.tag_, word.lemma_, word.ent_type_)
|
||||||
|
)
|
||||||
for start, end, tag, lemma, ent_type in spans:
|
for start, end, tag, lemma, ent_type in spans:
|
||||||
doc.merge(start, end, tag=tag, lemma=lemma, ent_type=ent_type)
|
doc.merge(start, end, tag=tag, lemma=lemma, ent_type=ent_type)
|
||||||
if options.get('fine_grained'):
|
if options.get("fine_grained"):
|
||||||
words = [{'text': w.text, 'tag': w.tag_} for w in doc]
|
words = [{"text": w.text, "tag": w.tag_} for w in doc]
|
||||||
else:
|
else:
|
||||||
words = [{'text': w.text, 'tag': w.pos_} for w in doc]
|
words = [{"text": w.text, "tag": w.pos_} for w in doc]
|
||||||
arcs = []
|
arcs = []
|
||||||
for word in doc:
|
for word in doc:
|
||||||
if word.i < word.head.i:
|
if word.i < word.head.i:
|
||||||
arcs.append({'start': word.i, 'end': word.head.i,
|
arcs.append(
|
||||||
'label': word.dep_, 'dir': 'left'})
|
{"start": word.i, "end": word.head.i, "label": word.dep_, "dir": "left"}
|
||||||
|
)
|
||||||
elif word.i > word.head.i:
|
elif word.i > word.head.i:
|
||||||
arcs.append({'start': word.head.i, 'end': word.i,
|
arcs.append(
|
||||||
'label': word.dep_, 'dir': 'right'})
|
{
|
||||||
return {'words': words, 'arcs': arcs}
|
"start": word.head.i,
|
||||||
|
"end": word.i,
|
||||||
|
"label": word.dep_,
|
||||||
|
"dir": "right",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return {"words": words, "arcs": arcs}
|
||||||
|
|
||||||
|
|
||||||
def parse_ents(doc, options={}):
|
def parse_ents(doc, options={}):
|
||||||
|
@ -128,10 +145,11 @@ def parse_ents(doc, options={}):
|
||||||
doc (Doc): Document do parse.
|
doc (Doc): Document do parse.
|
||||||
RETURNS (dict): Generated entities keyed by text (original text) and ents.
|
RETURNS (dict): Generated entities keyed by text (original text) and ents.
|
||||||
"""
|
"""
|
||||||
ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}
|
ents = [
|
||||||
for ent in doc.ents]
|
{"start": ent.start_char, "end": ent.end_char, "label": ent.label_}
|
||||||
|
for ent in doc.ents
|
||||||
|
]
|
||||||
if not ents:
|
if not ents:
|
||||||
user_warning(Warnings.W006)
|
user_warning(Warnings.W006)
|
||||||
title = (doc.user_data.get('title', None)
|
title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None
|
||||||
if hasattr(doc, 'user_data') else None)
|
return {"text": doc.text, "ents": ents, "title": title}
|
||||||
return {'text': doc.text, 'ents': ents, 'title': title}
|
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import random
|
||||||
|
|
||||||
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS
|
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS
|
||||||
from .templates import TPL_ENT, TPL_ENTS, TPL_FIGURE, TPL_TITLE, TPL_PAGE
|
from .templates import TPL_ENT, TPL_ENTS, TPL_FIGURE, TPL_TITLE, TPL_PAGE
|
||||||
from ..util import minify_html, escape_html
|
from ..util import minify_html, escape_html
|
||||||
|
@ -8,7 +10,8 @@ from ..util import minify_html, escape_html
|
||||||
|
|
||||||
class DependencyRenderer(object):
|
class DependencyRenderer(object):
|
||||||
"""Render dependency parses as SVGs."""
|
"""Render dependency parses as SVGs."""
|
||||||
style = 'dep'
|
|
||||||
|
style = "dep"
|
||||||
|
|
||||||
def __init__(self, options={}):
|
def __init__(self, options={}):
|
||||||
"""Initialise dependency renderer.
|
"""Initialise dependency renderer.
|
||||||
|
@ -17,18 +20,16 @@ class DependencyRenderer(object):
|
||||||
arrow_spacing, arrow_width, arrow_stroke, distance, offset_x,
|
arrow_spacing, arrow_width, arrow_stroke, distance, offset_x,
|
||||||
color, bg, font)
|
color, bg, font)
|
||||||
"""
|
"""
|
||||||
self.compact = options.get('compact', False)
|
self.compact = options.get("compact", False)
|
||||||
self.word_spacing = options.get('word_spacing', 45)
|
self.word_spacing = options.get("word_spacing", 45)
|
||||||
self.arrow_spacing = options.get('arrow_spacing',
|
self.arrow_spacing = options.get("arrow_spacing", 12 if self.compact else 20)
|
||||||
12 if self.compact else 20)
|
self.arrow_width = options.get("arrow_width", 6 if self.compact else 10)
|
||||||
self.arrow_width = options.get('arrow_width',
|
self.arrow_stroke = options.get("arrow_stroke", 2)
|
||||||
6 if self.compact else 10)
|
self.distance = options.get("distance", 150 if self.compact else 175)
|
||||||
self.arrow_stroke = options.get('arrow_stroke', 2)
|
self.offset_x = options.get("offset_x", 50)
|
||||||
self.distance = options.get('distance', 150 if self.compact else 175)
|
self.color = options.get("color", "#000000")
|
||||||
self.offset_x = options.get('offset_x', 50)
|
self.bg = options.get("bg", "#ffffff")
|
||||||
self.color = options.get('color', '#000000')
|
self.font = options.get("font", "Arial")
|
||||||
self.bg = options.get('bg', '#ffffff')
|
|
||||||
self.font = options.get('font', 'Arial')
|
|
||||||
|
|
||||||
def render(self, parsed, page=False, minify=False):
|
def render(self, parsed, page=False, minify=False):
|
||||||
"""Render complete markup.
|
"""Render complete markup.
|
||||||
|
@ -38,14 +39,18 @@ class DependencyRenderer(object):
|
||||||
minify (bool): Minify HTML markup.
|
minify (bool): Minify HTML markup.
|
||||||
RETURNS (unicode): Rendered SVG or HTML markup.
|
RETURNS (unicode): Rendered SVG or HTML markup.
|
||||||
"""
|
"""
|
||||||
rendered = [self.render_svg(i, p['words'], p['arcs'])
|
# Create a random ID prefix to make sure parses don't receive the
|
||||||
for i, p in enumerate(parsed)]
|
# same ID, even if they're identical
|
||||||
|
id_prefix = random.randint(0, 999)
|
||||||
|
rendered = [
|
||||||
|
self.render_svg("{}-{}".format(id_prefix, i), p["words"], p["arcs"])
|
||||||
|
for i, p in enumerate(parsed)
|
||||||
|
]
|
||||||
if page:
|
if page:
|
||||||
content = ''.join([TPL_FIGURE.format(content=svg)
|
content = "".join([TPL_FIGURE.format(content=svg) for svg in rendered])
|
||||||
for svg in rendered])
|
|
||||||
markup = TPL_PAGE.format(content=content)
|
markup = TPL_PAGE.format(content=content)
|
||||||
else:
|
else:
|
||||||
markup = ''.join(rendered)
|
markup = "".join(rendered)
|
||||||
if minify:
|
if minify:
|
||||||
return minify_html(markup)
|
return minify_html(markup)
|
||||||
return markup
|
return markup
|
||||||
|
@ -60,19 +65,25 @@ class DependencyRenderer(object):
|
||||||
"""
|
"""
|
||||||
self.levels = self.get_levels(arcs)
|
self.levels = self.get_levels(arcs)
|
||||||
self.highest_level = len(self.levels)
|
self.highest_level = len(self.levels)
|
||||||
self.offset_y = self.distance/2*self.highest_level+self.arrow_stroke
|
self.offset_y = self.distance / 2 * self.highest_level + self.arrow_stroke
|
||||||
self.width = self.offset_x+len(words)*self.distance
|
self.width = self.offset_x + len(words) * self.distance
|
||||||
self.height = self.offset_y+3*self.word_spacing
|
self.height = self.offset_y + 3 * self.word_spacing
|
||||||
self.id = render_id
|
self.id = render_id
|
||||||
words = [self.render_word(w['text'], w['tag'], i)
|
words = [self.render_word(w["text"], w["tag"], i) for i, w in enumerate(words)]
|
||||||
for i, w in enumerate(words)]
|
arcs = [
|
||||||
arcs = [self.render_arrow(a['label'], a['start'],
|
self.render_arrow(a["label"], a["start"], a["end"], a["dir"], i)
|
||||||
a['end'], a['dir'], i)
|
for i, a in enumerate(arcs)
|
||||||
for i, a in enumerate(arcs)]
|
]
|
||||||
content = ''.join(words) + ''.join(arcs)
|
content = "".join(words) + "".join(arcs)
|
||||||
return TPL_DEP_SVG.format(id=self.id, width=self.width,
|
return TPL_DEP_SVG.format(
|
||||||
height=self.height, color=self.color,
|
id=self.id,
|
||||||
bg=self.bg, font=self.font, content=content)
|
width=self.width,
|
||||||
|
height=self.height,
|
||||||
|
color=self.color,
|
||||||
|
bg=self.bg,
|
||||||
|
font=self.font,
|
||||||
|
content=content,
|
||||||
|
)
|
||||||
|
|
||||||
def render_word(self, text, tag, i):
|
def render_word(self, text, tag, i):
|
||||||
"""Render individual word.
|
"""Render individual word.
|
||||||
|
@ -82,12 +93,11 @@ class DependencyRenderer(object):
|
||||||
i (int): Unique ID, typically word index.
|
i (int): Unique ID, typically word index.
|
||||||
RETURNS (unicode): Rendered SVG markup.
|
RETURNS (unicode): Rendered SVG markup.
|
||||||
"""
|
"""
|
||||||
y = self.offset_y+self.word_spacing
|
y = self.offset_y + self.word_spacing
|
||||||
x = self.offset_x+i*self.distance
|
x = self.offset_x + i * self.distance
|
||||||
html_text = escape_html(text)
|
html_text = escape_html(text)
|
||||||
return TPL_DEP_WORDS.format(text=html_text, tag=tag, x=x, y=y)
|
return TPL_DEP_WORDS.format(text=html_text, tag=tag, x=x, y=y)
|
||||||
|
|
||||||
|
|
||||||
def render_arrow(self, label, start, end, direction, i):
|
def render_arrow(self, label, start, end, direction, i):
|
||||||
"""Render indivicual arrow.
|
"""Render indivicual arrow.
|
||||||
|
|
||||||
|
@ -98,20 +108,30 @@ class DependencyRenderer(object):
|
||||||
i (int): Unique ID, typically arrow index.
|
i (int): Unique ID, typically arrow index.
|
||||||
RETURNS (unicode): Rendered SVG markup.
|
RETURNS (unicode): Rendered SVG markup.
|
||||||
"""
|
"""
|
||||||
level = self.levels.index(end-start)+1
|
level = self.levels.index(end - start) + 1
|
||||||
x_start = self.offset_x+start*self.distance+self.arrow_spacing
|
x_start = self.offset_x + start * self.distance + self.arrow_spacing
|
||||||
y = self.offset_y
|
y = self.offset_y
|
||||||
x_end = (self.offset_x+(end-start)*self.distance+start*self.distance
|
x_end = (
|
||||||
- self.arrow_spacing*(self.highest_level-level)/4)
|
self.offset_x
|
||||||
y_curve = self.offset_y-level*self.distance/2
|
+ (end - start) * self.distance
|
||||||
|
+ start * self.distance
|
||||||
|
- self.arrow_spacing * (self.highest_level - level) / 4
|
||||||
|
)
|
||||||
|
y_curve = self.offset_y - level * self.distance / 2
|
||||||
if self.compact:
|
if self.compact:
|
||||||
y_curve = self.offset_y-level*self.distance/6
|
y_curve = self.offset_y - level * self.distance / 6
|
||||||
if y_curve == 0 and len(self.levels) > 5:
|
if y_curve == 0 and len(self.levels) > 5:
|
||||||
y_curve = -self.distance
|
y_curve = -self.distance
|
||||||
arrowhead = self.get_arrowhead(direction, x_start, y, x_end)
|
arrowhead = self.get_arrowhead(direction, x_start, y, x_end)
|
||||||
arc = self.get_arc(x_start, y, y_curve, x_end)
|
arc = self.get_arc(x_start, y, y_curve, x_end)
|
||||||
return TPL_DEP_ARCS.format(id=self.id, i=i, stroke=self.arrow_stroke,
|
return TPL_DEP_ARCS.format(
|
||||||
head=arrowhead, label=label, arc=arc)
|
id=self.id,
|
||||||
|
i=i,
|
||||||
|
stroke=self.arrow_stroke,
|
||||||
|
head=arrowhead,
|
||||||
|
label=label,
|
||||||
|
arc=arc,
|
||||||
|
)
|
||||||
|
|
||||||
def get_arc(self, x_start, y, y_curve, x_end):
|
def get_arc(self, x_start, y, y_curve, x_end):
|
||||||
"""Render individual arc.
|
"""Render individual arc.
|
||||||
|
@ -136,13 +156,22 @@ class DependencyRenderer(object):
|
||||||
end (int): X-coordinate of arrow end point.
|
end (int): X-coordinate of arrow end point.
|
||||||
RETURNS (unicode): Definition of the arrow head path ('d' attribute).
|
RETURNS (unicode): Definition of the arrow head path ('d' attribute).
|
||||||
"""
|
"""
|
||||||
if direction == 'left':
|
if direction == "left":
|
||||||
pos1, pos2, pos3 = (x, x-self.arrow_width+2, x+self.arrow_width-2)
|
pos1, pos2, pos3 = (x, x - self.arrow_width + 2, x + self.arrow_width - 2)
|
||||||
else:
|
else:
|
||||||
pos1, pos2, pos3 = (end, end+self.arrow_width-2,
|
pos1, pos2, pos3 = (
|
||||||
end-self.arrow_width+2)
|
end,
|
||||||
arrowhead = (pos1, y+2, pos2, y-self.arrow_width, pos3,
|
end + self.arrow_width - 2,
|
||||||
y-self.arrow_width)
|
end - self.arrow_width + 2,
|
||||||
|
)
|
||||||
|
arrowhead = (
|
||||||
|
pos1,
|
||||||
|
y + 2,
|
||||||
|
pos2,
|
||||||
|
y - self.arrow_width,
|
||||||
|
pos3,
|
||||||
|
y - self.arrow_width,
|
||||||
|
)
|
||||||
return "M{},{} L{},{} {},{}".format(*arrowhead)
|
return "M{},{} L{},{} {},{}".format(*arrowhead)
|
||||||
|
|
||||||
def get_levels(self, arcs):
|
def get_levels(self, arcs):
|
||||||
|
@ -152,30 +181,44 @@ class DependencyRenderer(object):
|
||||||
args (list): Individual arcs and their start, end, direction and label.
|
args (list): Individual arcs and their start, end, direction and label.
|
||||||
RETURNS (list): Arc levels sorted from lowest to highest.
|
RETURNS (list): Arc levels sorted from lowest to highest.
|
||||||
"""
|
"""
|
||||||
levels = set(map(lambda arc: arc['end'] - arc['start'], arcs))
|
levels = set(map(lambda arc: arc["end"] - arc["start"], arcs))
|
||||||
return sorted(list(levels))
|
return sorted(list(levels))
|
||||||
|
|
||||||
|
|
||||||
class EntityRenderer(object):
|
class EntityRenderer(object):
|
||||||
"""Render named entities as HTML."""
|
"""Render named entities as HTML."""
|
||||||
style = 'ent'
|
|
||||||
|
style = "ent"
|
||||||
|
|
||||||
def __init__(self, options={}):
|
def __init__(self, options={}):
|
||||||
"""Initialise dependency renderer.
|
"""Initialise dependency renderer.
|
||||||
|
|
||||||
options (dict): Visualiser-specific options (colors, ents)
|
options (dict): Visualiser-specific options (colors, ents)
|
||||||
"""
|
"""
|
||||||
colors = {'ORG': '#7aecec', 'PRODUCT': '#bfeeb7', 'GPE': '#feca74',
|
colors = {
|
||||||
'LOC': '#ff9561', 'PERSON': '#aa9cfc', 'NORP': '#c887fb',
|
"ORG": "#7aecec",
|
||||||
'FACILITY': '#9cc9cc', 'EVENT': '#ffeb80', 'LAW': '#ff8197',
|
"PRODUCT": "#bfeeb7",
|
||||||
'LANGUAGE': '#ff8197', 'WORK_OF_ART': '#f0d0ff',
|
"GPE": "#feca74",
|
||||||
'DATE': '#bfe1d9', 'TIME': '#bfe1d9', 'MONEY': '#e4e7d2',
|
"LOC": "#ff9561",
|
||||||
'QUANTITY': '#e4e7d2', 'ORDINAL': '#e4e7d2',
|
"PERSON": "#aa9cfc",
|
||||||
'CARDINAL': '#e4e7d2', 'PERCENT': '#e4e7d2'}
|
"NORP": "#c887fb",
|
||||||
colors.update(options.get('colors', {}))
|
"FACILITY": "#9cc9cc",
|
||||||
self.default_color = '#ddd'
|
"EVENT": "#ffeb80",
|
||||||
|
"LAW": "#ff8197",
|
||||||
|
"LANGUAGE": "#ff8197",
|
||||||
|
"WORK_OF_ART": "#f0d0ff",
|
||||||
|
"DATE": "#bfe1d9",
|
||||||
|
"TIME": "#bfe1d9",
|
||||||
|
"MONEY": "#e4e7d2",
|
||||||
|
"QUANTITY": "#e4e7d2",
|
||||||
|
"ORDINAL": "#e4e7d2",
|
||||||
|
"CARDINAL": "#e4e7d2",
|
||||||
|
"PERCENT": "#e4e7d2",
|
||||||
|
}
|
||||||
|
colors.update(options.get("colors", {}))
|
||||||
|
self.default_color = "#ddd"
|
||||||
self.colors = colors
|
self.colors = colors
|
||||||
self.ents = options.get('ents', None)
|
self.ents = options.get("ents", None)
|
||||||
|
|
||||||
def render(self, parsed, page=False, minify=False):
|
def render(self, parsed, page=False, minify=False):
|
||||||
"""Render complete markup.
|
"""Render complete markup.
|
||||||
|
@ -185,14 +228,14 @@ class EntityRenderer(object):
|
||||||
minify (bool): Minify HTML markup.
|
minify (bool): Minify HTML markup.
|
||||||
RETURNS (unicode): Rendered HTML markup.
|
RETURNS (unicode): Rendered HTML markup.
|
||||||
"""
|
"""
|
||||||
rendered = [self.render_ents(p['text'], p['ents'],
|
rendered = [
|
||||||
p.get('title', None)) for p in parsed]
|
self.render_ents(p["text"], p["ents"], p.get("title", None)) for p in parsed
|
||||||
|
]
|
||||||
if page:
|
if page:
|
||||||
docs = ''.join([TPL_FIGURE.format(content=doc)
|
docs = "".join([TPL_FIGURE.format(content=doc) for doc in rendered])
|
||||||
for doc in rendered])
|
|
||||||
markup = TPL_PAGE.format(content=docs)
|
markup = TPL_PAGE.format(content=docs)
|
||||||
else:
|
else:
|
||||||
markup = ''.join(rendered)
|
markup = "".join(rendered)
|
||||||
if minify:
|
if minify:
|
||||||
return minify_html(markup)
|
return minify_html(markup)
|
||||||
return markup
|
return markup
|
||||||
|
@ -204,18 +247,18 @@ class EntityRenderer(object):
|
||||||
spans (list): Individual entity spans and their start, end and label.
|
spans (list): Individual entity spans and their start, end and label.
|
||||||
title (unicode or None): Document title set in Doc.user_data['title'].
|
title (unicode or None): Document title set in Doc.user_data['title'].
|
||||||
"""
|
"""
|
||||||
markup = ''
|
markup = ""
|
||||||
offset = 0
|
offset = 0
|
||||||
for span in spans:
|
for span in spans:
|
||||||
label = span['label']
|
label = span["label"]
|
||||||
start = span['start']
|
start = span["start"]
|
||||||
end = span['end']
|
end = span["end"]
|
||||||
entity = text[start:end]
|
entity = text[start:end]
|
||||||
fragments = text[offset:start].split('\n')
|
fragments = text[offset:start].split("\n")
|
||||||
for i, fragment in enumerate(fragments):
|
for i, fragment in enumerate(fragments):
|
||||||
markup += fragment
|
markup += fragment
|
||||||
if len(fragments) > 1 and i != len(fragments)-1:
|
if len(fragments) > 1 and i != len(fragments) - 1:
|
||||||
markup += '</br>'
|
markup += "</br>"
|
||||||
if self.ents is None or label.upper() in self.ents:
|
if self.ents is None or label.upper() in self.ents:
|
||||||
color = self.colors.get(label.upper(), self.default_color)
|
color = self.colors.get(label.upper(), self.default_color)
|
||||||
markup += TPL_ENT.format(label=label, text=entity, bg=color)
|
markup += TPL_ENT.format(label=label, text=entity, bg=color)
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
# setting explicit height and max-width: none on the SVG is required for
|
# Setting explicit height and max-width: none on the SVG is required for
|
||||||
# Jupyter to render it properly in a cell
|
# Jupyter to render it properly in a cell
|
||||||
|
|
||||||
TPL_DEP_SVG = """
|
TPL_DEP_SVG = """
|
||||||
|
|
|
@ -8,13 +8,17 @@ import inspect
|
||||||
|
|
||||||
def add_codes(err_cls):
|
def add_codes(err_cls):
|
||||||
"""Add error codes to string messages via class attribute names."""
|
"""Add error codes to string messages via class attribute names."""
|
||||||
|
|
||||||
class ErrorsWithCodes(object):
|
class ErrorsWithCodes(object):
|
||||||
def __getattribute__(self, code):
|
def __getattribute__(self, code):
|
||||||
msg = getattr(err_cls, code)
|
msg = getattr(err_cls, code)
|
||||||
return '[{code}] {msg}'.format(code=code, msg=msg)
|
return "[{code}] {msg}".format(code=code, msg=msg)
|
||||||
|
|
||||||
return ErrorsWithCodes()
|
return ErrorsWithCodes()
|
||||||
|
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
class Warnings(object):
|
class Warnings(object):
|
||||||
W001 = ("As of spaCy v2.0, the keyword argument `path=` is deprecated. "
|
W001 = ("As of spaCy v2.0, the keyword argument `path=` is deprecated. "
|
||||||
|
@ -260,7 +264,7 @@ class Errors(object):
|
||||||
E095 = ("Can't write to frozen dictionary. This is likely an internal "
|
E095 = ("Can't write to frozen dictionary. This is likely an internal "
|
||||||
"error. Are you writing to a default function argument?")
|
"error. Are you writing to a default function argument?")
|
||||||
E096 = ("Invalid object passed to displaCy: Can only visualize Doc or "
|
E096 = ("Invalid object passed to displaCy: Can only visualize Doc or "
|
||||||
"Span objects, or dicts if set to manual=True.")
|
"Span objects, or dicts if set to manual=True.")
|
||||||
E097 = ("Invalid pattern: expected token pattern (list of dicts) or "
|
E097 = ("Invalid pattern: expected token pattern (list of dicts) or "
|
||||||
"phrase pattern (string) but got:\n{pattern}")
|
"phrase pattern (string) but got:\n{pattern}")
|
||||||
E098 = ("Invalid pattern specified: expected both SPEC and PATTERN.")
|
E098 = ("Invalid pattern specified: expected both SPEC and PATTERN.")
|
||||||
|
@ -270,6 +274,19 @@ class Errors(object):
|
||||||
"NBOR_RELOP.")
|
"NBOR_RELOP.")
|
||||||
E101 = ("NODE_NAME should be a new node and NBOR_NAME should already have "
|
E101 = ("NODE_NAME should be a new node and NBOR_NAME should already have "
|
||||||
"have been declared in previous edges.")
|
"have been declared in previous edges.")
|
||||||
|
E102 = ("Can't merge non-disjoint spans. '{token}' is already part of tokens to merge")
|
||||||
|
E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A token"
|
||||||
|
" can only be part of one entity, so make sure the entities you're "
|
||||||
|
"setting don't overlap.")
|
||||||
|
E104 = ("Can't find JSON schema for '{name}'.")
|
||||||
|
E105 = ("The Doc.print_tree() method is now deprecated. Please use "
|
||||||
|
"Doc.json() instead.")
|
||||||
|
E106 = ("Can't find doc._.{attr} attribute specified in the underscore "
|
||||||
|
"settings: {opts}")
|
||||||
|
E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
|
||||||
|
E108 = ("As of spaCy v2.1, the pipe name `sbd` has been deprecated "
|
||||||
|
"in favor of the pipe name `sentencizer`, which does the same "
|
||||||
|
"thing. For example, use `nlp.create_pipeline('sentencizer')`")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
@ -289,55 +306,57 @@ class TempErrors(object):
|
||||||
"(pretrained_dims) but not the new name (pretrained_vectors).")
|
"(pretrained_dims) but not the new name (pretrained_vectors).")
|
||||||
|
|
||||||
|
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
class ModelsWarning(UserWarning):
|
class ModelsWarning(UserWarning):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
WARNINGS = {
|
WARNINGS = {
|
||||||
'user': UserWarning,
|
"user": UserWarning,
|
||||||
'deprecation': DeprecationWarning,
|
"deprecation": DeprecationWarning,
|
||||||
'models': ModelsWarning,
|
"models": ModelsWarning,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def _get_warn_types(arg):
|
def _get_warn_types(arg):
|
||||||
if arg == '': # don't show any warnings
|
if arg == "": # don't show any warnings
|
||||||
return []
|
return []
|
||||||
if not arg or arg == 'all': # show all available warnings
|
if not arg or arg == "all": # show all available warnings
|
||||||
return WARNINGS.keys()
|
return WARNINGS.keys()
|
||||||
return [w_type.strip() for w_type in arg.split(',')
|
return [w_type.strip() for w_type in arg.split(",") if w_type.strip() in WARNINGS]
|
||||||
if w_type.strip() in WARNINGS]
|
|
||||||
|
|
||||||
|
|
||||||
def _get_warn_excl(arg):
|
def _get_warn_excl(arg):
|
||||||
if not arg:
|
if not arg:
|
||||||
return []
|
return []
|
||||||
return [w_id.strip() for w_id in arg.split(',')]
|
return [w_id.strip() for w_id in arg.split(",")]
|
||||||
|
|
||||||
|
|
||||||
SPACY_WARNING_FILTER = os.environ.get('SPACY_WARNING_FILTER')
|
SPACY_WARNING_FILTER = os.environ.get("SPACY_WARNING_FILTER")
|
||||||
SPACY_WARNING_TYPES = _get_warn_types(os.environ.get('SPACY_WARNING_TYPES'))
|
SPACY_WARNING_TYPES = _get_warn_types(os.environ.get("SPACY_WARNING_TYPES"))
|
||||||
SPACY_WARNING_IGNORE = _get_warn_excl(os.environ.get('SPACY_WARNING_IGNORE'))
|
SPACY_WARNING_IGNORE = _get_warn_excl(os.environ.get("SPACY_WARNING_IGNORE"))
|
||||||
|
|
||||||
|
|
||||||
def user_warning(message):
|
def user_warning(message):
|
||||||
_warn(message, 'user')
|
_warn(message, "user")
|
||||||
|
|
||||||
|
|
||||||
def deprecation_warning(message):
|
def deprecation_warning(message):
|
||||||
_warn(message, 'deprecation')
|
_warn(message, "deprecation")
|
||||||
|
|
||||||
|
|
||||||
def models_warning(message):
|
def models_warning(message):
|
||||||
_warn(message, 'models')
|
_warn(message, "models")
|
||||||
|
|
||||||
|
|
||||||
def _warn(message, warn_type='user'):
|
def _warn(message, warn_type="user"):
|
||||||
"""
|
"""
|
||||||
message (unicode): The message to display.
|
message (unicode): The message to display.
|
||||||
category (Warning): The Warning to show.
|
category (Warning): The Warning to show.
|
||||||
"""
|
"""
|
||||||
w_id = message.split('[', 1)[1].split(']', 1)[0] # get ID from string
|
w_id = message.split("[", 1)[1].split("]", 1)[0] # get ID from string
|
||||||
if warn_type in SPACY_WARNING_TYPES and w_id not in SPACY_WARNING_IGNORE:
|
if warn_type in SPACY_WARNING_TYPES and w_id not in SPACY_WARNING_IGNORE:
|
||||||
category = WARNINGS[warn_type]
|
category = WARNINGS[warn_type]
|
||||||
stack = inspect.stack()[-1]
|
stack = inspect.stack()[-1]
|
||||||
|
|
|
@ -21,294 +21,272 @@ GLOSSARY = {
|
||||||
# POS tags
|
# POS tags
|
||||||
# Universal POS Tags
|
# Universal POS Tags
|
||||||
# http://universaldependencies.org/u/pos/
|
# http://universaldependencies.org/u/pos/
|
||||||
|
"ADJ": "adjective",
|
||||||
'ADJ': 'adjective',
|
"ADP": "adposition",
|
||||||
'ADP': 'adposition',
|
"ADV": "adverb",
|
||||||
'ADV': 'adverb',
|
"AUX": "auxiliary",
|
||||||
'AUX': 'auxiliary',
|
"CONJ": "conjunction",
|
||||||
'CONJ': 'conjunction',
|
"CCONJ": "coordinating conjunction",
|
||||||
'CCONJ': 'coordinating conjunction',
|
"DET": "determiner",
|
||||||
'DET': 'determiner',
|
"INTJ": "interjection",
|
||||||
'INTJ': 'interjection',
|
"NOUN": "noun",
|
||||||
'NOUN': 'noun',
|
"NUM": "numeral",
|
||||||
'NUM': 'numeral',
|
"PART": "particle",
|
||||||
'PART': 'particle',
|
"PRON": "pronoun",
|
||||||
'PRON': 'pronoun',
|
"PROPN": "proper noun",
|
||||||
'PROPN': 'proper noun',
|
"PUNCT": "punctuation",
|
||||||
'PUNCT': 'punctuation',
|
"SCONJ": "subordinating conjunction",
|
||||||
'SCONJ': 'subordinating conjunction',
|
"SYM": "symbol",
|
||||||
'SYM': 'symbol',
|
"VERB": "verb",
|
||||||
'VERB': 'verb',
|
"X": "other",
|
||||||
'X': 'other',
|
"EOL": "end of line",
|
||||||
'EOL': 'end of line',
|
"SPACE": "space",
|
||||||
'SPACE': 'space',
|
|
||||||
|
|
||||||
|
|
||||||
# POS tags (English)
|
# POS tags (English)
|
||||||
# OntoNotes 5 / Penn Treebank
|
# OntoNotes 5 / Penn Treebank
|
||||||
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
|
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
|
||||||
|
".": "punctuation mark, sentence closer",
|
||||||
'.': 'punctuation mark, sentence closer',
|
",": "punctuation mark, comma",
|
||||||
',': 'punctuation mark, comma',
|
"-LRB-": "left round bracket",
|
||||||
'-LRB-': 'left round bracket',
|
"-RRB-": "right round bracket",
|
||||||
'-RRB-': 'right round bracket',
|
"``": "opening quotation mark",
|
||||||
'``': 'opening quotation mark',
|
'""': "closing quotation mark",
|
||||||
'""': 'closing quotation mark',
|
"''": "closing quotation mark",
|
||||||
"''": 'closing quotation mark',
|
":": "punctuation mark, colon or ellipsis",
|
||||||
':': 'punctuation mark, colon or ellipsis',
|
"$": "symbol, currency",
|
||||||
'$': 'symbol, currency',
|
"#": "symbol, number sign",
|
||||||
'#': 'symbol, number sign',
|
"AFX": "affix",
|
||||||
'AFX': 'affix',
|
"CC": "conjunction, coordinating",
|
||||||
'CC': 'conjunction, coordinating',
|
"CD": "cardinal number",
|
||||||
'CD': 'cardinal number',
|
"DT": "determiner",
|
||||||
'DT': 'determiner',
|
"EX": "existential there",
|
||||||
'EX': 'existential there',
|
"FW": "foreign word",
|
||||||
'FW': 'foreign word',
|
"HYPH": "punctuation mark, hyphen",
|
||||||
'HYPH': 'punctuation mark, hyphen',
|
"IN": "conjunction, subordinating or preposition",
|
||||||
'IN': 'conjunction, subordinating or preposition',
|
"JJ": "adjective",
|
||||||
'JJ': 'adjective',
|
"JJR": "adjective, comparative",
|
||||||
'JJR': 'adjective, comparative',
|
"JJS": "adjective, superlative",
|
||||||
'JJS': 'adjective, superlative',
|
"LS": "list item marker",
|
||||||
'LS': 'list item marker',
|
"MD": "verb, modal auxiliary",
|
||||||
'MD': 'verb, modal auxiliary',
|
"NIL": "missing tag",
|
||||||
'NIL': 'missing tag',
|
"NN": "noun, singular or mass",
|
||||||
'NN': 'noun, singular or mass',
|
"NNP": "noun, proper singular",
|
||||||
'NNP': 'noun, proper singular',
|
"NNPS": "noun, proper plural",
|
||||||
'NNPS': 'noun, proper plural',
|
"NNS": "noun, plural",
|
||||||
'NNS': 'noun, plural',
|
"PDT": "predeterminer",
|
||||||
'PDT': 'predeterminer',
|
"POS": "possessive ending",
|
||||||
'POS': 'possessive ending',
|
"PRP": "pronoun, personal",
|
||||||
'PRP': 'pronoun, personal',
|
"PRP$": "pronoun, possessive",
|
||||||
'PRP$': 'pronoun, possessive',
|
"RB": "adverb",
|
||||||
'RB': 'adverb',
|
"RBR": "adverb, comparative",
|
||||||
'RBR': 'adverb, comparative',
|
"RBS": "adverb, superlative",
|
||||||
'RBS': 'adverb, superlative',
|
"RP": "adverb, particle",
|
||||||
'RP': 'adverb, particle',
|
"TO": "infinitival to",
|
||||||
'TO': 'infinitival to',
|
"UH": "interjection",
|
||||||
'UH': 'interjection',
|
"VB": "verb, base form",
|
||||||
'VB': 'verb, base form',
|
"VBD": "verb, past tense",
|
||||||
'VBD': 'verb, past tense',
|
"VBG": "verb, gerund or present participle",
|
||||||
'VBG': 'verb, gerund or present participle',
|
"VBN": "verb, past participle",
|
||||||
'VBN': 'verb, past participle',
|
"VBP": "verb, non-3rd person singular present",
|
||||||
'VBP': 'verb, non-3rd person singular present',
|
"VBZ": "verb, 3rd person singular present",
|
||||||
'VBZ': 'verb, 3rd person singular present',
|
"WDT": "wh-determiner",
|
||||||
'WDT': 'wh-determiner',
|
"WP": "wh-pronoun, personal",
|
||||||
'WP': 'wh-pronoun, personal',
|
"WP$": "wh-pronoun, possessive",
|
||||||
'WP$': 'wh-pronoun, possessive',
|
"WRB": "wh-adverb",
|
||||||
'WRB': 'wh-adverb',
|
"SP": "space",
|
||||||
'SP': 'space',
|
"ADD": "email",
|
||||||
'ADD': 'email',
|
"NFP": "superfluous punctuation",
|
||||||
'NFP': 'superfluous punctuation',
|
"GW": "additional word in multi-word expression",
|
||||||
'GW': 'additional word in multi-word expression',
|
"XX": "unknown",
|
||||||
'XX': 'unknown',
|
"BES": 'auxiliary "be"',
|
||||||
'BES': 'auxiliary "be"',
|
"HVS": 'forms of "have"',
|
||||||
'HVS': 'forms of "have"',
|
|
||||||
|
|
||||||
|
|
||||||
# POS Tags (German)
|
# POS Tags (German)
|
||||||
# TIGER Treebank
|
# TIGER Treebank
|
||||||
# http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
|
# http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
|
||||||
|
"$(": "other sentence-internal punctuation mark",
|
||||||
'$(': 'other sentence-internal punctuation mark',
|
"$,": "comma",
|
||||||
'$,': 'comma',
|
"$.": "sentence-final punctuation mark",
|
||||||
'$.': 'sentence-final punctuation mark',
|
"ADJA": "adjective, attributive",
|
||||||
'ADJA': 'adjective, attributive',
|
"ADJD": "adjective, adverbial or predicative",
|
||||||
'ADJD': 'adjective, adverbial or predicative',
|
"APPO": "postposition",
|
||||||
'APPO': 'postposition',
|
"APPR": "preposition; circumposition left",
|
||||||
'APPR': 'preposition; circumposition left',
|
"APPRART": "preposition with article",
|
||||||
'APPRART': 'preposition with article',
|
"APZR": "circumposition right",
|
||||||
'APZR': 'circumposition right',
|
"ART": "definite or indefinite article",
|
||||||
'ART': 'definite or indefinite article',
|
"CARD": "cardinal number",
|
||||||
'CARD': 'cardinal number',
|
"FM": "foreign language material",
|
||||||
'FM': 'foreign language material',
|
"ITJ": "interjection",
|
||||||
'ITJ': 'interjection',
|
"KOKOM": "comparative conjunction",
|
||||||
'KOKOM': 'comparative conjunction',
|
"KON": "coordinate conjunction",
|
||||||
'KON': 'coordinate conjunction',
|
"KOUI": 'subordinate conjunction with "zu" and infinitive',
|
||||||
'KOUI': 'subordinate conjunction with "zu" and infinitive',
|
"KOUS": "subordinate conjunction with sentence",
|
||||||
'KOUS': 'subordinate conjunction with sentence',
|
"NE": "proper noun",
|
||||||
'NE': 'proper noun',
|
"NNE": "proper noun",
|
||||||
'NNE': 'proper noun',
|
"PAV": "pronominal adverb",
|
||||||
'PAV': 'pronominal adverb',
|
"PROAV": "pronominal adverb",
|
||||||
'PROAV': 'pronominal adverb',
|
"PDAT": "attributive demonstrative pronoun",
|
||||||
'PDAT': 'attributive demonstrative pronoun',
|
"PDS": "substituting demonstrative pronoun",
|
||||||
'PDS': 'substituting demonstrative pronoun',
|
"PIAT": "attributive indefinite pronoun without determiner",
|
||||||
'PIAT': 'attributive indefinite pronoun without determiner',
|
"PIDAT": "attributive indefinite pronoun with determiner",
|
||||||
'PIDAT': 'attributive indefinite pronoun with determiner',
|
"PIS": "substituting indefinite pronoun",
|
||||||
'PIS': 'substituting indefinite pronoun',
|
"PPER": "non-reflexive personal pronoun",
|
||||||
'PPER': 'non-reflexive personal pronoun',
|
"PPOSAT": "attributive possessive pronoun",
|
||||||
'PPOSAT': 'attributive possessive pronoun',
|
"PPOSS": "substituting possessive pronoun",
|
||||||
'PPOSS': 'substituting possessive pronoun',
|
"PRELAT": "attributive relative pronoun",
|
||||||
'PRELAT': 'attributive relative pronoun',
|
"PRELS": "substituting relative pronoun",
|
||||||
'PRELS': 'substituting relative pronoun',
|
"PRF": "reflexive personal pronoun",
|
||||||
'PRF': 'reflexive personal pronoun',
|
"PTKA": "particle with adjective or adverb",
|
||||||
'PTKA': 'particle with adjective or adverb',
|
"PTKANT": "answer particle",
|
||||||
'PTKANT': 'answer particle',
|
"PTKNEG": "negative particle",
|
||||||
'PTKNEG': 'negative particle',
|
"PTKVZ": "separable verbal particle",
|
||||||
'PTKVZ': 'separable verbal particle',
|
"PTKZU": '"zu" before infinitive',
|
||||||
'PTKZU': '"zu" before infinitive',
|
"PWAT": "attributive interrogative pronoun",
|
||||||
'PWAT': 'attributive interrogative pronoun',
|
"PWAV": "adverbial interrogative or relative pronoun",
|
||||||
'PWAV': 'adverbial interrogative or relative pronoun',
|
"PWS": "substituting interrogative pronoun",
|
||||||
'PWS': 'substituting interrogative pronoun',
|
"TRUNC": "word remnant",
|
||||||
'TRUNC': 'word remnant',
|
"VAFIN": "finite verb, auxiliary",
|
||||||
'VAFIN': 'finite verb, auxiliary',
|
"VAIMP": "imperative, auxiliary",
|
||||||
'VAIMP': 'imperative, auxiliary',
|
"VAINF": "infinitive, auxiliary",
|
||||||
'VAINF': 'infinitive, auxiliary',
|
"VAPP": "perfect participle, auxiliary",
|
||||||
'VAPP': 'perfect participle, auxiliary',
|
"VMFIN": "finite verb, modal",
|
||||||
'VMFIN': 'finite verb, modal',
|
"VMINF": "infinitive, modal",
|
||||||
'VMINF': 'infinitive, modal',
|
"VMPP": "perfect participle, modal",
|
||||||
'VMPP': 'perfect participle, modal',
|
"VVFIN": "finite verb, full",
|
||||||
'VVFIN': 'finite verb, full',
|
"VVIMP": "imperative, full",
|
||||||
'VVIMP': 'imperative, full',
|
"VVINF": "infinitive, full",
|
||||||
'VVINF': 'infinitive, full',
|
"VVIZU": 'infinitive with "zu", full',
|
||||||
'VVIZU': 'infinitive with "zu", full',
|
"VVPP": "perfect participle, full",
|
||||||
'VVPP': 'perfect participle, full',
|
"XY": "non-word containing non-letter",
|
||||||
'XY': 'non-word containing non-letter',
|
|
||||||
|
|
||||||
|
|
||||||
# Noun chunks
|
# Noun chunks
|
||||||
|
"NP": "noun phrase",
|
||||||
'NP': 'noun phrase',
|
"PP": "prepositional phrase",
|
||||||
'PP': 'prepositional phrase',
|
"VP": "verb phrase",
|
||||||
'VP': 'verb phrase',
|
"ADVP": "adverb phrase",
|
||||||
'ADVP': 'adverb phrase',
|
"ADJP": "adjective phrase",
|
||||||
'ADJP': 'adjective phrase',
|
"SBAR": "subordinating conjunction",
|
||||||
'SBAR': 'subordinating conjunction',
|
"PRT": "particle",
|
||||||
'PRT': 'particle',
|
"PNP": "prepositional noun phrase",
|
||||||
'PNP': 'prepositional noun phrase',
|
|
||||||
|
|
||||||
|
|
||||||
# Dependency Labels (English)
|
# Dependency Labels (English)
|
||||||
# ClearNLP / Universal Dependencies
|
# ClearNLP / Universal Dependencies
|
||||||
# https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md
|
# https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md
|
||||||
|
"acomp": "adjectival complement",
|
||||||
'acomp': 'adjectival complement',
|
"advcl": "adverbial clause modifier",
|
||||||
'advcl': 'adverbial clause modifier',
|
"advmod": "adverbial modifier",
|
||||||
'advmod': 'adverbial modifier',
|
"agent": "agent",
|
||||||
'agent': 'agent',
|
"amod": "adjectival modifier",
|
||||||
'amod': 'adjectival modifier',
|
"appos": "appositional modifier",
|
||||||
'appos': 'appositional modifier',
|
"attr": "attribute",
|
||||||
'attr': 'attribute',
|
"aux": "auxiliary",
|
||||||
'aux': 'auxiliary',
|
"auxpass": "auxiliary (passive)",
|
||||||
'auxpass': 'auxiliary (passive)',
|
"cc": "coordinating conjunction",
|
||||||
'cc': 'coordinating conjunction',
|
"ccomp": "clausal complement",
|
||||||
'ccomp': 'clausal complement',
|
"complm": "complementizer",
|
||||||
'complm': 'complementizer',
|
"conj": "conjunct",
|
||||||
'conj': 'conjunct',
|
"cop": "copula",
|
||||||
'cop': 'copula',
|
"csubj": "clausal subject",
|
||||||
'csubj': 'clausal subject',
|
"csubjpass": "clausal subject (passive)",
|
||||||
'csubjpass': 'clausal subject (passive)',
|
"dep": "unclassified dependent",
|
||||||
'dep': 'unclassified dependent',
|
"det": "determiner",
|
||||||
'det': 'determiner',
|
"dobj": "direct object",
|
||||||
'dobj': 'direct object',
|
"expl": "expletive",
|
||||||
'expl': 'expletive',
|
"hmod": "modifier in hyphenation",
|
||||||
'hmod': 'modifier in hyphenation',
|
"hyph": "hyphen",
|
||||||
'hyph': 'hyphen',
|
"infmod": "infinitival modifier",
|
||||||
'infmod': 'infinitival modifier',
|
"intj": "interjection",
|
||||||
'intj': 'interjection',
|
"iobj": "indirect object",
|
||||||
'iobj': 'indirect object',
|
"mark": "marker",
|
||||||
'mark': 'marker',
|
"meta": "meta modifier",
|
||||||
'meta': 'meta modifier',
|
"neg": "negation modifier",
|
||||||
'neg': 'negation modifier',
|
"nmod": "modifier of nominal",
|
||||||
'nmod': 'modifier of nominal',
|
"nn": "noun compound modifier",
|
||||||
'nn': 'noun compound modifier',
|
"npadvmod": "noun phrase as adverbial modifier",
|
||||||
'npadvmod': 'noun phrase as adverbial modifier',
|
"nsubj": "nominal subject",
|
||||||
'nsubj': 'nominal subject',
|
"nsubjpass": "nominal subject (passive)",
|
||||||
'nsubjpass': 'nominal subject (passive)',
|
"num": "number modifier",
|
||||||
'num': 'number modifier',
|
"number": "number compound modifier",
|
||||||
'number': 'number compound modifier',
|
"oprd": "object predicate",
|
||||||
'oprd': 'object predicate',
|
"obj": "object",
|
||||||
'obj': 'object',
|
"obl": "oblique nominal",
|
||||||
'obl': 'oblique nominal',
|
"parataxis": "parataxis",
|
||||||
'parataxis': 'parataxis',
|
"partmod": "participal modifier",
|
||||||
'partmod': 'participal modifier',
|
"pcomp": "complement of preposition",
|
||||||
'pcomp': 'complement of preposition',
|
"pobj": "object of preposition",
|
||||||
'pobj': 'object of preposition',
|
"poss": "possession modifier",
|
||||||
'poss': 'possession modifier',
|
"possessive": "possessive modifier",
|
||||||
'possessive': 'possessive modifier',
|
"preconj": "pre-correlative conjunction",
|
||||||
'preconj': 'pre-correlative conjunction',
|
"prep": "prepositional modifier",
|
||||||
'prep': 'prepositional modifier',
|
"prt": "particle",
|
||||||
'prt': 'particle',
|
"punct": "punctuation",
|
||||||
'punct': 'punctuation',
|
"quantmod": "modifier of quantifier",
|
||||||
'quantmod': 'modifier of quantifier',
|
"rcmod": "relative clause modifier",
|
||||||
'rcmod': 'relative clause modifier',
|
"root": "root",
|
||||||
'root': 'root',
|
"xcomp": "open clausal complement",
|
||||||
'xcomp': 'open clausal complement',
|
|
||||||
|
|
||||||
|
|
||||||
# Dependency labels (German)
|
# Dependency labels (German)
|
||||||
# TIGER Treebank
|
# TIGER Treebank
|
||||||
# http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
|
# http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
|
||||||
# currently missing: 'cc' (comparative complement) because of conflict
|
# currently missing: 'cc' (comparative complement) because of conflict
|
||||||
# with English labels
|
# with English labels
|
||||||
|
"ac": "adpositional case marker",
|
||||||
'ac': 'adpositional case marker',
|
"adc": "adjective component",
|
||||||
'adc': 'adjective component',
|
"ag": "genitive attribute",
|
||||||
'ag': 'genitive attribute',
|
"ams": "measure argument of adjective",
|
||||||
'ams': 'measure argument of adjective',
|
"app": "apposition",
|
||||||
'app': 'apposition',
|
"avc": "adverbial phrase component",
|
||||||
'avc': 'adverbial phrase component',
|
"cd": "coordinating conjunction",
|
||||||
'cd': 'coordinating conjunction',
|
"cj": "conjunct",
|
||||||
'cj': 'conjunct',
|
"cm": "comparative conjunction",
|
||||||
'cm': 'comparative conjunction',
|
"cp": "complementizer",
|
||||||
'cp': 'complementizer',
|
"cvc": "collocational verb construction",
|
||||||
'cvc': 'collocational verb construction',
|
"da": "dative",
|
||||||
'da': 'dative',
|
"dh": "discourse-level head",
|
||||||
'dh': 'discourse-level head',
|
"dm": "discourse marker",
|
||||||
'dm': 'discourse marker',
|
"ep": "expletive es",
|
||||||
'ep': 'expletive es',
|
"hd": "head",
|
||||||
'hd': 'head',
|
"ju": "junctor",
|
||||||
'ju': 'junctor',
|
"mnr": "postnominal modifier",
|
||||||
'mnr': 'postnominal modifier',
|
"mo": "modifier",
|
||||||
'mo': 'modifier',
|
"ng": "negation",
|
||||||
'ng': 'negation',
|
"nk": "noun kernel element",
|
||||||
'nk': 'noun kernel element',
|
"nmc": "numerical component",
|
||||||
'nmc': 'numerical component',
|
"oa": "accusative object",
|
||||||
'oa': 'accusative object',
|
"oc": "clausal object",
|
||||||
'oc': 'clausal object',
|
"og": "genitive object",
|
||||||
'og': 'genitive object',
|
"op": "prepositional object",
|
||||||
'op': 'prepositional object',
|
"par": "parenthetical element",
|
||||||
'par': 'parenthetical element',
|
"pd": "predicate",
|
||||||
'pd': 'predicate',
|
"pg": "phrasal genitive",
|
||||||
'pg': 'phrasal genitive',
|
"ph": "placeholder",
|
||||||
'ph': 'placeholder',
|
"pm": "morphological particle",
|
||||||
'pm': 'morphological particle',
|
"pnc": "proper noun component",
|
||||||
'pnc': 'proper noun component',
|
"rc": "relative clause",
|
||||||
'rc': 'relative clause',
|
"re": "repeated element",
|
||||||
're': 'repeated element',
|
"rs": "reported speech",
|
||||||
'rs': 'reported speech',
|
"sb": "subject",
|
||||||
'sb': 'subject',
|
|
||||||
|
|
||||||
|
|
||||||
# Named Entity Recognition
|
# Named Entity Recognition
|
||||||
# OntoNotes 5
|
# OntoNotes 5
|
||||||
# https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf
|
# https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf
|
||||||
|
"PERSON": "People, including fictional",
|
||||||
'PERSON': 'People, including fictional',
|
"NORP": "Nationalities or religious or political groups",
|
||||||
'NORP': 'Nationalities or religious or political groups',
|
"FACILITY": "Buildings, airports, highways, bridges, etc.",
|
||||||
'FACILITY': 'Buildings, airports, highways, bridges, etc.',
|
"FAC": "Buildings, airports, highways, bridges, etc.",
|
||||||
'ORG': 'Companies, agencies, institutions, etc.',
|
"ORG": "Companies, agencies, institutions, etc.",
|
||||||
'GPE': 'Countries, cities, states',
|
"GPE": "Countries, cities, states",
|
||||||
'LOC': 'Non-GPE locations, mountain ranges, bodies of water',
|
"LOC": "Non-GPE locations, mountain ranges, bodies of water",
|
||||||
'PRODUCT': 'Objects, vehicles, foods, etc. (not services)',
|
"PRODUCT": "Objects, vehicles, foods, etc. (not services)",
|
||||||
'EVENT': 'Named hurricanes, battles, wars, sports events, etc.',
|
"EVENT": "Named hurricanes, battles, wars, sports events, etc.",
|
||||||
'WORK_OF_ART': 'Titles of books, songs, etc.',
|
"WORK_OF_ART": "Titles of books, songs, etc.",
|
||||||
'LAW': 'Named documents made into laws.',
|
"LAW": "Named documents made into laws.",
|
||||||
'LANGUAGE': 'Any named language',
|
"LANGUAGE": "Any named language",
|
||||||
'DATE': 'Absolute or relative dates or periods',
|
"DATE": "Absolute or relative dates or periods",
|
||||||
'TIME': 'Times smaller than a day',
|
"TIME": "Times smaller than a day",
|
||||||
'PERCENT': 'Percentage, including "%"',
|
"PERCENT": 'Percentage, including "%"',
|
||||||
'MONEY': 'Monetary values, including unit',
|
"MONEY": "Monetary values, including unit",
|
||||||
'QUANTITY': 'Measurements, as of weight or distance',
|
"QUANTITY": "Measurements, as of weight or distance",
|
||||||
'ORDINAL': '"first", "second", etc.',
|
"ORDINAL": '"first", "second", etc.',
|
||||||
'CARDINAL': 'Numerals that do not fall under another type',
|
"CARDINAL": "Numerals that do not fall under another type",
|
||||||
|
|
||||||
|
|
||||||
# Named Entity Recognition
|
# Named Entity Recognition
|
||||||
# Wikipedia
|
# Wikipedia
|
||||||
# http://www.sciencedirect.com/science/article/pii/S0004370212000276
|
# http://www.sciencedirect.com/science/article/pii/S0004370212000276
|
||||||
# https://pdfs.semanticscholar.org/5744/578cc243d92287f47448870bb426c66cc941.pdf
|
# https://pdfs.semanticscholar.org/5744/578cc243d92287f47448870bb426c66cc941.pdf
|
||||||
|
"PER": "Named person or family.",
|
||||||
'PER': 'Named person or family.',
|
"MISC": "Miscellaneous entities, e.g. events, nationalities, products or works of art",
|
||||||
'MISC': ('Miscellaneous entities, e.g. events, nationalities, '
|
|
||||||
'products or works of art'),
|
|
||||||
}
|
}
|
||||||
|
|
113
spacy/gold.pyx
113
spacy/gold.pyx
|
@ -15,7 +15,7 @@ import json
|
||||||
|
|
||||||
import ujson
|
import ujson
|
||||||
|
|
||||||
from . import _align
|
from . import _align
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
from .tokens import Doc
|
from .tokens import Doc
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
|
@ -172,7 +172,7 @@ class GoldCorpus(object):
|
||||||
def dev_tuples(self):
|
def dev_tuples(self):
|
||||||
locs = (self.tmp_dir / 'dev').iterdir()
|
locs = (self.tmp_dir / 'dev').iterdir()
|
||||||
yield from self.read_tuples(locs, limit=self.limit)
|
yield from self.read_tuples(locs, limit=self.limit)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def train_tuples(self):
|
def train_tuples(self):
|
||||||
locs = (self.tmp_dir / 'train').iterdir()
|
locs = (self.tmp_dir / 'train').iterdir()
|
||||||
|
@ -271,6 +271,53 @@ def _corrupt(c, noise_level):
|
||||||
return c.lower()
|
return c.lower()
|
||||||
|
|
||||||
|
|
||||||
|
def read_json_object(json_corpus_section):
|
||||||
|
"""Take a list of JSON-formatted documents (e.g. from an already loaded
|
||||||
|
training data file) and yield tuples in the GoldParse format.
|
||||||
|
|
||||||
|
json_corpus_section (list): The data.
|
||||||
|
YIELDS (tuple): The reformatted data.
|
||||||
|
"""
|
||||||
|
for json_doc in json_corpus_section:
|
||||||
|
tuple_doc = json_to_tuple(json_doc)
|
||||||
|
for tuple_paragraph in tuple_doc:
|
||||||
|
yield tuple_paragraph
|
||||||
|
|
||||||
|
|
||||||
|
def json_to_tuple(doc):
|
||||||
|
"""Convert an item in the JSON-formatted training data to the tuple format
|
||||||
|
used by GoldParse.
|
||||||
|
|
||||||
|
doc (dict): One entry in the training data.
|
||||||
|
YIELDS (tuple): The reformatted data.
|
||||||
|
"""
|
||||||
|
paragraphs = []
|
||||||
|
for paragraph in doc['paragraphs']:
|
||||||
|
sents = []
|
||||||
|
for sent in paragraph['sentences']:
|
||||||
|
words = []
|
||||||
|
ids = []
|
||||||
|
tags = []
|
||||||
|
heads = []
|
||||||
|
labels = []
|
||||||
|
ner = []
|
||||||
|
for i, token in enumerate(sent['tokens']):
|
||||||
|
words.append(token['orth'])
|
||||||
|
ids.append(i)
|
||||||
|
tags.append(token.get('tag', '-'))
|
||||||
|
heads.append(token.get('head', 0) + i)
|
||||||
|
labels.append(token.get('dep', ''))
|
||||||
|
# Ensure ROOT label is case-insensitive
|
||||||
|
if labels[-1].lower() == 'root':
|
||||||
|
labels[-1] = 'ROOT'
|
||||||
|
ner.append(token.get('ner', '-'))
|
||||||
|
sents.append([
|
||||||
|
[ids, words, tags, heads, labels, ner],
|
||||||
|
sent.get('brackets', [])])
|
||||||
|
if sents:
|
||||||
|
yield [paragraph.get('raw', None), sents]
|
||||||
|
|
||||||
|
|
||||||
def read_json_file(loc, docs_filter=None, limit=None):
|
def read_json_file(loc, docs_filter=None, limit=None):
|
||||||
loc = util.ensure_path(loc)
|
loc = util.ensure_path(loc)
|
||||||
if loc.is_dir():
|
if loc.is_dir():
|
||||||
|
@ -280,31 +327,8 @@ def read_json_file(loc, docs_filter=None, limit=None):
|
||||||
for doc in _json_iterate(loc):
|
for doc in _json_iterate(loc):
|
||||||
if docs_filter is not None and not docs_filter(doc):
|
if docs_filter is not None and not docs_filter(doc):
|
||||||
continue
|
continue
|
||||||
paragraphs = []
|
for json_tuple in json_to_tuple(doc):
|
||||||
for paragraph in doc['paragraphs']:
|
yield json_tuple
|
||||||
sents = []
|
|
||||||
for sent in paragraph['sentences']:
|
|
||||||
words = []
|
|
||||||
ids = []
|
|
||||||
tags = []
|
|
||||||
heads = []
|
|
||||||
labels = []
|
|
||||||
ner = []
|
|
||||||
for i, token in enumerate(sent['tokens']):
|
|
||||||
words.append(token['orth'])
|
|
||||||
ids.append(i)
|
|
||||||
tags.append(token.get('tag', '-'))
|
|
||||||
heads.append(token.get('head', 0) + i)
|
|
||||||
labels.append(token.get('dep', ''))
|
|
||||||
# Ensure ROOT label is case-insensitive
|
|
||||||
if labels[-1].lower() == 'root':
|
|
||||||
labels[-1] = 'ROOT'
|
|
||||||
ner.append(token.get('ner', '-'))
|
|
||||||
sents.append([
|
|
||||||
[ids, words, tags, heads, labels, ner],
|
|
||||||
sent.get('brackets', [])])
|
|
||||||
if sents:
|
|
||||||
yield [paragraph.get('raw', None), sents]
|
|
||||||
|
|
||||||
|
|
||||||
def _json_iterate(loc):
|
def _json_iterate(loc):
|
||||||
|
@ -573,32 +597,19 @@ cdef class GoldParse:
|
||||||
self.c.sent_start[i] = 0
|
self.c.sent_start[i] = 0
|
||||||
|
|
||||||
|
|
||||||
def docs_to_json(id, docs):
|
def docs_to_json(docs, underscore=None):
|
||||||
'''Convert a list of Doc objects into the JSON-serializable format used by
|
"""Convert a list of Doc objects into the JSON-serializable format used by
|
||||||
the spacy train command. Each Doc in the list will be interpreted as a
|
the spacy train command.
|
||||||
paragraph.
|
|
||||||
'''
|
docs (iterable / Doc): The Doc object(s) to convert.
|
||||||
|
underscore (list): Optional list of string names of custom doc._.
|
||||||
|
attributes. Attribute values need to be JSON-serializable. Values will
|
||||||
|
be added to an "_" key in the data, e.g. "_": {"foo": "bar"}.
|
||||||
|
RETURNS (list): The data in spaCy's JSON format.
|
||||||
|
"""
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
json_doc = {'id': id, 'paragraphs': []}
|
return [doc.to_json(underscore=underscore) for doc in docs]
|
||||||
for i, doc in enumerate(docs):
|
|
||||||
json_para = {'raw': doc.text, 'sentences': []}
|
|
||||||
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
|
|
||||||
biluo_tags = biluo_tags_from_offsets(doc, ent_offsets)
|
|
||||||
for j, sent in enumerate(doc.sents):
|
|
||||||
json_sent = {'tokens': [], 'brackets': []}
|
|
||||||
for token in sent:
|
|
||||||
json_token = {"id": token.i, "orth": token.text}
|
|
||||||
if doc.is_tagged:
|
|
||||||
json_token['tag'] = token.tag_
|
|
||||||
if doc.is_parsed:
|
|
||||||
json_token['head'] = token.head.i-token.i
|
|
||||||
json_token['dep'] = token.dep_
|
|
||||||
json_token['ner'] = biluo_tags[token.i]
|
|
||||||
json_sent['tokens'].append(json_token)
|
|
||||||
json_para['sentences'].append(json_sent)
|
|
||||||
json_doc['paragraphs'].append(json_para)
|
|
||||||
return json_doc
|
|
||||||
|
|
||||||
|
|
||||||
def biluo_tags_from_offsets(doc, entities, missing='O'):
|
def biluo_tags_from_offsets(doc, entities, missing='O'):
|
||||||
|
|
|
@ -16,16 +16,18 @@ from ...util import update_exc, add_lookups
|
||||||
class ArabicDefaults(Language.Defaults):
|
class ArabicDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
lex_attr_getters[LANG] = lambda text: 'ar'
|
lex_attr_getters[LANG] = lambda text: "ar"
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
lex_attr_getters[NORM] = add_lookups(
|
||||||
|
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||||
|
)
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
|
|
||||||
class Arabic(Language):
|
class Arabic(Language):
|
||||||
lang = 'ar'
|
lang = "ar"
|
||||||
Defaults = ArabicDefaults
|
Defaults = ArabicDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Arabic']
|
__all__ = ["Arabic"]
|
||||||
|
|
|
@ -10,11 +10,11 @@ Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
sentences = [
|
sentences = [
|
||||||
"نال الكاتب خالد توفيق جائزة الرواية العربية في معرض الشارقة الدولي للكتاب",
|
"نال الكاتب خالد توفيق جائزة الرواية العربية في معرض الشارقة الدولي للكتاب",
|
||||||
"أين تقع دمشق ؟"
|
"أين تقع دمشق ؟",
|
||||||
"كيف حالك ؟",
|
"كيف حالك ؟",
|
||||||
"هل يمكن ان نلتقي على الساعة الثانية عشرة ظهرا ؟",
|
"هل يمكن ان نلتقي على الساعة الثانية عشرة ظهرا ؟",
|
||||||
"ماهي أبرز التطورات السياسية، الأمنية والاجتماعية في العالم ؟",
|
"ماهي أبرز التطورات السياسية، الأمنية والاجتماعية في العالم ؟",
|
||||||
"هل بالإمكان أن نلتقي غدا؟",
|
"هل بالإمكان أن نلتقي غدا؟",
|
||||||
"هناك نحو 382 مليون شخص مصاب بداء السكَّري في العالم",
|
"هناك نحو 382 مليون شخص مصاب بداء السكَّري في العالم",
|
||||||
"كشفت دراسة حديثة أن الخيل تقرأ تعبيرات الوجه وتستطيع أن تتذكر مشاعر الناس وعواطفهم"
|
"كشفت دراسة حديثة أن الخيل تقرأ تعبيرات الوجه وتستطيع أن تتذكر مشاعر الناس وعواطفهم",
|
||||||
]
|
]
|
||||||
|
|
|
@ -2,7 +2,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
_num_words = set("""
|
_num_words = set(
|
||||||
|
"""
|
||||||
صفر
|
صفر
|
||||||
واحد
|
واحد
|
||||||
إثنان
|
إثنان
|
||||||
|
@ -52,9 +53,11 @@ _num_words = set("""
|
||||||
مليون
|
مليون
|
||||||
مليار
|
مليار
|
||||||
مليارات
|
مليارات
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
||||||
_ordinal_words = set("""
|
_ordinal_words = set(
|
||||||
|
"""
|
||||||
اول
|
اول
|
||||||
أول
|
أول
|
||||||
حاد
|
حاد
|
||||||
|
@ -69,20 +72,21 @@ _ordinal_words = set("""
|
||||||
ثامن
|
ثامن
|
||||||
تاسع
|
تاسع
|
||||||
عاشر
|
عاشر
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
"""
|
"""
|
||||||
check if text resembles a number
|
Check if text resembles a number
|
||||||
"""
|
"""
|
||||||
if text.startswith(('+', '-', '±', '~')):
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
text = text[1:]
|
text = text[1:]
|
||||||
text = text.replace(',', '').replace('.', '')
|
text = text.replace(",", "").replace(".", "")
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
if text.count('/') == 1:
|
if text.count("/") == 1:
|
||||||
num, denom = text.split('/')
|
num, denom = text.split("/")
|
||||||
if num.isdigit() and denom.isdigit():
|
if num.isdigit() and denom.isdigit():
|
||||||
return True
|
return True
|
||||||
if text in _num_words:
|
if text in _num_words:
|
||||||
|
@ -92,6 +96,4 @@ def like_num(text):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
LEX_ATTRS = {
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
||||||
LIKE_NUM: like_num
|
|
||||||
}
|
|
||||||
|
|
|
@ -1,15 +1,20 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..punctuation import TOKENIZER_INFIXES
|
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
||||||
from ..char_classes import QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
from ..char_classes import UNITS, ALPHA_UPPER
|
||||||
|
|
||||||
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
|
_suffixes = (
|
||||||
[r'(?<=[0-9])\+',
|
LIST_PUNCT
|
||||||
# Arabic is written from Right-To-Left
|
+ LIST_ELLIPSES
|
||||||
r'(?<=[0-9])(?:{})'.format(CURRENCY),
|
+ LIST_QUOTES
|
||||||
r'(?<=[0-9])(?:{})'.format(UNITS),
|
+ [
|
||||||
r'(?<=[{au}][{au}])\.'.format(au=ALPHA_UPPER)])
|
r"(?<=[0-9])\+",
|
||||||
|
# Arabic is written from Right-To-Left
|
||||||
|
r"(?<=[0-9])(?:{})".format(CURRENCY),
|
||||||
|
r"(?<=[0-9])(?:{})".format(UNITS),
|
||||||
|
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
TOKENIZER_SUFFIXES = _suffixes
|
TOKENIZER_SUFFIXES = _suffixes
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
STOP_WORDS = set("""
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
من
|
من
|
||||||
نحو
|
نحو
|
||||||
لعل
|
لعل
|
||||||
|
@ -388,4 +389,5 @@ STOP_WORDS = set("""
|
||||||
وإن
|
وإن
|
||||||
ولو
|
ولو
|
||||||
يا
|
يا
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
|
@ -1,21 +1,23 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
|
from ...symbols import ORTH, LEMMA
|
||||||
import re
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
# time
|
|
||||||
|
# Time
|
||||||
for exc_data in [
|
for exc_data in [
|
||||||
{LEMMA: "قبل الميلاد", ORTH: "ق.م"},
|
{LEMMA: "قبل الميلاد", ORTH: "ق.م"},
|
||||||
{LEMMA: "بعد الميلاد", ORTH: "ب. م"},
|
{LEMMA: "بعد الميلاد", ORTH: "ب. م"},
|
||||||
{LEMMA: "ميلادي", ORTH: ".م"},
|
{LEMMA: "ميلادي", ORTH: ".م"},
|
||||||
{LEMMA: "هجري", ORTH: ".هـ"},
|
{LEMMA: "هجري", ORTH: ".هـ"},
|
||||||
{LEMMA: "توفي", ORTH: ".ت"}]:
|
{LEMMA: "توفي", ORTH: ".ت"},
|
||||||
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
# scientific abv.
|
# Scientific abv.
|
||||||
for exc_data in [
|
for exc_data in [
|
||||||
{LEMMA: "صلى الله عليه وسلم", ORTH: "صلعم"},
|
{LEMMA: "صلى الله عليه وسلم", ORTH: "صلعم"},
|
||||||
{LEMMA: "الشارح", ORTH: "الشـ"},
|
{LEMMA: "الشارح", ORTH: "الشـ"},
|
||||||
|
@ -28,20 +30,20 @@ for exc_data in [
|
||||||
{LEMMA: "أنبأنا", ORTH: "أنا"},
|
{LEMMA: "أنبأنا", ORTH: "أنا"},
|
||||||
{LEMMA: "أخبرنا", ORTH: "نا"},
|
{LEMMA: "أخبرنا", ORTH: "نا"},
|
||||||
{LEMMA: "مصدر سابق", ORTH: "م. س"},
|
{LEMMA: "مصدر سابق", ORTH: "م. س"},
|
||||||
{LEMMA: "مصدر نفسه", ORTH: "م. ن"}]:
|
{LEMMA: "مصدر نفسه", ORTH: "م. ن"},
|
||||||
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
# other abv.
|
# Other abv.
|
||||||
for exc_data in [
|
for exc_data in [
|
||||||
{LEMMA: "دكتور", ORTH: "د."},
|
{LEMMA: "دكتور", ORTH: "د."},
|
||||||
{LEMMA: "أستاذ دكتور", ORTH: "أ.د"},
|
{LEMMA: "أستاذ دكتور", ORTH: "أ.د"},
|
||||||
{LEMMA: "أستاذ", ORTH: "أ."},
|
{LEMMA: "أستاذ", ORTH: "أ."},
|
||||||
{LEMMA: "بروفيسور", ORTH: "ب."}]:
|
{LEMMA: "بروفيسور", ORTH: "ب."},
|
||||||
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
for exc_data in [
|
for exc_data in [{LEMMA: "تلفون", ORTH: "ت."}, {LEMMA: "صندوق بريد", ORTH: "ص.ب"}]:
|
||||||
{LEMMA: "تلفون", ORTH: "ت."},
|
|
||||||
{LEMMA: "صندوق بريد", ORTH: "ص.ب"}]:
|
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = _exc
|
||||||
|
|
|
@ -15,7 +15,7 @@ from ...util import update_exc
|
||||||
|
|
||||||
class BengaliDefaults(Language.Defaults):
|
class BengaliDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'bn'
|
lex_attr_getters[LANG] = lambda text: "bn"
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
@ -26,8 +26,8 @@ class BengaliDefaults(Language.Defaults):
|
||||||
|
|
||||||
|
|
||||||
class Bengali(Language):
|
class Bengali(Language):
|
||||||
lang = 'bn'
|
lang = "bn"
|
||||||
Defaults = BengaliDefaults
|
Defaults = BengaliDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Bengali']
|
__all__ = ["Bengali"]
|
||||||
|
|
|
@ -13,11 +13,9 @@ LEMMA_RULES = {
|
||||||
["গাছা", ""],
|
["গাছা", ""],
|
||||||
["গাছি", ""],
|
["গাছি", ""],
|
||||||
["ছড়া", ""],
|
["ছড়া", ""],
|
||||||
|
|
||||||
["কে", ""],
|
["কে", ""],
|
||||||
["ে", ""],
|
["ে", ""],
|
||||||
["তে", ""],
|
["তে", ""],
|
||||||
|
|
||||||
["র", ""],
|
["র", ""],
|
||||||
["রা", ""],
|
["রা", ""],
|
||||||
["রে", ""],
|
["রে", ""],
|
||||||
|
@ -28,7 +26,6 @@ LEMMA_RULES = {
|
||||||
["গুলা", ""],
|
["গুলা", ""],
|
||||||
["গুলো", ""],
|
["গুলো", ""],
|
||||||
["গুলি", ""],
|
["গুলি", ""],
|
||||||
|
|
||||||
["কুল", ""],
|
["কুল", ""],
|
||||||
["গণ", ""],
|
["গণ", ""],
|
||||||
["দল", ""],
|
["দল", ""],
|
||||||
|
@ -45,7 +42,6 @@ LEMMA_RULES = {
|
||||||
["সকল", ""],
|
["সকল", ""],
|
||||||
["মহল", ""],
|
["মহল", ""],
|
||||||
["াবলি", ""], # আবলি
|
["াবলি", ""], # আবলি
|
||||||
|
|
||||||
# Bengali digit representations
|
# Bengali digit representations
|
||||||
["০", "0"],
|
["০", "0"],
|
||||||
["১", "1"],
|
["১", "1"],
|
||||||
|
@ -58,11 +54,5 @@ LEMMA_RULES = {
|
||||||
["৮", "8"],
|
["৮", "8"],
|
||||||
["৯", "9"],
|
["৯", "9"],
|
||||||
],
|
],
|
||||||
|
"punct": [["“", '"'], ["”", '"'], ["\u2018", "'"], ["\u2019", "'"]],
|
||||||
"punct": [
|
|
||||||
["“", "\""],
|
|
||||||
["”", "\""],
|
|
||||||
["\u2018", "'"],
|
|
||||||
["\u2019", "'"]
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,64 +5,253 @@ from ...symbols import LEMMA, PRON_LEMMA
|
||||||
|
|
||||||
|
|
||||||
MORPH_RULES = {
|
MORPH_RULES = {
|
||||||
"PRP": {
|
"PRP": {
|
||||||
'ঐ': {LEMMA: PRON_LEMMA, 'PronType': 'Dem'},
|
"ঐ": {LEMMA: PRON_LEMMA, "PronType": "Dem"},
|
||||||
'আমাকে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Case': 'Acc'},
|
"আমাকে": {
|
||||||
'কি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'},
|
LEMMA: PRON_LEMMA,
|
||||||
'সে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Nom'},
|
"Number": "Sing",
|
||||||
'কিসে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'},
|
"Person": "One",
|
||||||
'তাকে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Acc'},
|
"PronType": "Prs",
|
||||||
'স্বয়ং': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
|
"Case": "Acc",
|
||||||
'কোনগুলো': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'},
|
},
|
||||||
'তুমি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'},
|
"কি": {
|
||||||
'তুই': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'},
|
LEMMA: PRON_LEMMA,
|
||||||
'তাদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Acc'},
|
"Number": "Sing",
|
||||||
'আমরা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One ', 'PronType': 'Prs', 'Case': 'Nom'},
|
"Gender": "Neut",
|
||||||
'যিনি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Rel', 'Case': 'Nom'},
|
"PronType": "Int",
|
||||||
'আমাদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One', 'PronType': 'Prs', 'Case': 'Acc'},
|
"Case": "Acc",
|
||||||
'কোন': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'},
|
},
|
||||||
'কারা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'},
|
"সে": {
|
||||||
'তোমাকে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'},
|
LEMMA: PRON_LEMMA,
|
||||||
'তোকে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'},
|
"Number": "Sing",
|
||||||
'খোদ': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
|
"Person": "Three",
|
||||||
'কে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'},
|
"PronType": "Prs",
|
||||||
'যারা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Rel', 'Case': 'Nom'},
|
"Case": "Nom",
|
||||||
'যে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Rel', 'Case': 'Nom'},
|
},
|
||||||
'তোমরা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'},
|
"কিসে": {
|
||||||
'তোরা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'},
|
LEMMA: PRON_LEMMA,
|
||||||
'তোমাদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'},
|
"Number": "Sing",
|
||||||
'তোদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'},
|
"Gender": "Neut",
|
||||||
'আপন': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
|
"PronType": "Int",
|
||||||
'এ': {LEMMA: PRON_LEMMA, 'PronType': 'Dem'},
|
"Case": "Acc",
|
||||||
'নিজ': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
|
},
|
||||||
'কার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'},
|
"তাকে": {
|
||||||
'যা': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Rel', 'Case': 'Nom'},
|
LEMMA: PRON_LEMMA,
|
||||||
'তারা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Nom'},
|
"Number": "Sing",
|
||||||
'আমি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Case': 'Nom'}
|
"Person": "Three",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Acc",
|
||||||
|
},
|
||||||
|
"স্বয়ং": {LEMMA: PRON_LEMMA, "Reflex": "Yes", "PronType": "Ref"},
|
||||||
|
"কোনগুলো": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"Gender": "Neut",
|
||||||
|
"PronType": "Int",
|
||||||
|
"Case": "Acc",
|
||||||
|
},
|
||||||
|
"তুমি": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Sing",
|
||||||
|
"Person": "Two",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"তুই": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Sing",
|
||||||
|
"Person": "Two",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"তাদেরকে": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"Person": "Three",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Acc",
|
||||||
|
},
|
||||||
|
"আমরা": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"Person": "One ",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"যিনি": {LEMMA: PRON_LEMMA, "Number": "Sing", "PronType": "Rel", "Case": "Nom"},
|
||||||
|
"আমাদেরকে": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"Person": "One",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Acc",
|
||||||
|
},
|
||||||
|
"কোন": {LEMMA: PRON_LEMMA, "Number": "Sing", "PronType": "Int", "Case": "Acc"},
|
||||||
|
"কারা": {LEMMA: PRON_LEMMA, "Number": "Plur", "PronType": "Int", "Case": "Acc"},
|
||||||
|
"তোমাকে": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Sing",
|
||||||
|
"Person": "Two",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Acc",
|
||||||
|
},
|
||||||
|
"তোকে": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Sing",
|
||||||
|
"Person": "Two",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Acc",
|
||||||
|
},
|
||||||
|
"খোদ": {LEMMA: PRON_LEMMA, "Reflex": "Yes", "PronType": "Ref"},
|
||||||
|
"কে": {LEMMA: PRON_LEMMA, "Number": "Sing", "PronType": "Int", "Case": "Acc"},
|
||||||
|
"যারা": {LEMMA: PRON_LEMMA, "Number": "Plur", "PronType": "Rel", "Case": "Nom"},
|
||||||
|
"যে": {LEMMA: PRON_LEMMA, "Number": "Sing", "PronType": "Rel", "Case": "Nom"},
|
||||||
|
"তোমরা": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"Person": "Two",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"তোরা": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"Person": "Two",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"তোমাদেরকে": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"Person": "Two",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Acc",
|
||||||
|
},
|
||||||
|
"তোদেরকে": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"Person": "Two",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Acc",
|
||||||
|
},
|
||||||
|
"আপন": {LEMMA: PRON_LEMMA, "Reflex": "Yes", "PronType": "Ref"},
|
||||||
|
"এ": {LEMMA: PRON_LEMMA, "PronType": "Dem"},
|
||||||
|
"নিজ": {LEMMA: PRON_LEMMA, "Reflex": "Yes", "PronType": "Ref"},
|
||||||
|
"কার": {LEMMA: PRON_LEMMA, "Number": "Sing", "PronType": "Int", "Case": "Acc"},
|
||||||
|
"যা": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Sing",
|
||||||
|
"Gender": "Neut",
|
||||||
|
"PronType": "Rel",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"তারা": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"Person": "Three",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"আমি": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Sing",
|
||||||
|
"Person": "One",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
"PRP$": {
|
"PRP$": {
|
||||||
|
"আমার": {
|
||||||
'আমার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes',
|
LEMMA: PRON_LEMMA,
|
||||||
'Case': 'Nom'},
|
"Number": "Sing",
|
||||||
'মোর': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes',
|
"Person": "One",
|
||||||
'Case': 'Nom'},
|
"PronType": "Prs",
|
||||||
'মোদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes',
|
"Poss": "Yes",
|
||||||
'Case': 'Nom'},
|
"Case": "Nom",
|
||||||
'তার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes',
|
},
|
||||||
'Case': 'Nom'},
|
"মোর": {
|
||||||
'তোমাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes',
|
LEMMA: PRON_LEMMA,
|
||||||
'Case': 'Nom'},
|
"Number": "Sing",
|
||||||
'আমাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes',
|
"Person": "One",
|
||||||
'Case': 'Nom'},
|
"PronType": "Prs",
|
||||||
'তোমার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes',
|
"Poss": "Yes",
|
||||||
'Case': 'Nom'},
|
"Case": "Nom",
|
||||||
'তোর': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes',
|
},
|
||||||
'Case': 'Nom'},
|
"মোদের": {
|
||||||
'তাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes',
|
LEMMA: PRON_LEMMA,
|
||||||
'Case': 'Nom'},
|
"Number": "Plur",
|
||||||
'কাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'},
|
"Person": "One",
|
||||||
'তোদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes',
|
"PronType": "Prs",
|
||||||
'Case': 'Nom'},
|
"Poss": "Yes",
|
||||||
'যাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'},
|
"Case": "Nom",
|
||||||
}
|
},
|
||||||
|
"তার": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Sing",
|
||||||
|
"Person": "Three",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Poss": "Yes",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"তোমাদের": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"Person": "Two",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Poss": "Yes",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"আমাদের": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"Person": "One",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Poss": "Yes",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"তোমার": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Sing",
|
||||||
|
"Person": "Two",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Poss": "Yes",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"তোর": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Sing",
|
||||||
|
"Person": "Two",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Poss": "Yes",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"তাদের": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"Person": "Three",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Poss": "Yes",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"কাদের": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"PronType": "Int",
|
||||||
|
"Case": "Acc",
|
||||||
|
},
|
||||||
|
"তোদের": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"Person": "Two",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Poss": "Yes",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"যাদের": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"PronType": "Int",
|
||||||
|
"Case": "Acc",
|
||||||
|
},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,30 +2,45 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
||||||
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS
|
from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, QUOTES, UNITS
|
||||||
|
|
||||||
|
|
||||||
_currency = r"\$|¢|£|€|¥|฿|৳"
|
_currency = r"\$|¢|£|€|¥|฿|৳"
|
||||||
_quotes = QUOTES.replace("'", '')
|
_quotes = QUOTES.replace("'", "")
|
||||||
_list_punct = LIST_PUNCT + '। ॥'.strip().split()
|
_list_punct = LIST_PUNCT + "। ॥".strip().split()
|
||||||
|
|
||||||
|
|
||||||
_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)
|
_prefixes = [r"\+"] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS
|
||||||
|
|
||||||
_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
_suffixes = (
|
||||||
[r'(?<=[0-9])\+',
|
_list_punct
|
||||||
r'(?<=°[FfCcKk])\.',
|
+ LIST_ELLIPSES
|
||||||
r'(?<=[0-9])(?:{})'.format(_currency),
|
+ LIST_QUOTES
|
||||||
r'(?<=[0-9])(?:{})'.format(UNITS),
|
+ LIST_ICONS
|
||||||
r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)])
|
+ [
|
||||||
|
r"(?<=[0-9])\+",
|
||||||
|
r"(?<=°[FfCcKk])\.",
|
||||||
|
r"(?<=[0-9])(?:{})".format(_currency),
|
||||||
|
r"(?<=[0-9])(?:{})".format(UNITS),
|
||||||
|
r"(?<=[{}(?:{})])\.".format(
|
||||||
|
"|".join([ALPHA_LOWER, r"%²\-\)\]\+", QUOTES]), _currency
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
_infixes = (
|
||||||
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
LIST_ELLIPSES
|
||||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
+ LIST_ICONS
|
||||||
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
+ [
|
||||||
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
|
r"(?<=[0-9{zero}-{nine}])[+\-\*^=](?=[0-9{zero}-{nine}-])".format(
|
||||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
zero="০", nine="৯"
|
||||||
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=_quotes)])
|
),
|
||||||
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}])[{h}](?={ae})".format(a=ALPHA, h=HYPHENS, ae="এ"),
|
||||||
|
r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
|
||||||
|
r'(?<=[{a}"])[:<>=/](?=[{a}])'.format(a=ALPHA),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_PREFIXES = _prefixes
|
TOKENIZER_PREFIXES = _prefixes
|
||||||
|
|
|
@ -2,43 +2,45 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
STOP_WORDS = set("""
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
অতএব অথচ অথবা অনুযায়ী অনেক অনেকে অনেকেই অন্তত অবধি অবশ্য অর্থাৎ অন্য অনুযায়ী অর্ধভাগে
|
অতএব অথচ অথবা অনুযায়ী অনেক অনেকে অনেকেই অন্তত অবধি অবশ্য অর্থাৎ অন্য অনুযায়ী অর্ধভাগে
|
||||||
আগামী আগে আগেই আছে আজ আদ্যভাগে আপনার আপনি আবার আমরা আমাকে আমাদের আমার আমি আর আরও
|
আগামী আগে আগেই আছে আজ আদ্যভাগে আপনার আপনি আবার আমরা আমাকে আমাদের আমার আমি আর আরও
|
||||||
ইত্যাদি ইহা
|
ইত্যাদি ইহা
|
||||||
উচিত উনি উপর উপরে উত্তর
|
উচিত উনি উপর উপরে উত্তর
|
||||||
এ এঁদের এঁরা এই এক একই একজন একটা একটি একবার একে এখন এখনও এখানে এখানেই এটা এসো
|
এ এঁদের এঁরা এই এক একই একজন একটা একটি একবার একে এখন এখনও এখানে এখানেই এটা এসো
|
||||||
এটাই এটি এত এতটাই এতে এদের এবং এবার এমন এমনি এমনকি এর এরা এলো এস এসে
|
এটাই এটি এত এতটাই এতে এদের এবং এবার এমন এমনি এমনকি এর এরা এলো এস এসে
|
||||||
ঐ
|
ঐ
|
||||||
ও ওঁদের ওঁর ওঁরা ওই ওকে ওখানে ওদের ওর ওরা
|
ও ওঁদের ওঁর ওঁরা ওই ওকে ওখানে ওদের ওর ওরা
|
||||||
কখনও কত কথা কবে কয়েক কয়েকটি করছে করছেন করতে করবে করবেন করলে কয়েক কয়েকটি করিয়ে করিয়া করায়
|
কখনও কত কথা কবে কয়েক কয়েকটি করছে করছেন করতে করবে করবেন করলে কয়েক কয়েকটি করিয়ে করিয়া করায়
|
||||||
করলেন করা করাই করায় করার করি করিতে করিয়া করিয়ে করে করেই করেছিলেন করেছে করেছেন করেন কাউকে
|
করলেন করা করাই করায় করার করি করিতে করিয়া করিয়ে করে করেই করেছিলেন করেছে করেছেন করেন কাউকে
|
||||||
কাছ কাছে কাজ কাজে কারও কারণ কি কিংবা কিছু কিছুই কিন্তু কী কে কেউ কেউই কেন কোন কোনও কোনো কেমনে কোটি
|
কাছ কাছে কাজ কাজে কারও কারণ কি কিংবা কিছু কিছুই কিন্তু কী কে কেউ কেউই কেন কোন কোনও কোনো কেমনে কোটি
|
||||||
ক্ষেত্রে খুব
|
ক্ষেত্রে খুব
|
||||||
গিয়ে গিয়েছে গুলি গেছে গেল গেলে গোটা গিয়ে গিয়েছে
|
গিয়ে গিয়েছে গুলি গেছে গেল গেলে গোটা গিয়ে গিয়েছে
|
||||||
চলে চান চায় চেয়ে চায় চেয়ে চার চালু চেষ্টা
|
চলে চান চায় চেয়ে চায় চেয়ে চার চালু চেষ্টা
|
||||||
ছাড়া ছাড়াও ছিল ছিলেন ছাড়া ছাড়াও
|
ছাড়া ছাড়াও ছিল ছিলেন ছাড়া ছাড়াও
|
||||||
জন জনকে জনের জন্য জন্যে জানতে জানা জানানো জানায় জানিয়ে জানিয়েছে জানায় জাানিয়ে জানিয়েছে
|
জন জনকে জনের জন্য জন্যে জানতে জানা জানানো জানায় জানিয়ে জানিয়েছে জানায় জাানিয়ে জানিয়েছে
|
||||||
টি
|
টি
|
||||||
ঠিক
|
ঠিক
|
||||||
তখন তত তথা তবু তবে তা তাঁকে তাঁদের তাঁর তাঁরা তাঁহারা তাই তাও তাকে তাতে তাদের তার তারপর তারা তারই তাহলে তাহা তাহাতে তাহার তিনই
|
তখন তত তথা তবু তবে তা তাঁকে তাঁদের তাঁর তাঁরা তাঁহারা তাই তাও তাকে তাতে তাদের তার তারপর তারা তারই তাহলে তাহা তাহাতে তাহার তিনই
|
||||||
তিনি তিনিও তুমি তুলে তেমন তো তোমার তুই তোরা তোর তোমাদের তোদের
|
তিনি তিনিও তুমি তুলে তেমন তো তোমার তুই তোরা তোর তোমাদের তোদের
|
||||||
থাকবে থাকবেন থাকা থাকায় থাকে থাকেন থেকে থেকেই থেকেও থাকায়
|
থাকবে থাকবেন থাকা থাকায় থাকে থাকেন থেকে থেকেই থেকেও থাকায়
|
||||||
দিকে দিতে দিয়ে দিয়েছে দিয়েছেন দিলেন দিয়ে দু দুটি দুটো দেওয়া দেওয়ার দেখতে দেখা দেখে দেন দেয় দেশের
|
দিকে দিতে দিয়ে দিয়েছে দিয়েছেন দিলেন দিয়ে দু দুটি দুটো দেওয়া দেওয়ার দেখতে দেখা দেখে দেন দেয় দেশের
|
||||||
দ্বারা দিয়েছে দিয়েছেন দেয় দেওয়া দেওয়ার দিন দুই
|
দ্বারা দিয়েছে দিয়েছেন দেয় দেওয়া দেওয়ার দিন দুই
|
||||||
ধরা ধরে
|
ধরা ধরে
|
||||||
নয় না নাই নাকি নাগাদ নানা নিজে নিজেই নিজেদের নিজের নিতে নিয়ে নিয়ে নেই নেওয়া নেওয়ার নয় নতুন
|
নয় না নাই নাকি নাগাদ নানা নিজে নিজেই নিজেদের নিজের নিতে নিয়ে নিয়ে নেই নেওয়া নেওয়ার নয় নতুন
|
||||||
পক্ষে পর পরে পরেই পরেও পর্যন্ত পাওয়া পারি পারে পারেন পেয়ে প্রতি প্রভৃতি প্রায় পাওয়া পেয়ে প্রায় পাঁচ প্রথম প্রাথমিক
|
পক্ষে পর পরে পরেই পরেও পর্যন্ত পাওয়া পারি পারে পারেন পেয়ে প্রতি প্রভৃতি প্রায় পাওয়া পেয়ে প্রায় পাঁচ প্রথম প্রাথমিক
|
||||||
ফলে ফিরে ফের
|
ফলে ফিরে ফের
|
||||||
বছর বদলে বরং বলতে বলল বললেন বলা বলে বলেছেন বলেন বসে বহু বা বাদে বার বিনা বিভিন্ন বিশেষ বিষয়টি বেশ ব্যবহার ব্যাপারে বক্তব্য বন বেশি
|
বছর বদলে বরং বলতে বলল বললেন বলা বলে বলেছেন বলেন বসে বহু বা বাদে বার বিনা বিভিন্ন বিশেষ বিষয়টি বেশ ব্যবহার ব্যাপারে বক্তব্য বন বেশি
|
||||||
ভাবে ভাবেই
|
ভাবে ভাবেই
|
||||||
মত মতো মতোই মধ্যভাগে মধ্যে মধ্যেই মধ্যেও মনে মাত্র মাধ্যমে মানুষ মানুষের মোট মোটেই মোদের মোর
|
মত মতো মতোই মধ্যভাগে মধ্যে মধ্যেই মধ্যেও মনে মাত্র মাধ্যমে মানুষ মানুষের মোট মোটেই মোদের মোর
|
||||||
যখন যত যতটা যথেষ্ট যদি যদিও যা যাঁর যাঁরা যাওয়া যাওয়ার যাকে যাচ্ছে যাতে যাদের যান যাবে যায় যার যারা যায় যিনি যে যেখানে যেতে যেন
|
যখন যত যতটা যথেষ্ট যদি যদিও যা যাঁর যাঁরা যাওয়া যাওয়ার যাকে যাচ্ছে যাতে যাদের যান যাবে যায় যার যারা যায় যিনি যে যেখানে যেতে যেন
|
||||||
যেমন
|
যেমন
|
||||||
রকম রয়েছে রাখা রেখে রয়েছে
|
রকম রয়েছে রাখা রেখে রয়েছে
|
||||||
লক্ষ
|
লক্ষ
|
||||||
শুধু শুরু
|
শুধু শুরু
|
||||||
সাধারণ সামনে সঙ্গে সঙ্গেও সব সবার সমস্ত সম্প্রতি সময় সহ সহিত সাথে সুতরাং সে সেই সেখান সেখানে সেটা সেটাই সেটাও সেটি স্পষ্ট স্বয়ং
|
সাধারণ সামনে সঙ্গে সঙ্গেও সব সবার সমস্ত সম্প্রতি সময় সহ সহিত সাথে সুতরাং সে সেই সেখান সেখানে সেটা সেটাই সেটাও সেটি স্পষ্ট স্বয়ং
|
||||||
হইতে হইবে হইয়া হওয়া হওয়ায় হওয়ার হচ্ছে হত হতে হতেই হন হবে হবেন হয় হয়তো হয়নি হয়ে হয়েই হয়েছিল হয়েছে হাজার
|
হইতে হইবে হইয়া হওয়া হওয়ায় হওয়ার হচ্ছে হত হতে হতেই হন হবে হবেন হয় হয়তো হয়নি হয়ে হয়েই হয়েছিল হয়েছে হাজার
|
||||||
হয়েছেন হল হলে হলেই হলেও হলো হিসাবে হিসেবে হৈলে হোক হয় হয়ে হয়েছে হৈতে হইয়া হয়েছিল হয়েছেন হয়নি হয়েই হয়তো হওয়া হওয়ার হওয়ায়
|
হয়েছেন হল হলে হলেই হলেও হলো হিসাবে হিসেবে হৈলে হোক হয় হয়ে হয়েছে হৈতে হইয়া হয়েছিল হয়েছেন হয়নি হয়েই হয়তো হওয়া হওয়ার হওয়ায়
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
|
@ -6,72 +6,77 @@ from ...symbols import CCONJ, NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SYM
|
||||||
|
|
||||||
|
|
||||||
TAG_MAP = {
|
TAG_MAP = {
|
||||||
".": {POS: PUNCT, "PunctType": "peri"},
|
".": {POS: PUNCT, "PunctType": "peri"},
|
||||||
",": {POS: PUNCT, "PunctType": "comm"},
|
",": {POS: PUNCT, "PunctType": "comm"},
|
||||||
"-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"},
|
"-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"},
|
||||||
"-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"},
|
"-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"},
|
||||||
"``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"},
|
"``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"},
|
||||||
"\"\"": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
'""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
||||||
"''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
"''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
||||||
":": {POS: PUNCT},
|
":": {POS: PUNCT},
|
||||||
"৳": {POS: SYM, "Other": {"SymType": "currency"}},
|
"৳": {POS: SYM, "Other": {"SymType": "currency"}},
|
||||||
"#": {POS: SYM, "Other": {"SymType": "numbersign"}},
|
"#": {POS: SYM, "Other": {"SymType": "numbersign"}},
|
||||||
"AFX": {POS: ADJ, "Hyph": "yes"},
|
"AFX": {POS: ADJ, "Hyph": "yes"},
|
||||||
"CC": {POS: CONJ, "ConjType": "coor"},
|
"CC": {POS: CONJ, "ConjType": "coor"},
|
||||||
"CD": {POS: NUM, "NumType": "card"},
|
"CD": {POS: NUM, "NumType": "card"},
|
||||||
"DT": {POS: DET},
|
"DT": {POS: DET},
|
||||||
"EX": {POS: ADV, "AdvType": "ex"},
|
"EX": {POS: ADV, "AdvType": "ex"},
|
||||||
"FW": {POS: X, "Foreign": "yes"},
|
"FW": {POS: X, "Foreign": "yes"},
|
||||||
"HYPH": {POS: PUNCT, "PunctType": "dash"},
|
"HYPH": {POS: PUNCT, "PunctType": "dash"},
|
||||||
"IN": {POS: ADP},
|
"IN": {POS: ADP},
|
||||||
"JJ": {POS: ADJ, "Degree": "pos"},
|
"JJ": {POS: ADJ, "Degree": "pos"},
|
||||||
"JJR": {POS: ADJ, "Degree": "comp"},
|
"JJR": {POS: ADJ, "Degree": "comp"},
|
||||||
"JJS": {POS: ADJ, "Degree": "sup"},
|
"JJS": {POS: ADJ, "Degree": "sup"},
|
||||||
"LS": {POS: PUNCT, "NumType": "ord"},
|
"LS": {POS: PUNCT, "NumType": "ord"},
|
||||||
"MD": {POS: VERB, "VerbType": "mod"},
|
"MD": {POS: VERB, "VerbType": "mod"},
|
||||||
"NIL": {POS: ""},
|
"NIL": {POS: ""},
|
||||||
"NN": {POS: NOUN, "Number": "sing"},
|
"NN": {POS: NOUN, "Number": "sing"},
|
||||||
"NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
|
"NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
|
||||||
"NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"},
|
"NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"},
|
||||||
"NNS": {POS: NOUN, "Number": "plur"},
|
"NNS": {POS: NOUN, "Number": "plur"},
|
||||||
"PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"},
|
"PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"},
|
||||||
"POS": {POS: PART, "Poss": "yes"},
|
"POS": {POS: PART, "Poss": "yes"},
|
||||||
"PRP": {POS: PRON, "PronType": "prs"},
|
"PRP": {POS: PRON, "PronType": "prs"},
|
||||||
"PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"},
|
"PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"},
|
||||||
"RB": {POS: ADV, "Degree": "pos"},
|
"RB": {POS: ADV, "Degree": "pos"},
|
||||||
"RBR": {POS: ADV, "Degree": "comp"},
|
"RBR": {POS: ADV, "Degree": "comp"},
|
||||||
"RBS": {POS: ADV, "Degree": "sup"},
|
"RBS": {POS: ADV, "Degree": "sup"},
|
||||||
"RP": {POS: PART},
|
"RP": {POS: PART},
|
||||||
"SYM": {POS: SYM},
|
"TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"},
|
||||||
"TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"},
|
"UH": {POS: INTJ},
|
||||||
"UH": {POS: INTJ},
|
"VB": {POS: VERB, "VerbForm": "inf"},
|
||||||
"VB": {POS: VERB, "VerbForm": "inf"},
|
"VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"},
|
||||||
"VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"},
|
"VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"},
|
||||||
"VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"},
|
"VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"},
|
||||||
"VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"},
|
"VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"},
|
||||||
"VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"},
|
"VBZ": {
|
||||||
"VBZ": {POS: VERB, "VerbForm": "fin", "Tense": "pres", "Number": "sing", "Person": 3},
|
POS: VERB,
|
||||||
"WDT": {POS: ADJ, "PronType": "int|rel"},
|
"VerbForm": "fin",
|
||||||
"WP": {POS: NOUN, "PronType": "int|rel"},
|
"Tense": "pres",
|
||||||
"WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
|
"Number": "sing",
|
||||||
"WRB": {POS: ADV, "PronType": "int|rel"},
|
"Person": 3,
|
||||||
"SP": {POS: SPACE},
|
},
|
||||||
"ADV": {POS: ADV},
|
"WDT": {POS: ADJ, "PronType": "int|rel"},
|
||||||
"NOUN": {POS: NOUN},
|
"WP": {POS: NOUN, "PronType": "int|rel"},
|
||||||
"ADP": {POS: ADP},
|
"WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
|
||||||
"PRON": {POS: PRON},
|
"WRB": {POS: ADV, "PronType": "int|rel"},
|
||||||
"SCONJ": {POS: SCONJ},
|
"SP": {POS: SPACE},
|
||||||
"PROPN": {POS: PROPN},
|
"ADV": {POS: ADV},
|
||||||
"DET": {POS: DET},
|
"NOUN": {POS: NOUN},
|
||||||
"SYM": {POS: SYM},
|
"ADP": {POS: ADP},
|
||||||
"INTJ": {POS: INTJ},
|
"PRON": {POS: PRON},
|
||||||
"PUNCT": {POS: PUNCT},
|
"SCONJ": {POS: SCONJ},
|
||||||
"NUM": {POS: NUM},
|
"PROPN": {POS: PROPN},
|
||||||
"AUX": {POS: AUX},
|
"DET": {POS: DET},
|
||||||
"X": {POS: X},
|
"SYM": {POS: SYM},
|
||||||
"CONJ": {POS: CONJ},
|
"INTJ": {POS: INTJ},
|
||||||
"CCONJ": {POS: CCONJ},
|
"PUNCT": {POS: PUNCT},
|
||||||
"ADJ": {POS: ADJ},
|
"NUM": {POS: NUM},
|
||||||
"VERB": {POS: VERB},
|
"AUX": {POS: AUX},
|
||||||
"PART": {POS: PART},
|
"X": {POS: X},
|
||||||
|
"CONJ": {POS: CONJ},
|
||||||
|
"CCONJ": {POS: CCONJ},
|
||||||
|
"ADJ": {POS: ADJ},
|
||||||
|
"VERB": {POS: VERB},
|
||||||
|
"PART": {POS: PART},
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,7 +19,8 @@ for exc_data in [
|
||||||
{ORTH: "কি.মি", LEMMA: "কিলোমিটার"},
|
{ORTH: "কি.মি", LEMMA: "কিলোমিটার"},
|
||||||
{ORTH: "সে.মি.", LEMMA: "সেন্টিমিটার"},
|
{ORTH: "সে.মি.", LEMMA: "সেন্টিমিটার"},
|
||||||
{ORTH: "সে.মি", LEMMA: "সেন্টিমিটার"},
|
{ORTH: "সে.মি", LEMMA: "সেন্টিমিটার"},
|
||||||
{ORTH: "মি.লি.", LEMMA: "মিলিলিটার"}]:
|
{ORTH: "মি.লি.", LEMMA: "মিলিলিটার"},
|
||||||
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
|
||||||
|
|
33
spacy/lang/ca/__init__.py
Normal file
33
spacy/lang/ca/__init__.py
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .lemmatizer import LOOKUP
|
||||||
|
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
from ..norm_exceptions import BASE_NORMS
|
||||||
|
from ...language import Language
|
||||||
|
from ...attrs import LANG, NORM
|
||||||
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
|
|
||||||
|
class CatalanDefaults(Language.Defaults):
|
||||||
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
|
lex_attr_getters[LANG] = lambda text: "ca"
|
||||||
|
lex_attr_getters[NORM] = add_lookups(
|
||||||
|
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||||
|
)
|
||||||
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
lemma_lookup = LOOKUP
|
||||||
|
|
||||||
|
|
||||||
|
class Catalan(Language):
|
||||||
|
lang = "ca"
|
||||||
|
Defaults = CatalanDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Catalan"]
|
22
spacy/lang/ca/examples.py
Normal file
22
spacy/lang/ca/examples.py
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.ca.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Apple està buscant comprar una startup del Regne Unit per mil milions de dòlars",
|
||||||
|
"Els cotxes autònoms deleguen la responsabilitat de l'assegurança als seus fabricants",
|
||||||
|
"San Francisco analitza prohibir els robots de repartiment",
|
||||||
|
"Londres és una gran ciutat del Regne Unit",
|
||||||
|
"El gat menja peix",
|
||||||
|
"Veig a l'home amb el telescopi",
|
||||||
|
"L'Aranya menja mosques",
|
||||||
|
"El pingüí incuba en el seu niu",
|
||||||
|
]
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user