mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-22 15:24:11 +03:00
Merge branch 'master' into spacy.io
This commit is contained in:
commit
a47e449431
|
@ -1,11 +0,0 @@
|
||||||
steps:
|
|
||||||
-
|
|
||||||
command: "fab env clean make test sdist"
|
|
||||||
label: ":dizzy: :python:"
|
|
||||||
artifact_paths: "dist/*.tar.gz"
|
|
||||||
- wait
|
|
||||||
- trigger: "spacy-sdist-against-models"
|
|
||||||
label: ":dizzy: :hammer:"
|
|
||||||
build:
|
|
||||||
env:
|
|
||||||
SPACY_VERSION: "{$SPACY_VERSION}"
|
|
|
@ -1,11 +0,0 @@
|
||||||
steps:
|
|
||||||
-
|
|
||||||
command: "fab env clean make test wheel"
|
|
||||||
label: ":dizzy: :python:"
|
|
||||||
artifact_paths: "dist/*.whl"
|
|
||||||
- wait
|
|
||||||
- trigger: "spacy-train-from-wheel"
|
|
||||||
label: ":dizzy: :train:"
|
|
||||||
build:
|
|
||||||
env:
|
|
||||||
SPACY_VERSION: "{$SPACY_VERSION}"
|
|
106
.github/contributors/bratao.md
vendored
Normal file
106
.github/contributors/bratao.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [X] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Bruno Souza Cabral |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 24/12/2020 |
|
||||||
|
| GitHub username | bratao |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/dhruvrnaik.md
vendored
Normal file
106
.github/contributors/dhruvrnaik.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Dhruv Naik |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 26-01-2021 |
|
||||||
|
| GitHub username | dhruvrnaik |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/jmargeta.md
vendored
Normal file
106
.github/contributors/jmargeta.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Jan Margeta |
|
||||||
|
| Company name (if applicable) | KardioMe |
|
||||||
|
| Title or role (if applicable) | Founder |
|
||||||
|
| Date | 2020-10-16 |
|
||||||
|
| GitHub username | jmargeta |
|
||||||
|
| Website (optional) | kardio.me |
|
106
.github/contributors/keshav.md
vendored
Normal file
106
.github/contributors/keshav.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Keshav Garg |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | Jan 22, 2021 |
|
||||||
|
| GitHub username | KeshavG-lb |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/tiangolo.md
vendored
Normal file
106
.github/contributors/tiangolo.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Sebastián Ramírez |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-07-01 |
|
||||||
|
| GitHub username | tiangolo |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/werew.md
vendored
Normal file
106
.github/contributors/werew.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Luigi Coniglio |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 10/01/2021 |
|
||||||
|
| GitHub username | werew |
|
||||||
|
| Website (optional) | |
|
6
.gitignore
vendored
6
.gitignore
vendored
|
@ -43,12 +43,15 @@ __pycache__/
|
||||||
.env*
|
.env*
|
||||||
.~env/
|
.~env/
|
||||||
.venv
|
.venv
|
||||||
|
env3.6/
|
||||||
venv/
|
venv/
|
||||||
env3.*/
|
env3.*/
|
||||||
.dev
|
.dev
|
||||||
.denv
|
.denv
|
||||||
.pypyenv
|
.pypyenv
|
||||||
.pytest_cache/
|
.pytest_cache/
|
||||||
|
.mypy_cache/
|
||||||
|
.hypothesis/
|
||||||
|
|
||||||
# Distribution / packaging
|
# Distribution / packaging
|
||||||
env/
|
env/
|
||||||
|
@ -118,3 +121,6 @@ Desktop.ini
|
||||||
|
|
||||||
# Pycharm project files
|
# Pycharm project files
|
||||||
*.idea
|
*.idea
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
.ipynb_checkpoints/
|
||||||
|
|
23
.travis.yml
23
.travis.yml
|
@ -1,23 +0,0 @@
|
||||||
language: python
|
|
||||||
sudo: false
|
|
||||||
cache: pip
|
|
||||||
dist: trusty
|
|
||||||
group: edge
|
|
||||||
python:
|
|
||||||
- "2.7"
|
|
||||||
os:
|
|
||||||
- linux
|
|
||||||
install:
|
|
||||||
- "python -m pip install -U pip setuptools"
|
|
||||||
- "pip install -e . --prefer-binary"
|
|
||||||
script:
|
|
||||||
- "cat /proc/cpuinfo | grep flags | head -n 1"
|
|
||||||
- "pip install -r requirements.txt"
|
|
||||||
- "python -m pytest --tb=native spacy"
|
|
||||||
branches:
|
|
||||||
except:
|
|
||||||
- spacy.io
|
|
||||||
notifications:
|
|
||||||
slack:
|
|
||||||
secure: F8GvqnweSdzImuLL64TpfG0i5rYl89liyr9tmFVsHl4c0DNiDuGhZivUz0M1broS8svE3OPOllLfQbACG/4KxD890qfF9MoHzvRDlp7U+RtwMV/YAkYn8MGWjPIbRbX0HpGdY7O2Rc9Qy4Kk0T8ZgiqXYIqAz2Eva9/9BlSmsJQ=
|
|
||||||
email: false
|
|
215
CONTRIBUTING.md
215
CONTRIBUTING.md
|
@ -3,9 +3,11 @@
|
||||||
# Contribute to spaCy
|
# Contribute to spaCy
|
||||||
|
|
||||||
Thanks for your interest in contributing to spaCy 🎉 The project is maintained
|
Thanks for your interest in contributing to spaCy 🎉 The project is maintained
|
||||||
by [@honnibal](https://github.com/honnibal) and [@ines](https://github.com/ines),
|
by **[@honnibal](https://github.com/honnibal)**,
|
||||||
|
**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)** and
|
||||||
|
**[@adrianeboyd](https://github.com/adrianeboyd)**,
|
||||||
and we'll do our best to help you get started. This page will give you a quick
|
and we'll do our best to help you get started. This page will give you a quick
|
||||||
overview of how things are organised and most importantly, how to get involved.
|
overview of how things are organized and most importantly, how to get involved.
|
||||||
|
|
||||||
## Table of contents
|
## Table of contents
|
||||||
|
|
||||||
|
@ -43,33 +45,32 @@ can also submit a [regression test](#fixing-bugs) straight away. When you're
|
||||||
opening an issue to report the bug, simply refer to your pull request in the
|
opening an issue to report the bug, simply refer to your pull request in the
|
||||||
issue body. A few more tips:
|
issue body. A few more tips:
|
||||||
|
|
||||||
- **Describing your issue:** Try to provide as many details as possible. What
|
- **Describing your issue:** Try to provide as many details as possible. What
|
||||||
exactly goes wrong? _How_ is it failing? Is there an error?
|
exactly goes wrong? _How_ is it failing? Is there an error?
|
||||||
"XY doesn't work" usually isn't that helpful for tracking down problems. Always
|
"XY doesn't work" usually isn't that helpful for tracking down problems. Always
|
||||||
remember to include the code you ran and if possible, extract only the relevant
|
remember to include the code you ran and if possible, extract only the relevant
|
||||||
parts and don't just dump your entire script. This will make it easier for us to
|
parts and don't just dump your entire script. This will make it easier for us to
|
||||||
reproduce the error.
|
reproduce the error.
|
||||||
|
|
||||||
- **Getting info about your spaCy installation and environment:** If you're
|
- **Getting info about your spaCy installation and environment:** You can use the command line interface to print details and
|
||||||
using spaCy v1.7+, you can use the command line interface to print details and
|
even format them as Markdown to copy-paste into GitHub issues:
|
||||||
even format them as Markdown to copy-paste into GitHub issues:
|
`python -m spacy info --markdown`.
|
||||||
`python -m spacy info --markdown`.
|
|
||||||
|
|
||||||
- **Checking the model compatibility:** If you're having problems with a
|
- **Checking the model compatibility:** If you're having problems with a
|
||||||
[statistical model](https://spacy.io/models), it may be because the
|
[statistical model](https://spacy.io/models), it may be because the
|
||||||
model is incompatible with your spaCy installation. In spaCy v2.0+, you can check
|
model is incompatible with your spaCy installation. In spaCy v2.0+, you can check
|
||||||
this on the command line by running `python -m spacy validate`.
|
this on the command line by running `python -m spacy validate`.
|
||||||
|
|
||||||
- **Sharing a model's output, like dependencies and entities:** spaCy v2.0+
|
- **Sharing a model's output, like dependencies and entities:** spaCy
|
||||||
comes with [built-in visualizers](https://spacy.io/usage/visualizers) that
|
comes with [built-in visualizers](https://spacy.io/usage/visualizers) that
|
||||||
you can run from within your script or a Jupyter notebook. For some issues, it's
|
you can run from within your script or a Jupyter notebook. For some issues, it's
|
||||||
helpful to **include a screenshot** of the visualization. You can simply drag and
|
helpful to **include a screenshot** of the visualization. You can simply drag and
|
||||||
drop the image into GitHub's editor and it will be uploaded and included.
|
drop the image into GitHub's editor and it will be uploaded and included.
|
||||||
|
|
||||||
- **Sharing long blocks of code or logs:** If you need to include long code,
|
- **Sharing long blocks of code or logs:** If you need to include long code,
|
||||||
logs or tracebacks, you can wrap them in `<details>` and `</details>`. This
|
logs or tracebacks, you can wrap them in `<details>` and `</details>`. This
|
||||||
[collapses the content](https://developer.mozilla.org/en/docs/Web/HTML/Element/details)
|
[collapses the content](https://developer.mozilla.org/en/docs/Web/HTML/Element/details)
|
||||||
so it only becomes visible on click, making the issue easier to read and follow.
|
so it only becomes visible on click, making the issue easier to read and follow.
|
||||||
|
|
||||||
### Issue labels
|
### Issue labels
|
||||||
|
|
||||||
|
@ -94,39 +95,39 @@ shipped in the core library, and what could be provided in other packages. Our
|
||||||
philosophy is to prefer a smaller core library. We generally ask the following
|
philosophy is to prefer a smaller core library. We generally ask the following
|
||||||
questions:
|
questions:
|
||||||
|
|
||||||
- **What would this feature look like if implemented in a separate package?**
|
- **What would this feature look like if implemented in a separate package?**
|
||||||
Some features would be very difficult to implement externally – for example,
|
Some features would be very difficult to implement externally – for example,
|
||||||
changes to spaCy's built-in methods. In contrast, a library of word
|
changes to spaCy's built-in methods. In contrast, a library of word
|
||||||
alignment functions could easily live as a separate package that depended on
|
alignment functions could easily live as a separate package that depended on
|
||||||
spaCy — there's little difference between writing `import word_aligner` and
|
spaCy — there's little difference between writing `import word_aligner` and
|
||||||
`import spacy.word_aligner`. spaCy v2.0+ makes it easy to implement
|
`import spacy.word_aligner`. spaCy makes it easy to implement
|
||||||
[custom pipeline components](https://spacy.io/usage/processing-pipelines#custom-components),
|
[custom pipeline components](https://spacy.io/usage/processing-pipelines#custom-components),
|
||||||
and add your own attributes, properties and methods to the `Doc`, `Token` and
|
and add your own attributes, properties and methods to the `Doc`, `Token` and
|
||||||
`Span`. If you're looking to implement a new spaCy feature, starting with a
|
`Span`. If you're looking to implement a new spaCy feature, starting with a
|
||||||
custom component package is usually the best strategy. You won't have to worry
|
custom component package is usually the best strategy. You won't have to worry
|
||||||
about spaCy's internals and you can test your module in an isolated
|
about spaCy's internals and you can test your module in an isolated
|
||||||
environment. And if it works well, we can always integrate it into the core
|
environment. And if it works well, we can always integrate it into the core
|
||||||
library later.
|
library later.
|
||||||
|
|
||||||
- **Would the feature be easier to implement if it relied on "heavy" dependencies spaCy doesn't currently require?**
|
- **Would the feature be easier to implement if it relied on "heavy" dependencies spaCy doesn't currently require?**
|
||||||
Python has a very rich ecosystem. Libraries like scikit-learn, SciPy, Gensim or
|
Python has a very rich ecosystem. Libraries like PyTorch, TensorFlow, scikit-learn, SciPy or Gensim
|
||||||
TensorFlow/Keras do lots of useful things — but we don't want to have them as
|
do lots of useful things — but we don't want to have them as default
|
||||||
dependencies. If the feature requires functionality in one of these libraries,
|
dependencies. If the feature requires functionality in one of these libraries,
|
||||||
it's probably better to break it out into a different package.
|
it's probably better to break it out into a different package.
|
||||||
|
|
||||||
- **Is the feature orthogonal to the current spaCy functionality, or overlapping?**
|
- **Is the feature orthogonal to the current spaCy functionality, or overlapping?**
|
||||||
spaCy strongly prefers to avoid having 6 different ways of doing the same thing.
|
spaCy strongly prefers to avoid having 6 different ways of doing the same thing.
|
||||||
As better techniques are developed, we prefer to drop support for "the old way".
|
As better techniques are developed, we prefer to drop support for "the old way".
|
||||||
However, it's rare that one approach _entirely_ dominates another. It's very
|
However, it's rare that one approach _entirely_ dominates another. It's very
|
||||||
common that there's still a use-case for the "obsolete" approach. For instance,
|
common that there's still a use-case for the "obsolete" approach. For instance,
|
||||||
[WordNet](https://wordnet.princeton.edu/) is still very useful — but word
|
[WordNet](https://wordnet.princeton.edu/) is still very useful — but word
|
||||||
vectors are better for most use-cases, and the two approaches to lexical
|
vectors are better for most use-cases, and the two approaches to lexical
|
||||||
semantics do a lot of the same things. spaCy therefore only supports word
|
semantics do a lot of the same things. spaCy therefore only supports word
|
||||||
vectors, and support for WordNet is currently left for other packages.
|
vectors, and support for WordNet is currently left for other packages.
|
||||||
|
|
||||||
- **Do you need the feature to get basic things done?** We do want spaCy to be
|
- **Do you need the feature to get basic things done?** We do want spaCy to be
|
||||||
at least somewhat self-contained. If we keep needing some feature in our
|
at least somewhat self-contained. If we keep needing some feature in our
|
||||||
recipes, that does provide some argument for bringing it "in house".
|
recipes, that does provide some argument for bringing it "in house".
|
||||||
|
|
||||||
### Getting started
|
### Getting started
|
||||||
|
|
||||||
|
@ -137,19 +138,7 @@ files, a compiler, [pip](https://pip.pypa.io/en/latest/installing/),
|
||||||
[virtualenv](https://virtualenv.pypa.io/en/stable/) and
|
[virtualenv](https://virtualenv.pypa.io/en/stable/) and
|
||||||
[git](https://git-scm.com) installed. The compiler is usually the trickiest part.
|
[git](https://git-scm.com) installed. The compiler is usually the trickiest part.
|
||||||
|
|
||||||
```
|
If you've made changes to `.pyx` files, you need to **recompile spaCy** before you
|
||||||
python -m pip install -U pip
|
|
||||||
git clone https://github.com/explosion/spaCy
|
|
||||||
cd spaCy
|
|
||||||
|
|
||||||
python -m venv .env
|
|
||||||
source .env/bin/activate
|
|
||||||
export PYTHONPATH=`pwd`
|
|
||||||
pip install -r requirements.txt
|
|
||||||
python setup.py build_ext --inplace
|
|
||||||
```
|
|
||||||
|
|
||||||
If you've made changes to `.pyx` files, you need to recompile spaCy before you
|
|
||||||
can test your changes by re-running `python setup.py build_ext --inplace`.
|
can test your changes by re-running `python setup.py build_ext --inplace`.
|
||||||
Changes to `.py` files will be effective immediately.
|
Changes to `.py` files will be effective immediately.
|
||||||
|
|
||||||
|
@ -184,7 +173,7 @@ sure your test passes and reference the issue in your commit message.
|
||||||
## Code conventions
|
## Code conventions
|
||||||
|
|
||||||
Code should loosely follow [pep8](https://www.python.org/dev/peps/pep-0008/).
|
Code should loosely follow [pep8](https://www.python.org/dev/peps/pep-0008/).
|
||||||
As of `v2.1.0`, spaCy uses [`black`](https://github.com/ambv/black) for code
|
spaCy uses [`black`](https://github.com/ambv/black) for code
|
||||||
formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its
|
formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its
|
||||||
Python modules. If you've built spaCy from source, you'll already have both
|
Python modules. If you've built spaCy from source, you'll already have both
|
||||||
tools installed.
|
tools installed.
|
||||||
|
@ -195,7 +184,7 @@ modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.**
|
||||||
### Code formatting
|
### Code formatting
|
||||||
|
|
||||||
[`black`](https://github.com/ambv/black) is an opinionated Python code
|
[`black`](https://github.com/ambv/black) is an opinionated Python code
|
||||||
formatter, optimised to produce readable code and small diffs. You can run
|
formatter, optimized to produce readable code and small diffs. You can run
|
||||||
`black` from the command-line, or via your code editor. For example, if you're
|
`black` from the command-line, or via your code editor. For example, if you're
|
||||||
using [Visual Studio Code](https://code.visualstudio.com/), you can add the
|
using [Visual Studio Code](https://code.visualstudio.com/), you can add the
|
||||||
following to your `settings.json` to use `black` for formatting and auto-format
|
following to your `settings.json` to use `black` for formatting and auto-format
|
||||||
|
@ -203,10 +192,10 @@ your files on save:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"python.formatting.provider": "black",
|
"python.formatting.provider": "black",
|
||||||
"[python]": {
|
"[python]": {
|
||||||
"editor.formatOnSave": true
|
"editor.formatOnSave": true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -216,15 +205,14 @@ list of available editor integrations.
|
||||||
#### Disabling formatting
|
#### Disabling formatting
|
||||||
|
|
||||||
There are a few cases where auto-formatting doesn't improve readability – for
|
There are a few cases where auto-formatting doesn't improve readability – for
|
||||||
example, in some of the the language data files like the `tag_map.py`, or in
|
example, in some of the language data files or in the tests that construct `Doc` objects from lists of words and other labels.
|
||||||
the tests that construct `Doc` objects from lists of words and other labels.
|
|
||||||
Wrapping a block in `# fmt: off` and `# fmt: on` lets you disable formatting
|
Wrapping a block in `# fmt: off` and `# fmt: on` lets you disable formatting
|
||||||
for that particular code. Here's an example:
|
for that particular code. Here's an example:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# fmt: off
|
# fmt: off
|
||||||
text = "I look forward to using Thingamajig. I've been told it will make my life easier..."
|
text = "I look forward to using Thingamajig. I've been told it will make my life easier..."
|
||||||
heads = [1, 0, -1, -2, -1, -1, -5, -1, 3, 2, 1, 0, 2, 1, -3, 1, 1, -3, -7]
|
heads = [1, 1, 1, 1, 3, 4, 1, 6, 11, 11, 11, 11, 14, 14, 11, 16, 17, 14, 11]
|
||||||
deps = ["nsubj", "ROOT", "advmod", "prep", "pcomp", "dobj", "punct", "",
|
deps = ["nsubj", "ROOT", "advmod", "prep", "pcomp", "dobj", "punct", "",
|
||||||
"nsubjpass", "aux", "auxpass", "ROOT", "nsubj", "aux", "ccomp",
|
"nsubjpass", "aux", "auxpass", "ROOT", "nsubj", "aux", "ccomp",
|
||||||
"poss", "nsubj", "ccomp", "punct"]
|
"poss", "nsubj", "ccomp", "punct"]
|
||||||
|
@ -280,40 +268,31 @@ except: # noqa: E722
|
||||||
|
|
||||||
### Python conventions
|
### Python conventions
|
||||||
|
|
||||||
All Python code must be written in an **intersection of Python 2 and Python 3**.
|
All Python code must be written **compatible with Python 3.6+**.
|
||||||
This is easy in Cython, but somewhat ugly in Python. Logic that deals with
|
|
||||||
Python or platform compatibility should only live in
|
|
||||||
[`spacy.compat`](spacy/compat.py). To distinguish them from the builtin
|
|
||||||
functions, replacement functions are suffixed with an underscore, for example
|
|
||||||
`unicode_`. If you need to access the user's version or platform information,
|
|
||||||
for example to show more specific error messages, you can use the `is_config()`
|
|
||||||
helper function.
|
|
||||||
|
|
||||||
```python
|
#### I/O and handling paths
|
||||||
from .compat import unicode_, is_config
|
|
||||||
|
|
||||||
compatible_unicode = unicode_('hello world')
|
|
||||||
if is_config(windows=True, python2=True):
|
|
||||||
print("You are using Python 2 on Windows.")
|
|
||||||
```
|
|
||||||
|
|
||||||
Code that interacts with the file-system should accept objects that follow the
|
Code that interacts with the file-system should accept objects that follow the
|
||||||
`pathlib.Path` API, without assuming that the object inherits from `pathlib.Path`.
|
`pathlib.Path` API, without assuming that the object inherits from `pathlib.Path`.
|
||||||
If the function is user-facing and takes a path as an argument, it should check
|
If the function is user-facing and takes a path as an argument, it should check
|
||||||
whether the path is provided as a string. Strings should be converted to
|
whether the path is provided as a string. Strings should be converted to
|
||||||
`pathlib.Path` objects. Serialization and deserialization functions should always
|
`pathlib.Path` objects. Serialization and deserialization functions should always
|
||||||
accept **file-like objects**, as it makes the library io-agnostic. Working on
|
accept **file-like objects**, as it makes the library IO-agnostic. Working on
|
||||||
buffers makes the code more general, easier to test, and compatible with Python
|
buffers makes the code more general, easier to test, and compatible with Python
|
||||||
3's asynchronous IO.
|
3's asynchronous IO.
|
||||||
|
|
||||||
|
#### Composition vs. inheritance
|
||||||
|
|
||||||
Although spaCy uses a lot of classes, **inheritance is viewed with some suspicion**
|
Although spaCy uses a lot of classes, **inheritance is viewed with some suspicion**
|
||||||
— it's seen as a mechanism of last resort. You should discuss plans to extend
|
— it's seen as a mechanism of last resort. You should discuss plans to extend
|
||||||
the class hierarchy before implementing.
|
the class hierarchy before implementing.
|
||||||
|
|
||||||
|
#### Naming conventions
|
||||||
|
|
||||||
We have a number of conventions around variable naming that are still being
|
We have a number of conventions around variable naming that are still being
|
||||||
documented, and aren't 100% strict. A general policy is that instances of the
|
documented, and aren't 100% strict. A general policy is that instances of the
|
||||||
class `Doc` should by default be called `doc`, `Token` `token`, `Lexeme` `lex`,
|
class `Doc` should by default be called `doc`, `Token` → `token`, `Lexeme` → `lex`,
|
||||||
`Vocab` `vocab` and `Language` `nlp`. You should avoid naming variables that are
|
`Vocab` → `vocab` and `Language` → `nlp`. You should avoid naming variables that are
|
||||||
of other types these names. For instance, don't name a text string `doc` — you
|
of other types these names. For instance, don't name a text string `doc` — you
|
||||||
should usually call this `text`. Two general code style preferences further help
|
should usually call this `text`. Two general code style preferences further help
|
||||||
with naming. First, **lean away from introducing temporary variables**, as these
|
with naming. First, **lean away from introducing temporary variables**, as these
|
||||||
|
@ -400,7 +379,7 @@ of Python and C++, with additional complexity and syntax from numpy. The
|
||||||
many "traps for new players". Working in Cython is very rewarding once you're
|
many "traps for new players". Working in Cython is very rewarding once you're
|
||||||
over the initial learning curve. As with C and C++, the first way you write
|
over the initial learning curve. As with C and C++, the first way you write
|
||||||
something in Cython will often be the performance-optimal approach. In contrast,
|
something in Cython will often be the performance-optimal approach. In contrast,
|
||||||
Python optimisation generally requires a lot of experimentation. Is it faster to
|
Python optimization generally requires a lot of experimentation. Is it faster to
|
||||||
have an `if item in my_dict` check, or to use `.get()`? What about `try`/`except`?
|
have an `if item in my_dict` check, or to use `.get()`? What about `try`/`except`?
|
||||||
Does this numpy operation create a copy? There's no way to guess the answers to
|
Does this numpy operation create a copy? There's no way to guess the answers to
|
||||||
these questions, and you'll usually be dissatisfied with your results — so
|
these questions, and you'll usually be dissatisfied with your results — so
|
||||||
|
@ -413,10 +392,10 @@ Python. If it's not fast enough the first time, just switch to Cython.
|
||||||
|
|
||||||
### Resources to get you started
|
### Resources to get you started
|
||||||
|
|
||||||
- [PEP 8 Style Guide for Python Code](https://www.python.org/dev/peps/pep-0008/) (python.org)
|
- [PEP 8 Style Guide for Python Code](https://www.python.org/dev/peps/pep-0008/) (python.org)
|
||||||
- [Official Cython documentation](http://docs.cython.org/en/latest/) (cython.org)
|
- [Official Cython documentation](http://docs.cython.org/en/latest/) (cython.org)
|
||||||
- [Writing C in Cython](https://explosion.ai/blog/writing-c-in-cython) (explosion.ai)
|
- [Writing C in Cython](https://explosion.ai/blog/writing-c-in-cython) (explosion.ai)
|
||||||
- [Multi-threading spaCy’s parser and named entity recogniser](https://explosion.ai/blog/multithreading-with-cython) (explosion.ai)
|
- [Multi-threading spaCy’s parser and named entity recognizer](https://explosion.ai/blog/multithreading-with-cython) (explosion.ai)
|
||||||
|
|
||||||
## Adding tests
|
## Adding tests
|
||||||
|
|
||||||
|
@ -428,7 +407,7 @@ name. For example, tests for the `Tokenizer` can be found in
|
||||||
all test files and test functions need to be prefixed with `test_`.
|
all test files and test functions need to be prefixed with `test_`.
|
||||||
|
|
||||||
When adding tests, make sure to use descriptive names, keep the code short and
|
When adding tests, make sure to use descriptive names, keep the code short and
|
||||||
concise and only test for one behaviour at a time. Try to `parametrize` test
|
concise and only test for one behavior at a time. Try to `parametrize` test
|
||||||
cases wherever possible, use our pre-defined fixtures for spaCy components and
|
cases wherever possible, use our pre-defined fixtures for spaCy components and
|
||||||
avoid unnecessary imports.
|
avoid unnecessary imports.
|
||||||
|
|
||||||
|
@ -437,7 +416,7 @@ Tests that require the model to be loaded should be marked with
|
||||||
`@pytest.mark.models`. Loading the models is expensive and not necessary if
|
`@pytest.mark.models`. Loading the models is expensive and not necessary if
|
||||||
you're not actually testing the model performance. If all you need is a `Doc`
|
you're not actually testing the model performance. If all you need is a `Doc`
|
||||||
object with annotations like heads, POS tags or the dependency parse, you can
|
object with annotations like heads, POS tags or the dependency parse, you can
|
||||||
use the `get_doc()` utility function to construct it manually.
|
use the `Doc` constructor to construct it manually.
|
||||||
|
|
||||||
📖 **For more guidelines and information on how to add tests, check out the [tests README](spacy/tests/README.md).**
|
📖 **For more guidelines and information on how to add tests, check out the [tests README](spacy/tests/README.md).**
|
||||||
|
|
||||||
|
@ -456,25 +435,25 @@ simply click on the "Suggest edits" button at the bottom of a page.
|
||||||
We're very excited about all the new possibilities for **community extensions**
|
We're very excited about all the new possibilities for **community extensions**
|
||||||
and plugins in spaCy v2.0, and we can't wait to see what you build with it!
|
and plugins in spaCy v2.0, and we can't wait to see what you build with it!
|
||||||
|
|
||||||
- An extension or plugin should add substantial functionality, be
|
- An extension or plugin should add substantial functionality, be
|
||||||
**well-documented** and **open-source**. It should be available for users to download
|
**well-documented** and **open-source**. It should be available for users to download
|
||||||
and install as a Python package – for example via [PyPi](http://pypi.python.org).
|
and install as a Python package – for example via [PyPi](http://pypi.python.org).
|
||||||
|
|
||||||
- Extensions that write to `Doc`, `Token` or `Span` attributes should be wrapped
|
- Extensions that write to `Doc`, `Token` or `Span` attributes should be wrapped
|
||||||
as [pipeline components](https://spacy.io/usage/processing-pipelines#custom-components)
|
as [pipeline components](https://spacy.io/usage/processing-pipelines#custom-components)
|
||||||
that users can **add to their processing pipeline** using `nlp.add_pipe()`.
|
that users can **add to their processing pipeline** using `nlp.add_pipe()`.
|
||||||
|
|
||||||
- When publishing your extension on GitHub, **tag it** with the topics
|
- When publishing your extension on GitHub, **tag it** with the topics
|
||||||
[`spacy`](https://github.com/topics/spacy?o=desc&s=stars) and
|
[`spacy`](https://github.com/topics/spacy?o=desc&s=stars) and
|
||||||
[`spacy-extensions`](https://github.com/topics/spacy-extension?o=desc&s=stars)
|
[`spacy-extensions`](https://github.com/topics/spacy-extension?o=desc&s=stars)
|
||||||
to make it easier to find. Those are also the topics we're linking to from the
|
to make it easier to find. Those are also the topics we're linking to from the
|
||||||
spaCy website. If you're sharing your project on Twitter, feel free to tag
|
spaCy website. If you're sharing your project on Twitter, feel free to tag
|
||||||
[@spacy_io](https://twitter.com/spacy_io) so we can check it out.
|
[@spacy_io](https://twitter.com/spacy_io) so we can check it out.
|
||||||
|
|
||||||
- Once your extension is published, you can open an issue on the
|
- Once your extension is published, you can open an issue on the
|
||||||
[issue tracker](https://github.com/explosion/spacy/issues) to suggest it for the
|
[issue tracker](https://github.com/explosion/spacy/issues) to suggest it for the
|
||||||
[resources directory](https://spacy.io/usage/resources#extensions) on the
|
[resources directory](https://spacy.io/usage/resources#extensions) on the
|
||||||
website.
|
website.
|
||||||
|
|
||||||
📖 **For more tips and best practices, see the [checklist for developing spaCy extensions](https://spacy.io/usage/processing-pipelines#extensions).**
|
📖 **For more tips and best practices, see the [checklist for developing spaCy extensions](https://spacy.io/usage/processing-pipelines#extensions).**
|
||||||
|
|
||||||
|
|
2
LICENSE
2
LICENSE
|
@ -1,6 +1,6 @@
|
||||||
The MIT License (MIT)
|
The MIT License (MIT)
|
||||||
|
|
||||||
Copyright (C) 2016-2020 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
|
Copyright (C) 2016-2021 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
recursive-include include *.h
|
recursive-include include *.h
|
||||||
recursive-include spacy *.txt *.pyx *.pxd
|
recursive-include spacy *.pyx *.pxd *.txt *.cfg *.jinja
|
||||||
include LICENSE
|
include LICENSE
|
||||||
include README.md
|
include README.md
|
||||||
include bin/spacy
|
|
||||||
include pyproject.toml
|
include pyproject.toml
|
||||||
recursive-exclude spacy/lang *.json
|
recursive-exclude spacy/lang *.json
|
||||||
recursive-include spacy/lang *.json.gz
|
recursive-include spacy/lang *.json.gz
|
||||||
|
recursive-include spacy/cli *.json *.yml
|
||||||
recursive-include licenses *
|
recursive-include licenses *
|
||||||
|
|
48
Makefile
48
Makefile
|
@ -1,29 +1,55 @@
|
||||||
SHELL := /bin/bash
|
SHELL := /bin/bash
|
||||||
PYVER := 3.6
|
|
||||||
|
ifndef SPACY_EXTRAS
|
||||||
|
override SPACY_EXTRAS = spacy-lookups-data==1.0.0 jieba spacy-pkuseg==0.0.28 sudachipy sudachidict_core pymorphy2
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifndef PYVER
|
||||||
|
override PYVER = 3.6
|
||||||
|
endif
|
||||||
|
|
||||||
VENV := ./env$(PYVER)
|
VENV := ./env$(PYVER)
|
||||||
|
|
||||||
version := $(shell "bin/get-version.sh")
|
version := $(shell "bin/get-version.sh")
|
||||||
|
package := $(shell "bin/get-package.sh")
|
||||||
|
|
||||||
dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp
|
ifndef SPACY_BIN
|
||||||
$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy-lookups-data jieba pkuseg==0.0.25 sudachipy sudachidict_core
|
override SPACY_BIN = $(package)-$(version).pex
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifndef WHEELHOUSE
|
||||||
|
override WHEELHOUSE = "./wheelhouse"
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
dist/$(SPACY_BIN) : $(WHEELHOUSE)/spacy-$(PYVER)-$(version).stamp
|
||||||
|
$(VENV)/bin/pex \
|
||||||
|
-f $(WHEELHOUSE) \
|
||||||
|
--no-index \
|
||||||
|
--disable-cache \
|
||||||
|
-o $@ \
|
||||||
|
$(package)==$(version) \
|
||||||
|
$(SPACY_EXTRAS)
|
||||||
chmod a+rx $@
|
chmod a+rx $@
|
||||||
cp $@ dist/spacy.pex
|
cp $@ dist/spacy.pex
|
||||||
|
|
||||||
dist/pytest.pex : wheelhouse/pytest-*.whl
|
dist/pytest.pex : $(WHEELHOUSE)/pytest-*.whl
|
||||||
$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m pytest -o $@ pytest pytest-timeout mock
|
$(VENV)/bin/pex -f $(WHEELHOUSE) --no-index --disable-cache -m pytest -o $@ pytest pytest-timeout mock
|
||||||
chmod a+rx $@
|
chmod a+rx $@
|
||||||
|
|
||||||
wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py*
|
$(WHEELHOUSE)/spacy-$(PYVER)-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py*
|
||||||
$(VENV)/bin/pip wheel . -w ./wheelhouse
|
$(VENV)/bin/pip wheel . -w $(WHEELHOUSE)
|
||||||
$(VENV)/bin/pip wheel jsonschema spacy-lookups-data jieba pkuseg==0.0.25 sudachipy sudachidict_core -w ./wheelhouse
|
$(VENV)/bin/pip wheel $(SPACY_EXTRAS) -w $(WHEELHOUSE)
|
||||||
|
|
||||||
touch $@
|
touch $@
|
||||||
|
|
||||||
wheelhouse/pytest-%.whl : $(VENV)/bin/pex
|
$(WHEELHOUSE)/pytest-%.whl : $(VENV)/bin/pex
|
||||||
$(VENV)/bin/pip wheel pytest pytest-timeout mock -w ./wheelhouse
|
$(VENV)/bin/pip wheel pytest pytest-timeout mock -w $(WHEELHOUSE)
|
||||||
|
|
||||||
$(VENV)/bin/pex :
|
$(VENV)/bin/pex :
|
||||||
python$(PYVER) -m venv $(VENV)
|
python$(PYVER) -m venv $(VENV)
|
||||||
$(VENV)/bin/pip install -U pip setuptools pex wheel
|
$(VENV)/bin/pip install -U pip setuptools pex wheel
|
||||||
|
$(VENV)/bin/pip install numpy
|
||||||
|
|
||||||
.PHONY : clean test
|
.PHONY : clean test
|
||||||
|
|
||||||
|
@ -33,6 +59,6 @@ test : dist/spacy-$(version).pex dist/pytest.pex
|
||||||
|
|
||||||
clean : setup.py
|
clean : setup.py
|
||||||
rm -rf dist/*
|
rm -rf dist/*
|
||||||
rm -rf ./wheelhouse
|
rm -rf $(WHEELHOUSE)/*
|
||||||
rm -rf $(VENV)
|
rm -rf $(VENV)
|
||||||
python setup.py clean --all
|
python setup.py clean --all
|
||||||
|
|
222
README.md
222
README.md
|
@ -2,61 +2,67 @@
|
||||||
|
|
||||||
# spaCy: Industrial-strength NLP
|
# spaCy: Industrial-strength NLP
|
||||||
|
|
||||||
spaCy is a library for advanced Natural Language Processing in Python and
|
spaCy is a library for **advanced Natural Language Processing** in Python and
|
||||||
Cython. It's built on the very latest research, and was designed from day one to
|
Cython. It's built on the very latest research, and was designed from day one to
|
||||||
be used in real products. spaCy comes with
|
be used in real products.
|
||||||
[pretrained statistical models](https://spacy.io/models) and word vectors, and
|
|
||||||
currently supports tokenization for **60+ languages**. It features
|
|
||||||
state-of-the-art speed, convolutional **neural network models** for tagging,
|
|
||||||
parsing and **named entity recognition** and easy **deep learning** integration.
|
|
||||||
It's commercial open-source software, released under the MIT license.
|
|
||||||
|
|
||||||
💫 **Version 2.3 out now!**
|
spaCy comes with
|
||||||
|
[pretrained pipelines](https://spacy.io/models) and
|
||||||
|
currently supports tokenization and training for **60+ languages**. It features
|
||||||
|
state-of-the-art speed and **neural network models** for tagging,
|
||||||
|
parsing, **named entity recognition**, **text classification** and more,
|
||||||
|
multi-task learning with pretrained **transformers** like BERT, as well as a
|
||||||
|
production-ready [**training system**](https://spacy.io/usage/training) and easy
|
||||||
|
model packaging, deployment and workflow management. spaCy is commercial
|
||||||
|
open-source software, released under the MIT license.
|
||||||
|
|
||||||
|
💫 **Version 3.0 out now!**
|
||||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||||
|
|
||||||
🌙 **Version 3.0 (nightly) out now!**
|
[![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
||||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases/tag/v3.0.0rc1)
|
|
||||||
|
|
||||||
[![Azure Pipelines](<https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build+(3.x)>)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
|
||||||
[![Travis Build Status](<https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square&logo=travis-ci&logoColor=white&label=build+(2.7)>)](https://travis-ci.org/explosion/spaCy)
|
|
||||||
[![Current Release Version](https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square&logo=github)](https://github.com/explosion/spaCy/releases)
|
[![Current Release Version](https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square&logo=github)](https://github.com/explosion/spaCy/releases)
|
||||||
[![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/spacy/)
|
[![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/spacy/)
|
||||||
[![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square&logo=conda-forge&logoColor=white)](https://anaconda.org/conda-forge/spacy)
|
[![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square&logo=conda-forge&logoColor=white)](https://anaconda.org/conda-forge/spacy)
|
||||||
[![Python wheels](https://img.shields.io/badge/wheels-%E2%9C%93-4c1.svg?longCache=true&style=flat-square&logo=python&logoColor=white)](https://github.com/explosion/wheelwright/releases)
|
[![Python wheels](https://img.shields.io/badge/wheels-%E2%9C%93-4c1.svg?longCache=true&style=flat-square&logo=python&logoColor=white)](https://github.com/explosion/wheelwright/releases)
|
||||||
|
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg?style=flat-square)](https://github.com/ambv/black)
|
||||||
|
<br />
|
||||||
[![PyPi downloads](https://img.shields.io/pypi/dm/spacy?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/spacy/)
|
[![PyPi downloads](https://img.shields.io/pypi/dm/spacy?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/spacy/)
|
||||||
[![Conda downloads](https://img.shields.io/conda/dn/conda-forge/spacy?style=flat-square&logo=conda-forge&logoColor=white)](https://anaconda.org/conda-forge/spacy)
|
[![Conda downloads](https://img.shields.io/conda/dn/conda-forge/spacy?style=flat-square&logo=conda-forge&logoColor=white)](https://anaconda.org/conda-forge/spacy)
|
||||||
[![Model downloads](https://img.shields.io/github/downloads/explosion/spacy-models/total?style=flat-square&label=model+downloads)](https://github.com/explosion/spacy-models/releases)
|
|
||||||
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg?style=flat-square)](https://github.com/ambv/black)
|
|
||||||
[![spaCy on Twitter](https://img.shields.io/twitter/follow/spacy_io.svg?style=social&label=Follow)](https://twitter.com/spacy_io)
|
[![spaCy on Twitter](https://img.shields.io/twitter/follow/spacy_io.svg?style=social&label=Follow)](https://twitter.com/spacy_io)
|
||||||
|
|
||||||
## 📖 Documentation
|
## 📖 Documentation
|
||||||
|
|
||||||
| Documentation | |
|
| Documentation | |
|
||||||
| --------------- | -------------------------------------------------------------- |
|
| -------------------------- | -------------------------------------------------------------- |
|
||||||
| [spaCy 101] | New to spaCy? Here's everything you need to know! |
|
| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
|
||||||
| [Usage Guides] | How to use spaCy and its features. |
|
| 📚 **[Usage Guides]** | How to use spaCy and its features. |
|
||||||
| [New in v2.3] | New features, backwards incompatibilities and migration guide. |
|
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
|
||||||
| [API Reference] | The detailed reference for spaCy's API. |
|
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
|
||||||
| [Models] | Download statistical language models for spaCy. |
|
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
|
||||||
| [Universe] | Libraries, extensions, demos, books and courses. |
|
| 📦 **[Models]** | Download trained pipelines for spaCy. |
|
||||||
| [Changelog] | Changes and version history. |
|
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
|
||||||
| [Contribute] | How to contribute to the spaCy project and code base. |
|
| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
|
||||||
|
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
|
||||||
|
| 🛠 **[Changelog]** | Changes and version history. |
|
||||||
|
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
||||||
|
|
||||||
[spacy 101]: https://spacy.io/usage/spacy-101
|
[spacy 101]: https://spacy.io/usage/spacy-101
|
||||||
[new in v2.3]: https://spacy.io/usage/v2-3
|
[new in v3.0]: https://spacy.io/usage/v3
|
||||||
[usage guides]: https://spacy.io/usage/
|
[usage guides]: https://spacy.io/usage/
|
||||||
[api reference]: https://spacy.io/api/
|
[api reference]: https://spacy.io/api/
|
||||||
[models]: https://spacy.io/models
|
[models]: https://spacy.io/models
|
||||||
[universe]: https://spacy.io/universe
|
[universe]: https://spacy.io/universe
|
||||||
|
[videos]: https://www.youtube.com/c/ExplosionAI
|
||||||
|
[online course]: https://course.spacy.io
|
||||||
|
[project templates]: https://github.com/explosion/projects
|
||||||
[changelog]: https://spacy.io/usage#changelog
|
[changelog]: https://spacy.io/usage#changelog
|
||||||
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
||||||
|
|
||||||
## 💬 Where to ask questions
|
## 💬 Where to ask questions
|
||||||
|
|
||||||
The spaCy project is maintained by [@honnibal](https://github.com/honnibal) and
|
The spaCy project is maintained by **[@honnibal](https://github.com/honnibal)**,
|
||||||
[@ines](https://github.com/ines), along with core contributors
|
**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)** and
|
||||||
[@svlandeg](https://github.com/svlandeg) and
|
**[@adrianeboyd](https://github.com/adrianeboyd)**. Please understand that we won't
|
||||||
[@adrianeboyd](https://github.com/adrianeboyd). Please understand that we won't
|
|
||||||
be able to provide individual support via email. We also believe that help is
|
be able to provide individual support via email. We also believe that help is
|
||||||
much more valuable if it's shared publicly, so that more people can benefit from
|
much more valuable if it's shared publicly, so that more people can benefit from
|
||||||
it.
|
it.
|
||||||
|
@ -74,33 +80,31 @@ it.
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
- Non-destructive **tokenization**
|
- Support for **60+ languages**
|
||||||
- **Named entity** recognition
|
- **Trained pipelines** for different languages and tasks
|
||||||
- Support for **50+ languages**
|
- Multi-task learning with pretrained **transformers** like BERT
|
||||||
- pretrained [statistical models](https://spacy.io/models) and word vectors
|
- Support for pretrained **word vectors** and embeddings
|
||||||
- State-of-the-art speed
|
- State-of-the-art speed
|
||||||
- Easy **deep learning** integration
|
- Production-ready **training system**
|
||||||
- Part-of-speech tagging
|
- Linguistically-motivated **tokenization**
|
||||||
- Labelled dependency parsing
|
- Components for named **entity recognition**, part-of-speech-tagging, dependency parsing, sentence segmentation, **text classification**, lemmatization, morphological analysis, entity linking and more
|
||||||
- Syntax-driven sentence segmentation
|
- Easily extensible with **custom components** and attributes
|
||||||
|
- Support for custom models in **PyTorch**, **TensorFlow** and other frameworks
|
||||||
- Built in **visualizers** for syntax and NER
|
- Built in **visualizers** for syntax and NER
|
||||||
- Convenient string-to-hash mapping
|
- Easy **model packaging**, deployment and workflow management
|
||||||
- Export to numpy data arrays
|
|
||||||
- Efficient binary serialization
|
|
||||||
- Easy **model packaging** and deployment
|
|
||||||
- Robust, rigorously evaluated accuracy
|
- Robust, rigorously evaluated accuracy
|
||||||
|
|
||||||
📖 **For more details, see the
|
📖 **For more details, see the
|
||||||
[facts, figures and benchmarks](https://spacy.io/usage/facts-figures).**
|
[facts, figures and benchmarks](https://spacy.io/usage/facts-figures).**
|
||||||
|
|
||||||
## Install spaCy
|
## ⏳ Install spaCy
|
||||||
|
|
||||||
For detailed installation instructions, see the
|
For detailed installation instructions, see the
|
||||||
[documentation](https://spacy.io/usage).
|
[documentation](https://spacy.io/usage).
|
||||||
|
|
||||||
- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
|
- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
|
||||||
Studio)
|
Studio)
|
||||||
- **Python version**: Python 2.7, 3.5+ (only 64 bit)
|
- **Python version**: Python 3.6+ (only 64 bit)
|
||||||
- **Package managers**: [pip] · [conda] (via `conda-forge`)
|
- **Package managers**: [pip] · [conda] (via `conda-forge`)
|
||||||
|
|
||||||
[pip]: https://pypi.org/project/spacy/
|
[pip]: https://pypi.org/project/spacy/
|
||||||
|
@ -108,30 +112,21 @@ For detailed installation instructions, see the
|
||||||
|
|
||||||
### pip
|
### pip
|
||||||
|
|
||||||
Using pip, spaCy releases are available as source packages and binary wheels (as
|
Using pip, spaCy releases are available as source packages and binary wheels.
|
||||||
of `v2.0.13`). Before you install spaCy and its dependencies, make sure that
|
Before you install spaCy and its dependencies, make sure that
|
||||||
`pip`, `setuptools` and `wheel` are up to date.
|
your `pip`, `setuptools` and `wheel` are up to date.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install -U pip setuptools wheel
|
pip install -U pip setuptools wheel
|
||||||
pip install spacy
|
pip install spacy
|
||||||
```
|
```
|
||||||
|
|
||||||
For installation on python 2.7 or 3.5 where binary wheels are not provided for
|
To install additional data tables for lemmatization and normalization you can
|
||||||
the most recent versions of the dependencies, you can prefer older binary
|
run `pip install spacy[lookups]` or install
|
||||||
wheels over newer source packages with `--prefer-binary`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install spacy --prefer-binary
|
|
||||||
```
|
|
||||||
|
|
||||||
To install additional data tables for lemmatization and normalization in
|
|
||||||
**spaCy v2.2+** you can run `pip install spacy[lookups]` or install
|
|
||||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
|
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
|
||||||
separately. The lookups package is needed to create blank models with
|
separately. The lookups package is needed to create blank models with
|
||||||
lemmatization data for v2.2+ plus normalization data for v2.3+, and to
|
lemmatization data, and to lemmatize in languages that don't yet come with
|
||||||
lemmatize in languages that don't yet come with pretrained models and aren't
|
pretrained models and aren't powered by third-party libraries.
|
||||||
powered by third-party libraries.
|
|
||||||
|
|
||||||
When using pip it is generally recommended to install packages in a virtual
|
When using pip it is generally recommended to install packages in a virtual
|
||||||
environment to avoid modifying system state:
|
environment to avoid modifying system state:
|
||||||
|
@ -145,17 +140,14 @@ pip install spacy
|
||||||
|
|
||||||
### conda
|
### conda
|
||||||
|
|
||||||
Thanks to our great community, we've finally re-added conda support. You can now
|
You can also install spaCy from `conda` via the `conda-forge` channel. For the
|
||||||
install spaCy via `conda-forge`:
|
feedstock including the build recipe and configuration, check out
|
||||||
|
[this repository](https://github.com/conda-forge/spacy-feedstock).
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
conda install -c conda-forge spacy
|
conda install -c conda-forge spacy
|
||||||
```
|
```
|
||||||
|
|
||||||
For the feedstock including the build recipe and configuration, check out
|
|
||||||
[this repository](https://github.com/conda-forge/spacy-feedstock). Improvements
|
|
||||||
and pull requests to the recipe and setup are always appreciated.
|
|
||||||
|
|
||||||
### Updating spaCy
|
### Updating spaCy
|
||||||
|
|
||||||
Some updates to spaCy may require downloading new statistical models. If you're
|
Some updates to spaCy may require downloading new statistical models. If you're
|
||||||
|
@ -172,37 +164,40 @@ If you've trained your own models, keep in mind that your training and runtime
|
||||||
inputs must match. After updating spaCy, we recommend **retraining your models**
|
inputs must match. After updating spaCy, we recommend **retraining your models**
|
||||||
with the new version.
|
with the new version.
|
||||||
|
|
||||||
📖 **For details on upgrading from spaCy 1.x to spaCy 2.x, see the
|
📖 **For details on upgrading from spaCy 2.x to spaCy 3.x, see the
|
||||||
[migration guide](https://spacy.io/usage/v2#migrating).**
|
[migration guide](https://spacy.io/usage/v3#migrating).**
|
||||||
|
|
||||||
## Download models
|
## 📦 Download model packages
|
||||||
|
|
||||||
As of v1.7.0, models for spaCy can be installed as **Python packages**. This
|
Trained pipelines for spaCy can be installed as **Python packages**. This
|
||||||
means that they're a component of your application, just like any other module.
|
means that they're a component of your application, just like any other module.
|
||||||
Models can be installed using spaCy's `download` command, or manually by
|
Models can be installed using spaCy's [`download`](https://spacy.io/api/cli#download)
|
||||||
pointing pip to a path or URL.
|
command, or manually by pointing pip to a path or URL.
|
||||||
|
|
||||||
| Documentation | |
|
| Documentation | |
|
||||||
| ---------------------- | ------------------------------------------------------------- |
|
| -------------------------- | ---------------------------------------------------------------- |
|
||||||
| [Available Models] | Detailed model descriptions, accuracy figures and benchmarks. |
|
| **[Available Pipelines]** | Detailed pipeline descriptions, accuracy figures and benchmarks. |
|
||||||
| [Models Documentation] | Detailed usage instructions. |
|
| **[Models Documentation]** | Detailed usage and installation instructions. |
|
||||||
|
| **[Training]** | How to train your own pipelines on your data. |
|
||||||
|
|
||||||
[available models]: https://spacy.io/models
|
[available pipelines]: https://spacy.io/models
|
||||||
[models documentation]: https://spacy.io/docs/usage/models
|
[models documentation]: https://spacy.io/usage/models
|
||||||
|
[training]: https://spacy.io/usage/training
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# download best-matching version of specific model for your spaCy installation
|
# Download best-matching version of specific model for your spaCy installation
|
||||||
python -m spacy download en_core_web_sm
|
python -m spacy download en_core_web_sm
|
||||||
|
|
||||||
# pip install .tar.gz archive from path or URL
|
# pip install .tar.gz archive or .whl from path or URL
|
||||||
pip install /Users/you/en_core_web_sm-2.2.0.tar.gz
|
pip install /Users/you/en_core_web_sm-3.0.0.tar.gz
|
||||||
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
|
pip install /Users/you/en_core_web_sm-3.0.0-py3-none-any.whl
|
||||||
|
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz
|
||||||
```
|
```
|
||||||
|
|
||||||
### Loading and using models
|
### Loading and using models
|
||||||
|
|
||||||
To load a model, use `spacy.load()` with the model name, a shortcut link or a
|
To load a model, use [`spacy.load()`](https://spacy.io/api/top-level#spacy.load)
|
||||||
path to the model data directory.
|
with the model name or a path to the model data directory.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import spacy
|
import spacy
|
||||||
|
@ -224,7 +219,7 @@ doc = nlp("This is a sentence.")
|
||||||
📖 **For more info and examples, check out the
|
📖 **For more info and examples, check out the
|
||||||
[models documentation](https://spacy.io/docs/usage/models).**
|
[models documentation](https://spacy.io/docs/usage/models).**
|
||||||
|
|
||||||
## Compile from source
|
## ⚒ Compile from source
|
||||||
|
|
||||||
The other way to install spaCy is to clone its
|
The other way to install spaCy is to clone its
|
||||||
[GitHub repository](https://github.com/explosion/spaCy) and build it from
|
[GitHub repository](https://github.com/explosion/spaCy) and build it from
|
||||||
|
@ -234,8 +229,19 @@ Python distribution including header files, a compiler,
|
||||||
[pip](https://pip.pypa.io/en/latest/installing/),
|
[pip](https://pip.pypa.io/en/latest/installing/),
|
||||||
[virtualenv](https://virtualenv.pypa.io/en/latest/) and
|
[virtualenv](https://virtualenv.pypa.io/en/latest/) and
|
||||||
[git](https://git-scm.com) installed. The compiler part is the trickiest. How to
|
[git](https://git-scm.com) installed. The compiler part is the trickiest. How to
|
||||||
do that depends on your system. See notes on Ubuntu, OS X and Windows for
|
do that depends on your system.
|
||||||
details.
|
|
||||||
|
| Platform | |
|
||||||
|
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| **Ubuntu** | Install system-level dependencies via `apt-get`: `sudo apt-get install build-essential python-dev git` . |
|
||||||
|
| **Mac** | Install a recent version of [XCode](https://developer.apple.com/xcode/), including the so-called "Command Line Tools". macOS and OS X ship with Python and git preinstalled. |
|
||||||
|
| **Windows** | Install a version of the [Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/) or [Visual Studio Express](https://visualstudio.microsoft.com/vs/express/) that matches the version that was used to compile your Python interpreter. |
|
||||||
|
|
||||||
|
For more details
|
||||||
|
and instructions, see the documentation on
|
||||||
|
[compiling spaCy from source](https://spacy.io/usage#source) and the
|
||||||
|
[quickstart widget](https://spacy.io/usage#section-quickstart) to get the right
|
||||||
|
commands for your platform and Python version.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/explosion/spaCy
|
git clone https://github.com/explosion/spaCy
|
||||||
|
@ -256,57 +262,25 @@ To install with extras:
|
||||||
pip install .[lookups,cuda102]
|
pip install .[lookups,cuda102]
|
||||||
```
|
```
|
||||||
|
|
||||||
To install all dependencies required for development:
|
To install all dependencies required for development, use the [`requirements.txt`](requirements.txt). Compared to regular install via pip, it
|
||||||
|
additionally installs developer dependencies such as Cython.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
Compared to regular install via pip, [requirements.txt](requirements.txt)
|
## 🚦 Run tests
|
||||||
additionally installs developer dependencies such as Cython. For more details
|
|
||||||
and instructions, see the documentation on
|
|
||||||
[compiling spaCy from source](https://spacy.io/usage#source) and the
|
|
||||||
[quickstart widget](https://spacy.io/usage#section-quickstart) to get the right
|
|
||||||
commands for your platform and Python version.
|
|
||||||
|
|
||||||
### Ubuntu
|
|
||||||
|
|
||||||
Install system-level dependencies via `apt-get`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
sudo apt-get install build-essential python-dev git
|
|
||||||
```
|
|
||||||
|
|
||||||
### macOS / OS X
|
|
||||||
|
|
||||||
Install a recent version of [XCode](https://developer.apple.com/xcode/),
|
|
||||||
including the so-called "Command Line Tools". macOS and OS X ship with Python
|
|
||||||
and git preinstalled.
|
|
||||||
|
|
||||||
### Windows
|
|
||||||
|
|
||||||
Install a version of the
|
|
||||||
[Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/)
|
|
||||||
or [Visual Studio Express](https://visualstudio.microsoft.com/vs/express/) that
|
|
||||||
matches the version that was used to compile your Python interpreter. For
|
|
||||||
official distributions these are VS 2008 (Python 2.7), VS 2010 (Python 3.4) and
|
|
||||||
VS 2015 (Python 3.5).
|
|
||||||
|
|
||||||
## Run tests
|
|
||||||
|
|
||||||
spaCy comes with an [extensive test suite](spacy/tests). In order to run the
|
spaCy comes with an [extensive test suite](spacy/tests). In order to run the
|
||||||
tests, you'll usually want to clone the repository and build spaCy from source.
|
tests, you'll usually want to clone the repository and build spaCy from source.
|
||||||
This will also install the required development dependencies and test utilities
|
This will also install the required development dependencies and test utilities
|
||||||
defined in the `requirements.txt`.
|
defined in the [`requirements.txt`](requirements.txt).
|
||||||
|
|
||||||
Alternatively, you can run `pytest` on the tests from within the installed
|
Alternatively, you can run `pytest` on the tests from within the installed
|
||||||
`spacy` package. Don't forget to also install the test utilities via spaCy's
|
`spacy` package. Don't forget to also install the test utilities via spaCy's
|
||||||
`requirements.txt`:
|
[`requirements.txt`](requirements.txt):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
python -m pytest --pyargs spacy
|
python -m pytest --pyargs spacy
|
||||||
```
|
```
|
||||||
|
|
||||||
See [the documentation](https://spacy.io/usage#tests) for more details and
|
|
||||||
examples.
|
|
||||||
|
|
|
@ -2,132 +2,113 @@ trigger:
|
||||||
batch: true
|
batch: true
|
||||||
branches:
|
branches:
|
||||||
include:
|
include:
|
||||||
- '*'
|
- "*"
|
||||||
exclude:
|
exclude:
|
||||||
- 'spacy.io'
|
- "spacy.io"
|
||||||
|
- "nightly.spacy.io"
|
||||||
|
- "v2.spacy.io"
|
||||||
paths:
|
paths:
|
||||||
exclude:
|
exclude:
|
||||||
- 'website/*'
|
- "website/*"
|
||||||
- '*.md'
|
- "*.md"
|
||||||
pr:
|
pr:
|
||||||
paths:
|
paths:
|
||||||
exclude:
|
exclude:
|
||||||
- 'website/*'
|
- "website/*"
|
||||||
- '*.md'
|
- "*.md"
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
# Perform basic checks for most important errors (syntax etc.) Uses the config
|
||||||
|
# defined in .flake8 and overwrites the selected codes.
|
||||||
|
- job: "Validate"
|
||||||
|
pool:
|
||||||
|
vmImage: "ubuntu-16.04"
|
||||||
|
steps:
|
||||||
|
- task: UsePythonVersion@0
|
||||||
|
inputs:
|
||||||
|
versionSpec: "3.7"
|
||||||
|
- script: |
|
||||||
|
pip install flake8==3.5.0
|
||||||
|
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
|
||||||
|
displayName: "flake8"
|
||||||
|
|
||||||
# Perform basic checks for most important errors (syntax etc.) Uses the config
|
- job: "Test"
|
||||||
# defined in .flake8 and overwrites the selected codes.
|
dependsOn: "Validate"
|
||||||
- job: 'Validate'
|
strategy:
|
||||||
pool:
|
matrix:
|
||||||
vmImage: 'ubuntu-16.04'
|
# We're only running one platform per Python version to speed up builds
|
||||||
steps:
|
Python36Linux:
|
||||||
- task: UsePythonVersion@0
|
imageName: "ubuntu-16.04"
|
||||||
inputs:
|
python.version: "3.6"
|
||||||
versionSpec: '3.7'
|
# Python36Windows:
|
||||||
- script: |
|
# imageName: "vs2017-win2016"
|
||||||
pip install flake8
|
# python.version: "3.6"
|
||||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
|
# Python36Mac:
|
||||||
displayName: 'flake8'
|
# imageName: "macos-10.14"
|
||||||
|
# python.version: "3.6"
|
||||||
|
# Python37Linux:
|
||||||
|
# imageName: "ubuntu-16.04"
|
||||||
|
# python.version: "3.7"
|
||||||
|
Python37Windows:
|
||||||
|
imageName: "vs2017-win2016"
|
||||||
|
python.version: "3.7"
|
||||||
|
# Python37Mac:
|
||||||
|
# imageName: "macos-10.14"
|
||||||
|
# python.version: "3.7"
|
||||||
|
# Python38Linux:
|
||||||
|
# imageName: "ubuntu-16.04"
|
||||||
|
# python.version: "3.8"
|
||||||
|
# Python38Windows:
|
||||||
|
# imageName: "vs2017-win2016"
|
||||||
|
# python.version: "3.8"
|
||||||
|
Python38Mac:
|
||||||
|
imageName: "macos-10.14"
|
||||||
|
python.version: "3.8"
|
||||||
|
Python39Linux:
|
||||||
|
imageName: "ubuntu-16.04"
|
||||||
|
python.version: "3.9"
|
||||||
|
Python39Windows:
|
||||||
|
imageName: "vs2017-win2016"
|
||||||
|
python.version: "3.9"
|
||||||
|
Python39Mac:
|
||||||
|
imageName: "macos-10.14"
|
||||||
|
python.version: "3.9"
|
||||||
|
maxParallel: 4
|
||||||
|
pool:
|
||||||
|
vmImage: $(imageName)
|
||||||
|
|
||||||
- job: 'Test'
|
steps:
|
||||||
dependsOn: 'Validate'
|
- task: UsePythonVersion@0
|
||||||
strategy:
|
inputs:
|
||||||
matrix:
|
versionSpec: "$(python.version)"
|
||||||
Python35Linux:
|
architecture: "x64"
|
||||||
imageName: 'ubuntu-16.04'
|
|
||||||
python.version: '3.5'
|
|
||||||
os: linux
|
|
||||||
Python35Windows:
|
|
||||||
imageName: 'vs2017-win2016'
|
|
||||||
python.version: '3.5'
|
|
||||||
# Test on one OS per python 3.6/3.7/3.8 to speed up CI
|
|
||||||
Python36Linux:
|
|
||||||
imageName: 'ubuntu-16.04'
|
|
||||||
python.version: '3.6'
|
|
||||||
# Python36Windows:
|
|
||||||
# imageName: 'vs2017-win2016'
|
|
||||||
# python.version: '3.6'
|
|
||||||
# Python36Mac:
|
|
||||||
# imageName: 'macos-10.14'
|
|
||||||
# python.version: '3.6'
|
|
||||||
# Python37Linux:
|
|
||||||
# imageName: 'ubuntu-16.04'
|
|
||||||
# python.version: '3.7'
|
|
||||||
Python37Windows:
|
|
||||||
imageName: 'vs2017-win2016'
|
|
||||||
python.version: '3.7'
|
|
||||||
# Python37Mac:
|
|
||||||
# imageName: 'macos-10.14'
|
|
||||||
# python.version: '3.7'
|
|
||||||
# Python38Linux:
|
|
||||||
# imageName: 'ubuntu-16.04'
|
|
||||||
# python.version: '3.8'
|
|
||||||
# Python38Windows:
|
|
||||||
# imageName: 'vs2017-win2016'
|
|
||||||
# python.version: '3.8'
|
|
||||||
Python38Mac:
|
|
||||||
imageName: 'macos-10.14'
|
|
||||||
python.version: '3.8'
|
|
||||||
Python39Linux:
|
|
||||||
imageName: 'ubuntu-16.04'
|
|
||||||
python.version: '3.9'
|
|
||||||
Python39Windows:
|
|
||||||
imageName: 'vs2017-win2016'
|
|
||||||
python.version: '3.9'
|
|
||||||
Python39Mac:
|
|
||||||
imageName: 'macos-10.14'
|
|
||||||
python.version: '3.9'
|
|
||||||
maxParallel: 4
|
|
||||||
pool:
|
|
||||||
vmImage: $(imageName)
|
|
||||||
|
|
||||||
steps:
|
- script: |
|
||||||
- task: UsePythonVersion@0
|
python -m pip install -U setuptools
|
||||||
inputs:
|
pip install -r requirements.txt
|
||||||
versionSpec: '$(python.version)'
|
displayName: "Install dependencies"
|
||||||
architecture: 'x64'
|
|
||||||
|
|
||||||
- script: python -m pip install -U pip setuptools
|
- script: |
|
||||||
displayName: 'Update pip'
|
python setup.py build_ext --inplace
|
||||||
|
python setup.py sdist --formats=gztar
|
||||||
|
displayName: "Compile and build sdist"
|
||||||
|
|
||||||
- script: pip install -r requirements.txt --prefer-binary
|
- task: DeleteFiles@1
|
||||||
displayName: 'Install dependencies (python 3.5: prefer binary)'
|
inputs:
|
||||||
condition: eq(variables['python.version'], '3.5')
|
contents: "spacy"
|
||||||
|
displayName: "Delete source directory"
|
||||||
|
|
||||||
- script: pip install -r requirements.txt
|
- script: |
|
||||||
displayName: 'Install dependencies'
|
pip freeze > installed.txt
|
||||||
condition: not(eq(variables['python.version'], '3.5'))
|
pip uninstall -y -r installed.txt
|
||||||
|
displayName: "Uninstall all packages"
|
||||||
|
|
||||||
- script: |
|
- bash: |
|
||||||
python setup.py build_ext --inplace -j 2
|
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||||
python setup.py sdist --formats=gztar
|
pip install dist/$SDIST
|
||||||
displayName: 'Compile and build sdist'
|
displayName: "Install from sdist"
|
||||||
|
|
||||||
- task: DeleteFiles@1
|
- script: |
|
||||||
inputs:
|
pip install -r requirements.txt
|
||||||
contents: 'spacy'
|
python -m pytest --pyargs spacy
|
||||||
displayName: 'Delete source directory'
|
displayName: "Run tests"
|
||||||
|
|
||||||
- script: |
|
|
||||||
pip freeze > installed.txt
|
|
||||||
pip uninstall -y -r installed.txt
|
|
||||||
displayName: 'Uninstall all packages'
|
|
||||||
|
|
||||||
- bash: |
|
|
||||||
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
|
||||||
pip install dist/$SDIST --prefer-binary
|
|
||||||
displayName: 'Install from sdist (python 3.5: prefer binary)'
|
|
||||||
condition: eq(variables['python.version'], '3.5')
|
|
||||||
|
|
||||||
- bash: |
|
|
||||||
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
|
||||||
pip install dist/$SDIST
|
|
||||||
displayName: 'Install from sdist'
|
|
||||||
condition: not(eq(variables['python.version'], '3.5'))
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
pip install -r requirements.txt --prefer-binary
|
|
||||||
python -m pytest --pyargs spacy
|
|
||||||
displayName: 'Run tests'
|
|
||||||
|
|
169
bin/cythonize.py
169
bin/cythonize.py
|
@ -1,169 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
""" cythonize.py
|
|
||||||
|
|
||||||
Cythonize pyx files into C++ files as needed.
|
|
||||||
|
|
||||||
Usage: cythonize.py [root]
|
|
||||||
|
|
||||||
Checks pyx files to see if they have been changed relative to their
|
|
||||||
corresponding C++ files. If they have, then runs cython on these files to
|
|
||||||
recreate the C++ files.
|
|
||||||
|
|
||||||
Additionally, checks pxd files and setup.py if they have been changed. If
|
|
||||||
they have, rebuilds everything.
|
|
||||||
|
|
||||||
Change detection based on file hashes stored in JSON format.
|
|
||||||
|
|
||||||
For now, this script should be run by developers when changing Cython files
|
|
||||||
and the resulting C++ files checked in, so that end-users (and Python-only
|
|
||||||
developers) do not get the Cython dependencies.
|
|
||||||
|
|
||||||
Based upon:
|
|
||||||
|
|
||||||
https://raw.github.com/dagss/private-scipy-refactor/cythonize/cythonize.py
|
|
||||||
https://raw.githubusercontent.com/numpy/numpy/master/tools/cythonize.py
|
|
||||||
|
|
||||||
Note: this script does not check any of the dependent C++ libraries.
|
|
||||||
"""
|
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import json
|
|
||||||
import hashlib
|
|
||||||
import subprocess
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
|
|
||||||
HASH_FILE = "cythonize.json"
|
|
||||||
|
|
||||||
|
|
||||||
def process_pyx(fromfile, tofile, language_level="-2"):
|
|
||||||
print("Processing %s" % fromfile)
|
|
||||||
try:
|
|
||||||
from Cython.Compiler.Version import version as cython_version
|
|
||||||
from distutils.version import LooseVersion
|
|
||||||
|
|
||||||
if LooseVersion(cython_version) < LooseVersion("0.19"):
|
|
||||||
raise Exception("Require Cython >= 0.19")
|
|
||||||
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
flags = ["--fast-fail", language_level]
|
|
||||||
if tofile.endswith(".cpp"):
|
|
||||||
flags += ["--cplus"]
|
|
||||||
|
|
||||||
try:
|
|
||||||
try:
|
|
||||||
r = subprocess.call(
|
|
||||||
["cython"] + flags + ["-o", tofile, fromfile], env=os.environ
|
|
||||||
) # See Issue #791
|
|
||||||
if r != 0:
|
|
||||||
raise Exception("Cython failed")
|
|
||||||
except OSError:
|
|
||||||
# There are ways of installing Cython that don't result in a cython
|
|
||||||
# executable on the path, see gh-2397.
|
|
||||||
r = subprocess.call(
|
|
||||||
[
|
|
||||||
sys.executable,
|
|
||||||
"-c",
|
|
||||||
"import sys; from Cython.Compiler.Main import "
|
|
||||||
"setuptools_main as main; sys.exit(main())",
|
|
||||||
]
|
|
||||||
+ flags
|
|
||||||
+ ["-o", tofile, fromfile]
|
|
||||||
)
|
|
||||||
if r != 0:
|
|
||||||
raise Exception("Cython failed")
|
|
||||||
except OSError:
|
|
||||||
raise OSError("Cython needs to be installed")
|
|
||||||
|
|
||||||
|
|
||||||
def preserve_cwd(path, func, *args):
|
|
||||||
orig_cwd = os.getcwd()
|
|
||||||
try:
|
|
||||||
os.chdir(path)
|
|
||||||
func(*args)
|
|
||||||
finally:
|
|
||||||
os.chdir(orig_cwd)
|
|
||||||
|
|
||||||
|
|
||||||
def load_hashes(filename):
|
|
||||||
try:
|
|
||||||
return json.load(open(filename))
|
|
||||||
except (ValueError, IOError):
|
|
||||||
return {}
|
|
||||||
|
|
||||||
|
|
||||||
def save_hashes(hash_db, filename):
|
|
||||||
with open(filename, "w") as f:
|
|
||||||
f.write(json.dumps(hash_db))
|
|
||||||
|
|
||||||
|
|
||||||
def get_hash(path):
|
|
||||||
return hashlib.md5(open(path, "rb").read()).hexdigest()
|
|
||||||
|
|
||||||
|
|
||||||
def hash_changed(base, path, db):
|
|
||||||
full_path = os.path.normpath(os.path.join(base, path))
|
|
||||||
return not get_hash(full_path) == db.get(full_path)
|
|
||||||
|
|
||||||
|
|
||||||
def hash_add(base, path, db):
|
|
||||||
full_path = os.path.normpath(os.path.join(base, path))
|
|
||||||
db[full_path] = get_hash(full_path)
|
|
||||||
|
|
||||||
|
|
||||||
def process(base, filename, db):
|
|
||||||
root, ext = os.path.splitext(filename)
|
|
||||||
if ext in [".pyx", ".cpp"]:
|
|
||||||
if hash_changed(base, filename, db) or not os.path.isfile(
|
|
||||||
os.path.join(base, root + ".cpp")
|
|
||||||
):
|
|
||||||
preserve_cwd(base, process_pyx, root + ".pyx", root + ".cpp")
|
|
||||||
hash_add(base, root + ".cpp", db)
|
|
||||||
hash_add(base, root + ".pyx", db)
|
|
||||||
|
|
||||||
|
|
||||||
def check_changes(root, db):
|
|
||||||
res = False
|
|
||||||
new_db = {}
|
|
||||||
|
|
||||||
setup_filename = "setup.py"
|
|
||||||
hash_add(".", setup_filename, new_db)
|
|
||||||
if hash_changed(".", setup_filename, db):
|
|
||||||
res = True
|
|
||||||
|
|
||||||
for base, _, files in os.walk(root):
|
|
||||||
for filename in files:
|
|
||||||
if filename.endswith(".pxd"):
|
|
||||||
hash_add(base, filename, new_db)
|
|
||||||
if hash_changed(base, filename, db):
|
|
||||||
res = True
|
|
||||||
|
|
||||||
if res:
|
|
||||||
db.clear()
|
|
||||||
db.update(new_db)
|
|
||||||
return res
|
|
||||||
|
|
||||||
|
|
||||||
def run(root):
|
|
||||||
db = load_hashes(HASH_FILE)
|
|
||||||
|
|
||||||
try:
|
|
||||||
check_changes(root, db)
|
|
||||||
for base, _, files in os.walk(root):
|
|
||||||
for filename in files:
|
|
||||||
process(base, filename, db)
|
|
||||||
finally:
|
|
||||||
save_hashes(db, HASH_FILE)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Cythonize pyx files into C++ files as needed"
|
|
||||||
)
|
|
||||||
parser.add_argument("root", help="root directory")
|
|
||||||
args = parser.parse_args()
|
|
||||||
run(args.root)
|
|
12
bin/get-package.sh
Executable file
12
bin/get-package.sh
Executable file
|
@ -0,0 +1,12 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
version=$(grep "__title__ = " spacy/about.py)
|
||||||
|
version=${version/__title__ = }
|
||||||
|
version=${version/\'/}
|
||||||
|
version=${version/\'/}
|
||||||
|
version=${version/\"/}
|
||||||
|
version=${version/\"/}
|
||||||
|
|
||||||
|
echo $version
|
|
@ -1,97 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import bz2
|
|
||||||
import re
|
|
||||||
import srsly
|
|
||||||
import sys
|
|
||||||
import random
|
|
||||||
import datetime
|
|
||||||
import plac
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
_unset = object()
|
|
||||||
|
|
||||||
|
|
||||||
class Reddit(object):
|
|
||||||
"""Stream cleaned comments from Reddit."""
|
|
||||||
|
|
||||||
pre_format_re = re.compile(r"^[`*~]")
|
|
||||||
post_format_re = re.compile(r"[`*~]$")
|
|
||||||
url_re = re.compile(r"\[([^]]+)\]\(%%URL\)")
|
|
||||||
link_re = re.compile(r"\[([^]]+)\]\(https?://[^\)]+\)")
|
|
||||||
|
|
||||||
def __init__(self, file_path, meta_keys={"subreddit": "section"}):
|
|
||||||
"""
|
|
||||||
file_path (unicode / Path): Path to archive or directory of archives.
|
|
||||||
meta_keys (dict): Meta data key included in the Reddit corpus, mapped
|
|
||||||
to display name in Prodigy meta.
|
|
||||||
RETURNS (Reddit): The Reddit loader.
|
|
||||||
"""
|
|
||||||
self.meta = meta_keys
|
|
||||||
file_path = Path(file_path)
|
|
||||||
if not file_path.exists():
|
|
||||||
raise IOError("Can't find file path: {}".format(file_path))
|
|
||||||
if not file_path.is_dir():
|
|
||||||
self.files = [file_path]
|
|
||||||
else:
|
|
||||||
self.files = list(file_path.iterdir())
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
for file_path in self.iter_files():
|
|
||||||
with bz2.open(str(file_path)) as f:
|
|
||||||
for line in f:
|
|
||||||
line = line.strip()
|
|
||||||
if not line:
|
|
||||||
continue
|
|
||||||
comment = srsly.json_loads(line)
|
|
||||||
if self.is_valid(comment):
|
|
||||||
text = self.strip_tags(comment["body"])
|
|
||||||
yield {"text": text}
|
|
||||||
|
|
||||||
def get_meta(self, item):
|
|
||||||
return {name: item.get(key, "n/a") for key, name in self.meta.items()}
|
|
||||||
|
|
||||||
def iter_files(self):
|
|
||||||
for file_path in self.files:
|
|
||||||
yield file_path
|
|
||||||
|
|
||||||
def strip_tags(self, text):
|
|
||||||
text = self.link_re.sub(r"\1", text)
|
|
||||||
text = text.replace(">", ">").replace("<", "<")
|
|
||||||
text = self.pre_format_re.sub("", text)
|
|
||||||
text = self.post_format_re.sub("", text)
|
|
||||||
text = re.sub(r"\s+", " ", text)
|
|
||||||
return text.strip()
|
|
||||||
|
|
||||||
def is_valid(self, comment):
|
|
||||||
return (
|
|
||||||
comment["body"] is not None
|
|
||||||
and comment["body"] != "[deleted]"
|
|
||||||
and comment["body"] != "[removed]"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def main(path):
|
|
||||||
reddit = Reddit(path)
|
|
||||||
for comment in reddit:
|
|
||||||
print(srsly.json_dumps(comment))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import socket
|
|
||||||
|
|
||||||
try:
|
|
||||||
BrokenPipeError
|
|
||||||
except NameError:
|
|
||||||
BrokenPipeError = socket.error
|
|
||||||
try:
|
|
||||||
plac.call(main)
|
|
||||||
except BrokenPipeError:
|
|
||||||
import os, sys
|
|
||||||
|
|
||||||
# Python flushes standard streams on exit; redirect remaining output
|
|
||||||
# to devnull to avoid another BrokenPipeError at shutdown
|
|
||||||
devnull = os.open(os.devnull, os.O_WRONLY)
|
|
||||||
os.dup2(devnull, sys.stdout.fileno())
|
|
||||||
sys.exit(1) # Python exits with error code 1 on EPIPE
|
|
|
@ -1,81 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
from __future__ import print_function, unicode_literals, division
|
|
||||||
|
|
||||||
import logging
|
|
||||||
from pathlib import Path
|
|
||||||
from collections import defaultdict
|
|
||||||
from gensim.models import Word2Vec
|
|
||||||
import plac
|
|
||||||
import spacy
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class Corpus(object):
|
|
||||||
def __init__(self, directory, nlp):
|
|
||||||
self.directory = directory
|
|
||||||
self.nlp = nlp
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
for text_loc in iter_dir(self.directory):
|
|
||||||
with text_loc.open("r", encoding="utf-8") as file_:
|
|
||||||
text = file_.read()
|
|
||||||
|
|
||||||
# This is to keep the input to the blank model (which doesn't
|
|
||||||
# sentencize) from being too long. It works particularly well with
|
|
||||||
# the output of [WikiExtractor](https://github.com/attardi/wikiextractor)
|
|
||||||
paragraphs = text.split('\n\n')
|
|
||||||
for par in paragraphs:
|
|
||||||
yield [word.orth_ for word in self.nlp(par)]
|
|
||||||
|
|
||||||
|
|
||||||
def iter_dir(loc):
|
|
||||||
dir_path = Path(loc)
|
|
||||||
for fn_path in dir_path.iterdir():
|
|
||||||
if fn_path.is_dir():
|
|
||||||
for sub_path in fn_path.iterdir():
|
|
||||||
yield sub_path
|
|
||||||
else:
|
|
||||||
yield fn_path
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
lang=("ISO language code"),
|
|
||||||
in_dir=("Location of input directory"),
|
|
||||||
out_loc=("Location of output file"),
|
|
||||||
n_workers=("Number of workers", "option", "n", int),
|
|
||||||
size=("Dimension of the word vectors", "option", "d", int),
|
|
||||||
window=("Context window size", "option", "w", int),
|
|
||||||
min_count=("Min count", "option", "m", int),
|
|
||||||
negative=("Number of negative samples", "option", "g", int),
|
|
||||||
nr_iter=("Number of iterations", "option", "i", int),
|
|
||||||
)
|
|
||||||
def main(
|
|
||||||
lang,
|
|
||||||
in_dir,
|
|
||||||
out_loc,
|
|
||||||
negative=5,
|
|
||||||
n_workers=4,
|
|
||||||
window=5,
|
|
||||||
size=128,
|
|
||||||
min_count=10,
|
|
||||||
nr_iter=5,
|
|
||||||
):
|
|
||||||
logging.basicConfig(
|
|
||||||
format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO
|
|
||||||
)
|
|
||||||
nlp = spacy.blank(lang)
|
|
||||||
corpus = Corpus(in_dir, nlp)
|
|
||||||
model = Word2Vec(
|
|
||||||
sentences=corpus,
|
|
||||||
size=size,
|
|
||||||
window=window,
|
|
||||||
min_count=min_count,
|
|
||||||
workers=n_workers,
|
|
||||||
sample=1e-5,
|
|
||||||
negative=negative,
|
|
||||||
)
|
|
||||||
model.save(out_loc)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
|
@ -1,2 +0,0 @@
|
||||||
from .conll17_ud_eval import main as ud_evaluate # noqa: F401
|
|
||||||
from .ud_train import main as ud_train # noqa: F401
|
|
|
@ -1,614 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# flake8: noqa
|
|
||||||
|
|
||||||
# CoNLL 2017 UD Parsing evaluation script.
|
|
||||||
#
|
|
||||||
# Compatible with Python 2.7 and 3.2+, can be used either as a module
|
|
||||||
# or a standalone executable.
|
|
||||||
#
|
|
||||||
# Copyright 2017 Institute of Formal and Applied Linguistics (UFAL),
|
|
||||||
# Faculty of Mathematics and Physics, Charles University, Czech Republic.
|
|
||||||
#
|
|
||||||
# Changelog:
|
|
||||||
# - [02 Jan 2017] Version 0.9: Initial release
|
|
||||||
# - [25 Jan 2017] Version 0.9.1: Fix bug in LCS alignment computation
|
|
||||||
# - [10 Mar 2017] Version 1.0: Add documentation and test
|
|
||||||
# Compare HEADs correctly using aligned words
|
|
||||||
# Allow evaluation with errorneous spaces in forms
|
|
||||||
# Compare forms in LCS case insensitively
|
|
||||||
# Detect cycles and multiple root nodes
|
|
||||||
# Compute AlignedAccuracy
|
|
||||||
|
|
||||||
# Command line usage
|
|
||||||
# ------------------
|
|
||||||
# conll17_ud_eval.py [-v] [-w weights_file] gold_conllu_file system_conllu_file
|
|
||||||
#
|
|
||||||
# - if no -v is given, only the CoNLL17 UD Shared Task evaluation LAS metrics
|
|
||||||
# is printed
|
|
||||||
# - if -v is given, several metrics are printed (as precision, recall, F1 score,
|
|
||||||
# and in case the metric is computed on aligned words also accuracy on these):
|
|
||||||
# - Tokens: how well do the gold tokens match system tokens
|
|
||||||
# - Sentences: how well do the gold sentences match system sentences
|
|
||||||
# - Words: how well can the gold words be aligned to system words
|
|
||||||
# - UPOS: using aligned words, how well does UPOS match
|
|
||||||
# - XPOS: using aligned words, how well does XPOS match
|
|
||||||
# - Feats: using aligned words, how well does FEATS match
|
|
||||||
# - AllTags: using aligned words, how well does UPOS+XPOS+FEATS match
|
|
||||||
# - Lemmas: using aligned words, how well does LEMMA match
|
|
||||||
# - UAS: using aligned words, how well does HEAD match
|
|
||||||
# - LAS: using aligned words, how well does HEAD+DEPREL(ignoring subtypes) match
|
|
||||||
# - if weights_file is given (with lines containing deprel-weight pairs),
|
|
||||||
# one more metric is shown:
|
|
||||||
# - WeightedLAS: as LAS, but each deprel (ignoring subtypes) has different weight
|
|
||||||
|
|
||||||
# API usage
|
|
||||||
# ---------
|
|
||||||
# - load_conllu(file)
|
|
||||||
# - loads CoNLL-U file from given file object to an internal representation
|
|
||||||
# - the file object should return str on both Python 2 and Python 3
|
|
||||||
# - raises UDError exception if the given file cannot be loaded
|
|
||||||
# - evaluate(gold_ud, system_ud)
|
|
||||||
# - evaluate the given gold and system CoNLL-U files (loaded with load_conllu)
|
|
||||||
# - raises UDError if the concatenated tokens of gold and system file do not match
|
|
||||||
# - returns a dictionary with the metrics described above, each metrics having
|
|
||||||
# four fields: precision, recall, f1 and aligned_accuracy (when using aligned
|
|
||||||
# words, otherwise this is None)
|
|
||||||
|
|
||||||
# Description of token matching
|
|
||||||
# -----------------------------
|
|
||||||
# In order to match tokens of gold file and system file, we consider the text
|
|
||||||
# resulting from concatenation of gold tokens and text resulting from
|
|
||||||
# concatenation of system tokens. These texts should match -- if they do not,
|
|
||||||
# the evaluation fails.
|
|
||||||
#
|
|
||||||
# If the texts do match, every token is represented as a range in this original
|
|
||||||
# text, and tokens are equal only if their range is the same.
|
|
||||||
|
|
||||||
# Description of word matching
|
|
||||||
# ----------------------------
|
|
||||||
# When matching words of gold file and system file, we first match the tokens.
|
|
||||||
# The words which are also tokens are matched as tokens, but words in multi-word
|
|
||||||
# tokens have to be handled differently.
|
|
||||||
#
|
|
||||||
# To handle multi-word tokens, we start by finding "multi-word spans".
|
|
||||||
# Multi-word span is a span in the original text such that
|
|
||||||
# - it contains at least one multi-word token
|
|
||||||
# - all multi-word tokens in the span (considering both gold and system ones)
|
|
||||||
# are completely inside the span (i.e., they do not "stick out")
|
|
||||||
# - the multi-word span is as small as possible
|
|
||||||
#
|
|
||||||
# For every multi-word span, we align the gold and system words completely
|
|
||||||
# inside this span using LCS on their FORMs. The words not intersecting
|
|
||||||
# (even partially) any multi-word span are then aligned as tokens.
|
|
||||||
|
|
||||||
|
|
||||||
from __future__ import division
|
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import io
|
|
||||||
import sys
|
|
||||||
import unittest
|
|
||||||
|
|
||||||
# CoNLL-U column names
|
|
||||||
ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = range(10)
|
|
||||||
|
|
||||||
# UD Error is used when raising exceptions in this module
|
|
||||||
class UDError(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Load given CoNLL-U file into internal representation
|
|
||||||
def load_conllu(file, check_parse=True):
|
|
||||||
# Internal representation classes
|
|
||||||
class UDRepresentation:
|
|
||||||
def __init__(self):
|
|
||||||
# Characters of all the tokens in the whole file.
|
|
||||||
# Whitespace between tokens is not included.
|
|
||||||
self.characters = []
|
|
||||||
# List of UDSpan instances with start&end indices into `characters`.
|
|
||||||
self.tokens = []
|
|
||||||
# List of UDWord instances.
|
|
||||||
self.words = []
|
|
||||||
# List of UDSpan instances with start&end indices into `characters`.
|
|
||||||
self.sentences = []
|
|
||||||
class UDSpan:
|
|
||||||
def __init__(self, start, end, characters):
|
|
||||||
self.start = start
|
|
||||||
# Note that self.end marks the first position **after the end** of span,
|
|
||||||
# so we can use characters[start:end] or range(start, end).
|
|
||||||
self.end = end
|
|
||||||
self.characters = characters
|
|
||||||
|
|
||||||
@property
|
|
||||||
def text(self):
|
|
||||||
return ''.join(self.characters[self.start:self.end])
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return self.text
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return self.text
|
|
||||||
class UDWord:
|
|
||||||
def __init__(self, span, columns, is_multiword):
|
|
||||||
# Span of this word (or MWT, see below) within ud_representation.characters.
|
|
||||||
self.span = span
|
|
||||||
# 10 columns of the CoNLL-U file: ID, FORM, LEMMA,...
|
|
||||||
self.columns = columns
|
|
||||||
# is_multiword==True means that this word is part of a multi-word token.
|
|
||||||
# In that case, self.span marks the span of the whole multi-word token.
|
|
||||||
self.is_multiword = is_multiword
|
|
||||||
# Reference to the UDWord instance representing the HEAD (or None if root).
|
|
||||||
self.parent = None
|
|
||||||
# Let's ignore language-specific deprel subtypes.
|
|
||||||
self.columns[DEPREL] = columns[DEPREL].split(':')[0]
|
|
||||||
|
|
||||||
ud = UDRepresentation()
|
|
||||||
|
|
||||||
# Load the CoNLL-U file
|
|
||||||
index, sentence_start = 0, None
|
|
||||||
linenum = 0
|
|
||||||
while True:
|
|
||||||
line = file.readline()
|
|
||||||
linenum += 1
|
|
||||||
if not line:
|
|
||||||
break
|
|
||||||
line = line.rstrip("\r\n")
|
|
||||||
|
|
||||||
# Handle sentence start boundaries
|
|
||||||
if sentence_start is None:
|
|
||||||
# Skip comments
|
|
||||||
if line.startswith("#"):
|
|
||||||
continue
|
|
||||||
# Start a new sentence
|
|
||||||
ud.sentences.append(UDSpan(index, 0, ud.characters))
|
|
||||||
sentence_start = len(ud.words)
|
|
||||||
if not line:
|
|
||||||
# Add parent UDWord links and check there are no cycles
|
|
||||||
def process_word(word):
|
|
||||||
if word.parent == "remapping":
|
|
||||||
raise UDError("There is a cycle in a sentence")
|
|
||||||
if word.parent is None:
|
|
||||||
head = int(word.columns[HEAD])
|
|
||||||
if head > len(ud.words) - sentence_start:
|
|
||||||
raise UDError("Line {}: HEAD '{}' points outside of the sentence".format(
|
|
||||||
linenum, word.columns[HEAD]))
|
|
||||||
if head:
|
|
||||||
parent = ud.words[sentence_start + head - 1]
|
|
||||||
word.parent = "remapping"
|
|
||||||
process_word(parent)
|
|
||||||
word.parent = parent
|
|
||||||
|
|
||||||
for word in ud.words[sentence_start:]:
|
|
||||||
process_word(word)
|
|
||||||
|
|
||||||
# Check there is a single root node
|
|
||||||
if check_parse:
|
|
||||||
if len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1:
|
|
||||||
raise UDError("There are multiple roots in a sentence")
|
|
||||||
|
|
||||||
# End the sentence
|
|
||||||
ud.sentences[-1].end = index
|
|
||||||
sentence_start = None
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Read next token/word
|
|
||||||
columns = line.split("\t")
|
|
||||||
if len(columns) != 10:
|
|
||||||
raise UDError("The CoNLL-U line {} does not contain 10 tab-separated columns: '{}'".format(linenum, line))
|
|
||||||
|
|
||||||
# Skip empty nodes
|
|
||||||
if "." in columns[ID]:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Delete spaces from FORM so gold.characters == system.characters
|
|
||||||
# even if one of them tokenizes the space.
|
|
||||||
columns[FORM] = columns[FORM].replace(" ", "")
|
|
||||||
if not columns[FORM]:
|
|
||||||
raise UDError("There is an empty FORM in the CoNLL-U file -- line %d" % linenum)
|
|
||||||
|
|
||||||
# Save token
|
|
||||||
ud.characters.extend(columns[FORM])
|
|
||||||
ud.tokens.append(UDSpan(index, index + len(columns[FORM]), ud.characters))
|
|
||||||
index += len(columns[FORM])
|
|
||||||
|
|
||||||
# Handle multi-word tokens to save word(s)
|
|
||||||
if "-" in columns[ID]:
|
|
||||||
try:
|
|
||||||
start, end = map(int, columns[ID].split("-"))
|
|
||||||
except:
|
|
||||||
raise UDError("Cannot parse multi-word token ID '{}'".format(columns[ID]))
|
|
||||||
|
|
||||||
for _ in range(start, end + 1):
|
|
||||||
word_line = file.readline().rstrip("\r\n")
|
|
||||||
word_columns = word_line.split("\t")
|
|
||||||
if len(word_columns) != 10:
|
|
||||||
print(columns)
|
|
||||||
raise UDError("The CoNLL-U line {} does not contain 10 tab-separated columns: '{}'".format(linenum, word_line))
|
|
||||||
ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True))
|
|
||||||
# Basic tokens/words
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
word_id = int(columns[ID])
|
|
||||||
except:
|
|
||||||
raise UDError("Cannot parse word ID '{}'".format(columns[ID]))
|
|
||||||
if word_id != len(ud.words) - sentence_start + 1:
|
|
||||||
raise UDError("Incorrect word ID '{}' for word '{}', expected '{}'".format(columns[ID], columns[FORM], len(ud.words) - sentence_start + 1))
|
|
||||||
|
|
||||||
try:
|
|
||||||
head_id = int(columns[HEAD])
|
|
||||||
except:
|
|
||||||
raise UDError("Cannot parse HEAD '{}'".format(columns[HEAD]))
|
|
||||||
if head_id < 0:
|
|
||||||
raise UDError("HEAD cannot be negative")
|
|
||||||
|
|
||||||
ud.words.append(UDWord(ud.tokens[-1], columns, is_multiword=False))
|
|
||||||
|
|
||||||
if sentence_start is not None:
|
|
||||||
raise UDError("The CoNLL-U file does not end with empty line")
|
|
||||||
|
|
||||||
return ud
|
|
||||||
|
|
||||||
# Evaluate the gold and system treebanks (loaded using load_conllu).
|
|
||||||
def evaluate(gold_ud, system_ud, deprel_weights=None, check_parse=True):
|
|
||||||
class Score:
|
|
||||||
def __init__(self, gold_total, system_total, correct, aligned_total=None, undersegmented=None, oversegmented=None):
|
|
||||||
self.precision = correct / system_total if system_total else 0.0
|
|
||||||
self.recall = correct / gold_total if gold_total else 0.0
|
|
||||||
self.f1 = 2 * correct / (system_total + gold_total) if system_total + gold_total else 0.0
|
|
||||||
self.aligned_accuracy = correct / aligned_total if aligned_total else aligned_total
|
|
||||||
self.undersegmented = undersegmented
|
|
||||||
self.oversegmented = oversegmented
|
|
||||||
self.under_perc = len(undersegmented) / gold_total if gold_total and undersegmented else 0.0
|
|
||||||
self.over_perc = len(oversegmented) / gold_total if gold_total and oversegmented else 0.0
|
|
||||||
class AlignmentWord:
|
|
||||||
def __init__(self, gold_word, system_word):
|
|
||||||
self.gold_word = gold_word
|
|
||||||
self.system_word = system_word
|
|
||||||
self.gold_parent = None
|
|
||||||
self.system_parent_gold_aligned = None
|
|
||||||
class Alignment:
|
|
||||||
def __init__(self, gold_words, system_words):
|
|
||||||
self.gold_words = gold_words
|
|
||||||
self.system_words = system_words
|
|
||||||
self.matched_words = []
|
|
||||||
self.matched_words_map = {}
|
|
||||||
def append_aligned_words(self, gold_word, system_word):
|
|
||||||
self.matched_words.append(AlignmentWord(gold_word, system_word))
|
|
||||||
self.matched_words_map[system_word] = gold_word
|
|
||||||
def fill_parents(self):
|
|
||||||
# We represent root parents in both gold and system data by '0'.
|
|
||||||
# For gold data, we represent non-root parent by corresponding gold word.
|
|
||||||
# For system data, we represent non-root parent by either gold word aligned
|
|
||||||
# to parent system nodes, or by None if no gold words is aligned to the parent.
|
|
||||||
for words in self.matched_words:
|
|
||||||
words.gold_parent = words.gold_word.parent if words.gold_word.parent is not None else 0
|
|
||||||
words.system_parent_gold_aligned = self.matched_words_map.get(words.system_word.parent, None) \
|
|
||||||
if words.system_word.parent is not None else 0
|
|
||||||
|
|
||||||
def lower(text):
|
|
||||||
if sys.version_info < (3, 0) and isinstance(text, str):
|
|
||||||
return text.decode("utf-8").lower()
|
|
||||||
return text.lower()
|
|
||||||
|
|
||||||
def spans_score(gold_spans, system_spans):
|
|
||||||
correct, gi, si = 0, 0, 0
|
|
||||||
undersegmented = []
|
|
||||||
oversegmented = []
|
|
||||||
combo = 0
|
|
||||||
previous_end_si_earlier = False
|
|
||||||
previous_end_gi_earlier = False
|
|
||||||
while gi < len(gold_spans) and si < len(system_spans):
|
|
||||||
previous_si = system_spans[si-1] if si > 0 else None
|
|
||||||
previous_gi = gold_spans[gi-1] if gi > 0 else None
|
|
||||||
if system_spans[si].start < gold_spans[gi].start:
|
|
||||||
# avoid counting the same mistake twice
|
|
||||||
if not previous_end_si_earlier:
|
|
||||||
combo += 1
|
|
||||||
oversegmented.append(str(previous_gi).strip())
|
|
||||||
si += 1
|
|
||||||
elif gold_spans[gi].start < system_spans[si].start:
|
|
||||||
# avoid counting the same mistake twice
|
|
||||||
if not previous_end_gi_earlier:
|
|
||||||
combo += 1
|
|
||||||
undersegmented.append(str(previous_si).strip())
|
|
||||||
gi += 1
|
|
||||||
else:
|
|
||||||
correct += gold_spans[gi].end == system_spans[si].end
|
|
||||||
if gold_spans[gi].end < system_spans[si].end:
|
|
||||||
undersegmented.append(str(system_spans[si]).strip())
|
|
||||||
previous_end_gi_earlier = True
|
|
||||||
previous_end_si_earlier = False
|
|
||||||
elif gold_spans[gi].end > system_spans[si].end:
|
|
||||||
oversegmented.append(str(gold_spans[gi]).strip())
|
|
||||||
previous_end_si_earlier = True
|
|
||||||
previous_end_gi_earlier = False
|
|
||||||
else:
|
|
||||||
previous_end_gi_earlier = False
|
|
||||||
previous_end_si_earlier = False
|
|
||||||
si += 1
|
|
||||||
gi += 1
|
|
||||||
|
|
||||||
return Score(len(gold_spans), len(system_spans), correct, None, undersegmented, oversegmented)
|
|
||||||
|
|
||||||
def alignment_score(alignment, key_fn, weight_fn=lambda w: 1):
|
|
||||||
gold, system, aligned, correct = 0, 0, 0, 0
|
|
||||||
|
|
||||||
for word in alignment.gold_words:
|
|
||||||
gold += weight_fn(word)
|
|
||||||
|
|
||||||
for word in alignment.system_words:
|
|
||||||
system += weight_fn(word)
|
|
||||||
|
|
||||||
for words in alignment.matched_words:
|
|
||||||
aligned += weight_fn(words.gold_word)
|
|
||||||
|
|
||||||
if key_fn is None:
|
|
||||||
# Return score for whole aligned words
|
|
||||||
return Score(gold, system, aligned)
|
|
||||||
|
|
||||||
for words in alignment.matched_words:
|
|
||||||
if key_fn(words.gold_word, words.gold_parent) == key_fn(words.system_word, words.system_parent_gold_aligned):
|
|
||||||
correct += weight_fn(words.gold_word)
|
|
||||||
|
|
||||||
return Score(gold, system, correct, aligned)
|
|
||||||
|
|
||||||
def beyond_end(words, i, multiword_span_end):
|
|
||||||
if i >= len(words):
|
|
||||||
return True
|
|
||||||
if words[i].is_multiword:
|
|
||||||
return words[i].span.start >= multiword_span_end
|
|
||||||
return words[i].span.end > multiword_span_end
|
|
||||||
|
|
||||||
def extend_end(word, multiword_span_end):
|
|
||||||
if word.is_multiword and word.span.end > multiword_span_end:
|
|
||||||
return word.span.end
|
|
||||||
return multiword_span_end
|
|
||||||
|
|
||||||
def find_multiword_span(gold_words, system_words, gi, si):
|
|
||||||
# We know gold_words[gi].is_multiword or system_words[si].is_multiword.
|
|
||||||
# Find the start of the multiword span (gs, ss), so the multiword span is minimal.
|
|
||||||
# Initialize multiword_span_end characters index.
|
|
||||||
if gold_words[gi].is_multiword:
|
|
||||||
multiword_span_end = gold_words[gi].span.end
|
|
||||||
if not system_words[si].is_multiword and system_words[si].span.start < gold_words[gi].span.start:
|
|
||||||
si += 1
|
|
||||||
else: # if system_words[si].is_multiword
|
|
||||||
multiword_span_end = system_words[si].span.end
|
|
||||||
if not gold_words[gi].is_multiword and gold_words[gi].span.start < system_words[si].span.start:
|
|
||||||
gi += 1
|
|
||||||
gs, ss = gi, si
|
|
||||||
|
|
||||||
# Find the end of the multiword span
|
|
||||||
# (so both gi and si are pointing to the word following the multiword span end).
|
|
||||||
while not beyond_end(gold_words, gi, multiword_span_end) or \
|
|
||||||
not beyond_end(system_words, si, multiword_span_end):
|
|
||||||
if gi < len(gold_words) and (si >= len(system_words) or
|
|
||||||
gold_words[gi].span.start <= system_words[si].span.start):
|
|
||||||
multiword_span_end = extend_end(gold_words[gi], multiword_span_end)
|
|
||||||
gi += 1
|
|
||||||
else:
|
|
||||||
multiword_span_end = extend_end(system_words[si], multiword_span_end)
|
|
||||||
si += 1
|
|
||||||
return gs, ss, gi, si
|
|
||||||
|
|
||||||
def compute_lcs(gold_words, system_words, gi, si, gs, ss):
|
|
||||||
lcs = [[0] * (si - ss) for i in range(gi - gs)]
|
|
||||||
for g in reversed(range(gi - gs)):
|
|
||||||
for s in reversed(range(si - ss)):
|
|
||||||
if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]):
|
|
||||||
lcs[g][s] = 1 + (lcs[g+1][s+1] if g+1 < gi-gs and s+1 < si-ss else 0)
|
|
||||||
lcs[g][s] = max(lcs[g][s], lcs[g+1][s] if g+1 < gi-gs else 0)
|
|
||||||
lcs[g][s] = max(lcs[g][s], lcs[g][s+1] if s+1 < si-ss else 0)
|
|
||||||
return lcs
|
|
||||||
|
|
||||||
def align_words(gold_words, system_words):
|
|
||||||
alignment = Alignment(gold_words, system_words)
|
|
||||||
|
|
||||||
gi, si = 0, 0
|
|
||||||
while gi < len(gold_words) and si < len(system_words):
|
|
||||||
if gold_words[gi].is_multiword or system_words[si].is_multiword:
|
|
||||||
# A: Multi-word tokens => align via LCS within the whole "multiword span".
|
|
||||||
gs, ss, gi, si = find_multiword_span(gold_words, system_words, gi, si)
|
|
||||||
|
|
||||||
if si > ss and gi > gs:
|
|
||||||
lcs = compute_lcs(gold_words, system_words, gi, si, gs, ss)
|
|
||||||
|
|
||||||
# Store aligned words
|
|
||||||
s, g = 0, 0
|
|
||||||
while g < gi - gs and s < si - ss:
|
|
||||||
if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]):
|
|
||||||
alignment.append_aligned_words(gold_words[gs+g], system_words[ss+s])
|
|
||||||
g += 1
|
|
||||||
s += 1
|
|
||||||
elif lcs[g][s] == (lcs[g+1][s] if g+1 < gi-gs else 0):
|
|
||||||
g += 1
|
|
||||||
else:
|
|
||||||
s += 1
|
|
||||||
else:
|
|
||||||
# B: No multi-word token => align according to spans.
|
|
||||||
if (gold_words[gi].span.start, gold_words[gi].span.end) == (system_words[si].span.start, system_words[si].span.end):
|
|
||||||
alignment.append_aligned_words(gold_words[gi], system_words[si])
|
|
||||||
gi += 1
|
|
||||||
si += 1
|
|
||||||
elif gold_words[gi].span.start <= system_words[si].span.start:
|
|
||||||
gi += 1
|
|
||||||
else:
|
|
||||||
si += 1
|
|
||||||
|
|
||||||
alignment.fill_parents()
|
|
||||||
|
|
||||||
return alignment
|
|
||||||
|
|
||||||
# Check that underlying character sequences do match
|
|
||||||
if gold_ud.characters != system_ud.characters:
|
|
||||||
index = 0
|
|
||||||
while gold_ud.characters[index] == system_ud.characters[index]:
|
|
||||||
index += 1
|
|
||||||
|
|
||||||
raise UDError(
|
|
||||||
"The concatenation of tokens in gold file and in system file differ!\n" +
|
|
||||||
"First 20 differing characters in gold file: '{}' and system file: '{}'".format(
|
|
||||||
"".join(gold_ud.characters[index:index + 20]),
|
|
||||||
"".join(system_ud.characters[index:index + 20])
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Align words
|
|
||||||
alignment = align_words(gold_ud.words, system_ud.words)
|
|
||||||
|
|
||||||
# Compute the F1-scores
|
|
||||||
if check_parse:
|
|
||||||
result = {
|
|
||||||
"Tokens": spans_score(gold_ud.tokens, system_ud.tokens),
|
|
||||||
"Sentences": spans_score(gold_ud.sentences, system_ud.sentences),
|
|
||||||
"Words": alignment_score(alignment, None),
|
|
||||||
"UPOS": alignment_score(alignment, lambda w, parent: w.columns[UPOS]),
|
|
||||||
"XPOS": alignment_score(alignment, lambda w, parent: w.columns[XPOS]),
|
|
||||||
"Feats": alignment_score(alignment, lambda w, parent: w.columns[FEATS]),
|
|
||||||
"AllTags": alignment_score(alignment, lambda w, parent: (w.columns[UPOS], w.columns[XPOS], w.columns[FEATS])),
|
|
||||||
"Lemmas": alignment_score(alignment, lambda w, parent: w.columns[LEMMA]),
|
|
||||||
"UAS": alignment_score(alignment, lambda w, parent: parent),
|
|
||||||
"LAS": alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL])),
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
result = {
|
|
||||||
"Tokens": spans_score(gold_ud.tokens, system_ud.tokens),
|
|
||||||
"Sentences": spans_score(gold_ud.sentences, system_ud.sentences),
|
|
||||||
"Words": alignment_score(alignment, None),
|
|
||||||
"Feats": alignment_score(alignment, lambda w, parent: w.columns[FEATS]),
|
|
||||||
"Lemmas": alignment_score(alignment, lambda w, parent: w.columns[LEMMA]),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
# Add WeightedLAS if weights are given
|
|
||||||
if deprel_weights is not None:
|
|
||||||
def weighted_las(word):
|
|
||||||
return deprel_weights.get(word.columns[DEPREL], 1.0)
|
|
||||||
result["WeightedLAS"] = alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL]), weighted_las)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def load_deprel_weights(weights_file):
|
|
||||||
if weights_file is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
deprel_weights = {}
|
|
||||||
for line in weights_file:
|
|
||||||
# Ignore comments and empty lines
|
|
||||||
if line.startswith("#") or not line.strip():
|
|
||||||
continue
|
|
||||||
|
|
||||||
columns = line.rstrip("\r\n").split()
|
|
||||||
if len(columns) != 2:
|
|
||||||
raise ValueError("Expected two columns in the UD Relations weights file on line '{}'".format(line))
|
|
||||||
|
|
||||||
deprel_weights[columns[0]] = float(columns[1])
|
|
||||||
|
|
||||||
return deprel_weights
|
|
||||||
|
|
||||||
def load_conllu_file(path):
|
|
||||||
_file = open(path, mode="r", **({"encoding": "utf-8"} if sys.version_info >= (3, 0) else {}))
|
|
||||||
return load_conllu(_file)
|
|
||||||
|
|
||||||
def evaluate_wrapper(args):
|
|
||||||
# Load CoNLL-U files
|
|
||||||
gold_ud = load_conllu_file(args.gold_file)
|
|
||||||
system_ud = load_conllu_file(args.system_file)
|
|
||||||
|
|
||||||
# Load weights if requested
|
|
||||||
deprel_weights = load_deprel_weights(args.weights)
|
|
||||||
|
|
||||||
return evaluate(gold_ud, system_ud, deprel_weights)
|
|
||||||
|
|
||||||
def main():
|
|
||||||
# Parse arguments
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("gold_file", type=str,
|
|
||||||
help="Name of the CoNLL-U file with the gold data.")
|
|
||||||
parser.add_argument("system_file", type=str,
|
|
||||||
help="Name of the CoNLL-U file with the predicted data.")
|
|
||||||
parser.add_argument("--weights", "-w", type=argparse.FileType("r"), default=None,
|
|
||||||
metavar="deprel_weights_file",
|
|
||||||
help="Compute WeightedLAS using given weights for Universal Dependency Relations.")
|
|
||||||
parser.add_argument("--verbose", "-v", default=0, action="count",
|
|
||||||
help="Print all metrics.")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
# Use verbose if weights are supplied
|
|
||||||
if args.weights is not None and not args.verbose:
|
|
||||||
args.verbose = 1
|
|
||||||
|
|
||||||
# Evaluate
|
|
||||||
evaluation = evaluate_wrapper(args)
|
|
||||||
|
|
||||||
# Print the evaluation
|
|
||||||
if not args.verbose:
|
|
||||||
print("LAS F1 Score: {:.2f}".format(100 * evaluation["LAS"].f1))
|
|
||||||
else:
|
|
||||||
metrics = ["Tokens", "Sentences", "Words", "UPOS", "XPOS", "Feats", "AllTags", "Lemmas", "UAS", "LAS"]
|
|
||||||
if args.weights is not None:
|
|
||||||
metrics.append("WeightedLAS")
|
|
||||||
|
|
||||||
print("Metrics | Precision | Recall | F1 Score | AligndAcc")
|
|
||||||
print("-----------+-----------+-----------+-----------+-----------")
|
|
||||||
for metric in metrics:
|
|
||||||
print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{}".format(
|
|
||||||
metric,
|
|
||||||
100 * evaluation[metric].precision,
|
|
||||||
100 * evaluation[metric].recall,
|
|
||||||
100 * evaluation[metric].f1,
|
|
||||||
"{:10.2f}".format(100 * evaluation[metric].aligned_accuracy) if evaluation[metric].aligned_accuracy is not None else ""
|
|
||||||
))
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
|
||||||
# Tests, which can be executed with `python -m unittest conll17_ud_eval`.
|
|
||||||
class TestAlignment(unittest.TestCase):
|
|
||||||
@staticmethod
|
|
||||||
def _load_words(words):
|
|
||||||
"""Prepare fake CoNLL-U files with fake HEAD to prevent multiple roots errors."""
|
|
||||||
lines, num_words = [], 0
|
|
||||||
for w in words:
|
|
||||||
parts = w.split(" ")
|
|
||||||
if len(parts) == 1:
|
|
||||||
num_words += 1
|
|
||||||
lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, parts[0], int(num_words>1)))
|
|
||||||
else:
|
|
||||||
lines.append("{}-{}\t{}\t_\t_\t_\t_\t_\t_\t_\t_".format(num_words + 1, num_words + len(parts) - 1, parts[0]))
|
|
||||||
for part in parts[1:]:
|
|
||||||
num_words += 1
|
|
||||||
lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, part, int(num_words>1)))
|
|
||||||
return load_conllu((io.StringIO if sys.version_info >= (3, 0) else io.BytesIO)("\n".join(lines+["\n"])))
|
|
||||||
|
|
||||||
def _test_exception(self, gold, system):
|
|
||||||
self.assertRaises(UDError, evaluate, self._load_words(gold), self._load_words(system))
|
|
||||||
|
|
||||||
def _test_ok(self, gold, system, correct):
|
|
||||||
metrics = evaluate(self._load_words(gold), self._load_words(system))
|
|
||||||
gold_words = sum((max(1, len(word.split(" ")) - 1) for word in gold))
|
|
||||||
system_words = sum((max(1, len(word.split(" ")) - 1) for word in system))
|
|
||||||
self.assertEqual((metrics["Words"].precision, metrics["Words"].recall, metrics["Words"].f1),
|
|
||||||
(correct / system_words, correct / gold_words, 2 * correct / (gold_words + system_words)))
|
|
||||||
|
|
||||||
def test_exception(self):
|
|
||||||
self._test_exception(["a"], ["b"])
|
|
||||||
|
|
||||||
def test_equal(self):
|
|
||||||
self._test_ok(["a"], ["a"], 1)
|
|
||||||
self._test_ok(["a", "b", "c"], ["a", "b", "c"], 3)
|
|
||||||
|
|
||||||
def test_equal_with_multiword(self):
|
|
||||||
self._test_ok(["abc a b c"], ["a", "b", "c"], 3)
|
|
||||||
self._test_ok(["a", "bc b c", "d"], ["a", "b", "c", "d"], 4)
|
|
||||||
self._test_ok(["abcd a b c d"], ["ab a b", "cd c d"], 4)
|
|
||||||
self._test_ok(["abc a b c", "de d e"], ["a", "bcd b c d", "e"], 5)
|
|
||||||
|
|
||||||
def test_alignment(self):
|
|
||||||
self._test_ok(["abcd"], ["a", "b", "c", "d"], 0)
|
|
||||||
self._test_ok(["abc", "d"], ["a", "b", "c", "d"], 1)
|
|
||||||
self._test_ok(["a", "bc", "d"], ["a", "b", "c", "d"], 2)
|
|
||||||
self._test_ok(["a", "bc b c", "d"], ["a", "b", "cd"], 2)
|
|
||||||
self._test_ok(["abc a BX c", "def d EX f"], ["ab a b", "cd c d", "ef e f"], 4)
|
|
||||||
self._test_ok(["ab a b", "cd bc d"], ["a", "bc", "d"], 2)
|
|
||||||
self._test_ok(["a", "bc b c", "d"], ["ab AX BX", "cd CX a"], 1)
|
|
|
@ -1,293 +0,0 @@
|
||||||
import spacy
|
|
||||||
import time
|
|
||||||
import re
|
|
||||||
import plac
|
|
||||||
import operator
|
|
||||||
import datetime
|
|
||||||
from pathlib import Path
|
|
||||||
import xml.etree.ElementTree as ET
|
|
||||||
|
|
||||||
import conll17_ud_eval
|
|
||||||
from ud_train import write_conllu
|
|
||||||
from spacy.lang.lex_attrs import word_shape
|
|
||||||
from spacy.util import get_lang_class
|
|
||||||
|
|
||||||
# All languages in spaCy - in UD format (note that Norwegian is 'no' instead of 'nb')
|
|
||||||
ALL_LANGUAGES = ("af, ar, bg, bn, ca, cs, da, de, el, en, es, et, fa, fi, fr,"
|
|
||||||
"ga, he, hi, hr, hu, id, is, it, ja, kn, ko, lt, lv, mr, no,"
|
|
||||||
"nl, pl, pt, ro, ru, si, sk, sl, sq, sr, sv, ta, te, th, tl,"
|
|
||||||
"tr, tt, uk, ur, vi, zh")
|
|
||||||
|
|
||||||
# Non-parsing tasks that will be evaluated (works for default models)
|
|
||||||
EVAL_NO_PARSE = ['Tokens', 'Words', 'Lemmas', 'Sentences', 'Feats']
|
|
||||||
|
|
||||||
# Tasks that will be evaluated if check_parse=True (does not work for default models)
|
|
||||||
EVAL_PARSE = ['Tokens', 'Words', 'Lemmas', 'Sentences', 'Feats', 'UPOS', 'XPOS', 'AllTags', 'UAS', 'LAS']
|
|
||||||
|
|
||||||
# Minimum frequency an error should have to be printed
|
|
||||||
PRINT_FREQ = 20
|
|
||||||
|
|
||||||
# Maximum number of errors printed per category
|
|
||||||
PRINT_TOTAL = 10
|
|
||||||
|
|
||||||
space_re = re.compile("\s+")
|
|
||||||
|
|
||||||
|
|
||||||
def load_model(modelname, add_sentencizer=False):
|
|
||||||
""" Load a specific spaCy model """
|
|
||||||
loading_start = time.time()
|
|
||||||
nlp = spacy.load(modelname)
|
|
||||||
if add_sentencizer:
|
|
||||||
nlp.add_pipe(nlp.create_pipe('sentencizer'))
|
|
||||||
loading_end = time.time()
|
|
||||||
loading_time = loading_end - loading_start
|
|
||||||
if add_sentencizer:
|
|
||||||
return nlp, loading_time, modelname + '_sentencizer'
|
|
||||||
return nlp, loading_time, modelname
|
|
||||||
|
|
||||||
|
|
||||||
def load_default_model_sentencizer(lang):
|
|
||||||
""" Load a generic spaCy model and add the sentencizer for sentence tokenization"""
|
|
||||||
loading_start = time.time()
|
|
||||||
lang_class = get_lang_class(lang)
|
|
||||||
nlp = lang_class()
|
|
||||||
nlp.add_pipe(nlp.create_pipe('sentencizer'))
|
|
||||||
loading_end = time.time()
|
|
||||||
loading_time = loading_end - loading_start
|
|
||||||
return nlp, loading_time, lang + "_default_" + 'sentencizer'
|
|
||||||
|
|
||||||
|
|
||||||
def split_text(text):
|
|
||||||
return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")]
|
|
||||||
|
|
||||||
|
|
||||||
def get_freq_tuples(my_list, print_total_threshold):
|
|
||||||
""" Turn a list of errors into frequency-sorted tuples thresholded by a certain total number """
|
|
||||||
d = {}
|
|
||||||
for token in my_list:
|
|
||||||
d.setdefault(token, 0)
|
|
||||||
d[token] += 1
|
|
||||||
return sorted(d.items(), key=operator.itemgetter(1), reverse=True)[:print_total_threshold]
|
|
||||||
|
|
||||||
|
|
||||||
def _contains_blinded_text(stats_xml):
|
|
||||||
""" Heuristic to determine whether the treebank has blinded texts or not """
|
|
||||||
tree = ET.parse(stats_xml)
|
|
||||||
root = tree.getroot()
|
|
||||||
total_tokens = int(root.find('size/total/tokens').text)
|
|
||||||
unique_forms = int(root.find('forms').get('unique'))
|
|
||||||
|
|
||||||
# assume the corpus is largely blinded when there are less than 1% unique tokens
|
|
||||||
return (unique_forms / total_tokens) < 0.01
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_all_treebanks(ud_dir, languages, corpus, best_per_language):
|
|
||||||
"""" Fetch the txt files for all treebanks for a given set of languages """
|
|
||||||
all_treebanks = dict()
|
|
||||||
treebank_size = dict()
|
|
||||||
for l in languages:
|
|
||||||
all_treebanks[l] = []
|
|
||||||
treebank_size[l] = 0
|
|
||||||
|
|
||||||
for treebank_dir in ud_dir.iterdir():
|
|
||||||
if treebank_dir.is_dir():
|
|
||||||
for txt_path in treebank_dir.iterdir():
|
|
||||||
if txt_path.name.endswith('-ud-' + corpus + '.txt'):
|
|
||||||
file_lang = txt_path.name.split('_')[0]
|
|
||||||
if file_lang in languages:
|
|
||||||
gold_path = treebank_dir / txt_path.name.replace('.txt', '.conllu')
|
|
||||||
stats_xml = treebank_dir / "stats.xml"
|
|
||||||
# ignore treebanks where the texts are not publicly available
|
|
||||||
if not _contains_blinded_text(stats_xml):
|
|
||||||
if not best_per_language:
|
|
||||||
all_treebanks[file_lang].append(txt_path)
|
|
||||||
# check the tokens in the gold annotation to keep only the biggest treebank per language
|
|
||||||
else:
|
|
||||||
with gold_path.open(mode='r', encoding='utf-8') as gold_file:
|
|
||||||
gold_ud = conll17_ud_eval.load_conllu(gold_file)
|
|
||||||
gold_tokens = len(gold_ud.tokens)
|
|
||||||
if treebank_size[file_lang] < gold_tokens:
|
|
||||||
all_treebanks[file_lang] = [txt_path]
|
|
||||||
treebank_size[file_lang] = gold_tokens
|
|
||||||
|
|
||||||
return all_treebanks
|
|
||||||
|
|
||||||
|
|
||||||
def run_single_eval(nlp, loading_time, print_name, text_path, gold_ud, tmp_output_path, out_file, print_header,
|
|
||||||
check_parse, print_freq_tasks):
|
|
||||||
"""" Run an evaluation of a model nlp on a certain specified treebank """
|
|
||||||
with text_path.open(mode='r', encoding='utf-8') as f:
|
|
||||||
flat_text = f.read()
|
|
||||||
|
|
||||||
# STEP 1: tokenize text
|
|
||||||
tokenization_start = time.time()
|
|
||||||
texts = split_text(flat_text)
|
|
||||||
docs = list(nlp.pipe(texts))
|
|
||||||
tokenization_end = time.time()
|
|
||||||
tokenization_time = tokenization_end - tokenization_start
|
|
||||||
|
|
||||||
# STEP 2: record stats and timings
|
|
||||||
tokens_per_s = int(len(gold_ud.tokens) / tokenization_time)
|
|
||||||
|
|
||||||
print_header_1 = ['date', 'text_path', 'gold_tokens', 'model', 'loading_time', 'tokenization_time', 'tokens_per_s']
|
|
||||||
print_string_1 = [str(datetime.date.today()), text_path.name, len(gold_ud.tokens),
|
|
||||||
print_name, "%.2f" % loading_time, "%.2f" % tokenization_time, tokens_per_s]
|
|
||||||
|
|
||||||
# STEP 3: evaluate predicted tokens and features
|
|
||||||
with tmp_output_path.open(mode="w", encoding="utf8") as tmp_out_file:
|
|
||||||
write_conllu(docs, tmp_out_file)
|
|
||||||
with tmp_output_path.open(mode="r", encoding="utf8") as sys_file:
|
|
||||||
sys_ud = conll17_ud_eval.load_conllu(sys_file, check_parse=check_parse)
|
|
||||||
tmp_output_path.unlink()
|
|
||||||
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud, check_parse=check_parse)
|
|
||||||
|
|
||||||
# STEP 4: format the scoring results
|
|
||||||
eval_headers = EVAL_PARSE
|
|
||||||
if not check_parse:
|
|
||||||
eval_headers = EVAL_NO_PARSE
|
|
||||||
|
|
||||||
for score_name in eval_headers:
|
|
||||||
score = scores[score_name]
|
|
||||||
print_string_1.extend(["%.2f" % score.precision,
|
|
||||||
"%.2f" % score.recall,
|
|
||||||
"%.2f" % score.f1])
|
|
||||||
print_string_1.append("-" if score.aligned_accuracy is None else "%.2f" % score.aligned_accuracy)
|
|
||||||
print_string_1.append("-" if score.undersegmented is None else "%.4f" % score.under_perc)
|
|
||||||
print_string_1.append("-" if score.oversegmented is None else "%.4f" % score.over_perc)
|
|
||||||
|
|
||||||
print_header_1.extend([score_name + '_p', score_name + '_r', score_name + '_F', score_name + '_acc',
|
|
||||||
score_name + '_under', score_name + '_over'])
|
|
||||||
|
|
||||||
if score_name in print_freq_tasks:
|
|
||||||
print_header_1.extend([score_name + '_word_under_ex', score_name + '_shape_under_ex',
|
|
||||||
score_name + '_word_over_ex', score_name + '_shape_over_ex'])
|
|
||||||
|
|
||||||
d_under_words = get_freq_tuples(score.undersegmented, PRINT_TOTAL)
|
|
||||||
d_under_shapes = get_freq_tuples([word_shape(x) for x in score.undersegmented], PRINT_TOTAL)
|
|
||||||
d_over_words = get_freq_tuples(score.oversegmented, PRINT_TOTAL)
|
|
||||||
d_over_shapes = get_freq_tuples([word_shape(x) for x in score.oversegmented], PRINT_TOTAL)
|
|
||||||
|
|
||||||
# saving to CSV with ; seperator so blinding ; in the example output
|
|
||||||
print_string_1.append(
|
|
||||||
str({k: v for k, v in d_under_words if v > PRINT_FREQ}).replace(";", "*SEMICOLON*"))
|
|
||||||
print_string_1.append(
|
|
||||||
str({k: v for k, v in d_under_shapes if v > PRINT_FREQ}).replace(";", "*SEMICOLON*"))
|
|
||||||
print_string_1.append(
|
|
||||||
str({k: v for k, v in d_over_words if v > PRINT_FREQ}).replace(";", "*SEMICOLON*"))
|
|
||||||
print_string_1.append(
|
|
||||||
str({k: v for k, v in d_over_shapes if v > PRINT_FREQ}).replace(";", "*SEMICOLON*"))
|
|
||||||
|
|
||||||
# STEP 5: print the formatted results to CSV
|
|
||||||
if print_header:
|
|
||||||
out_file.write(';'.join(map(str, print_header_1)) + '\n')
|
|
||||||
out_file.write(';'.join(map(str, print_string_1)) + '\n')
|
|
||||||
|
|
||||||
|
|
||||||
def run_all_evals(models, treebanks, out_file, check_parse, print_freq_tasks):
|
|
||||||
"""" Run an evaluation for each language with its specified models and treebanks """
|
|
||||||
print_header = True
|
|
||||||
|
|
||||||
for tb_lang, treebank_list in treebanks.items():
|
|
||||||
print()
|
|
||||||
print("Language", tb_lang)
|
|
||||||
for text_path in treebank_list:
|
|
||||||
print(" Evaluating on", text_path)
|
|
||||||
|
|
||||||
gold_path = text_path.parent / (text_path.stem + '.conllu')
|
|
||||||
print(" Gold data from ", gold_path)
|
|
||||||
|
|
||||||
# nested try blocks to ensure the code can continue with the next iteration after a failure
|
|
||||||
try:
|
|
||||||
with gold_path.open(mode='r', encoding='utf-8') as gold_file:
|
|
||||||
gold_ud = conll17_ud_eval.load_conllu(gold_file)
|
|
||||||
|
|
||||||
for nlp, nlp_loading_time, nlp_name in models[tb_lang]:
|
|
||||||
try:
|
|
||||||
print(" Benchmarking", nlp_name)
|
|
||||||
tmp_output_path = text_path.parent / str('tmp_' + nlp_name + '.conllu')
|
|
||||||
run_single_eval(nlp, nlp_loading_time, nlp_name, text_path, gold_ud, tmp_output_path, out_file,
|
|
||||||
print_header, check_parse, print_freq_tasks)
|
|
||||||
print_header = False
|
|
||||||
except Exception as e:
|
|
||||||
print(" Ran into trouble: ", str(e))
|
|
||||||
except Exception as e:
|
|
||||||
print(" Ran into trouble: ", str(e))
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
out_path=("Path to output CSV file", "positional", None, Path),
|
|
||||||
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
|
|
||||||
check_parse=("Set flag to evaluate parsing performance", "flag", "p", bool),
|
|
||||||
langs=("Enumeration of languages to evaluate (default: all)", "option", "l", str),
|
|
||||||
exclude_trained_models=("Set flag to exclude trained models", "flag", "t", bool),
|
|
||||||
exclude_multi=("Set flag to exclude the multi-language model as default baseline", "flag", "m", bool),
|
|
||||||
hide_freq=("Set flag to avoid printing out more detailed high-freq tokenization errors", "flag", "f", bool),
|
|
||||||
corpus=("Whether to run on train, dev or test", "option", "c", str),
|
|
||||||
best_per_language=("Set flag to only keep the largest treebank for each language", "flag", "b", bool)
|
|
||||||
)
|
|
||||||
def main(out_path, ud_dir, check_parse=False, langs=ALL_LANGUAGES, exclude_trained_models=False, exclude_multi=False,
|
|
||||||
hide_freq=False, corpus='train', best_per_language=False):
|
|
||||||
""""
|
|
||||||
Assemble all treebanks and models to run evaluations with.
|
|
||||||
When setting check_parse to True, the default models will not be evaluated as they don't have parsing functionality
|
|
||||||
"""
|
|
||||||
languages = [lang.strip() for lang in langs.split(",")]
|
|
||||||
|
|
||||||
print_freq_tasks = []
|
|
||||||
if not hide_freq:
|
|
||||||
print_freq_tasks = ['Tokens']
|
|
||||||
|
|
||||||
# fetching all relevant treebank from the directory
|
|
||||||
treebanks = fetch_all_treebanks(ud_dir, languages, corpus, best_per_language)
|
|
||||||
|
|
||||||
print()
|
|
||||||
print("Loading all relevant models for", languages)
|
|
||||||
models = dict()
|
|
||||||
|
|
||||||
# multi-lang model
|
|
||||||
multi = None
|
|
||||||
if not exclude_multi and not check_parse:
|
|
||||||
multi = load_model('xx_ent_wiki_sm', add_sentencizer=True)
|
|
||||||
|
|
||||||
# initialize all models with the multi-lang model
|
|
||||||
for lang in languages:
|
|
||||||
models[lang] = [multi] if multi else []
|
|
||||||
# add default models if we don't want to evaluate parsing info
|
|
||||||
if not check_parse:
|
|
||||||
# Norwegian is 'nb' in spaCy but 'no' in the UD corpora
|
|
||||||
if lang == 'no':
|
|
||||||
models['no'].append(load_default_model_sentencizer('nb'))
|
|
||||||
else:
|
|
||||||
models[lang].append(load_default_model_sentencizer(lang))
|
|
||||||
|
|
||||||
# language-specific trained models
|
|
||||||
if not exclude_trained_models:
|
|
||||||
if 'de' in models:
|
|
||||||
models['de'].append(load_model('de_core_news_sm'))
|
|
||||||
models['de'].append(load_model('de_core_news_md'))
|
|
||||||
if 'el' in models:
|
|
||||||
models['el'].append(load_model('el_core_news_sm'))
|
|
||||||
models['el'].append(load_model('el_core_news_md'))
|
|
||||||
if 'en' in models:
|
|
||||||
models['en'].append(load_model('en_core_web_sm'))
|
|
||||||
models['en'].append(load_model('en_core_web_md'))
|
|
||||||
models['en'].append(load_model('en_core_web_lg'))
|
|
||||||
if 'es' in models:
|
|
||||||
models['es'].append(load_model('es_core_news_sm'))
|
|
||||||
models['es'].append(load_model('es_core_news_md'))
|
|
||||||
if 'fr' in models:
|
|
||||||
models['fr'].append(load_model('fr_core_news_sm'))
|
|
||||||
models['fr'].append(load_model('fr_core_news_md'))
|
|
||||||
if 'it' in models:
|
|
||||||
models['it'].append(load_model('it_core_news_sm'))
|
|
||||||
if 'nl' in models:
|
|
||||||
models['nl'].append(load_model('nl_core_news_sm'))
|
|
||||||
if 'pt' in models:
|
|
||||||
models['pt'].append(load_model('pt_core_news_sm'))
|
|
||||||
|
|
||||||
with out_path.open(mode='w', encoding='utf-8') as out_file:
|
|
||||||
run_all_evals(models, treebanks, out_file, check_parse, print_freq_tasks)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
|
@ -1,335 +0,0 @@
|
||||||
# flake8: noqa
|
|
||||||
"""Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
|
|
||||||
.conllu format for development data, allowing the official scorer to be used.
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import plac
|
|
||||||
from pathlib import Path
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
import srsly
|
|
||||||
|
|
||||||
import spacy
|
|
||||||
import spacy.util
|
|
||||||
from spacy.tokens import Token, Doc
|
|
||||||
from spacy.gold import GoldParse
|
|
||||||
from spacy.util import compounding, minibatch_by_words
|
|
||||||
from spacy.syntax.nonproj import projectivize
|
|
||||||
from spacy.matcher import Matcher
|
|
||||||
|
|
||||||
# from spacy.morphology import Fused_begin, Fused_inside
|
|
||||||
from spacy import displacy
|
|
||||||
from collections import defaultdict, Counter
|
|
||||||
from timeit import default_timer as timer
|
|
||||||
|
|
||||||
Fused_begin = None
|
|
||||||
Fused_inside = None
|
|
||||||
|
|
||||||
import itertools
|
|
||||||
import random
|
|
||||||
import numpy.random
|
|
||||||
|
|
||||||
from . import conll17_ud_eval
|
|
||||||
|
|
||||||
from spacy import lang
|
|
||||||
from spacy.lang import zh
|
|
||||||
from spacy.lang import ja
|
|
||||||
from spacy.lang import ru
|
|
||||||
|
|
||||||
|
|
||||||
################
|
|
||||||
# Data reading #
|
|
||||||
################
|
|
||||||
|
|
||||||
space_re = re.compile(r"\s+")
|
|
||||||
|
|
||||||
|
|
||||||
def split_text(text):
|
|
||||||
return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")]
|
|
||||||
|
|
||||||
|
|
||||||
##############
|
|
||||||
# Evaluation #
|
|
||||||
##############
|
|
||||||
|
|
||||||
|
|
||||||
def read_conllu(file_):
|
|
||||||
docs = []
|
|
||||||
sent = []
|
|
||||||
doc = []
|
|
||||||
for line in file_:
|
|
||||||
if line.startswith("# newdoc"):
|
|
||||||
if doc:
|
|
||||||
docs.append(doc)
|
|
||||||
doc = []
|
|
||||||
elif line.startswith("#"):
|
|
||||||
continue
|
|
||||||
elif not line.strip():
|
|
||||||
if sent:
|
|
||||||
doc.append(sent)
|
|
||||||
sent = []
|
|
||||||
else:
|
|
||||||
sent.append(list(line.strip().split("\t")))
|
|
||||||
if len(sent[-1]) != 10:
|
|
||||||
print(repr(line))
|
|
||||||
raise ValueError
|
|
||||||
if sent:
|
|
||||||
doc.append(sent)
|
|
||||||
if doc:
|
|
||||||
docs.append(doc)
|
|
||||||
return docs
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
|
||||||
if text_loc.parts[-1].endswith(".conllu"):
|
|
||||||
docs = []
|
|
||||||
with text_loc.open(encoding="utf8") as file_:
|
|
||||||
for conllu_doc in read_conllu(file_):
|
|
||||||
for conllu_sent in conllu_doc:
|
|
||||||
words = [line[1] for line in conllu_sent]
|
|
||||||
docs.append(Doc(nlp.vocab, words=words))
|
|
||||||
for name, component in nlp.pipeline:
|
|
||||||
docs = list(component.pipe(docs))
|
|
||||||
else:
|
|
||||||
with text_loc.open("r", encoding="utf8") as text_file:
|
|
||||||
texts = split_text(text_file.read())
|
|
||||||
docs = list(nlp.pipe(texts))
|
|
||||||
with sys_loc.open("w", encoding="utf8") as out_file:
|
|
||||||
write_conllu(docs, out_file)
|
|
||||||
with gold_loc.open("r", encoding="utf8") as gold_file:
|
|
||||||
gold_ud = conll17_ud_eval.load_conllu(gold_file)
|
|
||||||
with sys_loc.open("r", encoding="utf8") as sys_file:
|
|
||||||
sys_ud = conll17_ud_eval.load_conllu(sys_file)
|
|
||||||
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
|
|
||||||
return docs, scores
|
|
||||||
|
|
||||||
|
|
||||||
def write_conllu(docs, file_):
|
|
||||||
merger = Matcher(docs[0].vocab)
|
|
||||||
merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
|
|
||||||
for i, doc in enumerate(docs):
|
|
||||||
matches = []
|
|
||||||
if doc.is_parsed:
|
|
||||||
matches = merger(doc)
|
|
||||||
spans = [doc[start : end + 1] for _, start, end in matches]
|
|
||||||
with doc.retokenize() as retokenizer:
|
|
||||||
for span in spans:
|
|
||||||
retokenizer.merge(span)
|
|
||||||
file_.write("# newdoc id = {i}\n".format(i=i))
|
|
||||||
for j, sent in enumerate(doc.sents):
|
|
||||||
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
|
|
||||||
file_.write("# text = {text}\n".format(text=sent.text))
|
|
||||||
for k, token in enumerate(sent):
|
|
||||||
file_.write(_get_token_conllu(token, k, len(sent)) + "\n")
|
|
||||||
file_.write("\n")
|
|
||||||
for word in sent:
|
|
||||||
if word.head.i == word.i and word.dep_ == "ROOT":
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
print("Rootless sentence!")
|
|
||||||
print(sent)
|
|
||||||
print(i)
|
|
||||||
for w in sent:
|
|
||||||
print(w.i, w.text, w.head.text, w.head.i, w.dep_)
|
|
||||||
raise ValueError
|
|
||||||
|
|
||||||
|
|
||||||
def _get_token_conllu(token, k, sent_len):
|
|
||||||
if token.check_morph(Fused_begin) and (k + 1 < sent_len):
|
|
||||||
n = 1
|
|
||||||
text = [token.text]
|
|
||||||
while token.nbor(n).check_morph(Fused_inside):
|
|
||||||
text.append(token.nbor(n).text)
|
|
||||||
n += 1
|
|
||||||
id_ = "%d-%d" % (k + 1, (k + n))
|
|
||||||
fields = [id_, "".join(text)] + ["_"] * 8
|
|
||||||
lines = ["\t".join(fields)]
|
|
||||||
else:
|
|
||||||
lines = []
|
|
||||||
if token.head.i == token.i:
|
|
||||||
head = 0
|
|
||||||
else:
|
|
||||||
head = k + (token.head.i - token.i) + 1
|
|
||||||
fields = [
|
|
||||||
str(k + 1),
|
|
||||||
token.text,
|
|
||||||
token.lemma_,
|
|
||||||
token.pos_,
|
|
||||||
token.tag_,
|
|
||||||
"_",
|
|
||||||
str(head),
|
|
||||||
token.dep_.lower(),
|
|
||||||
"_",
|
|
||||||
"_",
|
|
||||||
]
|
|
||||||
if token.check_morph(Fused_begin) and (k + 1 < sent_len):
|
|
||||||
if k == 0:
|
|
||||||
fields[1] = token.norm_[0].upper() + token.norm_[1:]
|
|
||||||
else:
|
|
||||||
fields[1] = token.norm_
|
|
||||||
elif token.check_morph(Fused_inside):
|
|
||||||
fields[1] = token.norm_
|
|
||||||
elif token._.split_start is not None:
|
|
||||||
split_start = token._.split_start
|
|
||||||
split_end = token._.split_end
|
|
||||||
split_len = (split_end.i - split_start.i) + 1
|
|
||||||
n_in_split = token.i - split_start.i
|
|
||||||
subtokens = guess_fused_orths(split_start.text, [""] * split_len)
|
|
||||||
fields[1] = subtokens[n_in_split]
|
|
||||||
|
|
||||||
lines.append("\t".join(fields))
|
|
||||||
return "\n".join(lines)
|
|
||||||
|
|
||||||
|
|
||||||
def guess_fused_orths(word, ud_forms):
|
|
||||||
"""The UD data 'fused tokens' don't necessarily expand to keys that match
|
|
||||||
the form. We need orths that exact match the string. Here we make a best
|
|
||||||
effort to divide up the word."""
|
|
||||||
if word == "".join(ud_forms):
|
|
||||||
# Happy case: we get a perfect split, with each letter accounted for.
|
|
||||||
return ud_forms
|
|
||||||
elif len(word) == sum(len(subtoken) for subtoken in ud_forms):
|
|
||||||
# Unideal, but at least lengths match.
|
|
||||||
output = []
|
|
||||||
remain = word
|
|
||||||
for subtoken in ud_forms:
|
|
||||||
assert len(subtoken) >= 1
|
|
||||||
output.append(remain[: len(subtoken)])
|
|
||||||
remain = remain[len(subtoken) :]
|
|
||||||
assert len(remain) == 0, (word, ud_forms, remain)
|
|
||||||
return output
|
|
||||||
else:
|
|
||||||
# Let's say word is 6 long, and there are three subtokens. The orths
|
|
||||||
# *must* equal the original string. Arbitrarily, split [4, 1, 1]
|
|
||||||
first = word[: len(word) - (len(ud_forms) - 1)]
|
|
||||||
output = [first]
|
|
||||||
remain = word[len(first) :]
|
|
||||||
for i in range(1, len(ud_forms)):
|
|
||||||
assert remain
|
|
||||||
output.append(remain[:1])
|
|
||||||
remain = remain[1:]
|
|
||||||
assert len(remain) == 0, (word, output, remain)
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
def print_results(name, ud_scores):
|
|
||||||
fields = {}
|
|
||||||
if ud_scores is not None:
|
|
||||||
fields.update(
|
|
||||||
{
|
|
||||||
"words": ud_scores["Words"].f1 * 100,
|
|
||||||
"sents": ud_scores["Sentences"].f1 * 100,
|
|
||||||
"tags": ud_scores["XPOS"].f1 * 100,
|
|
||||||
"uas": ud_scores["UAS"].f1 * 100,
|
|
||||||
"las": ud_scores["LAS"].f1 * 100,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
fields.update({"words": 0.0, "sents": 0.0, "tags": 0.0, "uas": 0.0, "las": 0.0})
|
|
||||||
tpl = "\t".join(
|
|
||||||
(name, "{las:.1f}", "{uas:.1f}", "{tags:.1f}", "{sents:.1f}", "{words:.1f}")
|
|
||||||
)
|
|
||||||
print(tpl.format(**fields))
|
|
||||||
return fields
|
|
||||||
|
|
||||||
|
|
||||||
def get_token_split_start(token):
|
|
||||||
if token.text == "":
|
|
||||||
assert token.i != 0
|
|
||||||
i = -1
|
|
||||||
while token.nbor(i).text == "":
|
|
||||||
i -= 1
|
|
||||||
return token.nbor(i)
|
|
||||||
elif (token.i + 1) < len(token.doc) and token.nbor(1).text == "":
|
|
||||||
return token
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def get_token_split_end(token):
|
|
||||||
if (token.i + 1) == len(token.doc):
|
|
||||||
return token if token.text == "" else None
|
|
||||||
elif token.text != "" and token.nbor(1).text != "":
|
|
||||||
return None
|
|
||||||
i = 1
|
|
||||||
while (token.i + i) < len(token.doc) and token.nbor(i).text == "":
|
|
||||||
i += 1
|
|
||||||
return token.nbor(i - 1)
|
|
||||||
|
|
||||||
|
|
||||||
##################
|
|
||||||
# Initialization #
|
|
||||||
##################
|
|
||||||
|
|
||||||
|
|
||||||
def load_nlp(experiments_dir, corpus):
|
|
||||||
nlp = spacy.load(experiments_dir / corpus / "best-model")
|
|
||||||
return nlp
|
|
||||||
|
|
||||||
|
|
||||||
def initialize_pipeline(nlp, docs, golds, config, device):
|
|
||||||
nlp.add_pipe(nlp.create_pipe("parser"))
|
|
||||||
return nlp
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
test_data_dir=(
|
|
||||||
"Path to Universal Dependencies test data",
|
|
||||||
"positional",
|
|
||||||
None,
|
|
||||||
Path,
|
|
||||||
),
|
|
||||||
experiment_dir=("Parent directory with output model", "positional", None, Path),
|
|
||||||
corpus=(
|
|
||||||
"UD corpus to evaluate, e.g. UD_English, UD_Spanish, etc",
|
|
||||||
"positional",
|
|
||||||
None,
|
|
||||||
str,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
def main(test_data_dir, experiment_dir, corpus):
|
|
||||||
Token.set_extension("split_start", getter=get_token_split_start)
|
|
||||||
Token.set_extension("split_end", getter=get_token_split_end)
|
|
||||||
Token.set_extension("begins_fused", default=False)
|
|
||||||
Token.set_extension("inside_fused", default=False)
|
|
||||||
lang.zh.Chinese.Defaults.use_jieba = False
|
|
||||||
lang.ja.Japanese.Defaults.use_janome = False
|
|
||||||
lang.ru.Russian.Defaults.use_pymorphy2 = False
|
|
||||||
|
|
||||||
nlp = load_nlp(experiment_dir, corpus)
|
|
||||||
|
|
||||||
treebank_code = nlp.meta["treebank"]
|
|
||||||
for section in ("test", "dev"):
|
|
||||||
if section == "dev":
|
|
||||||
section_dir = "conll17-ud-development-2017-03-19"
|
|
||||||
else:
|
|
||||||
section_dir = "conll17-ud-test-2017-05-09"
|
|
||||||
text_path = test_data_dir / "input" / section_dir / (treebank_code + ".txt")
|
|
||||||
udpipe_path = (
|
|
||||||
test_data_dir / "input" / section_dir / (treebank_code + "-udpipe.conllu")
|
|
||||||
)
|
|
||||||
gold_path = test_data_dir / "gold" / section_dir / (treebank_code + ".conllu")
|
|
||||||
|
|
||||||
header = [section, "LAS", "UAS", "TAG", "SENT", "WORD"]
|
|
||||||
print("\t".join(header))
|
|
||||||
inputs = {"gold": gold_path, "udp": udpipe_path, "raw": text_path}
|
|
||||||
for input_type in ("udp", "raw"):
|
|
||||||
input_path = inputs[input_type]
|
|
||||||
output_path = (
|
|
||||||
experiment_dir / corpus / "{section}.conllu".format(section=section)
|
|
||||||
)
|
|
||||||
|
|
||||||
parsed_docs, test_scores = evaluate(nlp, input_path, gold_path, output_path)
|
|
||||||
|
|
||||||
accuracy = print_results(input_type, test_scores)
|
|
||||||
acc_path = (
|
|
||||||
experiment_dir
|
|
||||||
/ corpus
|
|
||||||
/ "{section}-accuracy.json".format(section=section)
|
|
||||||
)
|
|
||||||
srsly.write_json(acc_path, accuracy)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
|
@ -1,570 +0,0 @@
|
||||||
# flake8: noqa
|
|
||||||
"""Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
|
|
||||||
.conllu format for development data, allowing the official scorer to be used.
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import plac
|
|
||||||
from pathlib import Path
|
|
||||||
import re
|
|
||||||
import json
|
|
||||||
import tqdm
|
|
||||||
|
|
||||||
import spacy
|
|
||||||
import spacy.util
|
|
||||||
from bin.ud import conll17_ud_eval
|
|
||||||
from spacy.tokens import Token, Doc
|
|
||||||
from spacy.gold import GoldParse
|
|
||||||
from spacy.util import compounding, minibatch, minibatch_by_words
|
|
||||||
from spacy.syntax.nonproj import projectivize
|
|
||||||
from spacy.matcher import Matcher
|
|
||||||
from spacy import displacy
|
|
||||||
from collections import defaultdict
|
|
||||||
|
|
||||||
import random
|
|
||||||
|
|
||||||
from spacy import lang
|
|
||||||
from spacy.lang import zh
|
|
||||||
from spacy.lang import ja
|
|
||||||
|
|
||||||
try:
|
|
||||||
import torch
|
|
||||||
except ImportError:
|
|
||||||
torch = None
|
|
||||||
|
|
||||||
|
|
||||||
################
|
|
||||||
# Data reading #
|
|
||||||
################
|
|
||||||
|
|
||||||
space_re = re.compile("\s+")
|
|
||||||
|
|
||||||
|
|
||||||
def split_text(text):
|
|
||||||
return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")]
|
|
||||||
|
|
||||||
|
|
||||||
def read_data(
|
|
||||||
nlp,
|
|
||||||
conllu_file,
|
|
||||||
text_file,
|
|
||||||
raw_text=True,
|
|
||||||
oracle_segments=False,
|
|
||||||
max_doc_length=None,
|
|
||||||
limit=None,
|
|
||||||
):
|
|
||||||
"""Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
|
|
||||||
include Doc objects created using nlp.make_doc and then aligned against
|
|
||||||
the gold-standard sequences. If oracle_segments=True, include Doc objects
|
|
||||||
created from the gold-standard segments. At least one must be True."""
|
|
||||||
if not raw_text and not oracle_segments:
|
|
||||||
raise ValueError("At least one of raw_text or oracle_segments must be True")
|
|
||||||
paragraphs = split_text(text_file.read())
|
|
||||||
conllu = read_conllu(conllu_file)
|
|
||||||
# sd is spacy doc; cd is conllu doc
|
|
||||||
# cs is conllu sent, ct is conllu token
|
|
||||||
docs = []
|
|
||||||
golds = []
|
|
||||||
for doc_id, (text, cd) in enumerate(zip(paragraphs, conllu)):
|
|
||||||
sent_annots = []
|
|
||||||
for cs in cd:
|
|
||||||
sent = defaultdict(list)
|
|
||||||
for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
|
|
||||||
if "." in id_:
|
|
||||||
continue
|
|
||||||
if "-" in id_:
|
|
||||||
continue
|
|
||||||
id_ = int(id_) - 1
|
|
||||||
head = int(head) - 1 if head != "0" else id_
|
|
||||||
sent["words"].append(word)
|
|
||||||
sent["tags"].append(tag)
|
|
||||||
sent["morphology"].append(_parse_morph_string(morph))
|
|
||||||
sent["morphology"][-1].add("POS_%s" % pos)
|
|
||||||
sent["heads"].append(head)
|
|
||||||
sent["deps"].append("ROOT" if dep == "root" else dep)
|
|
||||||
sent["spaces"].append(space_after == "_")
|
|
||||||
sent["entities"] = ["-"] * len(sent["words"])
|
|
||||||
sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
|
|
||||||
if oracle_segments:
|
|
||||||
docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
|
|
||||||
golds.append(GoldParse(docs[-1], **sent))
|
|
||||||
assert golds[-1].morphology is not None
|
|
||||||
|
|
||||||
sent_annots.append(sent)
|
|
||||||
if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
|
|
||||||
doc, gold = _make_gold(nlp, None, sent_annots)
|
|
||||||
assert gold.morphology is not None
|
|
||||||
sent_annots = []
|
|
||||||
docs.append(doc)
|
|
||||||
golds.append(gold)
|
|
||||||
if limit and len(docs) >= limit:
|
|
||||||
return docs, golds
|
|
||||||
|
|
||||||
if raw_text and sent_annots:
|
|
||||||
doc, gold = _make_gold(nlp, None, sent_annots)
|
|
||||||
docs.append(doc)
|
|
||||||
golds.append(gold)
|
|
||||||
if limit and len(docs) >= limit:
|
|
||||||
return docs, golds
|
|
||||||
return docs, golds
|
|
||||||
|
|
||||||
def _parse_morph_string(morph_string):
|
|
||||||
if morph_string == '_':
|
|
||||||
return set()
|
|
||||||
output = []
|
|
||||||
replacements = {'1': 'one', '2': 'two', '3': 'three'}
|
|
||||||
for feature in morph_string.split('|'):
|
|
||||||
key, value = feature.split('=')
|
|
||||||
value = replacements.get(value, value)
|
|
||||||
value = value.split(',')[0]
|
|
||||||
output.append('%s_%s' % (key, value.lower()))
|
|
||||||
return set(output)
|
|
||||||
|
|
||||||
def read_conllu(file_):
|
|
||||||
docs = []
|
|
||||||
sent = []
|
|
||||||
doc = []
|
|
||||||
for line in file_:
|
|
||||||
if line.startswith("# newdoc"):
|
|
||||||
if doc:
|
|
||||||
docs.append(doc)
|
|
||||||
doc = []
|
|
||||||
elif line.startswith("#"):
|
|
||||||
continue
|
|
||||||
elif not line.strip():
|
|
||||||
if sent:
|
|
||||||
doc.append(sent)
|
|
||||||
sent = []
|
|
||||||
else:
|
|
||||||
sent.append(list(line.strip().split("\t")))
|
|
||||||
if len(sent[-1]) != 10:
|
|
||||||
print(repr(line))
|
|
||||||
raise ValueError
|
|
||||||
if sent:
|
|
||||||
doc.append(sent)
|
|
||||||
if doc:
|
|
||||||
docs.append(doc)
|
|
||||||
return docs
|
|
||||||
|
|
||||||
|
|
||||||
def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
|
|
||||||
# Flatten the conll annotations, and adjust the head indices
|
|
||||||
flat = defaultdict(list)
|
|
||||||
sent_starts = []
|
|
||||||
for sent in sent_annots:
|
|
||||||
flat["heads"].extend(len(flat["words"])+head for head in sent["heads"])
|
|
||||||
for field in ["words", "tags", "deps", "morphology", "entities", "spaces"]:
|
|
||||||
flat[field].extend(sent[field])
|
|
||||||
sent_starts.append(True)
|
|
||||||
sent_starts.extend([False] * (len(sent["words"]) - 1))
|
|
||||||
# Construct text if necessary
|
|
||||||
assert len(flat["words"]) == len(flat["spaces"])
|
|
||||||
if text is None:
|
|
||||||
text = "".join(
|
|
||||||
word + " " * space for word, space in zip(flat["words"], flat["spaces"])
|
|
||||||
)
|
|
||||||
doc = nlp.make_doc(text)
|
|
||||||
flat.pop("spaces")
|
|
||||||
gold = GoldParse(doc, **flat)
|
|
||||||
gold.sent_starts = sent_starts
|
|
||||||
for i in range(len(gold.heads)):
|
|
||||||
if random.random() < drop_deps:
|
|
||||||
gold.heads[i] = None
|
|
||||||
gold.labels[i] = None
|
|
||||||
|
|
||||||
return doc, gold
|
|
||||||
|
|
||||||
|
|
||||||
#############################
|
|
||||||
# Data transforms for spaCy #
|
|
||||||
#############################
|
|
||||||
|
|
||||||
|
|
||||||
def golds_to_gold_tuples(docs, golds):
|
|
||||||
"""Get out the annoying 'tuples' format used by begin_training, given the
|
|
||||||
GoldParse objects."""
|
|
||||||
tuples = []
|
|
||||||
for doc, gold in zip(docs, golds):
|
|
||||||
text = doc.text
|
|
||||||
ids, words, tags, heads, labels, iob = zip(*gold.orig_annot)
|
|
||||||
sents = [((ids, words, tags, heads, labels, iob), [])]
|
|
||||||
tuples.append((text, sents))
|
|
||||||
return tuples
|
|
||||||
|
|
||||||
|
|
||||||
##############
|
|
||||||
# Evaluation #
|
|
||||||
##############
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
|
||||||
if text_loc.parts[-1].endswith(".conllu"):
|
|
||||||
docs = []
|
|
||||||
with text_loc.open(encoding="utf8") as file_:
|
|
||||||
for conllu_doc in read_conllu(file_):
|
|
||||||
for conllu_sent in conllu_doc:
|
|
||||||
words = [line[1] for line in conllu_sent]
|
|
||||||
docs.append(Doc(nlp.vocab, words=words))
|
|
||||||
for name, component in nlp.pipeline:
|
|
||||||
docs = list(component.pipe(docs))
|
|
||||||
else:
|
|
||||||
with text_loc.open("r", encoding="utf8") as text_file:
|
|
||||||
texts = split_text(text_file.read())
|
|
||||||
docs = list(nlp.pipe(texts))
|
|
||||||
with sys_loc.open("w", encoding="utf8") as out_file:
|
|
||||||
write_conllu(docs, out_file)
|
|
||||||
with gold_loc.open("r", encoding="utf8") as gold_file:
|
|
||||||
gold_ud = conll17_ud_eval.load_conllu(gold_file)
|
|
||||||
with sys_loc.open("r", encoding="utf8") as sys_file:
|
|
||||||
sys_ud = conll17_ud_eval.load_conllu(sys_file)
|
|
||||||
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
|
|
||||||
return docs, scores
|
|
||||||
|
|
||||||
|
|
||||||
def write_conllu(docs, file_):
|
|
||||||
if not Token.has_extension("get_conllu_lines"):
|
|
||||||
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
|
||||||
if not Token.has_extension("begins_fused"):
|
|
||||||
Token.set_extension("begins_fused", default=False)
|
|
||||||
if not Token.has_extension("inside_fused"):
|
|
||||||
Token.set_extension("inside_fused", default=False)
|
|
||||||
|
|
||||||
merger = Matcher(docs[0].vocab)
|
|
||||||
merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
|
|
||||||
for i, doc in enumerate(docs):
|
|
||||||
matches = []
|
|
||||||
if doc.is_parsed:
|
|
||||||
matches = merger(doc)
|
|
||||||
spans = [doc[start : end + 1] for _, start, end in matches]
|
|
||||||
seen_tokens = set()
|
|
||||||
with doc.retokenize() as retokenizer:
|
|
||||||
for span in spans:
|
|
||||||
span_tokens = set(range(span.start, span.end))
|
|
||||||
if not span_tokens.intersection(seen_tokens):
|
|
||||||
retokenizer.merge(span)
|
|
||||||
seen_tokens.update(span_tokens)
|
|
||||||
|
|
||||||
file_.write("# newdoc id = {i}\n".format(i=i))
|
|
||||||
for j, sent in enumerate(doc.sents):
|
|
||||||
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
|
|
||||||
file_.write("# text = {text}\n".format(text=sent.text))
|
|
||||||
for k, token in enumerate(sent):
|
|
||||||
if token.head.i > sent[-1].i or token.head.i < sent[0].i:
|
|
||||||
for word in doc[sent[0].i - 10 : sent[0].i]:
|
|
||||||
print(word.i, word.head.i, word.text, word.dep_)
|
|
||||||
for word in sent:
|
|
||||||
print(word.i, word.head.i, word.text, word.dep_)
|
|
||||||
for word in doc[sent[-1].i : sent[-1].i + 10]:
|
|
||||||
print(word.i, word.head.i, word.text, word.dep_)
|
|
||||||
raise ValueError(
|
|
||||||
"Invalid parse: head outside sentence (%s)" % token.text
|
|
||||||
)
|
|
||||||
file_.write(token._.get_conllu_lines(k) + "\n")
|
|
||||||
file_.write("\n")
|
|
||||||
|
|
||||||
|
|
||||||
def print_progress(itn, losses, ud_scores):
|
|
||||||
fields = {
|
|
||||||
"dep_loss": losses.get("parser", 0.0),
|
|
||||||
"morph_loss": losses.get("morphologizer", 0.0),
|
|
||||||
"tag_loss": losses.get("tagger", 0.0),
|
|
||||||
"words": ud_scores["Words"].f1 * 100,
|
|
||||||
"sents": ud_scores["Sentences"].f1 * 100,
|
|
||||||
"tags": ud_scores["XPOS"].f1 * 100,
|
|
||||||
"uas": ud_scores["UAS"].f1 * 100,
|
|
||||||
"las": ud_scores["LAS"].f1 * 100,
|
|
||||||
"morph": ud_scores["Feats"].f1 * 100,
|
|
||||||
}
|
|
||||||
header = ["Epoch", "P.Loss", "M.Loss", "LAS", "UAS", "TAG", "MORPH", "SENT", "WORD"]
|
|
||||||
if itn == 0:
|
|
||||||
print("\t".join(header))
|
|
||||||
tpl = "\t".join((
|
|
||||||
"{:d}",
|
|
||||||
"{dep_loss:.1f}",
|
|
||||||
"{morph_loss:.1f}",
|
|
||||||
"{las:.1f}",
|
|
||||||
"{uas:.1f}",
|
|
||||||
"{tags:.1f}",
|
|
||||||
"{morph:.1f}",
|
|
||||||
"{sents:.1f}",
|
|
||||||
"{words:.1f}",
|
|
||||||
))
|
|
||||||
print(tpl.format(itn, **fields))
|
|
||||||
|
|
||||||
|
|
||||||
# def get_sent_conllu(sent, sent_id):
|
|
||||||
# lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)]
|
|
||||||
|
|
||||||
|
|
||||||
def get_token_conllu(token, i):
|
|
||||||
if token._.begins_fused:
|
|
||||||
n = 1
|
|
||||||
while token.nbor(n)._.inside_fused:
|
|
||||||
n += 1
|
|
||||||
id_ = "%d-%d" % (i, i + n)
|
|
||||||
lines = [id_, token.text, "_", "_", "_", "_", "_", "_", "_", "_"]
|
|
||||||
else:
|
|
||||||
lines = []
|
|
||||||
if token.head.i == token.i:
|
|
||||||
head = 0
|
|
||||||
else:
|
|
||||||
head = i + (token.head.i - token.i) + 1
|
|
||||||
features = list(token.morph)
|
|
||||||
feat_str = []
|
|
||||||
replacements = {"one": "1", "two": "2", "three": "3"}
|
|
||||||
for feat in features:
|
|
||||||
if not feat.startswith("begin") and not feat.startswith("end"):
|
|
||||||
key, value = feat.split("_", 1)
|
|
||||||
value = replacements.get(value, value)
|
|
||||||
feat_str.append("%s=%s" % (key, value.title()))
|
|
||||||
if not feat_str:
|
|
||||||
feat_str = "_"
|
|
||||||
else:
|
|
||||||
feat_str = "|".join(feat_str)
|
|
||||||
fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, feat_str,
|
|
||||||
str(head), token.dep_.lower(), "_", "_"]
|
|
||||||
lines.append("\t".join(fields))
|
|
||||||
return "\n".join(lines)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
##################
|
|
||||||
# Initialization #
|
|
||||||
##################
|
|
||||||
|
|
||||||
|
|
||||||
def load_nlp(corpus, config, vectors=None):
|
|
||||||
lang = corpus.split("_")[0]
|
|
||||||
nlp = spacy.blank(lang)
|
|
||||||
if config.vectors:
|
|
||||||
if not vectors:
|
|
||||||
raise ValueError(
|
|
||||||
"config asks for vectors, but no vectors "
|
|
||||||
"directory set on command line (use -v)"
|
|
||||||
)
|
|
||||||
if (Path(vectors) / corpus).exists():
|
|
||||||
nlp.vocab.from_disk(Path(vectors) / corpus / "vocab")
|
|
||||||
nlp.meta["treebank"] = corpus
|
|
||||||
return nlp
|
|
||||||
|
|
||||||
|
|
||||||
def initialize_pipeline(nlp, docs, golds, config, device):
|
|
||||||
nlp.add_pipe(nlp.create_pipe("tagger", config={"set_morphology": False}))
|
|
||||||
nlp.add_pipe(nlp.create_pipe("morphologizer"))
|
|
||||||
nlp.add_pipe(nlp.create_pipe("parser"))
|
|
||||||
if config.multitask_tag:
|
|
||||||
nlp.parser.add_multitask_objective("tag")
|
|
||||||
if config.multitask_sent:
|
|
||||||
nlp.parser.add_multitask_objective("sent_start")
|
|
||||||
for gold in golds:
|
|
||||||
for tag in gold.tags:
|
|
||||||
if tag is not None:
|
|
||||||
nlp.tagger.add_label(tag)
|
|
||||||
if torch is not None and device != -1:
|
|
||||||
torch.set_default_tensor_type("torch.cuda.FloatTensor")
|
|
||||||
optimizer = nlp.begin_training(
|
|
||||||
lambda: golds_to_gold_tuples(docs, golds),
|
|
||||||
device=device,
|
|
||||||
subword_features=config.subword_features,
|
|
||||||
conv_depth=config.conv_depth,
|
|
||||||
bilstm_depth=config.bilstm_depth,
|
|
||||||
)
|
|
||||||
if config.pretrained_tok2vec:
|
|
||||||
_load_pretrained_tok2vec(nlp, config.pretrained_tok2vec)
|
|
||||||
return optimizer
|
|
||||||
|
|
||||||
|
|
||||||
def _load_pretrained_tok2vec(nlp, loc):
|
|
||||||
"""Load pretrained weights for the 'token-to-vector' part of the component
|
|
||||||
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
|
||||||
"""
|
|
||||||
with Path(loc).open("rb", encoding="utf8") as file_:
|
|
||||||
weights_data = file_.read()
|
|
||||||
loaded = []
|
|
||||||
for name, component in nlp.pipeline:
|
|
||||||
if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
|
|
||||||
component.tok2vec.from_bytes(weights_data)
|
|
||||||
loaded.append(name)
|
|
||||||
return loaded
|
|
||||||
|
|
||||||
|
|
||||||
########################
|
|
||||||
# Command line helpers #
|
|
||||||
########################
|
|
||||||
|
|
||||||
|
|
||||||
class Config(object):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
vectors=None,
|
|
||||||
max_doc_length=10,
|
|
||||||
multitask_tag=False,
|
|
||||||
multitask_sent=False,
|
|
||||||
multitask_dep=False,
|
|
||||||
multitask_vectors=None,
|
|
||||||
bilstm_depth=0,
|
|
||||||
nr_epoch=30,
|
|
||||||
min_batch_size=100,
|
|
||||||
max_batch_size=1000,
|
|
||||||
batch_by_words=True,
|
|
||||||
dropout=0.2,
|
|
||||||
conv_depth=4,
|
|
||||||
subword_features=True,
|
|
||||||
vectors_dir=None,
|
|
||||||
pretrained_tok2vec=None,
|
|
||||||
):
|
|
||||||
if vectors_dir is not None:
|
|
||||||
if vectors is None:
|
|
||||||
vectors = True
|
|
||||||
if multitask_vectors is None:
|
|
||||||
multitask_vectors = True
|
|
||||||
for key, value in locals().items():
|
|
||||||
setattr(self, key, value)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def load(cls, loc, vectors_dir=None):
|
|
||||||
with Path(loc).open("r", encoding="utf8") as file_:
|
|
||||||
cfg = json.load(file_)
|
|
||||||
if vectors_dir is not None:
|
|
||||||
cfg["vectors_dir"] = vectors_dir
|
|
||||||
return cls(**cfg)
|
|
||||||
|
|
||||||
|
|
||||||
class Dataset(object):
|
|
||||||
def __init__(self, path, section):
|
|
||||||
self.path = path
|
|
||||||
self.section = section
|
|
||||||
self.conllu = None
|
|
||||||
self.text = None
|
|
||||||
for file_path in self.path.iterdir():
|
|
||||||
name = file_path.parts[-1]
|
|
||||||
if section in name and name.endswith("conllu"):
|
|
||||||
self.conllu = file_path
|
|
||||||
elif section in name and name.endswith("txt"):
|
|
||||||
self.text = file_path
|
|
||||||
if self.conllu is None:
|
|
||||||
msg = "Could not find .txt file in {path} for {section}"
|
|
||||||
raise IOError(msg.format(section=section, path=path))
|
|
||||||
if self.text is None:
|
|
||||||
msg = "Could not find .txt file in {path} for {section}"
|
|
||||||
self.lang = self.conllu.parts[-1].split("-")[0].split("_")[0]
|
|
||||||
|
|
||||||
|
|
||||||
class TreebankPaths(object):
|
|
||||||
def __init__(self, ud_path, treebank, **cfg):
|
|
||||||
self.train = Dataset(ud_path / treebank, "train")
|
|
||||||
self.dev = Dataset(ud_path / treebank, "dev")
|
|
||||||
self.lang = self.train.lang
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
|
|
||||||
parses_dir=("Directory to write the development parses", "positional", None, Path),
|
|
||||||
corpus=(
|
|
||||||
"UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora",
|
|
||||||
"positional",
|
|
||||||
None,
|
|
||||||
str,
|
|
||||||
),
|
|
||||||
config=("Path to json formatted config file", "option", "C", Path),
|
|
||||||
limit=("Size limit", "option", "n", int),
|
|
||||||
gpu_device=("Use GPU", "option", "g", int),
|
|
||||||
use_oracle_segments=("Use oracle segments", "flag", "G", int),
|
|
||||||
vectors_dir=(
|
|
||||||
"Path to directory with pretrained vectors, named e.g. en/",
|
|
||||||
"option",
|
|
||||||
"v",
|
|
||||||
Path,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
def main(
|
|
||||||
ud_dir,
|
|
||||||
parses_dir,
|
|
||||||
corpus,
|
|
||||||
config=None,
|
|
||||||
limit=0,
|
|
||||||
gpu_device=-1,
|
|
||||||
vectors_dir=None,
|
|
||||||
use_oracle_segments=False,
|
|
||||||
):
|
|
||||||
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
|
||||||
Token.set_extension("begins_fused", default=False)
|
|
||||||
Token.set_extension("inside_fused", default=False)
|
|
||||||
|
|
||||||
spacy.util.fix_random_seed()
|
|
||||||
lang.zh.Chinese.Defaults.use_jieba = False
|
|
||||||
lang.ja.Japanese.Defaults.use_janome = False
|
|
||||||
|
|
||||||
if config is not None:
|
|
||||||
config = Config.load(config, vectors_dir=vectors_dir)
|
|
||||||
else:
|
|
||||||
config = Config(vectors_dir=vectors_dir)
|
|
||||||
paths = TreebankPaths(ud_dir, corpus)
|
|
||||||
if not (parses_dir / corpus).exists():
|
|
||||||
(parses_dir / corpus).mkdir()
|
|
||||||
print("Train and evaluate", corpus, "using lang", paths.lang)
|
|
||||||
nlp = load_nlp(paths.lang, config, vectors=vectors_dir)
|
|
||||||
|
|
||||||
docs, golds = read_data(
|
|
||||||
nlp,
|
|
||||||
paths.train.conllu.open(encoding="utf8"),
|
|
||||||
paths.train.text.open(encoding="utf8"),
|
|
||||||
max_doc_length=config.max_doc_length,
|
|
||||||
limit=limit,
|
|
||||||
)
|
|
||||||
|
|
||||||
optimizer = initialize_pipeline(nlp, docs, golds, config, gpu_device)
|
|
||||||
|
|
||||||
batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001)
|
|
||||||
beam_prob = compounding(0.2, 0.8, 1.001)
|
|
||||||
for i in range(config.nr_epoch):
|
|
||||||
docs, golds = read_data(
|
|
||||||
nlp,
|
|
||||||
paths.train.conllu.open(encoding="utf8"),
|
|
||||||
paths.train.text.open(encoding="utf8"),
|
|
||||||
max_doc_length=config.max_doc_length,
|
|
||||||
limit=limit,
|
|
||||||
oracle_segments=use_oracle_segments,
|
|
||||||
raw_text=not use_oracle_segments,
|
|
||||||
)
|
|
||||||
Xs = list(zip(docs, golds))
|
|
||||||
random.shuffle(Xs)
|
|
||||||
if config.batch_by_words:
|
|
||||||
batches = minibatch_by_words(Xs, size=batch_sizes)
|
|
||||||
else:
|
|
||||||
batches = minibatch(Xs, size=batch_sizes)
|
|
||||||
losses = {}
|
|
||||||
n_train_words = sum(len(doc) for doc in docs)
|
|
||||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
|
||||||
for batch in batches:
|
|
||||||
batch_docs, batch_gold = zip(*batch)
|
|
||||||
pbar.update(sum(len(doc) for doc in batch_docs))
|
|
||||||
nlp.parser.cfg["beam_update_prob"] = next(beam_prob)
|
|
||||||
nlp.update(
|
|
||||||
batch_docs,
|
|
||||||
batch_gold,
|
|
||||||
sgd=optimizer,
|
|
||||||
drop=config.dropout,
|
|
||||||
losses=losses,
|
|
||||||
)
|
|
||||||
|
|
||||||
out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i)
|
|
||||||
with nlp.use_params(optimizer.averages):
|
|
||||||
if use_oracle_segments:
|
|
||||||
parsed_docs, scores = evaluate(nlp, paths.dev.conllu,
|
|
||||||
paths.dev.conllu, out_path)
|
|
||||||
else:
|
|
||||||
parsed_docs, scores = evaluate(nlp, paths.dev.text,
|
|
||||||
paths.dev.conllu, out_path)
|
|
||||||
print_progress(i, losses, scores)
|
|
||||||
|
|
||||||
|
|
||||||
def _render_parses(i, to_render):
|
|
||||||
to_render[0].user_data["title"] = "Batch %d" % i
|
|
||||||
with Path("/tmp/parses.html").open("w", encoding="utf8") as file_:
|
|
||||||
html = displacy.render(to_render[:5], style="dep", page=True)
|
|
||||||
file_.write(html)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
|
@ -1,19 +0,0 @@
|
||||||
<a href="https://explosion.ai"><img src="https://explosion.ai/assets/img/logo.svg" width="125" height="125" align="right" /></a>
|
|
||||||
|
|
||||||
# spaCy examples
|
|
||||||
|
|
||||||
The examples are Python scripts with well-behaved command line interfaces. For
|
|
||||||
more detailed usage guides, see the [documentation](https://spacy.io/usage/).
|
|
||||||
|
|
||||||
To see the available arguments, you can use the `--help` or `-h` flag:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
$ python examples/training/train_ner.py --help
|
|
||||||
```
|
|
||||||
|
|
||||||
While we try to keep the examples up to date, they are not currently exercised
|
|
||||||
by the test suite, as some of them require significant data downloads or take
|
|
||||||
time to train. If you find that an example is no longer running,
|
|
||||||
[please tell us](https://github.com/explosion/spaCy/issues)! We know there's
|
|
||||||
nothing worse than trying to figure out what you're doing wrong, and it turns
|
|
||||||
out your code was never the problem.
|
|
|
@ -1,267 +0,0 @@
|
||||||
"""
|
|
||||||
This example shows how to use an LSTM sentiment classification model trained
|
|
||||||
using Keras in spaCy. spaCy splits the document into sentences, and each
|
|
||||||
sentence is classified using the LSTM. The scores for the sentences are then
|
|
||||||
aggregated to give the document score. This kind of hierarchical model is quite
|
|
||||||
difficult in "pure" Keras or Tensorflow, but it's very effective. The Keras
|
|
||||||
example on this dataset performs quite poorly, because it cuts off the documents
|
|
||||||
so that they're a fixed size. This hurts review accuracy a lot, because people
|
|
||||||
often summarise their rating in the final sentence
|
|
||||||
|
|
||||||
Prerequisites:
|
|
||||||
spacy download en_vectors_web_lg
|
|
||||||
pip install keras==2.0.9
|
|
||||||
|
|
||||||
Compatible with: spaCy v2.0.0+
|
|
||||||
"""
|
|
||||||
|
|
||||||
import plac
|
|
||||||
import random
|
|
||||||
import pathlib
|
|
||||||
import cytoolz
|
|
||||||
import numpy
|
|
||||||
from keras.models import Sequential, model_from_json
|
|
||||||
from keras.layers import LSTM, Dense, Embedding, Bidirectional
|
|
||||||
from keras.layers import TimeDistributed
|
|
||||||
from keras.optimizers import Adam
|
|
||||||
import thinc.extra.datasets
|
|
||||||
from spacy.compat import pickle
|
|
||||||
import spacy
|
|
||||||
|
|
||||||
|
|
||||||
class SentimentAnalyser(object):
|
|
||||||
@classmethod
|
|
||||||
def load(cls, path, nlp, max_length=100):
|
|
||||||
with (path / "config.json").open() as file_:
|
|
||||||
model = model_from_json(file_.read())
|
|
||||||
with (path / "model").open("rb") as file_:
|
|
||||||
lstm_weights = pickle.load(file_)
|
|
||||||
embeddings = get_embeddings(nlp.vocab)
|
|
||||||
model.set_weights([embeddings] + lstm_weights)
|
|
||||||
return cls(model, max_length=max_length)
|
|
||||||
|
|
||||||
def __init__(self, model, max_length=100):
|
|
||||||
self._model = model
|
|
||||||
self.max_length = max_length
|
|
||||||
|
|
||||||
def __call__(self, doc):
|
|
||||||
X = get_features([doc], self.max_length)
|
|
||||||
y = self._model.predict(X)
|
|
||||||
self.set_sentiment(doc, y)
|
|
||||||
|
|
||||||
def pipe(self, docs, batch_size=1000):
|
|
||||||
for minibatch in cytoolz.partition_all(batch_size, docs):
|
|
||||||
minibatch = list(minibatch)
|
|
||||||
sentences = []
|
|
||||||
for doc in minibatch:
|
|
||||||
sentences.extend(doc.sents)
|
|
||||||
Xs = get_features(sentences, self.max_length)
|
|
||||||
ys = self._model.predict(Xs)
|
|
||||||
for sent, label in zip(sentences, ys):
|
|
||||||
sent.doc.sentiment += label - 0.5
|
|
||||||
for doc in minibatch:
|
|
||||||
yield doc
|
|
||||||
|
|
||||||
def set_sentiment(self, doc, y):
|
|
||||||
doc.sentiment = float(y[0])
|
|
||||||
# Sentiment has a native slot for a single float.
|
|
||||||
# For arbitrary data storage, there's:
|
|
||||||
# doc.user_data['my_data'] = y
|
|
||||||
|
|
||||||
|
|
||||||
def get_labelled_sentences(docs, doc_labels):
|
|
||||||
labels = []
|
|
||||||
sentences = []
|
|
||||||
for doc, y in zip(docs, doc_labels):
|
|
||||||
for sent in doc.sents:
|
|
||||||
sentences.append(sent)
|
|
||||||
labels.append(y)
|
|
||||||
return sentences, numpy.asarray(labels, dtype="int32")
|
|
||||||
|
|
||||||
|
|
||||||
def get_features(docs, max_length):
|
|
||||||
docs = list(docs)
|
|
||||||
Xs = numpy.zeros((len(docs), max_length), dtype="int32")
|
|
||||||
for i, doc in enumerate(docs):
|
|
||||||
j = 0
|
|
||||||
for token in doc:
|
|
||||||
vector_id = token.vocab.vectors.find(key=token.orth)
|
|
||||||
if vector_id >= 0:
|
|
||||||
Xs[i, j] = vector_id
|
|
||||||
else:
|
|
||||||
Xs[i, j] = 0
|
|
||||||
j += 1
|
|
||||||
if j >= max_length:
|
|
||||||
break
|
|
||||||
return Xs
|
|
||||||
|
|
||||||
|
|
||||||
def train(
|
|
||||||
train_texts,
|
|
||||||
train_labels,
|
|
||||||
dev_texts,
|
|
||||||
dev_labels,
|
|
||||||
lstm_shape,
|
|
||||||
lstm_settings,
|
|
||||||
lstm_optimizer,
|
|
||||||
batch_size=100,
|
|
||||||
nb_epoch=5,
|
|
||||||
by_sentence=True,
|
|
||||||
):
|
|
||||||
|
|
||||||
print("Loading spaCy")
|
|
||||||
nlp = spacy.load("en_vectors_web_lg")
|
|
||||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
|
||||||
embeddings = get_embeddings(nlp.vocab)
|
|
||||||
model = compile_lstm(embeddings, lstm_shape, lstm_settings)
|
|
||||||
|
|
||||||
print("Parsing texts...")
|
|
||||||
train_docs = list(nlp.pipe(train_texts))
|
|
||||||
dev_docs = list(nlp.pipe(dev_texts))
|
|
||||||
if by_sentence:
|
|
||||||
train_docs, train_labels = get_labelled_sentences(train_docs, train_labels)
|
|
||||||
dev_docs, dev_labels = get_labelled_sentences(dev_docs, dev_labels)
|
|
||||||
|
|
||||||
train_X = get_features(train_docs, lstm_shape["max_length"])
|
|
||||||
dev_X = get_features(dev_docs, lstm_shape["max_length"])
|
|
||||||
model.fit(
|
|
||||||
train_X,
|
|
||||||
train_labels,
|
|
||||||
validation_data=(dev_X, dev_labels),
|
|
||||||
epochs=nb_epoch,
|
|
||||||
batch_size=batch_size,
|
|
||||||
)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def compile_lstm(embeddings, shape, settings):
|
|
||||||
model = Sequential()
|
|
||||||
model.add(
|
|
||||||
Embedding(
|
|
||||||
embeddings.shape[0],
|
|
||||||
embeddings.shape[1],
|
|
||||||
input_length=shape["max_length"],
|
|
||||||
trainable=False,
|
|
||||||
weights=[embeddings],
|
|
||||||
mask_zero=True,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
model.add(TimeDistributed(Dense(shape["nr_hidden"], use_bias=False)))
|
|
||||||
model.add(
|
|
||||||
Bidirectional(
|
|
||||||
LSTM(
|
|
||||||
shape["nr_hidden"],
|
|
||||||
recurrent_dropout=settings["dropout"],
|
|
||||||
dropout=settings["dropout"],
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
model.add(Dense(shape["nr_class"], activation="sigmoid"))
|
|
||||||
model.compile(
|
|
||||||
optimizer=Adam(lr=settings["lr"]),
|
|
||||||
loss="binary_crossentropy",
|
|
||||||
metrics=["accuracy"],
|
|
||||||
)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def get_embeddings(vocab):
|
|
||||||
return vocab.vectors.data
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate(model_dir, texts, labels, max_length=100):
|
|
||||||
nlp = spacy.load("en_vectors_web_lg")
|
|
||||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
|
||||||
nlp.add_pipe(SentimentAnalyser.load(model_dir, nlp, max_length=max_length))
|
|
||||||
|
|
||||||
correct = 0
|
|
||||||
i = 0
|
|
||||||
for doc in nlp.pipe(texts, batch_size=1000):
|
|
||||||
correct += bool(doc.sentiment >= 0.5) == bool(labels[i])
|
|
||||||
i += 1
|
|
||||||
return float(correct) / i
|
|
||||||
|
|
||||||
|
|
||||||
def read_data(data_dir, limit=0):
|
|
||||||
examples = []
|
|
||||||
for subdir, label in (("pos", 1), ("neg", 0)):
|
|
||||||
for filename in (data_dir / subdir).iterdir():
|
|
||||||
with filename.open() as file_:
|
|
||||||
text = file_.read()
|
|
||||||
examples.append((text, label))
|
|
||||||
random.shuffle(examples)
|
|
||||||
if limit >= 1:
|
|
||||||
examples = examples[:limit]
|
|
||||||
return zip(*examples) # Unzips into two lists
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
train_dir=("Location of training file or directory"),
|
|
||||||
dev_dir=("Location of development file or directory"),
|
|
||||||
model_dir=("Location of output model directory",),
|
|
||||||
is_runtime=("Demonstrate run-time usage", "flag", "r", bool),
|
|
||||||
nr_hidden=("Number of hidden units", "option", "H", int),
|
|
||||||
max_length=("Maximum sentence length", "option", "L", int),
|
|
||||||
dropout=("Dropout", "option", "d", float),
|
|
||||||
learn_rate=("Learn rate", "option", "e", float),
|
|
||||||
nb_epoch=("Number of training epochs", "option", "i", int),
|
|
||||||
batch_size=("Size of minibatches for training LSTM", "option", "b", int),
|
|
||||||
nr_examples=("Limit to N examples", "option", "n", int),
|
|
||||||
)
|
|
||||||
def main(
|
|
||||||
model_dir=None,
|
|
||||||
train_dir=None,
|
|
||||||
dev_dir=None,
|
|
||||||
is_runtime=False,
|
|
||||||
nr_hidden=64,
|
|
||||||
max_length=100, # Shape
|
|
||||||
dropout=0.5,
|
|
||||||
learn_rate=0.001, # General NN config
|
|
||||||
nb_epoch=5,
|
|
||||||
batch_size=256,
|
|
||||||
nr_examples=-1,
|
|
||||||
): # Training params
|
|
||||||
if model_dir is not None:
|
|
||||||
model_dir = pathlib.Path(model_dir)
|
|
||||||
if train_dir is None or dev_dir is None:
|
|
||||||
imdb_data = thinc.extra.datasets.imdb()
|
|
||||||
if is_runtime:
|
|
||||||
if dev_dir is None:
|
|
||||||
dev_texts, dev_labels = zip(*imdb_data[1])
|
|
||||||
else:
|
|
||||||
dev_texts, dev_labels = read_data(dev_dir)
|
|
||||||
acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length)
|
|
||||||
print(acc)
|
|
||||||
else:
|
|
||||||
if train_dir is None:
|
|
||||||
train_texts, train_labels = zip(*imdb_data[0])
|
|
||||||
else:
|
|
||||||
print("Read data")
|
|
||||||
train_texts, train_labels = read_data(train_dir, limit=nr_examples)
|
|
||||||
if dev_dir is None:
|
|
||||||
dev_texts, dev_labels = zip(*imdb_data[1])
|
|
||||||
else:
|
|
||||||
dev_texts, dev_labels = read_data(dev_dir, imdb_data, limit=nr_examples)
|
|
||||||
train_labels = numpy.asarray(train_labels, dtype="int32")
|
|
||||||
dev_labels = numpy.asarray(dev_labels, dtype="int32")
|
|
||||||
lstm = train(
|
|
||||||
train_texts,
|
|
||||||
train_labels,
|
|
||||||
dev_texts,
|
|
||||||
dev_labels,
|
|
||||||
{"nr_hidden": nr_hidden, "max_length": max_length, "nr_class": 1},
|
|
||||||
{"dropout": dropout, "lr": learn_rate},
|
|
||||||
{},
|
|
||||||
nb_epoch=nb_epoch,
|
|
||||||
batch_size=batch_size,
|
|
||||||
)
|
|
||||||
weights = lstm.get_weights()
|
|
||||||
if model_dir is not None:
|
|
||||||
with (model_dir / "model").open("wb") as file_:
|
|
||||||
pickle.dump(weights[1:], file_)
|
|
||||||
with (model_dir / "config.json").open("w") as file_:
|
|
||||||
file_.write(lstm.to_json())
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
|
@ -1,82 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf8
|
|
||||||
"""A simple example of extracting relations between phrases and entities using
|
|
||||||
spaCy's named entity recognizer and the dependency parse. Here, we extract
|
|
||||||
money and currency values (entities labelled as MONEY) and then check the
|
|
||||||
dependency tree to find the noun phrase they are referring to – for example:
|
|
||||||
$9.4 million --> Net income.
|
|
||||||
|
|
||||||
Compatible with: spaCy v2.0.0+
|
|
||||||
Last tested with: v2.2.1
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals, print_function
|
|
||||||
|
|
||||||
import plac
|
|
||||||
import spacy
|
|
||||||
|
|
||||||
|
|
||||||
TEXTS = [
|
|
||||||
"Net income was $9.4 million compared to the prior year of $2.7 million.",
|
|
||||||
"Revenue exceeded twelve billion dollars, with a loss of $1b.",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
model=("Model to load (needs parser and NER)", "positional", None, str)
|
|
||||||
)
|
|
||||||
def main(model="en_core_web_sm"):
|
|
||||||
nlp = spacy.load(model)
|
|
||||||
print("Loaded model '%s'" % model)
|
|
||||||
print("Processing %d texts" % len(TEXTS))
|
|
||||||
|
|
||||||
for text in TEXTS:
|
|
||||||
doc = nlp(text)
|
|
||||||
relations = extract_currency_relations(doc)
|
|
||||||
for r1, r2 in relations:
|
|
||||||
print("{:<10}\t{}\t{}".format(r1.text, r2.ent_type_, r2.text))
|
|
||||||
|
|
||||||
|
|
||||||
def filter_spans(spans):
|
|
||||||
# Filter a sequence of spans so they don't contain overlaps
|
|
||||||
# For spaCy 2.1.4+: this function is available as spacy.util.filter_spans()
|
|
||||||
get_sort_key = lambda span: (span.end - span.start, -span.start)
|
|
||||||
sorted_spans = sorted(spans, key=get_sort_key, reverse=True)
|
|
||||||
result = []
|
|
||||||
seen_tokens = set()
|
|
||||||
for span in sorted_spans:
|
|
||||||
# Check for end - 1 here because boundaries are inclusive
|
|
||||||
if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
|
|
||||||
result.append(span)
|
|
||||||
seen_tokens.update(range(span.start, span.end))
|
|
||||||
result = sorted(result, key=lambda span: span.start)
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def extract_currency_relations(doc):
|
|
||||||
# Merge entities and noun chunks into one token
|
|
||||||
spans = list(doc.ents) + list(doc.noun_chunks)
|
|
||||||
spans = filter_spans(spans)
|
|
||||||
with doc.retokenize() as retokenizer:
|
|
||||||
for span in spans:
|
|
||||||
retokenizer.merge(span)
|
|
||||||
|
|
||||||
relations = []
|
|
||||||
for money in filter(lambda w: w.ent_type_ == "MONEY", doc):
|
|
||||||
if money.dep_ in ("attr", "dobj"):
|
|
||||||
subject = [w for w in money.head.lefts if w.dep_ == "nsubj"]
|
|
||||||
if subject:
|
|
||||||
subject = subject[0]
|
|
||||||
relations.append((subject, money))
|
|
||||||
elif money.dep_ == "pobj" and money.head.dep_ == "prep":
|
|
||||||
relations.append((money.head.head, money))
|
|
||||||
return relations
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
||||||
|
|
||||||
# Expected output:
|
|
||||||
# Net income MONEY $9.4 million
|
|
||||||
# the prior year MONEY $2.7 million
|
|
||||||
# Revenue MONEY twelve billion dollars
|
|
||||||
# a loss MONEY 1b
|
|
|
@ -1,67 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf8
|
|
||||||
"""This example shows how to navigate the parse tree including subtrees
|
|
||||||
attached to a word.
|
|
||||||
|
|
||||||
Based on issue #252:
|
|
||||||
"In the documents and tutorials the main thing I haven't found is
|
|
||||||
examples on how to break sentences down into small sub thoughts/chunks. The
|
|
||||||
noun_chunks is handy, but having examples on using the token.head to find small
|
|
||||||
(near-complete) sentence chunks would be neat. Lets take the example sentence:
|
|
||||||
"displaCy uses CSS and JavaScript to show you how computers understand language"
|
|
||||||
|
|
||||||
This sentence has two main parts (XCOMP & CCOMP) according to the breakdown:
|
|
||||||
[displaCy] uses CSS and Javascript [to + show]
|
|
||||||
show you how computers understand [language]
|
|
||||||
|
|
||||||
I'm assuming that we can use the token.head to build these groups."
|
|
||||||
|
|
||||||
Compatible with: spaCy v2.0.0+
|
|
||||||
Last tested with: v2.1.0
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals, print_function
|
|
||||||
|
|
||||||
import plac
|
|
||||||
import spacy
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(model=("Model to load", "positional", None, str))
|
|
||||||
def main(model="en_core_web_sm"):
|
|
||||||
nlp = spacy.load(model)
|
|
||||||
print("Loaded model '%s'" % model)
|
|
||||||
|
|
||||||
doc = nlp(
|
|
||||||
"displaCy uses CSS and JavaScript to show you how computers "
|
|
||||||
"understand language"
|
|
||||||
)
|
|
||||||
|
|
||||||
# The easiest way is to find the head of the subtree you want, and then use
|
|
||||||
# the `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree`
|
|
||||||
# is the one that does what you're asking for most directly:
|
|
||||||
for word in doc:
|
|
||||||
if word.dep_ in ("xcomp", "ccomp"):
|
|
||||||
print("".join(w.text_with_ws for w in word.subtree))
|
|
||||||
|
|
||||||
# It'd probably be better for `word.subtree` to return a `Span` object
|
|
||||||
# instead of a generator over the tokens. If you want the `Span` you can
|
|
||||||
# get it via the `.right_edge` and `.left_edge` properties. The `Span`
|
|
||||||
# object is nice because you can easily get a vector, merge it, etc.
|
|
||||||
for word in doc:
|
|
||||||
if word.dep_ in ("xcomp", "ccomp"):
|
|
||||||
subtree_span = doc[word.left_edge.i : word.right_edge.i + 1]
|
|
||||||
print(subtree_span.text, "|", subtree_span.root.text)
|
|
||||||
|
|
||||||
# You might also want to select a head, and then select a start and end
|
|
||||||
# position by walking along its children. You could then take the
|
|
||||||
# `.left_edge` and `.right_edge` of those tokens, and use it to calculate
|
|
||||||
# a span.
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
||||||
|
|
||||||
# Expected output:
|
|
||||||
# to show you how computers understand language
|
|
||||||
# how computers understand language
|
|
||||||
# to show you how computers understand language | show
|
|
||||||
# how computers understand language | understand
|
|
|
@ -1,112 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf8
|
|
||||||
"""Match a large set of multi-word expressions in O(1) time.
|
|
||||||
|
|
||||||
The idea is to associate each word in the vocabulary with a tag, noting whether
|
|
||||||
they begin, end, or are inside at least one pattern. An additional tag is used
|
|
||||||
for single-word patterns. Complete patterns are also stored in a hash set.
|
|
||||||
When we process a document, we look up the words in the vocabulary, to
|
|
||||||
associate the words with the tags. We then search for tag-sequences that
|
|
||||||
correspond to valid candidates. Finally, we look up the candidates in the hash
|
|
||||||
set.
|
|
||||||
|
|
||||||
For instance, to search for the phrases "Barack Hussein Obama" and "Hilary
|
|
||||||
Clinton", we would associate "Barack" and "Hilary" with the B tag, Hussein with
|
|
||||||
the I tag, and Obama and Clinton with the L tag.
|
|
||||||
|
|
||||||
The document "Barack Clinton and Hilary Clinton" would have the tag sequence
|
|
||||||
[{B}, {L}, {}, {B}, {L}], so we'd get two matches. However, only the second
|
|
||||||
candidate is in the phrase dictionary, so only one is returned as a match.
|
|
||||||
|
|
||||||
The algorithm is O(n) at run-time for document of length n because we're only
|
|
||||||
ever matching over the tag patterns. So no matter how many phrases we're
|
|
||||||
looking for, our pattern set stays very small (exact size depends on the
|
|
||||||
maximum length we're looking for, as the query language currently has no
|
|
||||||
quantifiers).
|
|
||||||
|
|
||||||
The example expects a .bz2 file from the Reddit corpus, and a patterns file,
|
|
||||||
formatted in jsonl as a sequence of entries like this:
|
|
||||||
|
|
||||||
{"text":"Anchorage"}
|
|
||||||
{"text":"Angola"}
|
|
||||||
{"text":"Ann Arbor"}
|
|
||||||
{"text":"Annapolis"}
|
|
||||||
{"text":"Appalachia"}
|
|
||||||
{"text":"Argentina"}
|
|
||||||
|
|
||||||
Reddit comments corpus:
|
|
||||||
* https://files.pushshift.io/reddit/
|
|
||||||
* https://archive.org/details/2015_reddit_comments_corpus
|
|
||||||
|
|
||||||
Compatible with: spaCy v2.0.0+
|
|
||||||
"""
|
|
||||||
from __future__ import print_function, unicode_literals, division
|
|
||||||
|
|
||||||
from bz2 import BZ2File
|
|
||||||
import time
|
|
||||||
import plac
|
|
||||||
import json
|
|
||||||
|
|
||||||
from spacy.matcher import PhraseMatcher
|
|
||||||
import spacy
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
patterns_loc=("Path to gazetteer", "positional", None, str),
|
|
||||||
text_loc=("Path to Reddit corpus file", "positional", None, str),
|
|
||||||
n=("Number of texts to read", "option", "n", int),
|
|
||||||
lang=("Language class to initialise", "option", "l", str),
|
|
||||||
)
|
|
||||||
def main(patterns_loc, text_loc, n=10000, lang="en"):
|
|
||||||
nlp = spacy.blank(lang)
|
|
||||||
nlp.vocab.lex_attr_getters = {}
|
|
||||||
phrases = read_gazetteer(nlp.tokenizer, patterns_loc)
|
|
||||||
count = 0
|
|
||||||
t1 = time.time()
|
|
||||||
for ent_id, text in get_matches(nlp.tokenizer, phrases, read_text(text_loc, n=n)):
|
|
||||||
count += 1
|
|
||||||
t2 = time.time()
|
|
||||||
print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count))
|
|
||||||
|
|
||||||
|
|
||||||
def read_gazetteer(tokenizer, loc, n=-1):
|
|
||||||
for i, line in enumerate(open(loc)):
|
|
||||||
data = json.loads(line.strip())
|
|
||||||
phrase = tokenizer(data["text"])
|
|
||||||
for w in phrase:
|
|
||||||
_ = tokenizer.vocab[w.text]
|
|
||||||
if len(phrase) >= 2:
|
|
||||||
yield phrase
|
|
||||||
|
|
||||||
|
|
||||||
def read_text(bz2_loc, n=10000):
|
|
||||||
with BZ2File(bz2_loc) as file_:
|
|
||||||
for i, line in enumerate(file_):
|
|
||||||
data = json.loads(line)
|
|
||||||
yield data["body"]
|
|
||||||
if i >= n:
|
|
||||||
break
|
|
||||||
|
|
||||||
|
|
||||||
def get_matches(tokenizer, phrases, texts):
|
|
||||||
matcher = PhraseMatcher(tokenizer.vocab)
|
|
||||||
matcher.add("Phrase", None, *phrases)
|
|
||||||
for text in texts:
|
|
||||||
doc = tokenizer(text)
|
|
||||||
for w in doc:
|
|
||||||
_ = doc.vocab[w.text]
|
|
||||||
matches = matcher(doc)
|
|
||||||
for ent_id, start, end in matches:
|
|
||||||
yield (ent_id, doc[start:end].text)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
if False:
|
|
||||||
import cProfile
|
|
||||||
import pstats
|
|
||||||
|
|
||||||
cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
|
|
||||||
s = pstats.Stats("Profile.prof")
|
|
||||||
s.strip_dirs().sort_stats("time").print_stats()
|
|
||||||
else:
|
|
||||||
plac.call(main)
|
|
|
@ -1,114 +0,0 @@
|
||||||
<a href="https://explosion.ai"><img src="https://explosion.ai/assets/img/logo.svg" width="125" height="125" align="right" /></a>
|
|
||||||
|
|
||||||
# A decomposable attention model for Natural Language Inference
|
|
||||||
**by Matthew Honnibal, [@honnibal](https://github.com/honnibal)**
|
|
||||||
**Updated for spaCy 2.0+ and Keras 2.2.2+ by John Stewart, [@free-variation](https://github.com/free-variation)**
|
|
||||||
|
|
||||||
This directory contains an implementation of the entailment prediction model described
|
|
||||||
by [Parikh et al. (2016)](https://arxiv.org/pdf/1606.01933.pdf). The model is notable
|
|
||||||
for its competitive performance with very few parameters.
|
|
||||||
|
|
||||||
The model is implemented using [Keras](https://keras.io/) and [spaCy](https://spacy.io).
|
|
||||||
Keras is used to build and train the network. spaCy is used to load
|
|
||||||
the [GloVe](http://nlp.stanford.edu/projects/glove/) vectors, perform the
|
|
||||||
feature extraction, and help you apply the model at run-time. The following
|
|
||||||
demo code shows how the entailment model can be used at runtime, once the
|
|
||||||
hook is installed to customise the `.similarity()` method of spaCy's `Doc`
|
|
||||||
and `Span` objects:
|
|
||||||
|
|
||||||
```python
|
|
||||||
def demo(shape):
|
|
||||||
nlp = spacy.load('en_vectors_web_lg')
|
|
||||||
nlp.add_pipe(KerasSimilarityShim.load(nlp.path / 'similarity', nlp, shape[0]))
|
|
||||||
|
|
||||||
doc1 = nlp(u'The king of France is bald.')
|
|
||||||
doc2 = nlp(u'France has no king.')
|
|
||||||
|
|
||||||
print("Sentence 1:", doc1)
|
|
||||||
print("Sentence 2:", doc2)
|
|
||||||
|
|
||||||
entailment_type, confidence = doc1.similarity(doc2)
|
|
||||||
print("Entailment type:", entailment_type, "(Confidence:", confidence, ")")
|
|
||||||
```
|
|
||||||
|
|
||||||
Which gives the output `Entailment type: contradiction (Confidence: 0.60604566)`, showing that
|
|
||||||
the system has definite opinions about Betrand Russell's [famous conundrum](https://users.drew.edu/jlenz/br-on-denoting.html)!
|
|
||||||
|
|
||||||
I'm working on a blog post to explain Parikh et al.'s model in more detail.
|
|
||||||
A [notebook](https://github.com/free-variation/spaCy/blob/master/examples/notebooks/Decompositional%20Attention.ipynb) is available that briefly explains this implementation.
|
|
||||||
I think it is a very interesting example of the attention mechanism, which
|
|
||||||
I didn't understand very well before working through this paper. There are
|
|
||||||
lots of ways to extend the model.
|
|
||||||
|
|
||||||
## What's where
|
|
||||||
|
|
||||||
| File | Description |
|
|
||||||
| --- | --- |
|
|
||||||
| `__main__.py` | The script that will be executed. Defines the CLI, the data reading, etc — all the boring stuff. |
|
|
||||||
| `spacy_hook.py` | Provides a class `KerasSimilarityShim` that lets you use an arbitrary function to customize spaCy's `doc.similarity()` method. Instead of the default average-of-vectors algorithm, when you call `doc1.similarity(doc2)`, you'll get the result of `your_model(doc1, doc2)`. |
|
|
||||||
| `keras_decomposable_attention.py` | Defines the neural network model. |
|
|
||||||
|
|
||||||
## Setting up
|
|
||||||
|
|
||||||
First, install [Keras](https://keras.io/), [spaCy](https://spacy.io) and the spaCy
|
|
||||||
English models (about 1GB of data):
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install keras
|
|
||||||
pip install spacy
|
|
||||||
python -m spacy download en_vectors_web_lg
|
|
||||||
```
|
|
||||||
|
|
||||||
You'll also want to get Keras working on your GPU, and you will need a backend, such as TensorFlow or Theano.
|
|
||||||
This will depend on your set up, so you're mostly on your own for this step. If you're using AWS, try the
|
|
||||||
[NVidia AMI](https://aws.amazon.com/marketplace/pp/B00FYCDDTE). It made things pretty easy.
|
|
||||||
|
|
||||||
Once you've installed the dependencies, you can run a small preliminary test of
|
|
||||||
the Keras model:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
py.test keras_parikh_entailment/keras_decomposable_attention.py
|
|
||||||
```
|
|
||||||
|
|
||||||
This compiles the model and fits it with some dummy data. You should see that
|
|
||||||
both tests passed.
|
|
||||||
|
|
||||||
Finally, download the [Stanford Natural Language Inference corpus](http://nlp.stanford.edu/projects/snli/).
|
|
||||||
|
|
||||||
## Running the example
|
|
||||||
|
|
||||||
You can run the `keras_parikh_entailment/` directory as a script, which executes the file
|
|
||||||
[`keras_parikh_entailment/__main__.py`](__main__.py). If you run the script without arguments
|
|
||||||
the usage is shown. Running it with `-h` explains the command line arguments.
|
|
||||||
|
|
||||||
The first thing you'll want to do is train the model:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python keras_parikh_entailment/ train -t <path to SNLI train JSON> -s <path to SNLI dev JSON>
|
|
||||||
```
|
|
||||||
|
|
||||||
Training takes about 300 epochs for full accuracy, and I haven't rerun the full
|
|
||||||
experiment since refactoring things to publish this example — please let me
|
|
||||||
know if I've broken something. You should get to at least 85% on the development data even after 10-15 epochs.
|
|
||||||
|
|
||||||
The other two modes demonstrate run-time usage. I never like relying on the accuracy printed
|
|
||||||
by `.fit()` methods. I never really feel confident until I've run a new process that loads
|
|
||||||
the model and starts making predictions, without access to the gold labels. I've therefore
|
|
||||||
included an `evaluate` mode.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python keras_parikh_entailment/ evaluate -s <path to SNLI train JSON>
|
|
||||||
```
|
|
||||||
|
|
||||||
Finally, there's also a little demo, which mostly exists to show
|
|
||||||
you how run-time usage will eventually look.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python keras_parikh_entailment/ demo
|
|
||||||
```
|
|
||||||
|
|
||||||
## Getting updates
|
|
||||||
|
|
||||||
We should have the blog post explaining the model ready before the end of the week. To get
|
|
||||||
notified when it's published, you can either follow me on [Twitter](https://twitter.com/honnibal)
|
|
||||||
or subscribe to our [mailing list](http://eepurl.com/ckUpQ5).
|
|
|
@ -1,207 +0,0 @@
|
||||||
import numpy as np
|
|
||||||
import json
|
|
||||||
from keras.utils import to_categorical
|
|
||||||
import plac
|
|
||||||
import sys
|
|
||||||
|
|
||||||
from keras_decomposable_attention import build_model
|
|
||||||
from spacy_hook import get_embeddings, KerasSimilarityShim
|
|
||||||
|
|
||||||
try:
|
|
||||||
import cPickle as pickle
|
|
||||||
except ImportError:
|
|
||||||
import pickle
|
|
||||||
|
|
||||||
import spacy
|
|
||||||
|
|
||||||
# workaround for keras/tensorflow bug
|
|
||||||
# see https://github.com/tensorflow/tensorflow/issues/3388
|
|
||||||
import os
|
|
||||||
import importlib
|
|
||||||
from keras import backend as K
|
|
||||||
|
|
||||||
|
|
||||||
def set_keras_backend(backend):
|
|
||||||
if K.backend() != backend:
|
|
||||||
os.environ["KERAS_BACKEND"] = backend
|
|
||||||
importlib.reload(K)
|
|
||||||
assert K.backend() == backend
|
|
||||||
if backend == "tensorflow":
|
|
||||||
K.get_session().close()
|
|
||||||
cfg = K.tf.ConfigProto()
|
|
||||||
cfg.gpu_options.allow_growth = True
|
|
||||||
K.set_session(K.tf.Session(config=cfg))
|
|
||||||
K.clear_session()
|
|
||||||
|
|
||||||
|
|
||||||
set_keras_backend("tensorflow")
|
|
||||||
|
|
||||||
|
|
||||||
def train(train_loc, dev_loc, shape, settings):
|
|
||||||
train_texts1, train_texts2, train_labels = read_snli(train_loc)
|
|
||||||
dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc)
|
|
||||||
|
|
||||||
print("Loading spaCy")
|
|
||||||
nlp = spacy.load("en_vectors_web_lg")
|
|
||||||
assert nlp.path is not None
|
|
||||||
print("Processing texts...")
|
|
||||||
train_X = create_dataset(nlp, train_texts1, train_texts2, 100, shape[0])
|
|
||||||
dev_X = create_dataset(nlp, dev_texts1, dev_texts2, 100, shape[0])
|
|
||||||
|
|
||||||
print("Compiling network")
|
|
||||||
model = build_model(get_embeddings(nlp.vocab), shape, settings)
|
|
||||||
|
|
||||||
print(settings)
|
|
||||||
model.fit(
|
|
||||||
train_X,
|
|
||||||
train_labels,
|
|
||||||
validation_data=(dev_X, dev_labels),
|
|
||||||
epochs=settings["nr_epoch"],
|
|
||||||
batch_size=settings["batch_size"],
|
|
||||||
)
|
|
||||||
if not (nlp.path / "similarity").exists():
|
|
||||||
(nlp.path / "similarity").mkdir()
|
|
||||||
print("Saving to", nlp.path / "similarity")
|
|
||||||
weights = model.get_weights()
|
|
||||||
# remove the embedding matrix. We can reconstruct it.
|
|
||||||
del weights[1]
|
|
||||||
with (nlp.path / "similarity" / "model").open("wb") as file_:
|
|
||||||
pickle.dump(weights, file_)
|
|
||||||
with (nlp.path / "similarity" / "config.json").open("w") as file_:
|
|
||||||
file_.write(model.to_json())
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate(dev_loc, shape):
|
|
||||||
dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc)
|
|
||||||
nlp = spacy.load("en_vectors_web_lg")
|
|
||||||
nlp.add_pipe(KerasSimilarityShim.load(nlp.path / "similarity", nlp, shape[0]))
|
|
||||||
total = 0.0
|
|
||||||
correct = 0.0
|
|
||||||
for text1, text2, label in zip(dev_texts1, dev_texts2, dev_labels):
|
|
||||||
doc1 = nlp(text1)
|
|
||||||
doc2 = nlp(text2)
|
|
||||||
sim, _ = doc1.similarity(doc2)
|
|
||||||
if sim == KerasSimilarityShim.entailment_types[label.argmax()]:
|
|
||||||
correct += 1
|
|
||||||
total += 1
|
|
||||||
return correct, total
|
|
||||||
|
|
||||||
|
|
||||||
def demo(shape):
|
|
||||||
nlp = spacy.load("en_vectors_web_lg")
|
|
||||||
nlp.add_pipe(KerasSimilarityShim.load(nlp.path / "similarity", nlp, shape[0]))
|
|
||||||
|
|
||||||
doc1 = nlp("The king of France is bald.")
|
|
||||||
doc2 = nlp("France has no king.")
|
|
||||||
|
|
||||||
print("Sentence 1:", doc1)
|
|
||||||
print("Sentence 2:", doc2)
|
|
||||||
|
|
||||||
entailment_type, confidence = doc1.similarity(doc2)
|
|
||||||
print("Entailment type:", entailment_type, "(Confidence:", confidence, ")")
|
|
||||||
|
|
||||||
|
|
||||||
LABELS = {"entailment": 0, "contradiction": 1, "neutral": 2}
|
|
||||||
|
|
||||||
|
|
||||||
def read_snli(path):
|
|
||||||
texts1 = []
|
|
||||||
texts2 = []
|
|
||||||
labels = []
|
|
||||||
with open(path, "r") as file_:
|
|
||||||
for line in file_:
|
|
||||||
eg = json.loads(line)
|
|
||||||
label = eg["gold_label"]
|
|
||||||
if label == "-": # per Parikh, ignore - SNLI entries
|
|
||||||
continue
|
|
||||||
texts1.append(eg["sentence1"])
|
|
||||||
texts2.append(eg["sentence2"])
|
|
||||||
labels.append(LABELS[label])
|
|
||||||
return texts1, texts2, to_categorical(np.asarray(labels, dtype="int32"))
|
|
||||||
|
|
||||||
|
|
||||||
def create_dataset(nlp, texts, hypotheses, num_unk, max_length):
|
|
||||||
sents = texts + hypotheses
|
|
||||||
sents_as_ids = []
|
|
||||||
for sent in sents:
|
|
||||||
doc = nlp(sent)
|
|
||||||
word_ids = []
|
|
||||||
for i, token in enumerate(doc):
|
|
||||||
# skip odd spaces from tokenizer
|
|
||||||
if token.has_vector and token.vector_norm == 0:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if i > max_length:
|
|
||||||
break
|
|
||||||
|
|
||||||
if token.has_vector:
|
|
||||||
word_ids.append(token.rank + num_unk + 1)
|
|
||||||
else:
|
|
||||||
# if we don't have a vector, pick an OOV entry
|
|
||||||
word_ids.append(token.rank % num_unk + 1)
|
|
||||||
|
|
||||||
# there must be a simpler way of generating padded arrays from lists...
|
|
||||||
word_id_vec = np.zeros((max_length), dtype="int")
|
|
||||||
clipped_len = min(max_length, len(word_ids))
|
|
||||||
word_id_vec[:clipped_len] = word_ids[:clipped_len]
|
|
||||||
sents_as_ids.append(word_id_vec)
|
|
||||||
|
|
||||||
return [np.array(sents_as_ids[: len(texts)]), np.array(sents_as_ids[len(texts) :])]
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
mode=("Mode to execute", "positional", None, str, ["train", "evaluate", "demo"]),
|
|
||||||
train_loc=("Path to training data", "option", "t", str),
|
|
||||||
dev_loc=("Path to development or test data", "option", "s", str),
|
|
||||||
max_length=("Length to truncate sentences", "option", "L", int),
|
|
||||||
nr_hidden=("Number of hidden units", "option", "H", int),
|
|
||||||
dropout=("Dropout level", "option", "d", float),
|
|
||||||
learn_rate=("Learning rate", "option", "r", float),
|
|
||||||
batch_size=("Batch size for neural network training", "option", "b", int),
|
|
||||||
nr_epoch=("Number of training epochs", "option", "e", int),
|
|
||||||
entail_dir=(
|
|
||||||
"Direction of entailment",
|
|
||||||
"option",
|
|
||||||
"D",
|
|
||||||
str,
|
|
||||||
["both", "left", "right"],
|
|
||||||
),
|
|
||||||
)
|
|
||||||
def main(
|
|
||||||
mode,
|
|
||||||
train_loc,
|
|
||||||
dev_loc,
|
|
||||||
max_length=50,
|
|
||||||
nr_hidden=200,
|
|
||||||
dropout=0.2,
|
|
||||||
learn_rate=0.001,
|
|
||||||
batch_size=1024,
|
|
||||||
nr_epoch=10,
|
|
||||||
entail_dir="both",
|
|
||||||
):
|
|
||||||
shape = (max_length, nr_hidden, 3)
|
|
||||||
settings = {
|
|
||||||
"lr": learn_rate,
|
|
||||||
"dropout": dropout,
|
|
||||||
"batch_size": batch_size,
|
|
||||||
"nr_epoch": nr_epoch,
|
|
||||||
"entail_dir": entail_dir,
|
|
||||||
}
|
|
||||||
|
|
||||||
if mode == "train":
|
|
||||||
if train_loc == None or dev_loc == None:
|
|
||||||
print("Train mode requires paths to training and development data sets.")
|
|
||||||
sys.exit(1)
|
|
||||||
train(train_loc, dev_loc, shape, settings)
|
|
||||||
elif mode == "evaluate":
|
|
||||||
if dev_loc == None:
|
|
||||||
print("Evaluate mode requires paths to test data set.")
|
|
||||||
sys.exit(1)
|
|
||||||
correct, total = evaluate(dev_loc, shape)
|
|
||||||
print(correct, "/", total, correct / total)
|
|
||||||
else:
|
|
||||||
demo(shape)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
|
@ -1,152 +0,0 @@
|
||||||
# Semantic entailment/similarity with decomposable attention (using spaCy and Keras)
|
|
||||||
# Practical state-of-the-art textual entailment with spaCy and Keras
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from keras import layers, Model, models, optimizers
|
|
||||||
from keras import backend as K
|
|
||||||
|
|
||||||
|
|
||||||
def build_model(vectors, shape, settings):
|
|
||||||
max_length, nr_hidden, nr_class = shape
|
|
||||||
|
|
||||||
input1 = layers.Input(shape=(max_length,), dtype="int32", name="words1")
|
|
||||||
input2 = layers.Input(shape=(max_length,), dtype="int32", name="words2")
|
|
||||||
|
|
||||||
# embeddings (projected)
|
|
||||||
embed = create_embedding(vectors, max_length, nr_hidden)
|
|
||||||
|
|
||||||
a = embed(input1)
|
|
||||||
b = embed(input2)
|
|
||||||
|
|
||||||
# step 1: attend
|
|
||||||
F = create_feedforward(nr_hidden)
|
|
||||||
att_weights = layers.dot([F(a), F(b)], axes=-1)
|
|
||||||
|
|
||||||
G = create_feedforward(nr_hidden)
|
|
||||||
|
|
||||||
if settings["entail_dir"] == "both":
|
|
||||||
norm_weights_a = layers.Lambda(normalizer(1))(att_weights)
|
|
||||||
norm_weights_b = layers.Lambda(normalizer(2))(att_weights)
|
|
||||||
alpha = layers.dot([norm_weights_a, a], axes=1)
|
|
||||||
beta = layers.dot([norm_weights_b, b], axes=1)
|
|
||||||
|
|
||||||
# step 2: compare
|
|
||||||
comp1 = layers.concatenate([a, beta])
|
|
||||||
comp2 = layers.concatenate([b, alpha])
|
|
||||||
v1 = layers.TimeDistributed(G)(comp1)
|
|
||||||
v2 = layers.TimeDistributed(G)(comp2)
|
|
||||||
|
|
||||||
# step 3: aggregate
|
|
||||||
v1_sum = layers.Lambda(sum_word)(v1)
|
|
||||||
v2_sum = layers.Lambda(sum_word)(v2)
|
|
||||||
concat = layers.concatenate([v1_sum, v2_sum])
|
|
||||||
|
|
||||||
elif settings["entail_dir"] == "left":
|
|
||||||
norm_weights_a = layers.Lambda(normalizer(1))(att_weights)
|
|
||||||
alpha = layers.dot([norm_weights_a, a], axes=1)
|
|
||||||
comp2 = layers.concatenate([b, alpha])
|
|
||||||
v2 = layers.TimeDistributed(G)(comp2)
|
|
||||||
v2_sum = layers.Lambda(sum_word)(v2)
|
|
||||||
concat = v2_sum
|
|
||||||
|
|
||||||
else:
|
|
||||||
norm_weights_b = layers.Lambda(normalizer(2))(att_weights)
|
|
||||||
beta = layers.dot([norm_weights_b, b], axes=1)
|
|
||||||
comp1 = layers.concatenate([a, beta])
|
|
||||||
v1 = layers.TimeDistributed(G)(comp1)
|
|
||||||
v1_sum = layers.Lambda(sum_word)(v1)
|
|
||||||
concat = v1_sum
|
|
||||||
|
|
||||||
H = create_feedforward(nr_hidden)
|
|
||||||
out = H(concat)
|
|
||||||
out = layers.Dense(nr_class, activation="softmax")(out)
|
|
||||||
|
|
||||||
model = Model([input1, input2], out)
|
|
||||||
|
|
||||||
model.compile(
|
|
||||||
optimizer=optimizers.Adam(lr=settings["lr"]),
|
|
||||||
loss="categorical_crossentropy",
|
|
||||||
metrics=["accuracy"],
|
|
||||||
)
|
|
||||||
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def create_embedding(vectors, max_length, projected_dim):
|
|
||||||
return models.Sequential(
|
|
||||||
[
|
|
||||||
layers.Embedding(
|
|
||||||
vectors.shape[0],
|
|
||||||
vectors.shape[1],
|
|
||||||
input_length=max_length,
|
|
||||||
weights=[vectors],
|
|
||||||
trainable=False,
|
|
||||||
),
|
|
||||||
layers.TimeDistributed(
|
|
||||||
layers.Dense(projected_dim, activation=None, use_bias=False)
|
|
||||||
),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def create_feedforward(num_units=200, activation="relu", dropout_rate=0.2):
|
|
||||||
return models.Sequential(
|
|
||||||
[
|
|
||||||
layers.Dense(num_units, activation=activation),
|
|
||||||
layers.Dropout(dropout_rate),
|
|
||||||
layers.Dense(num_units, activation=activation),
|
|
||||||
layers.Dropout(dropout_rate),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def normalizer(axis):
|
|
||||||
def _normalize(att_weights):
|
|
||||||
exp_weights = K.exp(att_weights)
|
|
||||||
sum_weights = K.sum(exp_weights, axis=axis, keepdims=True)
|
|
||||||
return exp_weights / sum_weights
|
|
||||||
|
|
||||||
return _normalize
|
|
||||||
|
|
||||||
|
|
||||||
def sum_word(x):
|
|
||||||
return K.sum(x, axis=1)
|
|
||||||
|
|
||||||
|
|
||||||
def test_build_model():
|
|
||||||
vectors = np.ndarray((100, 8), dtype="float32")
|
|
||||||
shape = (10, 16, 3)
|
|
||||||
settings = {"lr": 0.001, "dropout": 0.2, "gru_encode": True, "entail_dir": "both"}
|
|
||||||
model = build_model(vectors, shape, settings)
|
|
||||||
|
|
||||||
|
|
||||||
def test_fit_model():
|
|
||||||
def _generate_X(nr_example, length, nr_vector):
|
|
||||||
X1 = np.ndarray((nr_example, length), dtype="int32")
|
|
||||||
X1 *= X1 < nr_vector
|
|
||||||
X1 *= 0 <= X1
|
|
||||||
X2 = np.ndarray((nr_example, length), dtype="int32")
|
|
||||||
X2 *= X2 < nr_vector
|
|
||||||
X2 *= 0 <= X2
|
|
||||||
return [X1, X2]
|
|
||||||
|
|
||||||
def _generate_Y(nr_example, nr_class):
|
|
||||||
ys = np.zeros((nr_example, nr_class), dtype="int32")
|
|
||||||
for i in range(nr_example):
|
|
||||||
ys[i, i % nr_class] = 1
|
|
||||||
return ys
|
|
||||||
|
|
||||||
vectors = np.ndarray((100, 8), dtype="float32")
|
|
||||||
shape = (10, 16, 3)
|
|
||||||
settings = {"lr": 0.001, "dropout": 0.2, "gru_encode": True, "entail_dir": "both"}
|
|
||||||
model = build_model(vectors, shape, settings)
|
|
||||||
|
|
||||||
train_X = _generate_X(20, shape[0], vectors.shape[0])
|
|
||||||
train_Y = _generate_Y(20, shape[2])
|
|
||||||
dev_X = _generate_X(15, shape[0], vectors.shape[0])
|
|
||||||
dev_Y = _generate_Y(15, shape[2])
|
|
||||||
|
|
||||||
model.fit(train_X, train_Y, validation_data=(dev_X, dev_Y), epochs=5, batch_size=4)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = [build_model]
|
|
|
@ -1,77 +0,0 @@
|
||||||
import numpy as np
|
|
||||||
from keras.models import model_from_json
|
|
||||||
|
|
||||||
try:
|
|
||||||
import cPickle as pickle
|
|
||||||
except ImportError:
|
|
||||||
import pickle
|
|
||||||
|
|
||||||
|
|
||||||
class KerasSimilarityShim(object):
|
|
||||||
entailment_types = ["entailment", "contradiction", "neutral"]
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def load(cls, path, nlp, max_length=100, get_features=None):
|
|
||||||
|
|
||||||
if get_features is None:
|
|
||||||
get_features = get_word_ids
|
|
||||||
|
|
||||||
with (path / "config.json").open() as file_:
|
|
||||||
model = model_from_json(file_.read())
|
|
||||||
with (path / "model").open("rb") as file_:
|
|
||||||
weights = pickle.load(file_)
|
|
||||||
|
|
||||||
embeddings = get_embeddings(nlp.vocab)
|
|
||||||
weights.insert(1, embeddings)
|
|
||||||
model.set_weights(weights)
|
|
||||||
|
|
||||||
return cls(model, get_features=get_features, max_length=max_length)
|
|
||||||
|
|
||||||
def __init__(self, model, get_features=None, max_length=100):
|
|
||||||
self.model = model
|
|
||||||
self.get_features = get_features
|
|
||||||
self.max_length = max_length
|
|
||||||
|
|
||||||
def __call__(self, doc):
|
|
||||||
doc.user_hooks["similarity"] = self.predict
|
|
||||||
doc.user_span_hooks["similarity"] = self.predict
|
|
||||||
|
|
||||||
return doc
|
|
||||||
|
|
||||||
def predict(self, doc1, doc2):
|
|
||||||
x1 = self.get_features([doc1], max_length=self.max_length)
|
|
||||||
x2 = self.get_features([doc2], max_length=self.max_length)
|
|
||||||
scores = self.model.predict([x1, x2])
|
|
||||||
|
|
||||||
return self.entailment_types[scores.argmax()], scores.max()
|
|
||||||
|
|
||||||
|
|
||||||
def get_embeddings(vocab, nr_unk=100):
|
|
||||||
# the extra +1 is for a zero vector representing sentence-final padding
|
|
||||||
num_vectors = max(lex.rank for lex in vocab) + 2
|
|
||||||
|
|
||||||
# create random vectors for OOV tokens
|
|
||||||
oov = np.random.normal(size=(nr_unk, vocab.vectors_length))
|
|
||||||
oov = oov / oov.sum(axis=1, keepdims=True)
|
|
||||||
|
|
||||||
vectors = np.zeros((num_vectors + nr_unk, vocab.vectors_length), dtype="float32")
|
|
||||||
vectors[1 : (nr_unk + 1),] = oov
|
|
||||||
for lex in vocab:
|
|
||||||
if lex.has_vector and lex.vector_norm > 0:
|
|
||||||
vectors[nr_unk + lex.rank + 1] = lex.vector / lex.vector_norm
|
|
||||||
|
|
||||||
return vectors
|
|
||||||
|
|
||||||
|
|
||||||
def get_word_ids(docs, max_length=100, nr_unk=100):
|
|
||||||
Xs = np.zeros((len(docs), max_length), dtype="int32")
|
|
||||||
|
|
||||||
for i, doc in enumerate(docs):
|
|
||||||
for j, token in enumerate(doc):
|
|
||||||
if j == max_length:
|
|
||||||
break
|
|
||||||
if token.has_vector:
|
|
||||||
Xs[i, j] = token.rank + nr_unk + 1
|
|
||||||
else:
|
|
||||||
Xs[i, j] = token.rank % nr_unk + 1
|
|
||||||
return Xs
|
|
|
@ -1,45 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
"""
|
|
||||||
Example of loading previously parsed text using spaCy's DocBin class. The example
|
|
||||||
performs an entity count to show that the annotations are available.
|
|
||||||
For more details, see https://spacy.io/usage/saving-loading#docs
|
|
||||||
Installation:
|
|
||||||
python -m spacy download en_core_web_lg
|
|
||||||
Usage:
|
|
||||||
python examples/load_from_docbin.py en_core_web_lg RC_2015-03-9.spacy
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import spacy
|
|
||||||
from spacy.tokens import DocBin
|
|
||||||
from timeit import default_timer as timer
|
|
||||||
from collections import Counter
|
|
||||||
|
|
||||||
EXAMPLE_PARSES_PATH = "RC_2015-03-9.spacy"
|
|
||||||
|
|
||||||
|
|
||||||
def main(model="en_core_web_lg", docbin_path=EXAMPLE_PARSES_PATH):
|
|
||||||
nlp = spacy.load(model)
|
|
||||||
print("Reading data from {}".format(docbin_path))
|
|
||||||
with open(docbin_path, "rb") as file_:
|
|
||||||
bytes_data = file_.read()
|
|
||||||
nr_word = 0
|
|
||||||
start_time = timer()
|
|
||||||
entities = Counter()
|
|
||||||
docbin = DocBin().from_bytes(bytes_data)
|
|
||||||
for doc in docbin.get_docs(nlp.vocab):
|
|
||||||
nr_word += len(doc)
|
|
||||||
entities.update((e.label_, e.text) for e in doc.ents)
|
|
||||||
end_time = timer()
|
|
||||||
msg = "Loaded {nr_word} words in {seconds} seconds ({wps} words per second)"
|
|
||||||
wps = nr_word / (end_time - start_time)
|
|
||||||
print(msg.format(nr_word=nr_word, seconds=end_time - start_time, wps=wps))
|
|
||||||
print("Most common entities:")
|
|
||||||
for (label, entity), freq in entities.most_common(30):
|
|
||||||
print(freq, entity, label)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import plac
|
|
||||||
|
|
||||||
plac.call(main)
|
|
|
@ -1,955 +0,0 @@
|
||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Natural language inference using spaCy and Keras"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Introduction"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"This notebook details an implementation of the natural language inference model presented in [(Parikh et al, 2016)](https://arxiv.org/abs/1606.01933). The model is notable for the small number of paramaters *and hyperparameters* it specifices, while still yielding good performance."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Constructing the dataset"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import spacy\n",
|
|
||||||
"import numpy as np"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"We only need the GloVe vectors from spaCy, not a full NLP pipeline."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 2,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"nlp = spacy.load('en_vectors_web_lg')"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"Function to load the SNLI dataset. The categories are converted to one-shot representation. The function comes from an example in spaCy."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 3,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stderr",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"/home/jds/tensorflow-gpu/lib/python3.5/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
|
|
||||||
" from ._conv import register_converters as _register_converters\n",
|
|
||||||
"Using TensorFlow backend.\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"import json\n",
|
|
||||||
"from keras.utils import to_categorical\n",
|
|
||||||
"\n",
|
|
||||||
"LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}\n",
|
|
||||||
"def read_snli(path):\n",
|
|
||||||
" texts1 = []\n",
|
|
||||||
" texts2 = []\n",
|
|
||||||
" labels = []\n",
|
|
||||||
" with open(path, 'r') as file_:\n",
|
|
||||||
" for line in file_:\n",
|
|
||||||
" eg = json.loads(line)\n",
|
|
||||||
" label = eg['gold_label']\n",
|
|
||||||
" if label == '-': # per Parikh, ignore - SNLI entries\n",
|
|
||||||
" continue\n",
|
|
||||||
" texts1.append(eg['sentence1'])\n",
|
|
||||||
" texts2.append(eg['sentence2'])\n",
|
|
||||||
" labels.append(LABELS[label])\n",
|
|
||||||
" return texts1, texts2, to_categorical(np.asarray(labels, dtype='int32'))"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"Because Keras can do the train/test split for us, we'll load *all* SNLI triples from one file."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 8,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"texts,hypotheses,labels = read_snli('snli/snli_1.0_train.jsonl')"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 9,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"def create_dataset(nlp, texts, hypotheses, num_oov, max_length, norm_vectors = True):\n",
|
|
||||||
" sents = texts + hypotheses\n",
|
|
||||||
" \n",
|
|
||||||
" # the extra +1 is for a zero vector represting NULL for padding\n",
|
|
||||||
" num_vectors = max(lex.rank for lex in nlp.vocab) + 2 \n",
|
|
||||||
" \n",
|
|
||||||
" # create random vectors for OOV tokens\n",
|
|
||||||
" oov = np.random.normal(size=(num_oov, nlp.vocab.vectors_length))\n",
|
|
||||||
" oov = oov / oov.sum(axis=1, keepdims=True)\n",
|
|
||||||
" \n",
|
|
||||||
" vectors = np.zeros((num_vectors + num_oov, nlp.vocab.vectors_length), dtype='float32')\n",
|
|
||||||
" vectors[num_vectors:, ] = oov\n",
|
|
||||||
" for lex in nlp.vocab:\n",
|
|
||||||
" if lex.has_vector and lex.vector_norm > 0:\n",
|
|
||||||
" vectors[lex.rank + 1] = lex.vector / lex.vector_norm if norm_vectors == True else lex.vector\n",
|
|
||||||
" \n",
|
|
||||||
" sents_as_ids = []\n",
|
|
||||||
" for sent in sents:\n",
|
|
||||||
" doc = nlp(sent)\n",
|
|
||||||
" word_ids = []\n",
|
|
||||||
" \n",
|
|
||||||
" for i, token in enumerate(doc):\n",
|
|
||||||
" # skip odd spaces from tokenizer\n",
|
|
||||||
" if token.has_vector and token.vector_norm == 0:\n",
|
|
||||||
" continue\n",
|
|
||||||
" \n",
|
|
||||||
" if i > max_length:\n",
|
|
||||||
" break\n",
|
|
||||||
" \n",
|
|
||||||
" if token.has_vector:\n",
|
|
||||||
" word_ids.append(token.rank + 1)\n",
|
|
||||||
" else:\n",
|
|
||||||
" # if we don't have a vector, pick an OOV entry\n",
|
|
||||||
" word_ids.append(token.rank % num_oov + num_vectors) \n",
|
|
||||||
" \n",
|
|
||||||
" # there must be a simpler way of generating padded arrays from lists...\n",
|
|
||||||
" word_id_vec = np.zeros((max_length), dtype='int')\n",
|
|
||||||
" clipped_len = min(max_length, len(word_ids))\n",
|
|
||||||
" word_id_vec[:clipped_len] = word_ids[:clipped_len]\n",
|
|
||||||
" sents_as_ids.append(word_id_vec)\n",
|
|
||||||
" \n",
|
|
||||||
" \n",
|
|
||||||
" return vectors, np.array(sents_as_ids[:len(texts)]), np.array(sents_as_ids[len(texts):])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 10,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"sem_vectors, text_vectors, hypothesis_vectors = create_dataset(nlp, texts, hypotheses, 100, 50, True)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 11,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"texts_test,hypotheses_test,labels_test = read_snli('snli/snli_1.0_test.jsonl')"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 12,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"_, text_vectors_test, hypothesis_vectors_test = create_dataset(nlp, texts_test, hypotheses_test, 100, 50, True)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"We use spaCy to tokenize the sentences and return, when available, a semantic vector for each token. \n",
|
|
||||||
"\n",
|
|
||||||
"OOV terms (tokens for which no semantic vector is available) are assigned to one of a set of randomly-generated OOV vectors, per (Parikh et al, 2016).\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"Note that we will clip sentences to 50 words maximum."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 13,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from keras import layers, Model, models\n",
|
|
||||||
"from keras import backend as K"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Building the model"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"The embedding layer copies the 300-dimensional GloVe vectors into GPU memory. Per (Parikh et al, 2016), the vectors, which are not adapted during training, are projected down to lower-dimensional vectors using a trained projection matrix."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 14,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"def create_embedding(vectors, max_length, projected_dim):\n",
|
|
||||||
" return models.Sequential([\n",
|
|
||||||
" layers.Embedding(\n",
|
|
||||||
" vectors.shape[0],\n",
|
|
||||||
" vectors.shape[1],\n",
|
|
||||||
" input_length=max_length,\n",
|
|
||||||
" weights=[vectors],\n",
|
|
||||||
" trainable=False),\n",
|
|
||||||
" \n",
|
|
||||||
" layers.TimeDistributed(\n",
|
|
||||||
" layers.Dense(projected_dim,\n",
|
|
||||||
" activation=None,\n",
|
|
||||||
" use_bias=False))\n",
|
|
||||||
" ])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"The Parikh model makes use of three feedforward blocks that construct nonlinear combinations of their input. Each block contains two ReLU layers and two dropout layers."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 15,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"def create_feedforward(num_units=200, activation='relu', dropout_rate=0.2):\n",
|
|
||||||
" return models.Sequential([\n",
|
|
||||||
" layers.Dense(num_units, activation=activation),\n",
|
|
||||||
" layers.Dropout(dropout_rate),\n",
|
|
||||||
" layers.Dense(num_units, activation=activation),\n",
|
|
||||||
" layers.Dropout(dropout_rate)\n",
|
|
||||||
" ])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"The basic idea of the (Parikh et al, 2016) model is to:\n",
|
|
||||||
"\n",
|
|
||||||
"1. *Align*: Construct an alignment of subphrases in the text and hypothesis using an attention-like mechanism, called \"decompositional\" because the layer is applied to each of the two sentences individually rather than to their product. The dot product of the nonlinear transformations of the inputs is then normalized vertically and horizontally to yield a pair of \"soft\" alignment structures, from text->hypothesis and hypothesis->text. Concretely, for each word in one sentence, a multinomial distribution is computed over the words of the other sentence, by learning a multinomial logistic with softmax target.\n",
|
|
||||||
"2. *Compare*: Each word is now compared to its aligned phrase using a function modeled as a two-layer feedforward ReLU network. The output is a high-dimensional representation of the strength of association between word and aligned phrase.\n",
|
|
||||||
"3. *Aggregate*: The comparison vectors are summed, separately, for the text and the hypothesis. The result is two vectors: one that describes the degree of association of the text to the hypothesis, and the second, of the hypothesis to the text.\n",
|
|
||||||
"4. Finally, these two vectors are processed by a dense layer followed by a softmax classifier, as usual.\n",
|
|
||||||
"\n",
|
|
||||||
"Note that because in entailment the truth conditions of the consequent must be a subset of those of the antecedent, it is not obvious that we need both vectors in step (3). Entailment is not symmetric. It may be enough to just use the hypothesis->text vector. We will explore this possibility later."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"We need a couple of little functions for Lambda layers to normalize and aggregate weights:"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 16,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"def normalizer(axis):\n",
|
|
||||||
" def _normalize(att_weights):\n",
|
|
||||||
" exp_weights = K.exp(att_weights)\n",
|
|
||||||
" sum_weights = K.sum(exp_weights, axis=axis, keepdims=True)\n",
|
|
||||||
" return exp_weights/sum_weights\n",
|
|
||||||
" return _normalize\n",
|
|
||||||
"\n",
|
|
||||||
"def sum_word(x):\n",
|
|
||||||
" return K.sum(x, axis=1)\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 17,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"def build_model(vectors, max_length, num_hidden, num_classes, projected_dim, entail_dir='both'):\n",
|
|
||||||
" input1 = layers.Input(shape=(max_length,), dtype='int32', name='words1')\n",
|
|
||||||
" input2 = layers.Input(shape=(max_length,), dtype='int32', name='words2')\n",
|
|
||||||
" \n",
|
|
||||||
" # embeddings (projected)\n",
|
|
||||||
" embed = create_embedding(vectors, max_length, projected_dim)\n",
|
|
||||||
" \n",
|
|
||||||
" a = embed(input1)\n",
|
|
||||||
" b = embed(input2)\n",
|
|
||||||
" \n",
|
|
||||||
" # step 1: attend\n",
|
|
||||||
" F = create_feedforward(num_hidden)\n",
|
|
||||||
" att_weights = layers.dot([F(a), F(b)], axes=-1)\n",
|
|
||||||
" \n",
|
|
||||||
" G = create_feedforward(num_hidden)\n",
|
|
||||||
" \n",
|
|
||||||
" if entail_dir == 'both':\n",
|
|
||||||
" norm_weights_a = layers.Lambda(normalizer(1))(att_weights)\n",
|
|
||||||
" norm_weights_b = layers.Lambda(normalizer(2))(att_weights)\n",
|
|
||||||
" alpha = layers.dot([norm_weights_a, a], axes=1)\n",
|
|
||||||
" beta = layers.dot([norm_weights_b, b], axes=1)\n",
|
|
||||||
"\n",
|
|
||||||
" # step 2: compare\n",
|
|
||||||
" comp1 = layers.concatenate([a, beta])\n",
|
|
||||||
" comp2 = layers.concatenate([b, alpha])\n",
|
|
||||||
" v1 = layers.TimeDistributed(G)(comp1)\n",
|
|
||||||
" v2 = layers.TimeDistributed(G)(comp2)\n",
|
|
||||||
"\n",
|
|
||||||
" # step 3: aggregate\n",
|
|
||||||
" v1_sum = layers.Lambda(sum_word)(v1)\n",
|
|
||||||
" v2_sum = layers.Lambda(sum_word)(v2)\n",
|
|
||||||
" concat = layers.concatenate([v1_sum, v2_sum])\n",
|
|
||||||
" elif entail_dir == 'left':\n",
|
|
||||||
" norm_weights_a = layers.Lambda(normalizer(1))(att_weights)\n",
|
|
||||||
" alpha = layers.dot([norm_weights_a, a], axes=1)\n",
|
|
||||||
" comp2 = layers.concatenate([b, alpha])\n",
|
|
||||||
" v2 = layers.TimeDistributed(G)(comp2)\n",
|
|
||||||
" v2_sum = layers.Lambda(sum_word)(v2)\n",
|
|
||||||
" concat = v2_sum\n",
|
|
||||||
" else:\n",
|
|
||||||
" norm_weights_b = layers.Lambda(normalizer(2))(att_weights)\n",
|
|
||||||
" beta = layers.dot([norm_weights_b, b], axes=1)\n",
|
|
||||||
" comp1 = layers.concatenate([a, beta])\n",
|
|
||||||
" v1 = layers.TimeDistributed(G)(comp1)\n",
|
|
||||||
" v1_sum = layers.Lambda(sum_word)(v1)\n",
|
|
||||||
" concat = v1_sum\n",
|
|
||||||
" \n",
|
|
||||||
" H = create_feedforward(num_hidden)\n",
|
|
||||||
" out = H(concat)\n",
|
|
||||||
" out = layers.Dense(num_classes, activation='softmax')(out)\n",
|
|
||||||
" \n",
|
|
||||||
" model = Model([input1, input2], out)\n",
|
|
||||||
" \n",
|
|
||||||
" model.compile(optimizer='adam',\n",
|
|
||||||
" loss='categorical_crossentropy',\n",
|
|
||||||
" metrics=['accuracy'])\n",
|
|
||||||
" return model\n",
|
|
||||||
" \n",
|
|
||||||
" \n",
|
|
||||||
" "
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 18,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"Layer (type) Output Shape Param # Connected to \n",
|
|
||||||
"==================================================================================================\n",
|
|
||||||
"words1 (InputLayer) (None, 50) 0 \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"words2 (InputLayer) (None, 50) 0 \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"sequential_1 (Sequential) (None, 50, 200) 321381600 words1[0][0] \n",
|
|
||||||
" words2[0][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"sequential_2 (Sequential) (None, 50, 200) 80400 sequential_1[1][0] \n",
|
|
||||||
" sequential_1[2][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"dot_1 (Dot) (None, 50, 50) 0 sequential_2[1][0] \n",
|
|
||||||
" sequential_2[2][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"lambda_2 (Lambda) (None, 50, 50) 0 dot_1[0][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"lambda_1 (Lambda) (None, 50, 50) 0 dot_1[0][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"dot_3 (Dot) (None, 50, 200) 0 lambda_2[0][0] \n",
|
|
||||||
" sequential_1[2][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"dot_2 (Dot) (None, 50, 200) 0 lambda_1[0][0] \n",
|
|
||||||
" sequential_1[1][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"concatenate_1 (Concatenate) (None, 50, 400) 0 sequential_1[1][0] \n",
|
|
||||||
" dot_3[0][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"concatenate_2 (Concatenate) (None, 50, 400) 0 sequential_1[2][0] \n",
|
|
||||||
" dot_2[0][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"time_distributed_2 (TimeDistrib (None, 50, 200) 120400 concatenate_1[0][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"time_distributed_3 (TimeDistrib (None, 50, 200) 120400 concatenate_2[0][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"lambda_3 (Lambda) (None, 200) 0 time_distributed_2[0][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"lambda_4 (Lambda) (None, 200) 0 time_distributed_3[0][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"concatenate_3 (Concatenate) (None, 400) 0 lambda_3[0][0] \n",
|
|
||||||
" lambda_4[0][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"sequential_4 (Sequential) (None, 200) 120400 concatenate_3[0][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"dense_8 (Dense) (None, 3) 603 sequential_4[1][0] \n",
|
|
||||||
"==================================================================================================\n",
|
|
||||||
"Total params: 321,703,403\n",
|
|
||||||
"Trainable params: 381,803\n",
|
|
||||||
"Non-trainable params: 321,321,600\n",
|
|
||||||
"__________________________________________________________________________________________________\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"K.clear_session()\n",
|
|
||||||
"m = build_model(sem_vectors, 50, 200, 3, 200)\n",
|
|
||||||
"m.summary()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"The number of trainable parameters, ~381k, is the number given by Parikh et al, so we're on the right track."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Training the model"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"Parikh et al use tiny batches of 4, training for 50MM batches, which amounts to around 500 epochs. Here we'll use large batches to better use the GPU, and train for fewer epochs -- for purposes of this experiment."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 19,
|
|
||||||
"metadata": {
|
|
||||||
"scrolled": true
|
|
||||||
},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Train on 549367 samples, validate on 9824 samples\n",
|
|
||||||
"Epoch 1/50\n",
|
|
||||||
"549367/549367 [==============================] - 34s 62us/step - loss: 0.7599 - acc: 0.6617 - val_loss: 0.5396 - val_acc: 0.7861\n",
|
|
||||||
"Epoch 2/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.5611 - acc: 0.7763 - val_loss: 0.4892 - val_acc: 0.8085\n",
|
|
||||||
"Epoch 3/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.5212 - acc: 0.7948 - val_loss: 0.4574 - val_acc: 0.8261\n",
|
|
||||||
"Epoch 4/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4986 - acc: 0.8045 - val_loss: 0.4410 - val_acc: 0.8274\n",
|
|
||||||
"Epoch 5/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4819 - acc: 0.8114 - val_loss: 0.4224 - val_acc: 0.8383\n",
|
|
||||||
"Epoch 6/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4714 - acc: 0.8166 - val_loss: 0.4200 - val_acc: 0.8379\n",
|
|
||||||
"Epoch 7/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4633 - acc: 0.8203 - val_loss: 0.4098 - val_acc: 0.8457\n",
|
|
||||||
"Epoch 8/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4558 - acc: 0.8232 - val_loss: 0.4114 - val_acc: 0.8415\n",
|
|
||||||
"Epoch 9/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4508 - acc: 0.8250 - val_loss: 0.4062 - val_acc: 0.8477\n",
|
|
||||||
"Epoch 10/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4433 - acc: 0.8286 - val_loss: 0.3982 - val_acc: 0.8486\n",
|
|
||||||
"Epoch 11/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4388 - acc: 0.8307 - val_loss: 0.3953 - val_acc: 0.8497\n",
|
|
||||||
"Epoch 12/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4351 - acc: 0.8321 - val_loss: 0.3973 - val_acc: 0.8522\n",
|
|
||||||
"Epoch 13/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4309 - acc: 0.8342 - val_loss: 0.3939 - val_acc: 0.8539\n",
|
|
||||||
"Epoch 14/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4269 - acc: 0.8355 - val_loss: 0.3932 - val_acc: 0.8517\n",
|
|
||||||
"Epoch 15/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4247 - acc: 0.8369 - val_loss: 0.3938 - val_acc: 0.8515\n",
|
|
||||||
"Epoch 16/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4208 - acc: 0.8379 - val_loss: 0.3936 - val_acc: 0.8504\n",
|
|
||||||
"Epoch 17/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4194 - acc: 0.8390 - val_loss: 0.3885 - val_acc: 0.8560\n",
|
|
||||||
"Epoch 18/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4162 - acc: 0.8402 - val_loss: 0.3874 - val_acc: 0.8561\n",
|
|
||||||
"Epoch 19/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4140 - acc: 0.8409 - val_loss: 0.3889 - val_acc: 0.8545\n",
|
|
||||||
"Epoch 20/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4114 - acc: 0.8426 - val_loss: 0.3864 - val_acc: 0.8583\n",
|
|
||||||
"Epoch 21/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4092 - acc: 0.8430 - val_loss: 0.3870 - val_acc: 0.8561\n",
|
|
||||||
"Epoch 22/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4062 - acc: 0.8442 - val_loss: 0.3852 - val_acc: 0.8577\n",
|
|
||||||
"Epoch 23/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4050 - acc: 0.8450 - val_loss: 0.3850 - val_acc: 0.8578\n",
|
|
||||||
"Epoch 24/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4035 - acc: 0.8455 - val_loss: 0.3825 - val_acc: 0.8555\n",
|
|
||||||
"Epoch 25/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.4018 - acc: 0.8460 - val_loss: 0.3837 - val_acc: 0.8573\n",
|
|
||||||
"Epoch 26/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3989 - acc: 0.8476 - val_loss: 0.3843 - val_acc: 0.8599\n",
|
|
||||||
"Epoch 27/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3979 - acc: 0.8481 - val_loss: 0.3841 - val_acc: 0.8589\n",
|
|
||||||
"Epoch 28/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3967 - acc: 0.8484 - val_loss: 0.3811 - val_acc: 0.8575\n",
|
|
||||||
"Epoch 29/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3956 - acc: 0.8492 - val_loss: 0.3829 - val_acc: 0.8589\n",
|
|
||||||
"Epoch 30/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3938 - acc: 0.8499 - val_loss: 0.3859 - val_acc: 0.8562\n",
|
|
||||||
"Epoch 31/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3925 - acc: 0.8500 - val_loss: 0.3798 - val_acc: 0.8587\n",
|
|
||||||
"Epoch 32/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3906 - acc: 0.8509 - val_loss: 0.3834 - val_acc: 0.8569\n",
|
|
||||||
"Epoch 33/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3893 - acc: 0.8511 - val_loss: 0.3806 - val_acc: 0.8588\n",
|
|
||||||
"Epoch 34/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3885 - acc: 0.8515 - val_loss: 0.3828 - val_acc: 0.8603\n",
|
|
||||||
"Epoch 35/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3879 - acc: 0.8520 - val_loss: 0.3800 - val_acc: 0.8594\n",
|
|
||||||
"Epoch 36/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3860 - acc: 0.8530 - val_loss: 0.3796 - val_acc: 0.8577\n",
|
|
||||||
"Epoch 37/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3856 - acc: 0.8532 - val_loss: 0.3857 - val_acc: 0.8591\n",
|
|
||||||
"Epoch 38/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3838 - acc: 0.8535 - val_loss: 0.3835 - val_acc: 0.8603\n",
|
|
||||||
"Epoch 39/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3830 - acc: 0.8543 - val_loss: 0.3830 - val_acc: 0.8599\n",
|
|
||||||
"Epoch 40/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3818 - acc: 0.8548 - val_loss: 0.3832 - val_acc: 0.8559\n",
|
|
||||||
"Epoch 41/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3806 - acc: 0.8551 - val_loss: 0.3845 - val_acc: 0.8553\n",
|
|
||||||
"Epoch 42/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3803 - acc: 0.8550 - val_loss: 0.3789 - val_acc: 0.8617\n",
|
|
||||||
"Epoch 43/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3791 - acc: 0.8556 - val_loss: 0.3835 - val_acc: 0.8580\n",
|
|
||||||
"Epoch 44/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3778 - acc: 0.8565 - val_loss: 0.3799 - val_acc: 0.8580\n",
|
|
||||||
"Epoch 45/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3766 - acc: 0.8571 - val_loss: 0.3790 - val_acc: 0.8625\n",
|
|
||||||
"Epoch 46/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3770 - acc: 0.8569 - val_loss: 0.3820 - val_acc: 0.8590\n",
|
|
||||||
"Epoch 47/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3761 - acc: 0.8573 - val_loss: 0.3831 - val_acc: 0.8581\n",
|
|
||||||
"Epoch 48/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3739 - acc: 0.8579 - val_loss: 0.3828 - val_acc: 0.8599\n",
|
|
||||||
"Epoch 49/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3738 - acc: 0.8577 - val_loss: 0.3785 - val_acc: 0.8590\n",
|
|
||||||
"Epoch 50/50\n",
|
|
||||||
"549367/549367 [==============================] - 33s 60us/step - loss: 0.3726 - acc: 0.8580 - val_loss: 0.3820 - val_acc: 0.8585\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"<keras.callbacks.History at 0x7f5c9f49c438>"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 19,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"m.fit([text_vectors, hypothesis_vectors], labels, batch_size=1024, epochs=50,validation_data=([text_vectors_test, hypothesis_vectors_test], labels_test))"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"The result is broadly in the region reported by Parikh et al: ~86 vs 86.3%. The small difference might be accounted by differences in `max_length` (here set at 50), in the training regime, and that here we use Keras' built-in validation splitting rather than the SNLI test set."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Experiment: the asymmetric model"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"It was suggested earlier that, based on the semantics of entailment, the vector representing the strength of association between the hypothesis to the text is all that is needed for classifying the entailment.\n",
|
|
||||||
"\n",
|
|
||||||
"The following model removes consideration of the complementary vector (text to hypothesis) from the computation. This will decrease the paramater count slightly, because the final dense layers will be smaller, and speed up the forward pass when predicting, because fewer calculations will be needed."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 20,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"Layer (type) Output Shape Param # Connected to \n",
|
|
||||||
"==================================================================================================\n",
|
|
||||||
"words2 (InputLayer) (None, 50) 0 \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"words1 (InputLayer) (None, 50) 0 \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"sequential_5 (Sequential) (None, 50, 200) 321381600 words1[0][0] \n",
|
|
||||||
" words2[0][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"sequential_6 (Sequential) (None, 50, 200) 80400 sequential_5[1][0] \n",
|
|
||||||
" sequential_5[2][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"dot_4 (Dot) (None, 50, 50) 0 sequential_6[1][0] \n",
|
|
||||||
" sequential_6[2][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"lambda_5 (Lambda) (None, 50, 50) 0 dot_4[0][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"dot_5 (Dot) (None, 50, 200) 0 lambda_5[0][0] \n",
|
|
||||||
" sequential_5[1][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"concatenate_4 (Concatenate) (None, 50, 400) 0 sequential_5[2][0] \n",
|
|
||||||
" dot_5[0][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"time_distributed_5 (TimeDistrib (None, 50, 200) 120400 concatenate_4[0][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"lambda_6 (Lambda) (None, 200) 0 time_distributed_5[0][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"sequential_8 (Sequential) (None, 200) 80400 lambda_6[0][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"dense_16 (Dense) (None, 3) 603 sequential_8[1][0] \n",
|
|
||||||
"==================================================================================================\n",
|
|
||||||
"Total params: 321,663,403\n",
|
|
||||||
"Trainable params: 341,803\n",
|
|
||||||
"Non-trainable params: 321,321,600\n",
|
|
||||||
"__________________________________________________________________________________________________\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"m1 = build_model(sem_vectors, 50, 200, 3, 200, 'left')\n",
|
|
||||||
"m1.summary()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"The parameter count has indeed decreased by 40,000, corresponding to the 200x200 smaller H function."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 21,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Train on 549367 samples, validate on 9824 samples\n",
|
|
||||||
"Epoch 1/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 46us/step - loss: 0.7331 - acc: 0.6770 - val_loss: 0.5257 - val_acc: 0.7936\n",
|
|
||||||
"Epoch 2/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.5518 - acc: 0.7799 - val_loss: 0.4717 - val_acc: 0.8159\n",
|
|
||||||
"Epoch 3/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.5147 - acc: 0.7967 - val_loss: 0.4449 - val_acc: 0.8278\n",
|
|
||||||
"Epoch 4/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4948 - acc: 0.8060 - val_loss: 0.4326 - val_acc: 0.8344\n",
|
|
||||||
"Epoch 5/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4814 - acc: 0.8122 - val_loss: 0.4247 - val_acc: 0.8359\n",
|
|
||||||
"Epoch 6/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4712 - acc: 0.8162 - val_loss: 0.4143 - val_acc: 0.8430\n",
|
|
||||||
"Epoch 7/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4635 - acc: 0.8205 - val_loss: 0.4172 - val_acc: 0.8401\n",
|
|
||||||
"Epoch 8/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4570 - acc: 0.8223 - val_loss: 0.4106 - val_acc: 0.8422\n",
|
|
||||||
"Epoch 9/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4505 - acc: 0.8259 - val_loss: 0.4043 - val_acc: 0.8451\n",
|
|
||||||
"Epoch 10/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4459 - acc: 0.8280 - val_loss: 0.4050 - val_acc: 0.8467\n",
|
|
||||||
"Epoch 11/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4405 - acc: 0.8300 - val_loss: 0.3975 - val_acc: 0.8481\n",
|
|
||||||
"Epoch 12/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4360 - acc: 0.8324 - val_loss: 0.4026 - val_acc: 0.8496\n",
|
|
||||||
"Epoch 13/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4327 - acc: 0.8334 - val_loss: 0.4024 - val_acc: 0.8471\n",
|
|
||||||
"Epoch 14/50\n",
|
|
||||||
"549367/549367 [==============================] - 24s 45us/step - loss: 0.4293 - acc: 0.8350 - val_loss: 0.3955 - val_acc: 0.8496\n",
|
|
||||||
"Epoch 15/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4263 - acc: 0.8369 - val_loss: 0.3980 - val_acc: 0.8490\n",
|
|
||||||
"Epoch 16/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4236 - acc: 0.8377 - val_loss: 0.3958 - val_acc: 0.8496\n",
|
|
||||||
"Epoch 17/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4213 - acc: 0.8384 - val_loss: 0.3954 - val_acc: 0.8496\n",
|
|
||||||
"Epoch 18/50\n",
|
|
||||||
"549367/549367 [==============================] - 24s 45us/step - loss: 0.4187 - acc: 0.8394 - val_loss: 0.3929 - val_acc: 0.8514\n",
|
|
||||||
"Epoch 19/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4157 - acc: 0.8409 - val_loss: 0.3939 - val_acc: 0.8507\n",
|
|
||||||
"Epoch 20/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4135 - acc: 0.8417 - val_loss: 0.3953 - val_acc: 0.8522\n",
|
|
||||||
"Epoch 21/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4122 - acc: 0.8424 - val_loss: 0.3974 - val_acc: 0.8506\n",
|
|
||||||
"Epoch 22/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4099 - acc: 0.8435 - val_loss: 0.3918 - val_acc: 0.8522\n",
|
|
||||||
"Epoch 23/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4075 - acc: 0.8443 - val_loss: 0.3901 - val_acc: 0.8513\n",
|
|
||||||
"Epoch 24/50\n",
|
|
||||||
"549367/549367 [==============================] - 24s 44us/step - loss: 0.4067 - acc: 0.8447 - val_loss: 0.3885 - val_acc: 0.8543\n",
|
|
||||||
"Epoch 25/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4047 - acc: 0.8454 - val_loss: 0.3846 - val_acc: 0.8531\n",
|
|
||||||
"Epoch 26/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.4031 - acc: 0.8461 - val_loss: 0.3864 - val_acc: 0.8562\n",
|
|
||||||
"Epoch 27/50\n",
|
|
||||||
"549367/549367 [==============================] - 24s 45us/step - loss: 0.4020 - acc: 0.8467 - val_loss: 0.3874 - val_acc: 0.8546\n",
|
|
||||||
"Epoch 28/50\n",
|
|
||||||
"549367/549367 [==============================] - 24s 45us/step - loss: 0.4001 - acc: 0.8473 - val_loss: 0.3848 - val_acc: 0.8534\n",
|
|
||||||
"Epoch 29/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3991 - acc: 0.8479 - val_loss: 0.3865 - val_acc: 0.8562\n",
|
|
||||||
"Epoch 30/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3976 - acc: 0.8484 - val_loss: 0.3833 - val_acc: 0.8574\n",
|
|
||||||
"Epoch 31/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3961 - acc: 0.8487 - val_loss: 0.3846 - val_acc: 0.8585\n",
|
|
||||||
"Epoch 32/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3942 - acc: 0.8498 - val_loss: 0.3805 - val_acc: 0.8573\n",
|
|
||||||
"Epoch 33/50\n",
|
|
||||||
"549367/549367 [==============================] - 24s 44us/step - loss: 0.3935 - acc: 0.8503 - val_loss: 0.3856 - val_acc: 0.8579\n",
|
|
||||||
"Epoch 34/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3923 - acc: 0.8507 - val_loss: 0.3829 - val_acc: 0.8560\n",
|
|
||||||
"Epoch 35/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3920 - acc: 0.8508 - val_loss: 0.3864 - val_acc: 0.8575\n",
|
|
||||||
"Epoch 36/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3907 - acc: 0.8516 - val_loss: 0.3873 - val_acc: 0.8563\n",
|
|
||||||
"Epoch 37/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3891 - acc: 0.8519 - val_loss: 0.3850 - val_acc: 0.8570\n",
|
|
||||||
"Epoch 38/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3872 - acc: 0.8522 - val_loss: 0.3815 - val_acc: 0.8591\n",
|
|
||||||
"Epoch 39/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3887 - acc: 0.8520 - val_loss: 0.3829 - val_acc: 0.8590\n",
|
|
||||||
"Epoch 40/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3868 - acc: 0.8531 - val_loss: 0.3807 - val_acc: 0.8600\n",
|
|
||||||
"Epoch 41/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3859 - acc: 0.8537 - val_loss: 0.3832 - val_acc: 0.8574\n",
|
|
||||||
"Epoch 42/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3849 - acc: 0.8537 - val_loss: 0.3850 - val_acc: 0.8576\n",
|
|
||||||
"Epoch 43/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3834 - acc: 0.8541 - val_loss: 0.3825 - val_acc: 0.8563\n",
|
|
||||||
"Epoch 44/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3829 - acc: 0.8548 - val_loss: 0.3844 - val_acc: 0.8540\n",
|
|
||||||
"Epoch 45/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3816 - acc: 0.8552 - val_loss: 0.3841 - val_acc: 0.8559\n",
|
|
||||||
"Epoch 46/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3816 - acc: 0.8549 - val_loss: 0.3880 - val_acc: 0.8567\n",
|
|
||||||
"Epoch 47/50\n",
|
|
||||||
"549367/549367 [==============================] - 24s 45us/step - loss: 0.3799 - acc: 0.8559 - val_loss: 0.3767 - val_acc: 0.8635\n",
|
|
||||||
"Epoch 48/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3800 - acc: 0.8560 - val_loss: 0.3786 - val_acc: 0.8563\n",
|
|
||||||
"Epoch 49/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3781 - acc: 0.8563 - val_loss: 0.3812 - val_acc: 0.8596\n",
|
|
||||||
"Epoch 50/50\n",
|
|
||||||
"549367/549367 [==============================] - 25s 45us/step - loss: 0.3788 - acc: 0.8560 - val_loss: 0.3782 - val_acc: 0.8601\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"<keras.callbacks.History at 0x7f5ca1bf3e48>"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 21,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"m1.fit([text_vectors, hypothesis_vectors], labels, batch_size=1024, epochs=50,validation_data=([text_vectors_test, hypothesis_vectors_test], labels_test))"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"This model performs the same as the slightly more complex model that evaluates alignments in both directions. Note also that processing time is improved, from 64 down to 48 microseconds per step. \n",
|
|
||||||
"\n",
|
|
||||||
"Let's now look at an asymmetric model that evaluates text to hypothesis comparisons. The prediction is that such a model will correctly classify a decent proportion of the exemplars, but not as accurately as the previous two.\n",
|
|
||||||
"\n",
|
|
||||||
"We'll just use 10 epochs for expediency."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 96,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"Layer (type) Output Shape Param # Connected to \n",
|
|
||||||
"==================================================================================================\n",
|
|
||||||
"words1 (InputLayer) (None, 50) 0 \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"words2 (InputLayer) (None, 50) 0 \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"sequential_13 (Sequential) (None, 50, 200) 321381600 words1[0][0] \n",
|
|
||||||
" words2[0][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"sequential_14 (Sequential) (None, 50, 200) 80400 sequential_13[1][0] \n",
|
|
||||||
" sequential_13[2][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"dot_8 (Dot) (None, 50, 50) 0 sequential_14[1][0] \n",
|
|
||||||
" sequential_14[2][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"lambda_9 (Lambda) (None, 50, 50) 0 dot_8[0][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"dot_9 (Dot) (None, 50, 200) 0 lambda_9[0][0] \n",
|
|
||||||
" sequential_13[2][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"concatenate_6 (Concatenate) (None, 50, 400) 0 sequential_13[1][0] \n",
|
|
||||||
" dot_9[0][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"time_distributed_9 (TimeDistrib (None, 50, 200) 120400 concatenate_6[0][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"lambda_10 (Lambda) (None, 200) 0 time_distributed_9[0][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"sequential_16 (Sequential) (None, 200) 80400 lambda_10[0][0] \n",
|
|
||||||
"__________________________________________________________________________________________________\n",
|
|
||||||
"dense_32 (Dense) (None, 3) 603 sequential_16[1][0] \n",
|
|
||||||
"==================================================================================================\n",
|
|
||||||
"Total params: 321,663,403\n",
|
|
||||||
"Trainable params: 341,803\n",
|
|
||||||
"Non-trainable params: 321,321,600\n",
|
|
||||||
"__________________________________________________________________________________________________\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"m2 = build_model(sem_vectors, 50, 200, 3, 200, 'right')\n",
|
|
||||||
"m2.summary()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 97,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Train on 455226 samples, validate on 113807 samples\n",
|
|
||||||
"Epoch 1/10\n",
|
|
||||||
"455226/455226 [==============================] - 22s 49us/step - loss: 0.8920 - acc: 0.5771 - val_loss: 0.8001 - val_acc: 0.6435\n",
|
|
||||||
"Epoch 2/10\n",
|
|
||||||
"455226/455226 [==============================] - 22s 47us/step - loss: 0.7808 - acc: 0.6553 - val_loss: 0.7267 - val_acc: 0.6855\n",
|
|
||||||
"Epoch 3/10\n",
|
|
||||||
"455226/455226 [==============================] - 22s 47us/step - loss: 0.7329 - acc: 0.6825 - val_loss: 0.6966 - val_acc: 0.7006\n",
|
|
||||||
"Epoch 4/10\n",
|
|
||||||
"455226/455226 [==============================] - 22s 47us/step - loss: 0.7055 - acc: 0.6978 - val_loss: 0.6713 - val_acc: 0.7150\n",
|
|
||||||
"Epoch 5/10\n",
|
|
||||||
"455226/455226 [==============================] - 22s 47us/step - loss: 0.6862 - acc: 0.7081 - val_loss: 0.6533 - val_acc: 0.7253\n",
|
|
||||||
"Epoch 6/10\n",
|
|
||||||
"455226/455226 [==============================] - 21s 47us/step - loss: 0.6694 - acc: 0.7179 - val_loss: 0.6472 - val_acc: 0.7277\n",
|
|
||||||
"Epoch 7/10\n",
|
|
||||||
"455226/455226 [==============================] - 22s 47us/step - loss: 0.6555 - acc: 0.7252 - val_loss: 0.6338 - val_acc: 0.7347\n",
|
|
||||||
"Epoch 8/10\n",
|
|
||||||
"455226/455226 [==============================] - 22s 48us/step - loss: 0.6434 - acc: 0.7310 - val_loss: 0.6246 - val_acc: 0.7385\n",
|
|
||||||
"Epoch 9/10\n",
|
|
||||||
"455226/455226 [==============================] - 22s 47us/step - loss: 0.6325 - acc: 0.7367 - val_loss: 0.6164 - val_acc: 0.7424\n",
|
|
||||||
"Epoch 10/10\n",
|
|
||||||
"455226/455226 [==============================] - 22s 47us/step - loss: 0.6216 - acc: 0.7426 - val_loss: 0.6082 - val_acc: 0.7478\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"<keras.callbacks.History at 0x7fa6850cf080>"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 97,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"m2.fit([text_vectors, hypothesis_vectors], labels, batch_size=1024, epochs=10,validation_split=.2)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"Comparing this fit to the validation accuracy of the previous two models after 10 epochs, we observe that its accuracy is roughly 10% lower.\n",
|
|
||||||
"\n",
|
|
||||||
"It is reassuring that the neural modeling here reproduces what we know from the semantics of natural language!"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.5.2"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 2
|
|
||||||
}
|
|
|
@ -1,78 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf-8
|
|
||||||
"""This example contains several snippets of methods that can be set via custom
|
|
||||||
Doc, Token or Span attributes in spaCy v2.0. Attribute methods act like
|
|
||||||
they're "bound" to the object and are partially applied – i.e. the object
|
|
||||||
they're called on is passed in as the first argument.
|
|
||||||
|
|
||||||
* Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components
|
|
||||||
|
|
||||||
Compatible with: spaCy v2.0.0+
|
|
||||||
Last tested with: v2.1.0
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals, print_function
|
|
||||||
|
|
||||||
import plac
|
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.tokens import Doc, Span
|
|
||||||
from spacy import displacy
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
output_dir=("Output directory for saved HTML", "positional", None, Path)
|
|
||||||
)
|
|
||||||
def main(output_dir=None):
|
|
||||||
nlp = English() # start off with blank English class
|
|
||||||
|
|
||||||
Doc.set_extension("overlap", method=overlap_tokens)
|
|
||||||
doc1 = nlp("Peach emoji is where it has always been.")
|
|
||||||
doc2 = nlp("Peach is the superior emoji.")
|
|
||||||
print("Text 1:", doc1.text)
|
|
||||||
print("Text 2:", doc2.text)
|
|
||||||
print("Overlapping tokens:", doc1._.overlap(doc2))
|
|
||||||
|
|
||||||
Doc.set_extension("to_html", method=to_html)
|
|
||||||
doc = nlp("This is a sentence about Apple.")
|
|
||||||
# add entity manually for demo purposes, to make it work without a model
|
|
||||||
doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings["ORG"])]
|
|
||||||
print("Text:", doc.text)
|
|
||||||
doc._.to_html(output=output_dir, style="ent")
|
|
||||||
|
|
||||||
|
|
||||||
def to_html(doc, output="/tmp", style="dep"):
|
|
||||||
"""Doc method extension for saving the current state as a displaCy
|
|
||||||
visualization.
|
|
||||||
"""
|
|
||||||
# generate filename from first six non-punct tokens
|
|
||||||
file_name = "-".join([w.text for w in doc[:6] if not w.is_punct]) + ".html"
|
|
||||||
html = displacy.render(doc, style=style, page=True) # render markup
|
|
||||||
if output is not None:
|
|
||||||
output_path = Path(output)
|
|
||||||
if not output_path.exists():
|
|
||||||
output_path.mkdir()
|
|
||||||
output_file = Path(output) / file_name
|
|
||||||
output_file.open("w", encoding="utf-8").write(html) # save to file
|
|
||||||
print("Saved HTML to {}".format(output_file))
|
|
||||||
else:
|
|
||||||
print(html)
|
|
||||||
|
|
||||||
|
|
||||||
def overlap_tokens(doc, other_doc):
|
|
||||||
"""Get the tokens from the original Doc that are also in the comparison Doc.
|
|
||||||
"""
|
|
||||||
overlap = []
|
|
||||||
other_tokens = [token.text for token in other_doc]
|
|
||||||
for token in doc:
|
|
||||||
if token.text in other_tokens:
|
|
||||||
overlap.append(token)
|
|
||||||
return overlap
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
||||||
|
|
||||||
# Expected output:
|
|
||||||
# Text 1: Peach emoji is where it has always been.
|
|
||||||
# Text 2: Peach is the superior emoji.
|
|
||||||
# Overlapping tokens: [Peach, emoji, is, .]
|
|
|
@ -1,130 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf8
|
|
||||||
"""Example of a spaCy v2.0 pipeline component that requests all countries via
|
|
||||||
the REST Countries API, merges country names into one token, assigns entity
|
|
||||||
labels and sets attributes on country tokens, e.g. the capital and lat/lng
|
|
||||||
coordinates. Can be extended with more details from the API.
|
|
||||||
|
|
||||||
* REST Countries API: https://restcountries.eu (Mozilla Public License MPL 2.0)
|
|
||||||
* Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components
|
|
||||||
|
|
||||||
Compatible with: spaCy v2.0.0+
|
|
||||||
Last tested with: v2.1.0
|
|
||||||
Prerequisites: pip install requests
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals, print_function
|
|
||||||
|
|
||||||
import requests
|
|
||||||
import plac
|
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.matcher import PhraseMatcher
|
|
||||||
from spacy.tokens import Doc, Span, Token
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
# For simplicity, we start off with only the blank English Language class
|
|
||||||
# and no model or pre-defined pipeline loaded.
|
|
||||||
nlp = English()
|
|
||||||
rest_countries = RESTCountriesComponent(nlp) # initialise component
|
|
||||||
nlp.add_pipe(rest_countries) # add it to the pipeline
|
|
||||||
doc = nlp("Some text about Colombia and the Czech Republic")
|
|
||||||
print("Pipeline", nlp.pipe_names) # pipeline contains component name
|
|
||||||
print("Doc has countries", doc._.has_country) # Doc contains countries
|
|
||||||
for token in doc:
|
|
||||||
if token._.is_country:
|
|
||||||
print(
|
|
||||||
token.text,
|
|
||||||
token._.country_capital,
|
|
||||||
token._.country_latlng,
|
|
||||||
token._.country_flag,
|
|
||||||
) # country data
|
|
||||||
print("Entities", [(e.text, e.label_) for e in doc.ents]) # entities
|
|
||||||
|
|
||||||
|
|
||||||
class RESTCountriesComponent(object):
|
|
||||||
"""spaCy v2.0 pipeline component that requests all countries via
|
|
||||||
the REST Countries API, merges country names into one token, assigns entity
|
|
||||||
labels and sets attributes on country tokens.
|
|
||||||
"""
|
|
||||||
|
|
||||||
name = "rest_countries" # component name, will show up in the pipeline
|
|
||||||
|
|
||||||
def __init__(self, nlp, label="GPE"):
|
|
||||||
"""Initialise the pipeline component. The shared nlp instance is used
|
|
||||||
to initialise the matcher with the shared vocab, get the label ID and
|
|
||||||
generate Doc objects as phrase match patterns.
|
|
||||||
"""
|
|
||||||
# Make request once on initialisation and store the data
|
|
||||||
r = requests.get("https://restcountries.eu/rest/v2/all")
|
|
||||||
r.raise_for_status() # make sure requests raises an error if it fails
|
|
||||||
countries = r.json()
|
|
||||||
|
|
||||||
# Convert API response to dict keyed by country name for easy lookup
|
|
||||||
# This could also be extended using the alternative and foreign language
|
|
||||||
# names provided by the API
|
|
||||||
self.countries = {c["name"]: c for c in countries}
|
|
||||||
self.label = nlp.vocab.strings[label] # get entity label ID
|
|
||||||
|
|
||||||
# Set up the PhraseMatcher with Doc patterns for each country name
|
|
||||||
patterns = [nlp(c) for c in self.countries.keys()]
|
|
||||||
self.matcher = PhraseMatcher(nlp.vocab)
|
|
||||||
self.matcher.add("COUNTRIES", None, *patterns)
|
|
||||||
|
|
||||||
# Register attribute on the Token. We'll be overwriting this based on
|
|
||||||
# the matches, so we're only setting a default value, not a getter.
|
|
||||||
# If no default value is set, it defaults to None.
|
|
||||||
Token.set_extension("is_country", default=False)
|
|
||||||
Token.set_extension("country_capital", default=False)
|
|
||||||
Token.set_extension("country_latlng", default=False)
|
|
||||||
Token.set_extension("country_flag", default=False)
|
|
||||||
|
|
||||||
# Register attributes on Doc and Span via a getter that checks if one of
|
|
||||||
# the contained tokens is set to is_country == True.
|
|
||||||
Doc.set_extension("has_country", getter=self.has_country)
|
|
||||||
Span.set_extension("has_country", getter=self.has_country)
|
|
||||||
|
|
||||||
def __call__(self, doc):
|
|
||||||
"""Apply the pipeline component on a Doc object and modify it if matches
|
|
||||||
are found. Return the Doc, so it can be processed by the next component
|
|
||||||
in the pipeline, if available.
|
|
||||||
"""
|
|
||||||
matches = self.matcher(doc)
|
|
||||||
spans = [] # keep the spans for later so we can merge them afterwards
|
|
||||||
for _, start, end in matches:
|
|
||||||
# Generate Span representing the entity & set label
|
|
||||||
entity = Span(doc, start, end, label=self.label)
|
|
||||||
spans.append(entity)
|
|
||||||
# Set custom attribute on each token of the entity
|
|
||||||
# Can be extended with other data returned by the API, like
|
|
||||||
# currencies, country code, flag, calling code etc.
|
|
||||||
for token in entity:
|
|
||||||
token._.set("is_country", True)
|
|
||||||
token._.set("country_capital", self.countries[entity.text]["capital"])
|
|
||||||
token._.set("country_latlng", self.countries[entity.text]["latlng"])
|
|
||||||
token._.set("country_flag", self.countries[entity.text]["flag"])
|
|
||||||
# Overwrite doc.ents and add entity – be careful not to replace!
|
|
||||||
doc.ents = list(doc.ents) + [entity]
|
|
||||||
for span in spans:
|
|
||||||
# Iterate over all spans and merge them into one token. This is done
|
|
||||||
# after setting the entities – otherwise, it would cause mismatched
|
|
||||||
# indices!
|
|
||||||
span.merge()
|
|
||||||
return doc # don't forget to return the Doc!
|
|
||||||
|
|
||||||
def has_country(self, tokens):
|
|
||||||
"""Getter for Doc and Span attributes. Returns True if one of the tokens
|
|
||||||
is a country. Since the getter is only called when we access the
|
|
||||||
attribute, we can refer to the Token's 'is_country' attribute here,
|
|
||||||
which is already set in the processing step."""
|
|
||||||
return any([t._.get("is_country") for t in tokens])
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
||||||
|
|
||||||
# Expected output:
|
|
||||||
# Pipeline ['rest_countries']
|
|
||||||
# Doc has countries True
|
|
||||||
# Colombia Bogotá [4.0, -72.0] https://restcountries.eu/data/col.svg
|
|
||||||
# Czech Republic Prague [49.75, 15.5] https://restcountries.eu/data/cze.svg
|
|
||||||
# Entities [('Colombia', 'GPE'), ('Czech Republic', 'GPE')]
|
|
|
@ -1,115 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf8
|
|
||||||
"""Example of a spaCy v2.0 pipeline component that sets entity annotations
|
|
||||||
based on list of single or multiple-word company names. Companies are
|
|
||||||
labelled as ORG and their spans are merged into one token. Additionally,
|
|
||||||
._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token
|
|
||||||
respectively.
|
|
||||||
|
|
||||||
* Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components
|
|
||||||
|
|
||||||
Compatible with: spaCy v2.0.0+
|
|
||||||
Last tested with: v2.1.0
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals, print_function
|
|
||||||
|
|
||||||
import plac
|
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.matcher import PhraseMatcher
|
|
||||||
from spacy.tokens import Doc, Span, Token
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
text=("Text to process", "positional", None, str),
|
|
||||||
companies=("Names of technology companies", "positional", None, str),
|
|
||||||
)
|
|
||||||
def main(text="Alphabet Inc. is the company behind Google.", *companies):
|
|
||||||
# For simplicity, we start off with only the blank English Language class
|
|
||||||
# and no model or pre-defined pipeline loaded.
|
|
||||||
nlp = English()
|
|
||||||
if not companies: # set default companies if none are set via args
|
|
||||||
companies = ["Alphabet Inc.", "Google", "Netflix", "Apple"] # etc.
|
|
||||||
component = TechCompanyRecognizer(nlp, companies) # initialise component
|
|
||||||
nlp.add_pipe(component, last=True) # add last to the pipeline
|
|
||||||
|
|
||||||
doc = nlp(text)
|
|
||||||
print("Pipeline", nlp.pipe_names) # pipeline contains component name
|
|
||||||
print("Tokens", [t.text for t in doc]) # company names from the list are merged
|
|
||||||
print("Doc has_tech_org", doc._.has_tech_org) # Doc contains tech orgs
|
|
||||||
print("Token 0 is_tech_org", doc[0]._.is_tech_org) # "Alphabet Inc." is a tech org
|
|
||||||
print("Token 1 is_tech_org", doc[1]._.is_tech_org) # "is" is not
|
|
||||||
print("Entities", [(e.text, e.label_) for e in doc.ents]) # all orgs are entities
|
|
||||||
|
|
||||||
|
|
||||||
class TechCompanyRecognizer(object):
|
|
||||||
"""Example of a spaCy v2.0 pipeline component that sets entity annotations
|
|
||||||
based on list of single or multiple-word company names. Companies are
|
|
||||||
labelled as ORG and their spans are merged into one token. Additionally,
|
|
||||||
._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token
|
|
||||||
respectively."""
|
|
||||||
|
|
||||||
name = "tech_companies" # component name, will show up in the pipeline
|
|
||||||
|
|
||||||
def __init__(self, nlp, companies=tuple(), label="ORG"):
|
|
||||||
"""Initialise the pipeline component. The shared nlp instance is used
|
|
||||||
to initialise the matcher with the shared vocab, get the label ID and
|
|
||||||
generate Doc objects as phrase match patterns.
|
|
||||||
"""
|
|
||||||
self.label = nlp.vocab.strings[label] # get entity label ID
|
|
||||||
|
|
||||||
# Set up the PhraseMatcher – it can now take Doc objects as patterns,
|
|
||||||
# so even if the list of companies is long, it's very efficient
|
|
||||||
patterns = [nlp(org) for org in companies]
|
|
||||||
self.matcher = PhraseMatcher(nlp.vocab)
|
|
||||||
self.matcher.add("TECH_ORGS", None, *patterns)
|
|
||||||
|
|
||||||
# Register attribute on the Token. We'll be overwriting this based on
|
|
||||||
# the matches, so we're only setting a default value, not a getter.
|
|
||||||
Token.set_extension("is_tech_org", default=False)
|
|
||||||
|
|
||||||
# Register attributes on Doc and Span via a getter that checks if one of
|
|
||||||
# the contained tokens is set to is_tech_org == True.
|
|
||||||
Doc.set_extension("has_tech_org", getter=self.has_tech_org)
|
|
||||||
Span.set_extension("has_tech_org", getter=self.has_tech_org)
|
|
||||||
|
|
||||||
def __call__(self, doc):
|
|
||||||
"""Apply the pipeline component on a Doc object and modify it if matches
|
|
||||||
are found. Return the Doc, so it can be processed by the next component
|
|
||||||
in the pipeline, if available.
|
|
||||||
"""
|
|
||||||
matches = self.matcher(doc)
|
|
||||||
spans = [] # keep the spans for later so we can merge them afterwards
|
|
||||||
for _, start, end in matches:
|
|
||||||
# Generate Span representing the entity & set label
|
|
||||||
entity = Span(doc, start, end, label=self.label)
|
|
||||||
spans.append(entity)
|
|
||||||
# Set custom attribute on each token of the entity
|
|
||||||
for token in entity:
|
|
||||||
token._.set("is_tech_org", True)
|
|
||||||
# Overwrite doc.ents and add entity – be careful not to replace!
|
|
||||||
doc.ents = list(doc.ents) + [entity]
|
|
||||||
for span in spans:
|
|
||||||
# Iterate over all spans and merge them into one token. This is done
|
|
||||||
# after setting the entities – otherwise, it would cause mismatched
|
|
||||||
# indices!
|
|
||||||
span.merge()
|
|
||||||
return doc # don't forget to return the Doc!
|
|
||||||
|
|
||||||
def has_tech_org(self, tokens):
|
|
||||||
"""Getter for Doc and Span attributes. Returns True if one of the tokens
|
|
||||||
is a tech org. Since the getter is only called when we access the
|
|
||||||
attribute, we can refer to the Token's 'is_tech_org' attribute here,
|
|
||||||
which is already set in the processing step."""
|
|
||||||
return any([t._.get("is_tech_org") for t in tokens])
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
||||||
|
|
||||||
# Expected output:
|
|
||||||
# Pipeline ['tech_companies']
|
|
||||||
# Tokens ['Alphabet Inc.', 'is', 'the', 'company', 'behind', 'Google', '.']
|
|
||||||
# Doc has_tech_org True
|
|
||||||
# Token 0 is_tech_org True
|
|
||||||
# Token 1 is_tech_org False
|
|
||||||
# Entities [('Alphabet Inc.', 'ORG'), ('Google', 'ORG')]
|
|
|
@ -1,61 +0,0 @@
|
||||||
"""Example of adding a pipeline component to prohibit sentence boundaries
|
|
||||||
before certain tokens.
|
|
||||||
|
|
||||||
What we do is write to the token.is_sent_start attribute, which
|
|
||||||
takes values in {True, False, None}. The default value None allows the parser
|
|
||||||
to predict sentence segments. The value False prohibits the parser from inserting
|
|
||||||
a sentence boundary before that token. Note that fixing the sentence segmentation
|
|
||||||
should also improve the parse quality.
|
|
||||||
|
|
||||||
The specific example here is drawn from https://github.com/explosion/spaCy/issues/2627
|
|
||||||
Other versions of the model may not make the original mistake, so the specific
|
|
||||||
example might not be apt for future versions.
|
|
||||||
|
|
||||||
Compatible with: spaCy v2.0.0+
|
|
||||||
Last tested with: v2.1.0
|
|
||||||
"""
|
|
||||||
import plac
|
|
||||||
import spacy
|
|
||||||
|
|
||||||
|
|
||||||
def prevent_sentence_boundaries(doc):
|
|
||||||
for token in doc:
|
|
||||||
if not can_be_sentence_start(token):
|
|
||||||
token.is_sent_start = False
|
|
||||||
return doc
|
|
||||||
|
|
||||||
|
|
||||||
def can_be_sentence_start(token):
|
|
||||||
if token.i == 0:
|
|
||||||
return True
|
|
||||||
# We're not checking for is_title here to ignore arbitrary titlecased
|
|
||||||
# tokens within sentences
|
|
||||||
# elif token.is_title:
|
|
||||||
# return True
|
|
||||||
elif token.nbor(-1).is_punct:
|
|
||||||
return True
|
|
||||||
elif token.nbor(-1).is_space:
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
text=("The raw text to process", "positional", None, str),
|
|
||||||
spacy_model=("spaCy model to use (with a parser)", "option", "m", str),
|
|
||||||
)
|
|
||||||
def main(text="Been here And I'm loving it.", spacy_model="en_core_web_lg"):
|
|
||||||
print("Using spaCy model '{}'".format(spacy_model))
|
|
||||||
print("Processing text '{}'".format(text))
|
|
||||||
nlp = spacy.load(spacy_model)
|
|
||||||
doc = nlp(text)
|
|
||||||
sentences = [sent.text.strip() for sent in doc.sents]
|
|
||||||
print("Before:", sentences)
|
|
||||||
nlp.add_pipe(prevent_sentence_boundaries, before="parser")
|
|
||||||
doc = nlp(text)
|
|
||||||
sentences = [sent.text.strip() for sent in doc.sents]
|
|
||||||
print("After:", sentences)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
|
@ -1,37 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf8
|
|
||||||
"""Demonstrate adding a rule-based component that forces some tokens to not
|
|
||||||
be entities, before the NER tagger is applied. This is used to hotfix the issue
|
|
||||||
in https://github.com/explosion/spaCy/issues/2870, present as of spaCy v2.0.16.
|
|
||||||
|
|
||||||
Compatible with: spaCy v2.0.0+
|
|
||||||
Last tested with: v2.1.0
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import spacy
|
|
||||||
from spacy.attrs import ENT_IOB
|
|
||||||
|
|
||||||
|
|
||||||
def fix_space_tags(doc):
|
|
||||||
ent_iobs = doc.to_array([ENT_IOB])
|
|
||||||
for i, token in enumerate(doc):
|
|
||||||
if token.is_space:
|
|
||||||
# Sets 'O' tag (0 is None, so I is 1, O is 2)
|
|
||||||
ent_iobs[i] = 2
|
|
||||||
doc.from_array([ENT_IOB], ent_iobs.reshape((len(doc), 1)))
|
|
||||||
return doc
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
nlp = spacy.load("en_core_web_sm")
|
|
||||||
text = "This is some crazy test where I dont need an Apple Watch to make things bug"
|
|
||||||
doc = nlp(text)
|
|
||||||
print("Before", doc.ents)
|
|
||||||
nlp.add_pipe(fix_space_tags, name="fix-ner", before="ner")
|
|
||||||
doc = nlp(text)
|
|
||||||
print("After", doc.ents)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1,84 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf8
|
|
||||||
"""Example of multi-processing with Joblib. Here, we're exporting
|
|
||||||
part-of-speech-tagged, true-cased, (very roughly) sentence-separated text, with
|
|
||||||
each "sentence" on a newline, and spaces between tokens. Data is loaded from
|
|
||||||
the IMDB movie reviews dataset and will be loaded automatically via Thinc's
|
|
||||||
built-in dataset loader.
|
|
||||||
|
|
||||||
Compatible with: spaCy v2.0.0+
|
|
||||||
Last tested with: v2.1.0
|
|
||||||
Prerequisites: pip install joblib
|
|
||||||
"""
|
|
||||||
from __future__ import print_function, unicode_literals
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
from joblib import Parallel, delayed
|
|
||||||
from functools import partial
|
|
||||||
import thinc.extra.datasets
|
|
||||||
import plac
|
|
||||||
import spacy
|
|
||||||
from spacy.util import minibatch
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
output_dir=("Output directory", "positional", None, Path),
|
|
||||||
model=("Model name (needs tagger)", "positional", None, str),
|
|
||||||
n_jobs=("Number of workers", "option", "n", int),
|
|
||||||
batch_size=("Batch-size for each process", "option", "b", int),
|
|
||||||
limit=("Limit of entries from the dataset", "option", "l", int),
|
|
||||||
)
|
|
||||||
def main(output_dir, model="en_core_web_sm", n_jobs=4, batch_size=1000, limit=10000):
|
|
||||||
nlp = spacy.load(model) # load spaCy model
|
|
||||||
print("Loaded model '%s'" % model)
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir()
|
|
||||||
# load and pre-process the IMBD dataset
|
|
||||||
print("Loading IMDB data...")
|
|
||||||
data, _ = thinc.extra.datasets.imdb()
|
|
||||||
texts, _ = zip(*data[-limit:])
|
|
||||||
print("Processing texts...")
|
|
||||||
partitions = minibatch(texts, size=batch_size)
|
|
||||||
executor = Parallel(n_jobs=n_jobs, backend="multiprocessing", prefer="processes")
|
|
||||||
do = delayed(partial(transform_texts, nlp))
|
|
||||||
tasks = (do(i, batch, output_dir) for i, batch in enumerate(partitions))
|
|
||||||
executor(tasks)
|
|
||||||
|
|
||||||
|
|
||||||
def transform_texts(nlp, batch_id, texts, output_dir):
|
|
||||||
print(nlp.pipe_names)
|
|
||||||
out_path = Path(output_dir) / ("%d.txt" % batch_id)
|
|
||||||
if out_path.exists(): # return None in case same batch is called again
|
|
||||||
return None
|
|
||||||
print("Processing batch", batch_id)
|
|
||||||
with out_path.open("w", encoding="utf8") as f:
|
|
||||||
for doc in nlp.pipe(texts):
|
|
||||||
f.write(" ".join(represent_word(w) for w in doc if not w.is_space))
|
|
||||||
f.write("\n")
|
|
||||||
print("Saved {} texts to {}.txt".format(len(texts), batch_id))
|
|
||||||
|
|
||||||
|
|
||||||
def represent_word(word):
|
|
||||||
text = word.text
|
|
||||||
# True-case, i.e. try to normalize sentence-initial capitals.
|
|
||||||
# Only do this if the lower-cased form is more probable.
|
|
||||||
if (
|
|
||||||
text.istitle()
|
|
||||||
and is_sent_begin(word)
|
|
||||||
and word.prob < word.doc.vocab[text.lower()].prob
|
|
||||||
):
|
|
||||||
text = text.lower()
|
|
||||||
return text + "|" + word.tag_
|
|
||||||
|
|
||||||
|
|
||||||
def is_sent_begin(word):
|
|
||||||
if word.i == 0:
|
|
||||||
return True
|
|
||||||
elif word.i >= 2 and word.nbor(-1).text in (".", "!", "?", "..."):
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
|
@ -1,153 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
"""
|
|
||||||
Example of a Streamlit app for an interactive spaCy model visualizer. You can
|
|
||||||
either download the script, or point streamlit run to the raw URL of this
|
|
||||||
file. For more details, see https://streamlit.io.
|
|
||||||
|
|
||||||
Installation:
|
|
||||||
pip install streamlit
|
|
||||||
python -m spacy download en_core_web_sm
|
|
||||||
python -m spacy download en_core_web_md
|
|
||||||
python -m spacy download de_core_news_sm
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
streamlit run streamlit_spacy.py
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import streamlit as st
|
|
||||||
import spacy
|
|
||||||
from spacy import displacy
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
|
|
||||||
SPACY_MODEL_NAMES = ["en_core_web_sm", "en_core_web_md", "de_core_news_sm"]
|
|
||||||
DEFAULT_TEXT = "Mark Zuckerberg is the CEO of Facebook."
|
|
||||||
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
|
|
||||||
|
|
||||||
|
|
||||||
@st.cache(allow_output_mutation=True)
|
|
||||||
def load_model(name):
|
|
||||||
return spacy.load(name)
|
|
||||||
|
|
||||||
|
|
||||||
@st.cache(allow_output_mutation=True)
|
|
||||||
def process_text(model_name, text):
|
|
||||||
nlp = load_model(model_name)
|
|
||||||
return nlp(text)
|
|
||||||
|
|
||||||
|
|
||||||
st.sidebar.title("Interactive spaCy visualizer")
|
|
||||||
st.sidebar.markdown(
|
|
||||||
"""
|
|
||||||
Process text with [spaCy](https://spacy.io) models and visualize named entities,
|
|
||||||
dependencies and more. Uses spaCy's built-in
|
|
||||||
[displaCy](http://spacy.io/usage/visualizers) visualizer under the hood.
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
|
|
||||||
spacy_model = st.sidebar.selectbox("Model name", SPACY_MODEL_NAMES)
|
|
||||||
model_load_state = st.info(f"Loading model '{spacy_model}'...")
|
|
||||||
nlp = load_model(spacy_model)
|
|
||||||
model_load_state.empty()
|
|
||||||
|
|
||||||
text = st.text_area("Text to analyze", DEFAULT_TEXT)
|
|
||||||
doc = process_text(spacy_model, text)
|
|
||||||
|
|
||||||
if "parser" in nlp.pipe_names:
|
|
||||||
st.header("Dependency Parse & Part-of-speech tags")
|
|
||||||
st.sidebar.header("Dependency Parse")
|
|
||||||
split_sents = st.sidebar.checkbox("Split sentences", value=True)
|
|
||||||
collapse_punct = st.sidebar.checkbox("Collapse punctuation", value=True)
|
|
||||||
collapse_phrases = st.sidebar.checkbox("Collapse phrases")
|
|
||||||
compact = st.sidebar.checkbox("Compact mode")
|
|
||||||
options = {
|
|
||||||
"collapse_punct": collapse_punct,
|
|
||||||
"collapse_phrases": collapse_phrases,
|
|
||||||
"compact": compact,
|
|
||||||
}
|
|
||||||
docs = [span.as_doc() for span in doc.sents] if split_sents else [doc]
|
|
||||||
for sent in docs:
|
|
||||||
html = displacy.render(sent, options=options)
|
|
||||||
# Double newlines seem to mess with the rendering
|
|
||||||
html = html.replace("\n\n", "\n")
|
|
||||||
if split_sents and len(docs) > 1:
|
|
||||||
st.markdown(f"> {sent.text}")
|
|
||||||
st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
|
|
||||||
|
|
||||||
if "ner" in nlp.pipe_names:
|
|
||||||
st.header("Named Entities")
|
|
||||||
st.sidebar.header("Named Entities")
|
|
||||||
label_set = nlp.get_pipe("ner").labels
|
|
||||||
labels = st.sidebar.multiselect(
|
|
||||||
"Entity labels", options=label_set, default=list(label_set)
|
|
||||||
)
|
|
||||||
html = displacy.render(doc, style="ent", options={"ents": labels})
|
|
||||||
# Newlines seem to mess with the rendering
|
|
||||||
html = html.replace("\n", " ")
|
|
||||||
st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
|
|
||||||
attrs = ["text", "label_", "start", "end", "start_char", "end_char"]
|
|
||||||
if "entity_linker" in nlp.pipe_names:
|
|
||||||
attrs.append("kb_id_")
|
|
||||||
data = [
|
|
||||||
[str(getattr(ent, attr)) for attr in attrs]
|
|
||||||
for ent in doc.ents
|
|
||||||
if ent.label_ in labels
|
|
||||||
]
|
|
||||||
df = pd.DataFrame(data, columns=attrs)
|
|
||||||
st.dataframe(df)
|
|
||||||
|
|
||||||
|
|
||||||
if "textcat" in nlp.pipe_names:
|
|
||||||
st.header("Text Classification")
|
|
||||||
st.markdown(f"> {text}")
|
|
||||||
df = pd.DataFrame(doc.cats.items(), columns=("Label", "Score"))
|
|
||||||
st.dataframe(df)
|
|
||||||
|
|
||||||
|
|
||||||
vector_size = nlp.meta.get("vectors", {}).get("width", 0)
|
|
||||||
if vector_size:
|
|
||||||
st.header("Vectors & Similarity")
|
|
||||||
st.code(nlp.meta["vectors"])
|
|
||||||
text1 = st.text_input("Text or word 1", "apple")
|
|
||||||
text2 = st.text_input("Text or word 2", "orange")
|
|
||||||
doc1 = process_text(spacy_model, text1)
|
|
||||||
doc2 = process_text(spacy_model, text2)
|
|
||||||
similarity = doc1.similarity(doc2)
|
|
||||||
if similarity > 0.5:
|
|
||||||
st.success(similarity)
|
|
||||||
else:
|
|
||||||
st.error(similarity)
|
|
||||||
|
|
||||||
st.header("Token attributes")
|
|
||||||
|
|
||||||
if st.button("Show token attributes"):
|
|
||||||
attrs = [
|
|
||||||
"idx",
|
|
||||||
"text",
|
|
||||||
"lemma_",
|
|
||||||
"pos_",
|
|
||||||
"tag_",
|
|
||||||
"dep_",
|
|
||||||
"head",
|
|
||||||
"ent_type_",
|
|
||||||
"ent_iob_",
|
|
||||||
"shape_",
|
|
||||||
"is_alpha",
|
|
||||||
"is_ascii",
|
|
||||||
"is_digit",
|
|
||||||
"is_punct",
|
|
||||||
"like_num",
|
|
||||||
]
|
|
||||||
data = [[str(getattr(token, attr)) for attr in attrs] for token in doc]
|
|
||||||
df = pd.DataFrame(data, columns=attrs)
|
|
||||||
st.dataframe(df)
|
|
||||||
|
|
||||||
|
|
||||||
st.header("JSON Doc")
|
|
||||||
if st.button("Show JSON Doc"):
|
|
||||||
st.json(doc.to_json())
|
|
||||||
|
|
||||||
st.header("JSON model meta")
|
|
||||||
if st.button("Show JSON model meta"):
|
|
||||||
st.json(nlp.meta)
|
|
|
@ -1 +0,0 @@
|
||||||
{"nr_epoch": 3, "batch_size": 24, "dropout": 0.001, "vectors": 0, "multitask_tag": 0, "multitask_sent": 0}
|
|
|
@ -1,434 +0,0 @@
|
||||||
"""Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
|
|
||||||
.conllu format for development data, allowing the official scorer to be used.
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
import plac
|
|
||||||
import attr
|
|
||||||
from pathlib import Path
|
|
||||||
import re
|
|
||||||
import json
|
|
||||||
import tqdm
|
|
||||||
|
|
||||||
import spacy
|
|
||||||
import spacy.util
|
|
||||||
from spacy.tokens import Token, Doc
|
|
||||||
from spacy.gold import GoldParse
|
|
||||||
from spacy.syntax.nonproj import projectivize
|
|
||||||
from collections import defaultdict
|
|
||||||
from spacy.matcher import Matcher
|
|
||||||
|
|
||||||
import itertools
|
|
||||||
import random
|
|
||||||
import numpy.random
|
|
||||||
|
|
||||||
from bin.ud import conll17_ud_eval
|
|
||||||
|
|
||||||
import spacy.lang.zh
|
|
||||||
import spacy.lang.ja
|
|
||||||
|
|
||||||
spacy.lang.zh.Chinese.Defaults.use_jieba = False
|
|
||||||
spacy.lang.ja.Japanese.Defaults.use_janome = False
|
|
||||||
|
|
||||||
random.seed(0)
|
|
||||||
numpy.random.seed(0)
|
|
||||||
|
|
||||||
|
|
||||||
def minibatch_by_words(items, size=5000):
|
|
||||||
random.shuffle(items)
|
|
||||||
if isinstance(size, int):
|
|
||||||
size_ = itertools.repeat(size)
|
|
||||||
else:
|
|
||||||
size_ = size
|
|
||||||
items = iter(items)
|
|
||||||
while True:
|
|
||||||
batch_size = next(size_)
|
|
||||||
batch = []
|
|
||||||
while batch_size >= 0:
|
|
||||||
try:
|
|
||||||
doc, gold = next(items)
|
|
||||||
except StopIteration:
|
|
||||||
if batch:
|
|
||||||
yield batch
|
|
||||||
return
|
|
||||||
batch_size -= len(doc)
|
|
||||||
batch.append((doc, gold))
|
|
||||||
if batch:
|
|
||||||
yield batch
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
|
|
||||||
|
|
||||||
################
|
|
||||||
# Data reading #
|
|
||||||
################
|
|
||||||
|
|
||||||
space_re = re.compile("\s+")
|
|
||||||
|
|
||||||
|
|
||||||
def split_text(text):
|
|
||||||
return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")]
|
|
||||||
|
|
||||||
|
|
||||||
def read_data(
|
|
||||||
nlp,
|
|
||||||
conllu_file,
|
|
||||||
text_file,
|
|
||||||
raw_text=True,
|
|
||||||
oracle_segments=False,
|
|
||||||
max_doc_length=None,
|
|
||||||
limit=None,
|
|
||||||
):
|
|
||||||
"""Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
|
|
||||||
include Doc objects created using nlp.make_doc and then aligned against
|
|
||||||
the gold-standard sequences. If oracle_segments=True, include Doc objects
|
|
||||||
created from the gold-standard segments. At least one must be True."""
|
|
||||||
if not raw_text and not oracle_segments:
|
|
||||||
raise ValueError("At least one of raw_text or oracle_segments must be True")
|
|
||||||
paragraphs = split_text(text_file.read())
|
|
||||||
conllu = read_conllu(conllu_file)
|
|
||||||
# sd is spacy doc; cd is conllu doc
|
|
||||||
# cs is conllu sent, ct is conllu token
|
|
||||||
docs = []
|
|
||||||
golds = []
|
|
||||||
for doc_id, (text, cd) in enumerate(zip(paragraphs, conllu)):
|
|
||||||
sent_annots = []
|
|
||||||
for cs in cd:
|
|
||||||
sent = defaultdict(list)
|
|
||||||
for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
|
|
||||||
if "." in id_:
|
|
||||||
continue
|
|
||||||
if "-" in id_:
|
|
||||||
continue
|
|
||||||
id_ = int(id_) - 1
|
|
||||||
head = int(head) - 1 if head != "0" else id_
|
|
||||||
sent["words"].append(word)
|
|
||||||
sent["tags"].append(tag)
|
|
||||||
sent["heads"].append(head)
|
|
||||||
sent["deps"].append("ROOT" if dep == "root" else dep)
|
|
||||||
sent["spaces"].append(space_after == "_")
|
|
||||||
sent["entities"] = ["-"] * len(sent["words"])
|
|
||||||
sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
|
|
||||||
if oracle_segments:
|
|
||||||
docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
|
|
||||||
golds.append(GoldParse(docs[-1], **sent))
|
|
||||||
|
|
||||||
sent_annots.append(sent)
|
|
||||||
if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
|
|
||||||
doc, gold = _make_gold(nlp, None, sent_annots)
|
|
||||||
sent_annots = []
|
|
||||||
docs.append(doc)
|
|
||||||
golds.append(gold)
|
|
||||||
if limit and len(docs) >= limit:
|
|
||||||
return docs, golds
|
|
||||||
|
|
||||||
if raw_text and sent_annots:
|
|
||||||
doc, gold = _make_gold(nlp, None, sent_annots)
|
|
||||||
docs.append(doc)
|
|
||||||
golds.append(gold)
|
|
||||||
if limit and len(docs) >= limit:
|
|
||||||
return docs, golds
|
|
||||||
return docs, golds
|
|
||||||
|
|
||||||
|
|
||||||
def read_conllu(file_):
|
|
||||||
docs = []
|
|
||||||
sent = []
|
|
||||||
doc = []
|
|
||||||
for line in file_:
|
|
||||||
if line.startswith("# newdoc"):
|
|
||||||
if doc:
|
|
||||||
docs.append(doc)
|
|
||||||
doc = []
|
|
||||||
elif line.startswith("#"):
|
|
||||||
continue
|
|
||||||
elif not line.strip():
|
|
||||||
if sent:
|
|
||||||
doc.append(sent)
|
|
||||||
sent = []
|
|
||||||
else:
|
|
||||||
sent.append(list(line.strip().split("\t")))
|
|
||||||
if len(sent[-1]) != 10:
|
|
||||||
print(repr(line))
|
|
||||||
raise ValueError
|
|
||||||
if sent:
|
|
||||||
doc.append(sent)
|
|
||||||
if doc:
|
|
||||||
docs.append(doc)
|
|
||||||
return docs
|
|
||||||
|
|
||||||
|
|
||||||
def _make_gold(nlp, text, sent_annots):
|
|
||||||
# Flatten the conll annotations, and adjust the head indices
|
|
||||||
flat = defaultdict(list)
|
|
||||||
for sent in sent_annots:
|
|
||||||
flat["heads"].extend(len(flat["words"]) + head for head in sent["heads"])
|
|
||||||
for field in ["words", "tags", "deps", "entities", "spaces"]:
|
|
||||||
flat[field].extend(sent[field])
|
|
||||||
# Construct text if necessary
|
|
||||||
assert len(flat["words"]) == len(flat["spaces"])
|
|
||||||
if text is None:
|
|
||||||
text = "".join(
|
|
||||||
word + " " * space for word, space in zip(flat["words"], flat["spaces"])
|
|
||||||
)
|
|
||||||
doc = nlp.make_doc(text)
|
|
||||||
flat.pop("spaces")
|
|
||||||
gold = GoldParse(doc, **flat)
|
|
||||||
return doc, gold
|
|
||||||
|
|
||||||
|
|
||||||
#############################
|
|
||||||
# Data transforms for spaCy #
|
|
||||||
#############################
|
|
||||||
|
|
||||||
|
|
||||||
def golds_to_gold_tuples(docs, golds):
|
|
||||||
"""Get out the annoying 'tuples' format used by begin_training, given the
|
|
||||||
GoldParse objects."""
|
|
||||||
tuples = []
|
|
||||||
for doc, gold in zip(docs, golds):
|
|
||||||
text = doc.text
|
|
||||||
ids, words, tags, heads, labels, iob = zip(*gold.orig_annot)
|
|
||||||
sents = [((ids, words, tags, heads, labels, iob), [])]
|
|
||||||
tuples.append((text, sents))
|
|
||||||
return tuples
|
|
||||||
|
|
||||||
|
|
||||||
##############
|
|
||||||
# Evaluation #
|
|
||||||
##############
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
|
||||||
with text_loc.open("r", encoding="utf8") as text_file:
|
|
||||||
texts = split_text(text_file.read())
|
|
||||||
docs = list(nlp.pipe(texts))
|
|
||||||
with sys_loc.open("w", encoding="utf8") as out_file:
|
|
||||||
write_conllu(docs, out_file)
|
|
||||||
with gold_loc.open("r", encoding="utf8") as gold_file:
|
|
||||||
gold_ud = conll17_ud_eval.load_conllu(gold_file)
|
|
||||||
with sys_loc.open("r", encoding="utf8") as sys_file:
|
|
||||||
sys_ud = conll17_ud_eval.load_conllu(sys_file)
|
|
||||||
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
|
|
||||||
return scores
|
|
||||||
|
|
||||||
|
|
||||||
def write_conllu(docs, file_):
|
|
||||||
merger = Matcher(docs[0].vocab)
|
|
||||||
merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
|
|
||||||
for i, doc in enumerate(docs):
|
|
||||||
matches = merger(doc)
|
|
||||||
spans = [doc[start : end + 1] for _, start, end in matches]
|
|
||||||
offsets = [(span.start_char, span.end_char) for span in spans]
|
|
||||||
for start_char, end_char in offsets:
|
|
||||||
doc.merge(start_char, end_char)
|
|
||||||
file_.write("# newdoc id = {i}\n".format(i=i))
|
|
||||||
for j, sent in enumerate(doc.sents):
|
|
||||||
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
|
|
||||||
file_.write("# text = {text}\n".format(text=sent.text))
|
|
||||||
for k, token in enumerate(sent):
|
|
||||||
file_.write(token._.get_conllu_lines(k) + "\n")
|
|
||||||
file_.write("\n")
|
|
||||||
|
|
||||||
|
|
||||||
def print_progress(itn, losses, ud_scores):
|
|
||||||
fields = {
|
|
||||||
"dep_loss": losses.get("parser", 0.0),
|
|
||||||
"tag_loss": losses.get("tagger", 0.0),
|
|
||||||
"words": ud_scores["Words"].f1 * 100,
|
|
||||||
"sents": ud_scores["Sentences"].f1 * 100,
|
|
||||||
"tags": ud_scores["XPOS"].f1 * 100,
|
|
||||||
"uas": ud_scores["UAS"].f1 * 100,
|
|
||||||
"las": ud_scores["LAS"].f1 * 100,
|
|
||||||
}
|
|
||||||
header = ["Epoch", "Loss", "LAS", "UAS", "TAG", "SENT", "WORD"]
|
|
||||||
if itn == 0:
|
|
||||||
print("\t".join(header))
|
|
||||||
tpl = "\t".join(
|
|
||||||
(
|
|
||||||
"{:d}",
|
|
||||||
"{dep_loss:.1f}",
|
|
||||||
"{las:.1f}",
|
|
||||||
"{uas:.1f}",
|
|
||||||
"{tags:.1f}",
|
|
||||||
"{sents:.1f}",
|
|
||||||
"{words:.1f}",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
print(tpl.format(itn, **fields))
|
|
||||||
|
|
||||||
|
|
||||||
# def get_sent_conllu(sent, sent_id):
|
|
||||||
# lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)]
|
|
||||||
|
|
||||||
|
|
||||||
def get_token_conllu(token, i):
|
|
||||||
if token._.begins_fused:
|
|
||||||
n = 1
|
|
||||||
while token.nbor(n)._.inside_fused:
|
|
||||||
n += 1
|
|
||||||
id_ = "%d-%d" % (i, i + n)
|
|
||||||
lines = [id_, token.text, "_", "_", "_", "_", "_", "_", "_", "_"]
|
|
||||||
else:
|
|
||||||
lines = []
|
|
||||||
if token.head.i == token.i:
|
|
||||||
head = 0
|
|
||||||
else:
|
|
||||||
head = i + (token.head.i - token.i) + 1
|
|
||||||
fields = [
|
|
||||||
str(i + 1),
|
|
||||||
token.text,
|
|
||||||
token.lemma_,
|
|
||||||
token.pos_,
|
|
||||||
token.tag_,
|
|
||||||
"_",
|
|
||||||
str(head),
|
|
||||||
token.dep_.lower(),
|
|
||||||
"_",
|
|
||||||
"_",
|
|
||||||
]
|
|
||||||
lines.append("\t".join(fields))
|
|
||||||
return "\n".join(lines)
|
|
||||||
|
|
||||||
|
|
||||||
##################
|
|
||||||
# Initialization #
|
|
||||||
##################
|
|
||||||
|
|
||||||
|
|
||||||
def load_nlp(corpus, config):
|
|
||||||
lang = corpus.split("_")[0]
|
|
||||||
nlp = spacy.blank(lang)
|
|
||||||
if config.vectors:
|
|
||||||
nlp.vocab.from_disk(config.vectors / "vocab")
|
|
||||||
return nlp
|
|
||||||
|
|
||||||
|
|
||||||
def initialize_pipeline(nlp, docs, golds, config):
|
|
||||||
nlp.add_pipe(nlp.create_pipe("parser"))
|
|
||||||
if config.multitask_tag:
|
|
||||||
nlp.parser.add_multitask_objective("tag")
|
|
||||||
if config.multitask_sent:
|
|
||||||
nlp.parser.add_multitask_objective("sent_start")
|
|
||||||
nlp.parser.moves.add_action(2, "subtok")
|
|
||||||
nlp.add_pipe(nlp.create_pipe("tagger"))
|
|
||||||
for gold in golds:
|
|
||||||
for tag in gold.tags:
|
|
||||||
if tag is not None:
|
|
||||||
nlp.tagger.add_label(tag)
|
|
||||||
# Replace labels that didn't make the frequency cutoff
|
|
||||||
actions = set(nlp.parser.labels)
|
|
||||||
label_set = set([act.split("-")[1] for act in actions if "-" in act])
|
|
||||||
for gold in golds:
|
|
||||||
for i, label in enumerate(gold.labels):
|
|
||||||
if label is not None and label not in label_set:
|
|
||||||
gold.labels[i] = label.split("||")[0]
|
|
||||||
return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds))
|
|
||||||
|
|
||||||
|
|
||||||
########################
|
|
||||||
# Command line helpers #
|
|
||||||
########################
|
|
||||||
|
|
||||||
|
|
||||||
@attr.s
|
|
||||||
class Config(object):
|
|
||||||
vectors = attr.ib(default=None)
|
|
||||||
max_doc_length = attr.ib(default=10)
|
|
||||||
multitask_tag = attr.ib(default=True)
|
|
||||||
multitask_sent = attr.ib(default=True)
|
|
||||||
nr_epoch = attr.ib(default=30)
|
|
||||||
batch_size = attr.ib(default=1000)
|
|
||||||
dropout = attr.ib(default=0.2)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def load(cls, loc):
|
|
||||||
with Path(loc).open("r", encoding="utf8") as file_:
|
|
||||||
cfg = json.load(file_)
|
|
||||||
return cls(**cfg)
|
|
||||||
|
|
||||||
|
|
||||||
class Dataset(object):
|
|
||||||
def __init__(self, path, section):
|
|
||||||
self.path = path
|
|
||||||
self.section = section
|
|
||||||
self.conllu = None
|
|
||||||
self.text = None
|
|
||||||
for file_path in self.path.iterdir():
|
|
||||||
name = file_path.parts[-1]
|
|
||||||
if section in name and name.endswith("conllu"):
|
|
||||||
self.conllu = file_path
|
|
||||||
elif section in name and name.endswith("txt"):
|
|
||||||
self.text = file_path
|
|
||||||
if self.conllu is None:
|
|
||||||
msg = "Could not find .txt file in {path} for {section}"
|
|
||||||
raise IOError(msg.format(section=section, path=path))
|
|
||||||
if self.text is None:
|
|
||||||
msg = "Could not find .txt file in {path} for {section}"
|
|
||||||
self.lang = self.conllu.parts[-1].split("-")[0].split("_")[0]
|
|
||||||
|
|
||||||
|
|
||||||
class TreebankPaths(object):
|
|
||||||
def __init__(self, ud_path, treebank, **cfg):
|
|
||||||
self.train = Dataset(ud_path / treebank, "train")
|
|
||||||
self.dev = Dataset(ud_path / treebank, "dev")
|
|
||||||
self.lang = self.train.lang
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
|
|
||||||
parses_dir=("Directory to write the development parses", "positional", None, Path),
|
|
||||||
config=("Path to json formatted config file", "positional", None, Config.load),
|
|
||||||
corpus=(
|
|
||||||
"UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora",
|
|
||||||
"positional",
|
|
||||||
None,
|
|
||||||
str,
|
|
||||||
),
|
|
||||||
limit=("Size limit", "option", "n", int),
|
|
||||||
)
|
|
||||||
def main(ud_dir, parses_dir, config, corpus, limit=0):
|
|
||||||
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
|
||||||
Token.set_extension("begins_fused", default=False)
|
|
||||||
Token.set_extension("inside_fused", default=False)
|
|
||||||
|
|
||||||
paths = TreebankPaths(ud_dir, corpus)
|
|
||||||
if not (parses_dir / corpus).exists():
|
|
||||||
(parses_dir / corpus).mkdir()
|
|
||||||
print("Train and evaluate", corpus, "using lang", paths.lang)
|
|
||||||
nlp = load_nlp(paths.lang, config)
|
|
||||||
|
|
||||||
docs, golds = read_data(
|
|
||||||
nlp,
|
|
||||||
paths.train.conllu.open(encoding="utf8"),
|
|
||||||
paths.train.text.open(encoding="utf8"),
|
|
||||||
max_doc_length=config.max_doc_length,
|
|
||||||
limit=limit,
|
|
||||||
)
|
|
||||||
|
|
||||||
optimizer = initialize_pipeline(nlp, docs, golds, config)
|
|
||||||
|
|
||||||
for i in range(config.nr_epoch):
|
|
||||||
docs = [nlp.make_doc(doc.text) for doc in docs]
|
|
||||||
batches = minibatch_by_words(list(zip(docs, golds)), size=config.batch_size)
|
|
||||||
losses = {}
|
|
||||||
n_train_words = sum(len(doc) for doc in docs)
|
|
||||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
|
||||||
for batch in batches:
|
|
||||||
batch_docs, batch_gold = zip(*batch)
|
|
||||||
pbar.update(sum(len(doc) for doc in batch_docs))
|
|
||||||
nlp.update(
|
|
||||||
batch_docs,
|
|
||||||
batch_gold,
|
|
||||||
sgd=optimizer,
|
|
||||||
drop=config.dropout,
|
|
||||||
losses=losses,
|
|
||||||
)
|
|
||||||
|
|
||||||
out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i)
|
|
||||||
with nlp.use_params(optimizer.averages):
|
|
||||||
scores = evaluate(nlp, paths.dev.text, paths.dev.conllu, out_path)
|
|
||||||
print_progress(i, losses, scores)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
|
@ -1,114 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf8
|
|
||||||
|
|
||||||
"""Example of defining a knowledge base in spaCy,
|
|
||||||
which is needed to implement entity linking functionality.
|
|
||||||
|
|
||||||
For more details, see the documentation:
|
|
||||||
* Knowledge base: https://spacy.io/api/kb
|
|
||||||
* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
|
|
||||||
|
|
||||||
Compatible with: spaCy v2.2.4
|
|
||||||
Last tested with: v2.3.4
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals, print_function
|
|
||||||
|
|
||||||
import plac
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from spacy.vocab import Vocab
|
|
||||||
import spacy
|
|
||||||
from spacy.kb import KnowledgeBase
|
|
||||||
|
|
||||||
|
|
||||||
# Q2146908 (Russ Cochran): American golfer
|
|
||||||
# Q7381115 (Russ Cochran): publisher
|
|
||||||
ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)}
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
model=("Model name, should have pretrained word embeddings", "positional", None, str),
|
|
||||||
output_dir=("Optional output directory", "option", "o", Path),
|
|
||||||
)
|
|
||||||
def main(model=None, output_dir=None):
|
|
||||||
"""Load the model and create the KB with pre-defined entity encodings.
|
|
||||||
If an output_dir is provided, the KB will be stored there in a file 'kb'.
|
|
||||||
The updated vocab will also be written to a directory in the output_dir."""
|
|
||||||
|
|
||||||
nlp = spacy.load(model) # load existing spaCy model
|
|
||||||
print("Loaded model '%s'" % model)
|
|
||||||
|
|
||||||
# check the length of the nlp vectors
|
|
||||||
if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
|
|
||||||
raise ValueError(
|
|
||||||
"The `nlp` object should have access to pretrained word vectors, "
|
|
||||||
" cf. https://spacy.io/usage/models#languages."
|
|
||||||
)
|
|
||||||
|
|
||||||
# You can change the dimension of vectors in your KB by using an encoder that changes the dimensionality.
|
|
||||||
# For simplicity, we'll just use the original vector dimension here instead.
|
|
||||||
vectors_dim = nlp.vocab.vectors.shape[1]
|
|
||||||
kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=vectors_dim)
|
|
||||||
|
|
||||||
# set up the data
|
|
||||||
entity_ids = []
|
|
||||||
descr_embeddings = []
|
|
||||||
freqs = []
|
|
||||||
for key, value in ENTITIES.items():
|
|
||||||
desc, freq = value
|
|
||||||
entity_ids.append(key)
|
|
||||||
descr_embeddings.append(nlp(desc).vector)
|
|
||||||
freqs.append(freq)
|
|
||||||
|
|
||||||
# set the entities, can also be done by calling `kb.add_entity` for each entity
|
|
||||||
kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=descr_embeddings)
|
|
||||||
|
|
||||||
# adding aliases, the entities need to be defined in the KB beforehand
|
|
||||||
kb.add_alias(
|
|
||||||
alias="Russ Cochran",
|
|
||||||
entities=["Q2146908", "Q7381115"],
|
|
||||||
probabilities=[0.24, 0.7], # the sum of these probabilities should not exceed 1
|
|
||||||
)
|
|
||||||
|
|
||||||
# test the trained model
|
|
||||||
print()
|
|
||||||
_print_kb(kb)
|
|
||||||
|
|
||||||
# save model to output directory
|
|
||||||
if output_dir is not None:
|
|
||||||
output_dir = Path(output_dir)
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir()
|
|
||||||
kb_path = str(output_dir / "kb")
|
|
||||||
kb.dump(kb_path)
|
|
||||||
print()
|
|
||||||
print("Saved KB to", kb_path)
|
|
||||||
|
|
||||||
vocab_path = output_dir / "vocab"
|
|
||||||
kb.vocab.to_disk(vocab_path)
|
|
||||||
print("Saved vocab to", vocab_path)
|
|
||||||
|
|
||||||
print()
|
|
||||||
|
|
||||||
# test the saved model
|
|
||||||
# always reload a knowledge base with the same vocab instance!
|
|
||||||
print("Loading vocab from", vocab_path)
|
|
||||||
print("Loading KB from", kb_path)
|
|
||||||
vocab2 = Vocab().from_disk(vocab_path)
|
|
||||||
kb2 = KnowledgeBase(vocab=vocab2)
|
|
||||||
kb2.load_bulk(kb_path)
|
|
||||||
print()
|
|
||||||
_print_kb(kb2)
|
|
||||||
|
|
||||||
|
|
||||||
def _print_kb(kb):
|
|
||||||
print(kb.get_size_entities(), "kb entities:", kb.get_entity_strings())
|
|
||||||
print(kb.get_size_aliases(), "kb aliases:", kb.get_alias_strings())
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
||||||
|
|
||||||
# Expected output:
|
|
||||||
# 2 kb entities: ['Q2146908', 'Q7381115']
|
|
||||||
# 1 kb aliases: ['Russ Cochran']
|
|
|
@ -1,89 +0,0 @@
|
||||||
"""This example shows how to add a multi-task objective that is trained
|
|
||||||
alongside the entity recognizer. This is an alternative to adding features
|
|
||||||
to the model.
|
|
||||||
|
|
||||||
The multi-task idea is to train an auxiliary model to predict some attribute,
|
|
||||||
with weights shared between the auxiliary model and the main model. In this
|
|
||||||
example, we're predicting the position of the word in the document.
|
|
||||||
|
|
||||||
The model that predicts the position of the word encourages the convolutional
|
|
||||||
layers to include the position information in their representation. The
|
|
||||||
information is then available to the main model, as a feature.
|
|
||||||
|
|
||||||
The overall idea is that we might know something about what sort of features
|
|
||||||
we'd like the CNN to extract. The multi-task objectives can encourage the
|
|
||||||
extraction of this type of feature. The multi-task objective is only used
|
|
||||||
during training. We discard the auxiliary model before run-time.
|
|
||||||
|
|
||||||
The specific example here is not necessarily a good idea --- but it shows
|
|
||||||
how an arbitrary objective function for some word can be used.
|
|
||||||
|
|
||||||
Developed and tested for spaCy 2.0.6. Updated for v2.2.2
|
|
||||||
"""
|
|
||||||
import random
|
|
||||||
import plac
|
|
||||||
import spacy
|
|
||||||
import os.path
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
from spacy.gold import read_json_file, GoldParse
|
|
||||||
|
|
||||||
random.seed(0)
|
|
||||||
|
|
||||||
PWD = os.path.dirname(__file__)
|
|
||||||
|
|
||||||
TRAIN_DATA = list(read_json_file(
|
|
||||||
os.path.join(PWD, "ner_example_data", "ner-sent-per-line.json")))
|
|
||||||
|
|
||||||
|
|
||||||
def get_position_label(i, words, tags, heads, labels, ents):
|
|
||||||
"""Return labels indicating the position of the word in the document.
|
|
||||||
"""
|
|
||||||
if len(words) < 20:
|
|
||||||
return "short-doc"
|
|
||||||
elif i == 0:
|
|
||||||
return "first-word"
|
|
||||||
elif i < 10:
|
|
||||||
return "early-word"
|
|
||||||
elif i < 20:
|
|
||||||
return "mid-word"
|
|
||||||
elif i == len(words) - 1:
|
|
||||||
return "last-word"
|
|
||||||
else:
|
|
||||||
return "late-word"
|
|
||||||
|
|
||||||
|
|
||||||
def main(n_iter=10):
|
|
||||||
nlp = spacy.blank("en")
|
|
||||||
ner = nlp.create_pipe("ner")
|
|
||||||
ner.add_multitask_objective(get_position_label)
|
|
||||||
nlp.add_pipe(ner)
|
|
||||||
print(nlp.pipeline)
|
|
||||||
|
|
||||||
print("Create data", len(TRAIN_DATA))
|
|
||||||
optimizer = nlp.begin_training(get_gold_tuples=lambda: TRAIN_DATA)
|
|
||||||
for itn in range(n_iter):
|
|
||||||
random.shuffle(TRAIN_DATA)
|
|
||||||
losses = {}
|
|
||||||
for text, annot_brackets in TRAIN_DATA:
|
|
||||||
for annotations, _ in annot_brackets:
|
|
||||||
doc = Doc(nlp.vocab, words=annotations[1])
|
|
||||||
gold = GoldParse.from_annot_tuples(doc, annotations)
|
|
||||||
nlp.update(
|
|
||||||
[doc], # batch of texts
|
|
||||||
[gold], # batch of annotations
|
|
||||||
drop=0.2, # dropout - make it harder to memorise data
|
|
||||||
sgd=optimizer, # callable to update weights
|
|
||||||
losses=losses,
|
|
||||||
)
|
|
||||||
print(losses.get("nn_labeller", 0.0), losses["ner"])
|
|
||||||
|
|
||||||
# test the trained model
|
|
||||||
for text, _ in TRAIN_DATA:
|
|
||||||
if text is not None:
|
|
||||||
doc = nlp(text)
|
|
||||||
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
|
|
||||||
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
|
@ -1,217 +0,0 @@
|
||||||
"""This script is experimental.
|
|
||||||
|
|
||||||
Try pre-training the CNN component of the text categorizer using a cheap
|
|
||||||
language modelling-like objective. Specifically, we load pretrained vectors
|
|
||||||
(from something like word2vec, GloVe, FastText etc), and use the CNN to
|
|
||||||
predict the tokens' pretrained vectors. This isn't as easy as it sounds:
|
|
||||||
we're not merely doing compression here, because heavy dropout is applied,
|
|
||||||
including over the input words. This means the model must often (50% of the time)
|
|
||||||
use the context in order to predict the word.
|
|
||||||
|
|
||||||
To evaluate the technique, we're pre-training with the 50k texts from the IMDB
|
|
||||||
corpus, and then training with only 100 labels. Note that it's a bit dirty to
|
|
||||||
pre-train with the development data, but also not *so* terrible: we're not using
|
|
||||||
the development labels, after all --- only the unlabelled text.
|
|
||||||
"""
|
|
||||||
import plac
|
|
||||||
import tqdm
|
|
||||||
import random
|
|
||||||
import spacy
|
|
||||||
import thinc.extra.datasets
|
|
||||||
from spacy.util import minibatch, use_gpu, compounding
|
|
||||||
from spacy._ml import Tok2Vec
|
|
||||||
from spacy.pipeline import TextCategorizer
|
|
||||||
import numpy
|
|
||||||
|
|
||||||
|
|
||||||
def load_texts(limit=0):
|
|
||||||
train, dev = thinc.extra.datasets.imdb()
|
|
||||||
train_texts, train_labels = zip(*train)
|
|
||||||
dev_texts, dev_labels = zip(*train)
|
|
||||||
train_texts = list(train_texts)
|
|
||||||
dev_texts = list(dev_texts)
|
|
||||||
random.shuffle(train_texts)
|
|
||||||
random.shuffle(dev_texts)
|
|
||||||
if limit >= 1:
|
|
||||||
return train_texts[:limit]
|
|
||||||
else:
|
|
||||||
return list(train_texts) + list(dev_texts)
|
|
||||||
|
|
||||||
|
|
||||||
def load_textcat_data(limit=0):
|
|
||||||
"""Load data from the IMDB dataset."""
|
|
||||||
# Partition off part of the train data for evaluation
|
|
||||||
train_data, eval_data = thinc.extra.datasets.imdb()
|
|
||||||
random.shuffle(train_data)
|
|
||||||
train_data = train_data[-limit:]
|
|
||||||
texts, labels = zip(*train_data)
|
|
||||||
eval_texts, eval_labels = zip(*eval_data)
|
|
||||||
cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
|
|
||||||
eval_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in eval_labels]
|
|
||||||
return (texts, cats), (eval_texts, eval_cats)
|
|
||||||
|
|
||||||
|
|
||||||
def prefer_gpu():
|
|
||||||
used = spacy.util.use_gpu(0)
|
|
||||||
if used is None:
|
|
||||||
return False
|
|
||||||
else:
|
|
||||||
import cupy.random
|
|
||||||
|
|
||||||
cupy.random.seed(0)
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def build_textcat_model(tok2vec, nr_class, width):
|
|
||||||
from thinc.v2v import Model, Softmax, Maxout
|
|
||||||
from thinc.api import flatten_add_lengths, chain
|
|
||||||
from thinc.t2v import Pooling, sum_pool, mean_pool, max_pool
|
|
||||||
from thinc.misc import Residual, LayerNorm
|
|
||||||
from spacy._ml import logistic, zero_init
|
|
||||||
|
|
||||||
with Model.define_operators({">>": chain}):
|
|
||||||
model = (
|
|
||||||
tok2vec
|
|
||||||
>> flatten_add_lengths
|
|
||||||
>> Pooling(mean_pool)
|
|
||||||
>> Softmax(nr_class, width)
|
|
||||||
)
|
|
||||||
model.tok2vec = tok2vec
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def block_gradients(model):
|
|
||||||
from thinc.api import wrap
|
|
||||||
|
|
||||||
def forward(X, drop=0.0):
|
|
||||||
Y, _ = model.begin_update(X, drop=drop)
|
|
||||||
return Y, None
|
|
||||||
|
|
||||||
return wrap(forward, model)
|
|
||||||
|
|
||||||
|
|
||||||
def create_pipeline(width, embed_size, vectors_model):
|
|
||||||
print("Load vectors")
|
|
||||||
nlp = spacy.load(vectors_model)
|
|
||||||
print("Start training")
|
|
||||||
textcat = TextCategorizer(
|
|
||||||
nlp.vocab,
|
|
||||||
labels=["POSITIVE", "NEGATIVE"],
|
|
||||||
model=build_textcat_model(
|
|
||||||
Tok2Vec(width=width, embed_size=embed_size), 2, width
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
nlp.add_pipe(textcat)
|
|
||||||
return nlp
|
|
||||||
|
|
||||||
|
|
||||||
def train_tensorizer(nlp, texts, dropout, n_iter):
|
|
||||||
tensorizer = nlp.create_pipe("tensorizer")
|
|
||||||
nlp.add_pipe(tensorizer)
|
|
||||||
optimizer = nlp.begin_training()
|
|
||||||
for i in range(n_iter):
|
|
||||||
losses = {}
|
|
||||||
for i, batch in enumerate(minibatch(tqdm.tqdm(texts))):
|
|
||||||
docs = [nlp.make_doc(text) for text in batch]
|
|
||||||
tensorizer.update(docs, None, losses=losses, sgd=optimizer, drop=dropout)
|
|
||||||
print(losses)
|
|
||||||
return optimizer
|
|
||||||
|
|
||||||
|
|
||||||
def train_textcat(nlp, n_texts, n_iter=10):
|
|
||||||
textcat = nlp.get_pipe("textcat")
|
|
||||||
tok2vec_weights = textcat.model.tok2vec.to_bytes()
|
|
||||||
(train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
|
|
||||||
print(
|
|
||||||
"Using {} examples ({} training, {} evaluation)".format(
|
|
||||||
n_texts, len(train_texts), len(dev_texts)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
|
|
||||||
|
|
||||||
# get names of other pipes to disable them during training
|
|
||||||
pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
|
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
|
||||||
with nlp.disable_pipes(*other_pipes): # only train textcat
|
|
||||||
optimizer = nlp.begin_training()
|
|
||||||
textcat.model.tok2vec.from_bytes(tok2vec_weights)
|
|
||||||
print("Training the model...")
|
|
||||||
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
|
|
||||||
for i in range(n_iter):
|
|
||||||
losses = {"textcat": 0.0}
|
|
||||||
# batch up the examples using spaCy's minibatch
|
|
||||||
batches = minibatch(tqdm.tqdm(train_data), size=2)
|
|
||||||
for batch in batches:
|
|
||||||
texts, annotations = zip(*batch)
|
|
||||||
nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
|
|
||||||
with textcat.model.use_params(optimizer.averages):
|
|
||||||
# evaluate on the dev data split off in load_data()
|
|
||||||
scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
|
|
||||||
print(
|
|
||||||
"{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table
|
|
||||||
losses["textcat"],
|
|
||||||
scores["textcat_p"],
|
|
||||||
scores["textcat_r"],
|
|
||||||
scores["textcat_f"],
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate_textcat(tokenizer, textcat, texts, cats):
|
|
||||||
docs = (tokenizer(text) for text in texts)
|
|
||||||
tp = 1e-8
|
|
||||||
fp = 1e-8
|
|
||||||
tn = 1e-8
|
|
||||||
fn = 1e-8
|
|
||||||
for i, doc in enumerate(textcat.pipe(docs)):
|
|
||||||
gold = cats[i]
|
|
||||||
for label, score in doc.cats.items():
|
|
||||||
if label not in gold:
|
|
||||||
continue
|
|
||||||
if score >= 0.5 and gold[label] >= 0.5:
|
|
||||||
tp += 1.0
|
|
||||||
elif score >= 0.5 and gold[label] < 0.5:
|
|
||||||
fp += 1.0
|
|
||||||
elif score < 0.5 and gold[label] < 0.5:
|
|
||||||
tn += 1
|
|
||||||
elif score < 0.5 and gold[label] >= 0.5:
|
|
||||||
fn += 1
|
|
||||||
precision = tp / (tp + fp)
|
|
||||||
recall = tp / (tp + fn)
|
|
||||||
f_score = 2 * (precision * recall) / (precision + recall)
|
|
||||||
return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
width=("Width of CNN layers", "positional", None, int),
|
|
||||||
embed_size=("Embedding rows", "positional", None, int),
|
|
||||||
pretrain_iters=("Number of iterations to pretrain", "option", "pn", int),
|
|
||||||
train_iters=("Number of iterations to train", "option", "tn", int),
|
|
||||||
train_examples=("Number of labelled examples", "option", "eg", int),
|
|
||||||
vectors_model=("Name or path to vectors model to learn from"),
|
|
||||||
)
|
|
||||||
def main(
|
|
||||||
width,
|
|
||||||
embed_size,
|
|
||||||
vectors_model,
|
|
||||||
pretrain_iters=30,
|
|
||||||
train_iters=30,
|
|
||||||
train_examples=1000,
|
|
||||||
):
|
|
||||||
random.seed(0)
|
|
||||||
numpy.random.seed(0)
|
|
||||||
use_gpu = prefer_gpu()
|
|
||||||
print("Using GPU?", use_gpu)
|
|
||||||
|
|
||||||
nlp = create_pipeline(width, embed_size, vectors_model)
|
|
||||||
print("Load data")
|
|
||||||
texts = load_texts(limit=0)
|
|
||||||
print("Train tensorizer")
|
|
||||||
optimizer = train_tensorizer(nlp, texts, dropout=0.2, n_iter=pretrain_iters)
|
|
||||||
print("Train textcat")
|
|
||||||
train_textcat(nlp, train_examples, n_iter=train_iters)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
|
@ -1,97 +0,0 @@
|
||||||
"""Prevent catastrophic forgetting with rehearsal updates."""
|
|
||||||
import plac
|
|
||||||
import random
|
|
||||||
import warnings
|
|
||||||
import srsly
|
|
||||||
import spacy
|
|
||||||
from spacy.gold import GoldParse
|
|
||||||
from spacy.util import minibatch, compounding
|
|
||||||
|
|
||||||
|
|
||||||
LABEL = "ANIMAL"
|
|
||||||
TRAIN_DATA = [
|
|
||||||
(
|
|
||||||
"Horses are too tall and they pretend to care about your feelings",
|
|
||||||
{"entities": [(0, 6, "ANIMAL")]},
|
|
||||||
),
|
|
||||||
("Do they bite?", {"entities": []}),
|
|
||||||
(
|
|
||||||
"horses are too tall and they pretend to care about your feelings",
|
|
||||||
{"entities": [(0, 6, "ANIMAL")]},
|
|
||||||
),
|
|
||||||
("horses pretend to care about your feelings", {"entities": [(0, 6, "ANIMAL")]}),
|
|
||||||
(
|
|
||||||
"they pretend to care about your feelings, those horses",
|
|
||||||
{"entities": [(48, 54, "ANIMAL")]},
|
|
||||||
),
|
|
||||||
("horses?", {"entities": [(0, 6, "ANIMAL")]}),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def read_raw_data(nlp, jsonl_loc):
|
|
||||||
for json_obj in srsly.read_jsonl(jsonl_loc):
|
|
||||||
if json_obj["text"].strip():
|
|
||||||
doc = nlp.make_doc(json_obj["text"])
|
|
||||||
yield doc
|
|
||||||
|
|
||||||
|
|
||||||
def read_gold_data(nlp, gold_loc):
|
|
||||||
docs = []
|
|
||||||
golds = []
|
|
||||||
for json_obj in srsly.read_jsonl(gold_loc):
|
|
||||||
doc = nlp.make_doc(json_obj["text"])
|
|
||||||
ents = [(ent["start"], ent["end"], ent["label"]) for ent in json_obj["spans"]]
|
|
||||||
gold = GoldParse(doc, entities=ents)
|
|
||||||
docs.append(doc)
|
|
||||||
golds.append(gold)
|
|
||||||
return list(zip(docs, golds))
|
|
||||||
|
|
||||||
|
|
||||||
def main(model_name, unlabelled_loc):
|
|
||||||
n_iter = 10
|
|
||||||
dropout = 0.2
|
|
||||||
batch_size = 4
|
|
||||||
nlp = spacy.load(model_name)
|
|
||||||
nlp.get_pipe("ner").add_label(LABEL)
|
|
||||||
raw_docs = list(read_raw_data(nlp, unlabelled_loc))
|
|
||||||
optimizer = nlp.resume_training()
|
|
||||||
# Avoid use of Adam when resuming training. I don't understand this well
|
|
||||||
# yet, but I'm getting weird results from Adam. Try commenting out the
|
|
||||||
# nlp.update(), and using Adam -- you'll find the models drift apart.
|
|
||||||
# I guess Adam is losing precision, introducing gradient noise?
|
|
||||||
optimizer.alpha = 0.1
|
|
||||||
optimizer.b1 = 0.0
|
|
||||||
optimizer.b2 = 0.0
|
|
||||||
|
|
||||||
# get names of other pipes to disable them during training
|
|
||||||
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
|
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
|
||||||
sizes = compounding(1.0, 4.0, 1.001)
|
|
||||||
with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
|
|
||||||
# show warnings for misaligned entity spans once
|
|
||||||
warnings.filterwarnings("once", category=UserWarning, module='spacy')
|
|
||||||
|
|
||||||
for itn in range(n_iter):
|
|
||||||
random.shuffle(TRAIN_DATA)
|
|
||||||
random.shuffle(raw_docs)
|
|
||||||
losses = {}
|
|
||||||
r_losses = {}
|
|
||||||
# batch up the examples using spaCy's minibatch
|
|
||||||
raw_batches = minibatch(raw_docs, size=4)
|
|
||||||
for batch in minibatch(TRAIN_DATA, size=sizes):
|
|
||||||
docs, golds = zip(*batch)
|
|
||||||
nlp.update(docs, golds, sgd=optimizer, drop=dropout, losses=losses)
|
|
||||||
raw_batch = list(next(raw_batches))
|
|
||||||
nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses)
|
|
||||||
print("Losses", losses)
|
|
||||||
print("R. Losses", r_losses)
|
|
||||||
print(nlp.get_pipe("ner").model.unseen_classes)
|
|
||||||
test_text = "Do you like horses?"
|
|
||||||
doc = nlp(test_text)
|
|
||||||
print("Entities in '%s'" % test_text)
|
|
||||||
for ent in doc.ents:
|
|
||||||
print(ent.label_, ent.text)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
|
@ -1,177 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf8
|
|
||||||
|
|
||||||
"""Example of training spaCy's entity linker, starting off with a predefined
|
|
||||||
knowledge base and corresponding vocab, and a blank English model.
|
|
||||||
|
|
||||||
For more details, see the documentation:
|
|
||||||
* Training: https://spacy.io/usage/training
|
|
||||||
* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
|
|
||||||
|
|
||||||
Compatible with: spaCy v2.2.4
|
|
||||||
Last tested with: v2.3.4
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals, print_function
|
|
||||||
|
|
||||||
import plac
|
|
||||||
import random
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from spacy.vocab import Vocab
|
|
||||||
|
|
||||||
import spacy
|
|
||||||
from spacy.kb import KnowledgeBase
|
|
||||||
from spacy.pipeline import EntityRuler
|
|
||||||
from spacy.util import minibatch, compounding
|
|
||||||
|
|
||||||
|
|
||||||
def sample_train_data():
|
|
||||||
train_data = []
|
|
||||||
|
|
||||||
# Q2146908 (Russ Cochran): American golfer
|
|
||||||
# Q7381115 (Russ Cochran): publisher
|
|
||||||
|
|
||||||
text_1 = "Russ Cochran his reprints include EC Comics."
|
|
||||||
dict_1 = {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}
|
|
||||||
train_data.append((text_1, {"links": dict_1}))
|
|
||||||
|
|
||||||
text_2 = "Russ Cochran has been publishing comic art."
|
|
||||||
dict_2 = {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}
|
|
||||||
train_data.append((text_2, {"links": dict_2}))
|
|
||||||
|
|
||||||
text_3 = "Russ Cochran captured his first major title with his son as caddie."
|
|
||||||
dict_3 = {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}
|
|
||||||
train_data.append((text_3, {"links": dict_3}))
|
|
||||||
|
|
||||||
text_4 = "Russ Cochran was a member of University of Kentucky's golf team."
|
|
||||||
dict_4 = {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}
|
|
||||||
train_data.append((text_4, {"links": dict_4}))
|
|
||||||
|
|
||||||
return train_data
|
|
||||||
|
|
||||||
|
|
||||||
# training data
|
|
||||||
TRAIN_DATA = sample_train_data()
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
kb_path=("Path to the knowledge base", "positional", None, Path),
|
|
||||||
vocab_path=("Path to the vocab for the kb", "positional", None, Path),
|
|
||||||
output_dir=("Optional output directory", "option", "o", Path),
|
|
||||||
n_iter=("Number of training iterations", "option", "n", int),
|
|
||||||
)
|
|
||||||
def main(kb_path, vocab_path, output_dir=None, n_iter=50):
|
|
||||||
"""Create a blank model with the specified vocab, set up the pipeline and train the entity linker.
|
|
||||||
The `vocab` should be the one used during creation of the KB."""
|
|
||||||
# create blank English model with correct vocab
|
|
||||||
nlp = spacy.blank("en")
|
|
||||||
nlp.vocab.from_disk(vocab_path)
|
|
||||||
nlp.vocab.vectors.name = "spacy_pretrained_vectors"
|
|
||||||
print("Created blank 'en' model with vocab from '%s'" % vocab_path)
|
|
||||||
|
|
||||||
# Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy.
|
|
||||||
nlp.add_pipe(nlp.create_pipe('sentencizer'))
|
|
||||||
|
|
||||||
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data.
|
|
||||||
# Note that in a realistic application, an actual NER algorithm should be used instead.
|
|
||||||
ruler = EntityRuler(nlp)
|
|
||||||
patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}]
|
|
||||||
ruler.add_patterns(patterns)
|
|
||||||
nlp.add_pipe(ruler)
|
|
||||||
|
|
||||||
# Create the Entity Linker component and add it to the pipeline.
|
|
||||||
if "entity_linker" not in nlp.pipe_names:
|
|
||||||
# use only the predicted EL score and not the prior probability (for demo purposes)
|
|
||||||
cfg = {"incl_prior": False}
|
|
||||||
entity_linker = nlp.create_pipe("entity_linker", cfg)
|
|
||||||
kb = KnowledgeBase(vocab=nlp.vocab)
|
|
||||||
kb.load_bulk(kb_path)
|
|
||||||
print("Loaded Knowledge Base from '%s'" % kb_path)
|
|
||||||
entity_linker.set_kb(kb)
|
|
||||||
nlp.add_pipe(entity_linker, last=True)
|
|
||||||
|
|
||||||
# Convert the texts to docs to make sure we have doc.ents set for the training examples.
|
|
||||||
# Also ensure that the annotated examples correspond to known identifiers in the knowledge base.
|
|
||||||
kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings()
|
|
||||||
TRAIN_DOCS = []
|
|
||||||
for text, annotation in TRAIN_DATA:
|
|
||||||
with nlp.disable_pipes("entity_linker"):
|
|
||||||
doc = nlp(text)
|
|
||||||
annotation_clean = annotation
|
|
||||||
for offset, kb_id_dict in annotation["links"].items():
|
|
||||||
new_dict = {}
|
|
||||||
for kb_id, value in kb_id_dict.items():
|
|
||||||
if kb_id in kb_ids:
|
|
||||||
new_dict[kb_id] = value
|
|
||||||
else:
|
|
||||||
print(
|
|
||||||
"Removed", kb_id, "from training because it is not in the KB."
|
|
||||||
)
|
|
||||||
annotation_clean["links"][offset] = new_dict
|
|
||||||
TRAIN_DOCS.append((doc, annotation_clean))
|
|
||||||
|
|
||||||
# get names of other pipes to disable them during training
|
|
||||||
pipe_exceptions = ["entity_linker", "trf_wordpiecer", "trf_tok2vec"]
|
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
|
||||||
with nlp.disable_pipes(*other_pipes): # only train entity linker
|
|
||||||
# reset and initialize the weights randomly
|
|
||||||
optimizer = nlp.begin_training()
|
|
||||||
for itn in range(n_iter):
|
|
||||||
random.shuffle(TRAIN_DOCS)
|
|
||||||
losses = {}
|
|
||||||
# batch up the examples using spaCy's minibatch
|
|
||||||
batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))
|
|
||||||
for batch in batches:
|
|
||||||
texts, annotations = zip(*batch)
|
|
||||||
nlp.update(
|
|
||||||
texts, # batch of texts
|
|
||||||
annotations, # batch of annotations
|
|
||||||
drop=0.2, # dropout - make it harder to memorise data
|
|
||||||
losses=losses,
|
|
||||||
sgd=optimizer,
|
|
||||||
)
|
|
||||||
print(itn, "Losses", losses)
|
|
||||||
|
|
||||||
# test the trained model
|
|
||||||
_apply_model(nlp)
|
|
||||||
|
|
||||||
# save model to output directory
|
|
||||||
if output_dir is not None:
|
|
||||||
output_dir = Path(output_dir)
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir()
|
|
||||||
nlp.to_disk(output_dir)
|
|
||||||
print()
|
|
||||||
print("Saved model to", output_dir)
|
|
||||||
|
|
||||||
# test the saved model
|
|
||||||
print("Loading from", output_dir)
|
|
||||||
nlp2 = spacy.load(output_dir)
|
|
||||||
_apply_model(nlp2)
|
|
||||||
|
|
||||||
|
|
||||||
def _apply_model(nlp):
|
|
||||||
for text, annotation in TRAIN_DATA:
|
|
||||||
# apply the entity linker which will now make predictions for the 'Russ Cochran' entities
|
|
||||||
doc = nlp(text)
|
|
||||||
print()
|
|
||||||
print("Entities", [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents])
|
|
||||||
print("Tokens", [(t.text, t.ent_type_, t.ent_kb_id_) for t in doc])
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
||||||
|
|
||||||
# Expected output (can be shuffled):
|
|
||||||
|
|
||||||
# Entities[('Russ Cochran', 'PERSON', 'Q7381115')]
|
|
||||||
# Tokens[('Russ', 'PERSON', 'Q7381115'), ('Cochran', 'PERSON', 'Q7381115'), ("his", '', ''), ('reprints', '', ''), ('include', '', ''), ('The', '', ''), ('Complete', '', ''), ('EC', '', ''), ('Library', '', ''), ('.', '', '')]
|
|
||||||
|
|
||||||
# Entities[('Russ Cochran', 'PERSON', 'Q7381115')]
|
|
||||||
# Tokens[('Russ', 'PERSON', 'Q7381115'), ('Cochran', 'PERSON', 'Q7381115'), ('has', '', ''), ('been', '', ''), ('publishing', '', ''), ('comic', '', ''), ('art', '', ''), ('.', '', '')]
|
|
||||||
|
|
||||||
# Entities[('Russ Cochran', 'PERSON', 'Q2146908')]
|
|
||||||
# Tokens[('Russ', 'PERSON', 'Q2146908'), ('Cochran', 'PERSON', 'Q2146908'), ('captured', '', ''), ('his', '', ''), ('first', '', ''), ('major', '', ''), ('title', '', ''), ('with', '', ''), ('his', '', ''), ('son', '', ''), ('as', '', ''), ('caddie', '', ''), ('.', '', '')]
|
|
||||||
|
|
||||||
# Entities[('Russ Cochran', 'PERSON', 'Q2146908')]
|
|
||||||
# Tokens[('Russ', 'PERSON', 'Q2146908'), ('Cochran', 'PERSON', 'Q2146908'), ('was', '', ''), ('a', '', ''), ('member', '', ''), ('of', '', ''), ('University', '', ''), ('of', '', ''), ('Kentucky', '', ''), ("'s", '', ''), ('golf', '', ''), ('team', '', ''), ('.', '', '')]
|
|
|
@ -1,195 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf-8
|
|
||||||
"""Using the parser to recognise your own semantics
|
|
||||||
|
|
||||||
spaCy's parser component can be trained to predict any type of tree
|
|
||||||
structure over your input text. You can also predict trees over whole documents
|
|
||||||
or chat logs, with connections between the sentence-roots used to annotate
|
|
||||||
discourse structure. In this example, we'll build a message parser for a common
|
|
||||||
"chat intent": finding local businesses. Our message semantics will have the
|
|
||||||
following types of relations: ROOT, PLACE, QUALITY, ATTRIBUTE, TIME, LOCATION.
|
|
||||||
|
|
||||||
"show me the best hotel in berlin"
|
|
||||||
('show', 'ROOT', 'show')
|
|
||||||
('best', 'QUALITY', 'hotel') --> hotel with QUALITY best
|
|
||||||
('hotel', 'PLACE', 'show') --> show PLACE hotel
|
|
||||||
('berlin', 'LOCATION', 'hotel') --> hotel with LOCATION berlin
|
|
||||||
|
|
||||||
Compatible with: spaCy v2.0.0+
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals, print_function
|
|
||||||
|
|
||||||
import plac
|
|
||||||
import random
|
|
||||||
from pathlib import Path
|
|
||||||
import spacy
|
|
||||||
from spacy.util import minibatch, compounding
|
|
||||||
|
|
||||||
|
|
||||||
# training data: texts, heads and dependency labels
|
|
||||||
# for no relation, we simply chose an arbitrary dependency label, e.g. '-'
|
|
||||||
TRAIN_DATA = [
|
|
||||||
(
|
|
||||||
"find a cafe with great wifi",
|
|
||||||
{
|
|
||||||
"heads": [0, 2, 0, 5, 5, 2], # index of token head
|
|
||||||
"deps": ["ROOT", "-", "PLACE", "-", "QUALITY", "ATTRIBUTE"],
|
|
||||||
},
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"find a hotel near the beach",
|
|
||||||
{
|
|
||||||
"heads": [0, 2, 0, 5, 5, 2],
|
|
||||||
"deps": ["ROOT", "-", "PLACE", "QUALITY", "-", "ATTRIBUTE"],
|
|
||||||
},
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"find me the closest gym that's open late",
|
|
||||||
{
|
|
||||||
"heads": [0, 0, 4, 4, 0, 6, 4, 6, 6],
|
|
||||||
"deps": [
|
|
||||||
"ROOT",
|
|
||||||
"-",
|
|
||||||
"-",
|
|
||||||
"QUALITY",
|
|
||||||
"PLACE",
|
|
||||||
"-",
|
|
||||||
"-",
|
|
||||||
"ATTRIBUTE",
|
|
||||||
"TIME",
|
|
||||||
],
|
|
||||||
},
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"show me the cheapest store that sells flowers",
|
|
||||||
{
|
|
||||||
"heads": [0, 0, 4, 4, 0, 4, 4, 4], # attach "flowers" to store!
|
|
||||||
"deps": ["ROOT", "-", "-", "QUALITY", "PLACE", "-", "-", "PRODUCT"],
|
|
||||||
},
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"find a nice restaurant in london",
|
|
||||||
{
|
|
||||||
"heads": [0, 3, 3, 0, 3, 3],
|
|
||||||
"deps": ["ROOT", "-", "QUALITY", "PLACE", "-", "LOCATION"],
|
|
||||||
},
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"show me the coolest hostel in berlin",
|
|
||||||
{
|
|
||||||
"heads": [0, 0, 4, 4, 0, 4, 4],
|
|
||||||
"deps": ["ROOT", "-", "-", "QUALITY", "PLACE", "-", "LOCATION"],
|
|
||||||
},
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"find a good italian restaurant near work",
|
|
||||||
{
|
|
||||||
"heads": [0, 4, 4, 4, 0, 4, 5],
|
|
||||||
"deps": [
|
|
||||||
"ROOT",
|
|
||||||
"-",
|
|
||||||
"QUALITY",
|
|
||||||
"ATTRIBUTE",
|
|
||||||
"PLACE",
|
|
||||||
"ATTRIBUTE",
|
|
||||||
"LOCATION",
|
|
||||||
],
|
|
||||||
},
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
|
|
||||||
output_dir=("Optional output directory", "option", "o", Path),
|
|
||||||
n_iter=("Number of training iterations", "option", "n", int),
|
|
||||||
)
|
|
||||||
def main(model=None, output_dir=None, n_iter=15):
|
|
||||||
"""Load the model, set up the pipeline and train the parser."""
|
|
||||||
if model is not None:
|
|
||||||
nlp = spacy.load(model) # load existing spaCy model
|
|
||||||
print("Loaded model '%s'" % model)
|
|
||||||
else:
|
|
||||||
nlp = spacy.blank("en") # create blank Language class
|
|
||||||
print("Created blank 'en' model")
|
|
||||||
|
|
||||||
# We'll use the built-in dependency parser class, but we want to create a
|
|
||||||
# fresh instance – just in case.
|
|
||||||
if "parser" in nlp.pipe_names:
|
|
||||||
nlp.remove_pipe("parser")
|
|
||||||
parser = nlp.create_pipe("parser")
|
|
||||||
nlp.add_pipe(parser, first=True)
|
|
||||||
|
|
||||||
for text, annotations in TRAIN_DATA:
|
|
||||||
for dep in annotations.get("deps", []):
|
|
||||||
parser.add_label(dep)
|
|
||||||
|
|
||||||
pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"]
|
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
|
||||||
with nlp.disable_pipes(*other_pipes): # only train parser
|
|
||||||
optimizer = nlp.begin_training()
|
|
||||||
for itn in range(n_iter):
|
|
||||||
random.shuffle(TRAIN_DATA)
|
|
||||||
losses = {}
|
|
||||||
# batch up the examples using spaCy's minibatch
|
|
||||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
|
||||||
for batch in batches:
|
|
||||||
texts, annotations = zip(*batch)
|
|
||||||
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
|
|
||||||
print("Losses", losses)
|
|
||||||
|
|
||||||
# test the trained model
|
|
||||||
test_model(nlp)
|
|
||||||
|
|
||||||
# save model to output directory
|
|
||||||
if output_dir is not None:
|
|
||||||
output_dir = Path(output_dir)
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir()
|
|
||||||
nlp.to_disk(output_dir)
|
|
||||||
print("Saved model to", output_dir)
|
|
||||||
|
|
||||||
# test the saved model
|
|
||||||
print("Loading from", output_dir)
|
|
||||||
nlp2 = spacy.load(output_dir)
|
|
||||||
test_model(nlp2)
|
|
||||||
|
|
||||||
|
|
||||||
def test_model(nlp):
|
|
||||||
texts = [
|
|
||||||
"find a hotel with good wifi",
|
|
||||||
"find me the cheapest gym near work",
|
|
||||||
"show me the best hotel in berlin",
|
|
||||||
]
|
|
||||||
docs = nlp.pipe(texts)
|
|
||||||
for doc in docs:
|
|
||||||
print(doc.text)
|
|
||||||
print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != "-"])
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
||||||
|
|
||||||
# Expected output:
|
|
||||||
# find a hotel with good wifi
|
|
||||||
# [
|
|
||||||
# ('find', 'ROOT', 'find'),
|
|
||||||
# ('hotel', 'PLACE', 'find'),
|
|
||||||
# ('good', 'QUALITY', 'wifi'),
|
|
||||||
# ('wifi', 'ATTRIBUTE', 'hotel')
|
|
||||||
# ]
|
|
||||||
# find me the cheapest gym near work
|
|
||||||
# [
|
|
||||||
# ('find', 'ROOT', 'find'),
|
|
||||||
# ('cheapest', 'QUALITY', 'gym'),
|
|
||||||
# ('gym', 'PLACE', 'find'),
|
|
||||||
# ('near', 'ATTRIBUTE', 'gym'),
|
|
||||||
# ('work', 'LOCATION', 'near')
|
|
||||||
# ]
|
|
||||||
# show me the best hotel in berlin
|
|
||||||
# [
|
|
||||||
# ('show', 'ROOT', 'show'),
|
|
||||||
# ('best', 'QUALITY', 'hotel'),
|
|
||||||
# ('hotel', 'PLACE', 'show'),
|
|
||||||
# ('berlin', 'LOCATION', 'hotel')
|
|
||||||
# ]
|
|
|
@ -1,117 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf8
|
|
||||||
"""Example of training spaCy's named entity recognizer, starting off with an
|
|
||||||
existing model or a blank model.
|
|
||||||
|
|
||||||
For more details, see the documentation:
|
|
||||||
* Training: https://spacy.io/usage/training
|
|
||||||
* NER: https://spacy.io/usage/linguistic-features#named-entities
|
|
||||||
|
|
||||||
Compatible with: spaCy v2.0.0+
|
|
||||||
Last tested with: v2.2.4
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals, print_function
|
|
||||||
|
|
||||||
import plac
|
|
||||||
import random
|
|
||||||
import warnings
|
|
||||||
from pathlib import Path
|
|
||||||
import spacy
|
|
||||||
from spacy.util import minibatch, compounding
|
|
||||||
|
|
||||||
|
|
||||||
# training data
|
|
||||||
TRAIN_DATA = [
|
|
||||||
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
|
|
||||||
("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
|
|
||||||
output_dir=("Optional output directory", "option", "o", Path),
|
|
||||||
n_iter=("Number of training iterations", "option", "n", int),
|
|
||||||
)
|
|
||||||
def main(model=None, output_dir=None, n_iter=100):
|
|
||||||
"""Load the model, set up the pipeline and train the entity recognizer."""
|
|
||||||
if model is not None:
|
|
||||||
nlp = spacy.load(model) # load existing spaCy model
|
|
||||||
print("Loaded model '%s'" % model)
|
|
||||||
else:
|
|
||||||
nlp = spacy.blank("en") # create blank Language class
|
|
||||||
print("Created blank 'en' model")
|
|
||||||
|
|
||||||
# create the built-in pipeline components and add them to the pipeline
|
|
||||||
# nlp.create_pipe works for built-ins that are registered with spaCy
|
|
||||||
if "ner" not in nlp.pipe_names:
|
|
||||||
ner = nlp.create_pipe("ner")
|
|
||||||
nlp.add_pipe(ner, last=True)
|
|
||||||
# otherwise, get it so we can add labels
|
|
||||||
else:
|
|
||||||
ner = nlp.get_pipe("ner")
|
|
||||||
|
|
||||||
# add labels
|
|
||||||
for _, annotations in TRAIN_DATA:
|
|
||||||
for ent in annotations.get("entities"):
|
|
||||||
ner.add_label(ent[2])
|
|
||||||
|
|
||||||
# get names of other pipes to disable them during training
|
|
||||||
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
|
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
|
||||||
# only train NER
|
|
||||||
with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
|
|
||||||
# show warnings for misaligned entity spans once
|
|
||||||
warnings.filterwarnings("once", category=UserWarning, module='spacy')
|
|
||||||
|
|
||||||
# reset and initialize the weights randomly – but only if we're
|
|
||||||
# training a new model
|
|
||||||
if model is None:
|
|
||||||
nlp.begin_training()
|
|
||||||
for itn in range(n_iter):
|
|
||||||
random.shuffle(TRAIN_DATA)
|
|
||||||
losses = {}
|
|
||||||
# batch up the examples using spaCy's minibatch
|
|
||||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
|
||||||
for batch in batches:
|
|
||||||
texts, annotations = zip(*batch)
|
|
||||||
nlp.update(
|
|
||||||
texts, # batch of texts
|
|
||||||
annotations, # batch of annotations
|
|
||||||
drop=0.5, # dropout - make it harder to memorise data
|
|
||||||
losses=losses,
|
|
||||||
)
|
|
||||||
print("Losses", losses)
|
|
||||||
|
|
||||||
# test the trained model
|
|
||||||
for text, _ in TRAIN_DATA:
|
|
||||||
doc = nlp(text)
|
|
||||||
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
|
|
||||||
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
|
|
||||||
|
|
||||||
# save model to output directory
|
|
||||||
if output_dir is not None:
|
|
||||||
output_dir = Path(output_dir)
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir()
|
|
||||||
nlp.to_disk(output_dir)
|
|
||||||
print("Saved model to", output_dir)
|
|
||||||
|
|
||||||
# test the saved model
|
|
||||||
print("Loading from", output_dir)
|
|
||||||
nlp2 = spacy.load(output_dir)
|
|
||||||
for text, _ in TRAIN_DATA:
|
|
||||||
doc = nlp2(text)
|
|
||||||
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
|
|
||||||
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
||||||
|
|
||||||
# Expected output:
|
|
||||||
# Entities [('Shaka Khan', 'PERSON')]
|
|
||||||
# Tokens [('Who', '', 2), ('is', '', 2), ('Shaka', 'PERSON', 3),
|
|
||||||
# ('Khan', 'PERSON', 1), ('?', '', 2)]
|
|
||||||
# Entities [('London', 'LOC'), ('Berlin', 'LOC')]
|
|
||||||
# Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3),
|
|
||||||
# ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]
|
|
|
@ -1,144 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf8
|
|
||||||
"""Example of training an additional entity type
|
|
||||||
|
|
||||||
This script shows how to add a new entity type to an existing pretrained NER
|
|
||||||
model. To keep the example short and simple, only four sentences are provided
|
|
||||||
as examples. In practice, you'll need many more — a few hundred would be a
|
|
||||||
good start. You will also likely need to mix in examples of other entity
|
|
||||||
types, which might be obtained by running the entity recognizer over unlabelled
|
|
||||||
sentences, and adding their annotations to the training set.
|
|
||||||
|
|
||||||
The actual training is performed by looping over the examples, and calling
|
|
||||||
`nlp.entity.update()`. The `update()` method steps through the words of the
|
|
||||||
input. At each word, it makes a prediction. It then consults the annotations
|
|
||||||
provided on the GoldParse instance, to see whether it was right. If it was
|
|
||||||
wrong, it adjusts its weights so that the correct action will score higher
|
|
||||||
next time.
|
|
||||||
|
|
||||||
After training your model, you can save it to a directory. We recommend
|
|
||||||
wrapping models as Python packages, for ease of deployment.
|
|
||||||
|
|
||||||
For more details, see the documentation:
|
|
||||||
* Training: https://spacy.io/usage/training
|
|
||||||
* NER: https://spacy.io/usage/linguistic-features#named-entities
|
|
||||||
|
|
||||||
Compatible with: spaCy v2.1.0+
|
|
||||||
Last tested with: v2.2.4
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals, print_function
|
|
||||||
|
|
||||||
import plac
|
|
||||||
import random
|
|
||||||
import warnings
|
|
||||||
from pathlib import Path
|
|
||||||
import spacy
|
|
||||||
from spacy.util import minibatch, compounding
|
|
||||||
|
|
||||||
|
|
||||||
# new entity label
|
|
||||||
LABEL = "ANIMAL"
|
|
||||||
|
|
||||||
# training data
|
|
||||||
# Note: If you're using an existing model, make sure to mix in examples of
|
|
||||||
# other entity types that spaCy correctly recognized before. Otherwise, your
|
|
||||||
# model might learn the new type, but "forget" what it previously knew.
|
|
||||||
# https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
|
|
||||||
TRAIN_DATA = [
|
|
||||||
(
|
|
||||||
"Horses are too tall and they pretend to care about your feelings",
|
|
||||||
{"entities": [(0, 6, LABEL)]},
|
|
||||||
),
|
|
||||||
("Do they bite?", {"entities": []}),
|
|
||||||
(
|
|
||||||
"horses are too tall and they pretend to care about your feelings",
|
|
||||||
{"entities": [(0, 6, LABEL)]},
|
|
||||||
),
|
|
||||||
("horses pretend to care about your feelings", {"entities": [(0, 6, LABEL)]}),
|
|
||||||
(
|
|
||||||
"they pretend to care about your feelings, those horses",
|
|
||||||
{"entities": [(48, 54, LABEL)]},
|
|
||||||
),
|
|
||||||
("horses?", {"entities": [(0, 6, LABEL)]}),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
|
|
||||||
new_model_name=("New model name for model meta.", "option", "nm", str),
|
|
||||||
output_dir=("Optional output directory", "option", "o", Path),
|
|
||||||
n_iter=("Number of training iterations", "option", "n", int),
|
|
||||||
)
|
|
||||||
def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
|
|
||||||
"""Set up the pipeline and entity recognizer, and train the new entity."""
|
|
||||||
random.seed(0)
|
|
||||||
if model is not None:
|
|
||||||
nlp = spacy.load(model) # load existing spaCy model
|
|
||||||
print("Loaded model '%s'" % model)
|
|
||||||
else:
|
|
||||||
nlp = spacy.blank("en") # create blank Language class
|
|
||||||
print("Created blank 'en' model")
|
|
||||||
# Add entity recognizer to model if it's not in the pipeline
|
|
||||||
# nlp.create_pipe works for built-ins that are registered with spaCy
|
|
||||||
if "ner" not in nlp.pipe_names:
|
|
||||||
ner = nlp.create_pipe("ner")
|
|
||||||
nlp.add_pipe(ner)
|
|
||||||
# otherwise, get it, so we can add labels to it
|
|
||||||
else:
|
|
||||||
ner = nlp.get_pipe("ner")
|
|
||||||
|
|
||||||
ner.add_label(LABEL) # add new entity label to entity recognizer
|
|
||||||
# Adding extraneous labels shouldn't mess anything up
|
|
||||||
ner.add_label("VEGETABLE")
|
|
||||||
if model is None:
|
|
||||||
optimizer = nlp.begin_training()
|
|
||||||
else:
|
|
||||||
optimizer = nlp.resume_training()
|
|
||||||
move_names = list(ner.move_names)
|
|
||||||
# get names of other pipes to disable them during training
|
|
||||||
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
|
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
|
||||||
# only train NER
|
|
||||||
with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
|
|
||||||
# show warnings for misaligned entity spans once
|
|
||||||
warnings.filterwarnings("once", category=UserWarning, module='spacy')
|
|
||||||
|
|
||||||
sizes = compounding(1.0, 4.0, 1.001)
|
|
||||||
# batch up the examples using spaCy's minibatch
|
|
||||||
for itn in range(n_iter):
|
|
||||||
random.shuffle(TRAIN_DATA)
|
|
||||||
batches = minibatch(TRAIN_DATA, size=sizes)
|
|
||||||
losses = {}
|
|
||||||
for batch in batches:
|
|
||||||
texts, annotations = zip(*batch)
|
|
||||||
nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
|
|
||||||
print("Losses", losses)
|
|
||||||
|
|
||||||
# test the trained model
|
|
||||||
test_text = "Do you like horses?"
|
|
||||||
doc = nlp(test_text)
|
|
||||||
print("Entities in '%s'" % test_text)
|
|
||||||
for ent in doc.ents:
|
|
||||||
print(ent.label_, ent.text)
|
|
||||||
|
|
||||||
# save model to output directory
|
|
||||||
if output_dir is not None:
|
|
||||||
output_dir = Path(output_dir)
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir()
|
|
||||||
nlp.meta["name"] = new_model_name # rename model
|
|
||||||
nlp.to_disk(output_dir)
|
|
||||||
print("Saved model to", output_dir)
|
|
||||||
|
|
||||||
# test the saved model
|
|
||||||
print("Loading from", output_dir)
|
|
||||||
nlp2 = spacy.load(output_dir)
|
|
||||||
# Check the classes have loaded back consistently
|
|
||||||
assert nlp2.get_pipe("ner").move_names == move_names
|
|
||||||
doc2 = nlp2(test_text)
|
|
||||||
for ent in doc2.ents:
|
|
||||||
print(ent.label_, ent.text)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
|
@ -1,111 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf8
|
|
||||||
"""Example of training spaCy dependency parser, starting off with an existing
|
|
||||||
model or a blank model. For more details, see the documentation:
|
|
||||||
* Training: https://spacy.io/usage/training
|
|
||||||
* Dependency Parse: https://spacy.io/usage/linguistic-features#dependency-parse
|
|
||||||
|
|
||||||
Compatible with: spaCy v2.0.0+
|
|
||||||
Last tested with: v2.1.0
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals, print_function
|
|
||||||
|
|
||||||
import plac
|
|
||||||
import random
|
|
||||||
from pathlib import Path
|
|
||||||
import spacy
|
|
||||||
from spacy.util import minibatch, compounding
|
|
||||||
|
|
||||||
|
|
||||||
# training data
|
|
||||||
TRAIN_DATA = [
|
|
||||||
(
|
|
||||||
"They trade mortgage-backed securities.",
|
|
||||||
{
|
|
||||||
"heads": [1, 1, 4, 4, 5, 1, 1],
|
|
||||||
"deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"],
|
|
||||||
},
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"I like London and Berlin.",
|
|
||||||
{
|
|
||||||
"heads": [1, 1, 1, 2, 2, 1],
|
|
||||||
"deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
|
|
||||||
},
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
|
|
||||||
output_dir=("Optional output directory", "option", "o", Path),
|
|
||||||
n_iter=("Number of training iterations", "option", "n", int),
|
|
||||||
)
|
|
||||||
def main(model=None, output_dir=None, n_iter=15):
|
|
||||||
"""Load the model, set up the pipeline and train the parser."""
|
|
||||||
if model is not None:
|
|
||||||
nlp = spacy.load(model) # load existing spaCy model
|
|
||||||
print("Loaded model '%s'" % model)
|
|
||||||
else:
|
|
||||||
nlp = spacy.blank("en") # create blank Language class
|
|
||||||
print("Created blank 'en' model")
|
|
||||||
|
|
||||||
# add the parser to the pipeline if it doesn't exist
|
|
||||||
# nlp.create_pipe works for built-ins that are registered with spaCy
|
|
||||||
if "parser" not in nlp.pipe_names:
|
|
||||||
parser = nlp.create_pipe("parser")
|
|
||||||
nlp.add_pipe(parser, first=True)
|
|
||||||
# otherwise, get it, so we can add labels to it
|
|
||||||
else:
|
|
||||||
parser = nlp.get_pipe("parser")
|
|
||||||
|
|
||||||
# add labels to the parser
|
|
||||||
for _, annotations in TRAIN_DATA:
|
|
||||||
for dep in annotations.get("deps", []):
|
|
||||||
parser.add_label(dep)
|
|
||||||
|
|
||||||
# get names of other pipes to disable them during training
|
|
||||||
pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"]
|
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
|
||||||
with nlp.disable_pipes(*other_pipes): # only train parser
|
|
||||||
optimizer = nlp.begin_training()
|
|
||||||
for itn in range(n_iter):
|
|
||||||
random.shuffle(TRAIN_DATA)
|
|
||||||
losses = {}
|
|
||||||
# batch up the examples using spaCy's minibatch
|
|
||||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
|
||||||
for batch in batches:
|
|
||||||
texts, annotations = zip(*batch)
|
|
||||||
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
|
|
||||||
print("Losses", losses)
|
|
||||||
|
|
||||||
# test the trained model
|
|
||||||
test_text = "I like securities."
|
|
||||||
doc = nlp(test_text)
|
|
||||||
print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc])
|
|
||||||
|
|
||||||
# save model to output directory
|
|
||||||
if output_dir is not None:
|
|
||||||
output_dir = Path(output_dir)
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir()
|
|
||||||
nlp.to_disk(output_dir)
|
|
||||||
print("Saved model to", output_dir)
|
|
||||||
|
|
||||||
# test the saved model
|
|
||||||
print("Loading from", output_dir)
|
|
||||||
nlp2 = spacy.load(output_dir)
|
|
||||||
doc = nlp2(test_text)
|
|
||||||
print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc])
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
||||||
|
|
||||||
# expected result:
|
|
||||||
# [
|
|
||||||
# ('I', 'nsubj', 'like'),
|
|
||||||
# ('like', 'ROOT', 'like'),
|
|
||||||
# ('securities', 'dobj', 'like'),
|
|
||||||
# ('.', 'punct', 'like')
|
|
||||||
# ]
|
|
|
@ -1,101 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf8
|
|
||||||
"""
|
|
||||||
A simple example for training a part-of-speech tagger with a custom tag map.
|
|
||||||
To allow us to update the tag map with our custom one, this example starts off
|
|
||||||
with a blank Language class and modifies its defaults. For more details, see
|
|
||||||
the documentation:
|
|
||||||
* Training: https://spacy.io/usage/training
|
|
||||||
* POS Tagging: https://spacy.io/usage/linguistic-features#pos-tagging
|
|
||||||
|
|
||||||
Compatible with: spaCy v2.0.0+
|
|
||||||
Last tested with: v2.1.0
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals, print_function
|
|
||||||
|
|
||||||
import plac
|
|
||||||
import random
|
|
||||||
from pathlib import Path
|
|
||||||
import spacy
|
|
||||||
from spacy.util import minibatch, compounding
|
|
||||||
|
|
||||||
|
|
||||||
# You need to define a mapping from your data's part-of-speech tag names to the
|
|
||||||
# Universal Part-of-Speech tag set, as spaCy includes an enum of these tags.
|
|
||||||
# See here for the Universal Tag Set:
|
|
||||||
# http://universaldependencies.github.io/docs/u/pos/index.html
|
|
||||||
# You may also specify morphological features for your tags, from the universal
|
|
||||||
# scheme.
|
|
||||||
TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}}
|
|
||||||
|
|
||||||
# Usually you'll read this in, of course. Data formats vary. Ensure your
|
|
||||||
# strings are unicode and that the number of tags assigned matches spaCy's
|
|
||||||
# tokenization. If not, you can always add a 'words' key to the annotations
|
|
||||||
# that specifies the gold-standard tokenization, e.g.:
|
|
||||||
# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'], 'tags': ['V', 'J', 'N']})
|
|
||||||
TRAIN_DATA = [
|
|
||||||
("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
|
|
||||||
("Eat blue ham", {"tags": ["V", "J", "N"]}),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
lang=("ISO Code of language to use", "option", "l", str),
|
|
||||||
output_dir=("Optional output directory", "option", "o", Path),
|
|
||||||
n_iter=("Number of training iterations", "option", "n", int),
|
|
||||||
)
|
|
||||||
def main(lang="en", output_dir=None, n_iter=25):
|
|
||||||
"""Create a new model, set up the pipeline and train the tagger. In order to
|
|
||||||
train the tagger with a custom tag map, we're creating a new Language
|
|
||||||
instance with a custom vocab.
|
|
||||||
"""
|
|
||||||
nlp = spacy.blank(lang)
|
|
||||||
# add the tagger to the pipeline
|
|
||||||
# nlp.create_pipe works for built-ins that are registered with spaCy
|
|
||||||
tagger = nlp.create_pipe("tagger")
|
|
||||||
# Add the tags. This needs to be done before you start training.
|
|
||||||
for tag, values in TAG_MAP.items():
|
|
||||||
tagger.add_label(tag, values)
|
|
||||||
nlp.add_pipe(tagger)
|
|
||||||
|
|
||||||
optimizer = nlp.begin_training()
|
|
||||||
for i in range(n_iter):
|
|
||||||
random.shuffle(TRAIN_DATA)
|
|
||||||
losses = {}
|
|
||||||
# batch up the examples using spaCy's minibatch
|
|
||||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
|
||||||
for batch in batches:
|
|
||||||
texts, annotations = zip(*batch)
|
|
||||||
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
|
|
||||||
print("Losses", losses)
|
|
||||||
|
|
||||||
# test the trained model
|
|
||||||
test_text = "I like blue eggs"
|
|
||||||
doc = nlp(test_text)
|
|
||||||
print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])
|
|
||||||
|
|
||||||
# save model to output directory
|
|
||||||
if output_dir is not None:
|
|
||||||
output_dir = Path(output_dir)
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir()
|
|
||||||
nlp.to_disk(output_dir)
|
|
||||||
print("Saved model to", output_dir)
|
|
||||||
|
|
||||||
# test the save model
|
|
||||||
print("Loading from", output_dir)
|
|
||||||
nlp2 = spacy.load(output_dir)
|
|
||||||
doc = nlp2(test_text)
|
|
||||||
print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
||||||
|
|
||||||
# Expected output:
|
|
||||||
# [
|
|
||||||
# ('I', 'N', 'NOUN'),
|
|
||||||
# ('like', 'V', 'VERB'),
|
|
||||||
# ('blue', 'J', 'ADJ'),
|
|
||||||
# ('eggs', 'N', 'NOUN')
|
|
||||||
# ]
|
|
|
@ -1,160 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf8
|
|
||||||
"""Train a convolutional neural network text classifier on the
|
|
||||||
IMDB dataset, using the TextCategorizer component. The dataset will be loaded
|
|
||||||
automatically via Thinc's built-in dataset loader. The model is added to
|
|
||||||
spacy.pipeline, and predictions are available via `doc.cats`. For more details,
|
|
||||||
see the documentation:
|
|
||||||
* Training: https://spacy.io/usage/training
|
|
||||||
|
|
||||||
Compatible with: spaCy v2.0.0+
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals, print_function
|
|
||||||
import plac
|
|
||||||
import random
|
|
||||||
from pathlib import Path
|
|
||||||
import thinc.extra.datasets
|
|
||||||
|
|
||||||
import spacy
|
|
||||||
from spacy.util import minibatch, compounding
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
|
|
||||||
output_dir=("Optional output directory", "option", "o", Path),
|
|
||||||
n_texts=("Number of texts to train from", "option", "t", int),
|
|
||||||
n_iter=("Number of training iterations", "option", "n", int),
|
|
||||||
init_tok2vec=("Pretrained tok2vec weights", "option", "t2v", Path),
|
|
||||||
)
|
|
||||||
def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None):
|
|
||||||
if output_dir is not None:
|
|
||||||
output_dir = Path(output_dir)
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir()
|
|
||||||
|
|
||||||
if model is not None:
|
|
||||||
nlp = spacy.load(model) # load existing spaCy model
|
|
||||||
print("Loaded model '%s'" % model)
|
|
||||||
else:
|
|
||||||
nlp = spacy.blank("en") # create blank Language class
|
|
||||||
print("Created blank 'en' model")
|
|
||||||
|
|
||||||
# add the text classifier to the pipeline if it doesn't exist
|
|
||||||
# nlp.create_pipe works for built-ins that are registered with spaCy
|
|
||||||
if "textcat" not in nlp.pipe_names:
|
|
||||||
textcat = nlp.create_pipe(
|
|
||||||
"textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"}
|
|
||||||
)
|
|
||||||
nlp.add_pipe(textcat, last=True)
|
|
||||||
# otherwise, get it, so we can add labels to it
|
|
||||||
else:
|
|
||||||
textcat = nlp.get_pipe("textcat")
|
|
||||||
|
|
||||||
# add label to text classifier
|
|
||||||
textcat.add_label("POSITIVE")
|
|
||||||
textcat.add_label("NEGATIVE")
|
|
||||||
|
|
||||||
# load the IMDB dataset
|
|
||||||
print("Loading IMDB data...")
|
|
||||||
(train_texts, train_cats), (dev_texts, dev_cats) = load_data()
|
|
||||||
train_texts = train_texts[:n_texts]
|
|
||||||
train_cats = train_cats[:n_texts]
|
|
||||||
print(
|
|
||||||
"Using {} examples ({} training, {} evaluation)".format(
|
|
||||||
n_texts, len(train_texts), len(dev_texts)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
|
|
||||||
|
|
||||||
# get names of other pipes to disable them during training
|
|
||||||
pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
|
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
|
||||||
with nlp.disable_pipes(*other_pipes): # only train textcat
|
|
||||||
optimizer = nlp.begin_training()
|
|
||||||
if init_tok2vec is not None:
|
|
||||||
with init_tok2vec.open("rb") as file_:
|
|
||||||
textcat.model.tok2vec.from_bytes(file_.read())
|
|
||||||
print("Training the model...")
|
|
||||||
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
|
|
||||||
batch_sizes = compounding(4.0, 32.0, 1.001)
|
|
||||||
for i in range(n_iter):
|
|
||||||
losses = {}
|
|
||||||
# batch up the examples using spaCy's minibatch
|
|
||||||
random.shuffle(train_data)
|
|
||||||
batches = minibatch(train_data, size=batch_sizes)
|
|
||||||
for batch in batches:
|
|
||||||
texts, annotations = zip(*batch)
|
|
||||||
nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
|
|
||||||
with textcat.model.use_params(optimizer.averages):
|
|
||||||
# evaluate on the dev data split off in load_data()
|
|
||||||
scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
|
|
||||||
print(
|
|
||||||
"{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table
|
|
||||||
losses["textcat"],
|
|
||||||
scores["textcat_p"],
|
|
||||||
scores["textcat_r"],
|
|
||||||
scores["textcat_f"],
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# test the trained model
|
|
||||||
test_text = "This movie sucked"
|
|
||||||
doc = nlp(test_text)
|
|
||||||
print(test_text, doc.cats)
|
|
||||||
|
|
||||||
if output_dir is not None:
|
|
||||||
with nlp.use_params(optimizer.averages):
|
|
||||||
nlp.to_disk(output_dir)
|
|
||||||
print("Saved model to", output_dir)
|
|
||||||
|
|
||||||
# test the saved model
|
|
||||||
print("Loading from", output_dir)
|
|
||||||
nlp2 = spacy.load(output_dir)
|
|
||||||
doc2 = nlp2(test_text)
|
|
||||||
print(test_text, doc2.cats)
|
|
||||||
|
|
||||||
|
|
||||||
def load_data(limit=0, split=0.8):
|
|
||||||
"""Load data from the IMDB dataset."""
|
|
||||||
# Partition off part of the train data for evaluation
|
|
||||||
train_data, _ = thinc.extra.datasets.imdb()
|
|
||||||
random.shuffle(train_data)
|
|
||||||
train_data = train_data[-limit:]
|
|
||||||
texts, labels = zip(*train_data)
|
|
||||||
cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
|
|
||||||
split = int(len(train_data) * split)
|
|
||||||
return (texts[:split], cats[:split]), (texts[split:], cats[split:])
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate(tokenizer, textcat, texts, cats):
|
|
||||||
docs = (tokenizer(text) for text in texts)
|
|
||||||
tp = 0.0 # True positives
|
|
||||||
fp = 1e-8 # False positives
|
|
||||||
fn = 1e-8 # False negatives
|
|
||||||
tn = 0.0 # True negatives
|
|
||||||
for i, doc in enumerate(textcat.pipe(docs)):
|
|
||||||
gold = cats[i]
|
|
||||||
for label, score in doc.cats.items():
|
|
||||||
if label not in gold:
|
|
||||||
continue
|
|
||||||
if label == "NEGATIVE":
|
|
||||||
continue
|
|
||||||
if score >= 0.5 and gold[label] >= 0.5:
|
|
||||||
tp += 1.0
|
|
||||||
elif score >= 0.5 and gold[label] < 0.5:
|
|
||||||
fp += 1.0
|
|
||||||
elif score < 0.5 and gold[label] < 0.5:
|
|
||||||
tn += 1
|
|
||||||
elif score < 0.5 and gold[label] >= 0.5:
|
|
||||||
fn += 1
|
|
||||||
precision = tp / (tp + fp)
|
|
||||||
recall = tp / (tp + fn)
|
|
||||||
if (precision + recall) == 0:
|
|
||||||
f_score = 0.0
|
|
||||||
else:
|
|
||||||
f_score = 2 * (precision * recall) / (precision + recall)
|
|
||||||
return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
|
@ -1,49 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf8
|
|
||||||
"""Load vectors for a language trained using fastText
|
|
||||||
https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
|
|
||||||
Compatible with: spaCy v2.0.0+
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
import plac
|
|
||||||
import numpy
|
|
||||||
|
|
||||||
import spacy
|
|
||||||
from spacy.language import Language
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
vectors_loc=("Path to .vec file", "positional", None, str),
|
|
||||||
lang=(
|
|
||||||
"Optional language ID. If not set, blank Language() will be used.",
|
|
||||||
"positional",
|
|
||||||
None,
|
|
||||||
str,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
def main(vectors_loc, lang=None):
|
|
||||||
if lang is None:
|
|
||||||
nlp = Language()
|
|
||||||
else:
|
|
||||||
# create empty language class – this is required if you're planning to
|
|
||||||
# save the model to disk and load it back later (models always need a
|
|
||||||
# "lang" setting). Use 'xx' for blank multi-language class.
|
|
||||||
nlp = spacy.blank(lang)
|
|
||||||
with open(vectors_loc, "rb") as file_:
|
|
||||||
header = file_.readline()
|
|
||||||
nr_row, nr_dim = header.split()
|
|
||||||
nlp.vocab.reset_vectors(width=int(nr_dim))
|
|
||||||
for line in file_:
|
|
||||||
line = line.rstrip().decode("utf8")
|
|
||||||
pieces = line.rsplit(" ", int(nr_dim))
|
|
||||||
word = pieces[0]
|
|
||||||
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype="f")
|
|
||||||
nlp.vocab.set_vector(word, vector) # add the vectors to the vocab
|
|
||||||
# test the vectors and similarity
|
|
||||||
text = "class colspan"
|
|
||||||
doc = nlp(text)
|
|
||||||
print(text, doc[0].similarity(doc[1]))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
|
@ -1,105 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf8
|
|
||||||
"""Visualize spaCy word vectors in Tensorboard.
|
|
||||||
|
|
||||||
Adapted from: https://gist.github.com/BrikerMan/7bd4e4bd0a00ac9076986148afc06507
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from os import path
|
|
||||||
|
|
||||||
import tqdm
|
|
||||||
import math
|
|
||||||
import numpy
|
|
||||||
import plac
|
|
||||||
import spacy
|
|
||||||
import tensorflow as tf
|
|
||||||
from tensorflow.contrib.tensorboard.plugins.projector import (
|
|
||||||
visualize_embeddings,
|
|
||||||
ProjectorConfig,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
vectors_loc=("Path to spaCy model that contains vectors", "positional", None, str),
|
|
||||||
out_loc=(
|
|
||||||
"Path to output folder for tensorboard session data",
|
|
||||||
"positional",
|
|
||||||
None,
|
|
||||||
str,
|
|
||||||
),
|
|
||||||
name=(
|
|
||||||
"Human readable name for tsv file and vectors tensor",
|
|
||||||
"positional",
|
|
||||||
None,
|
|
||||||
str,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
def main(vectors_loc, out_loc, name="spaCy_vectors"):
|
|
||||||
meta_file = "{}.tsv".format(name)
|
|
||||||
out_meta_file = path.join(out_loc, meta_file)
|
|
||||||
|
|
||||||
print("Loading spaCy vectors model: {}".format(vectors_loc))
|
|
||||||
model = spacy.load(vectors_loc)
|
|
||||||
print("Finding lexemes with vectors attached: {}".format(vectors_loc))
|
|
||||||
strings_stream = tqdm.tqdm(
|
|
||||||
model.vocab.strings, total=len(model.vocab.strings), leave=False
|
|
||||||
)
|
|
||||||
queries = [w for w in strings_stream if model.vocab.has_vector(w)]
|
|
||||||
vector_count = len(queries)
|
|
||||||
|
|
||||||
print(
|
|
||||||
"Building Tensorboard Projector metadata for ({}) vectors: {}".format(
|
|
||||||
vector_count, out_meta_file
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Store vector data in a tensorflow variable
|
|
||||||
tf_vectors_variable = numpy.zeros((vector_count, model.vocab.vectors.shape[1]))
|
|
||||||
|
|
||||||
# Write a tab-separated file that contains information about the vectors for visualization
|
|
||||||
#
|
|
||||||
# Reference: https://www.tensorflow.org/programmers_guide/embedding#metadata
|
|
||||||
with open(out_meta_file, "wb") as file_metadata:
|
|
||||||
# Define columns in the first row
|
|
||||||
file_metadata.write("Text\tFrequency\n".encode("utf-8"))
|
|
||||||
# Write out a row for each vector that we add to the tensorflow variable we created
|
|
||||||
vec_index = 0
|
|
||||||
for text in tqdm.tqdm(queries, total=len(queries), leave=False):
|
|
||||||
# https://github.com/tensorflow/tensorflow/issues/9094
|
|
||||||
text = "<Space>" if text.lstrip() == "" else text
|
|
||||||
lex = model.vocab[text]
|
|
||||||
|
|
||||||
# Store vector data and metadata
|
|
||||||
tf_vectors_variable[vec_index] = model.vocab.get_vector(text)
|
|
||||||
file_metadata.write(
|
|
||||||
"{}\t{}\n".format(text, math.exp(lex.prob) * vector_count).encode(
|
|
||||||
"utf-8"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
vec_index += 1
|
|
||||||
|
|
||||||
print("Running Tensorflow Session...")
|
|
||||||
sess = tf.InteractiveSession()
|
|
||||||
tf.Variable(tf_vectors_variable, trainable=False, name=name)
|
|
||||||
tf.global_variables_initializer().run()
|
|
||||||
saver = tf.train.Saver()
|
|
||||||
writer = tf.summary.FileWriter(out_loc, sess.graph)
|
|
||||||
|
|
||||||
# Link the embeddings into the config
|
|
||||||
config = ProjectorConfig()
|
|
||||||
embed = config.embeddings.add()
|
|
||||||
embed.tensor_name = name
|
|
||||||
embed.metadata_path = meta_file
|
|
||||||
|
|
||||||
# Tell the projector about the configured embeddings and metadata file
|
|
||||||
visualize_embeddings(writer, config)
|
|
||||||
|
|
||||||
# Save session and print run command to the output
|
|
||||||
print("Saving Tensorboard Session...")
|
|
||||||
saver.save(sess, path.join(out_loc, "{}.ckpt".format(name)))
|
|
||||||
print("Done. Run `tensorboard --logdir={0}` to view in Tensorboard".format(out_loc))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
plac.call(main)
|
|
|
@ -1,20 +1,21 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import plac
|
import plac
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.gold import docs_to_json
|
from spacy.training import docs_to_json
|
||||||
import srsly
|
import srsly
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
model=("Model name. Defaults to 'en'.", "option", "m", str),
|
model=("Model name. Defaults to 'en'.", "option", "m", str),
|
||||||
input_file=("Input file (jsonl)", "positional", None, Path),
|
input_file=("Input file (jsonl)", "positional", None, Path),
|
||||||
output_dir=("Output directory", "positional", None, Path),
|
output_dir=("Output directory", "positional", None, Path),
|
||||||
n_texts=("Number of texts to convert", "option", "t", int),
|
n_texts=("Number of texts to convert", "option", "t", int),
|
||||||
)
|
)
|
||||||
def convert(model='en', input_file=None, output_dir=None, n_texts=0):
|
def convert(model="en", input_file=None, output_dir=None, n_texts=0):
|
||||||
# Load model with tokenizer + sentencizer only
|
# Load model with tokenizer + sentencizer only
|
||||||
nlp = spacy.load(model)
|
nlp = spacy.load(model)
|
||||||
nlp.disable_pipes(*nlp.pipe_names)
|
nlp.select_pipes(disable=nlp.pipe_names)
|
||||||
sentencizer = nlp.create_pipe("sentencizer")
|
sentencizer = nlp.create_pipe("sentencizer")
|
||||||
nlp.add_pipe(sentencizer, first=True)
|
nlp.add_pipe(sentencizer, first=True)
|
||||||
|
|
||||||
|
@ -49,5 +50,6 @@ def convert(model='en', input_file=None, output_dir=None, n_texts=0):
|
||||||
|
|
||||||
srsly.write_json(output_dir / input_file.with_suffix(".json"), [docs_to_json(docs)])
|
srsly.write_json(output_dir / input_file.with_suffix(".json"), [docs_to_json(docs)])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
plac.call(convert)
|
plac.call(convert)
|
154
fabfile.py
vendored
154
fabfile.py
vendored
|
@ -1,154 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals, print_function
|
|
||||||
|
|
||||||
import contextlib
|
|
||||||
from pathlib import Path
|
|
||||||
from fabric.api import local, lcd, env, settings, prefix
|
|
||||||
from os import path, environ
|
|
||||||
import shutil
|
|
||||||
import sys
|
|
||||||
|
|
||||||
|
|
||||||
PWD = path.dirname(__file__)
|
|
||||||
ENV = environ["VENV_DIR"] if "VENV_DIR" in environ else ".env"
|
|
||||||
VENV_DIR = Path(PWD) / ENV
|
|
||||||
|
|
||||||
|
|
||||||
@contextlib.contextmanager
|
|
||||||
def virtualenv(name, create=False, python="/usr/bin/python3.6"):
|
|
||||||
python = Path(python).resolve()
|
|
||||||
env_path = VENV_DIR
|
|
||||||
if create:
|
|
||||||
if env_path.exists():
|
|
||||||
shutil.rmtree(str(env_path))
|
|
||||||
local("{python} -m venv {env_path}".format(python=python, env_path=VENV_DIR))
|
|
||||||
|
|
||||||
def wrapped_local(cmd, env_vars=[], capture=False, direct=False):
|
|
||||||
return local(
|
|
||||||
"source {}/bin/activate && {}".format(env_path, cmd),
|
|
||||||
shell="/bin/bash",
|
|
||||||
capture=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
yield wrapped_local
|
|
||||||
|
|
||||||
|
|
||||||
def env(lang="python3.6"):
|
|
||||||
if VENV_DIR.exists():
|
|
||||||
local("rm -rf {env}".format(env=VENV_DIR))
|
|
||||||
if lang.startswith("python3"):
|
|
||||||
local("{lang} -m venv {env}".format(lang=lang, env=VENV_DIR))
|
|
||||||
else:
|
|
||||||
local("{lang} -m pip install virtualenv --no-cache-dir".format(lang=lang))
|
|
||||||
local(
|
|
||||||
"{lang} -m virtualenv {env} --no-cache-dir".format(lang=lang, env=VENV_DIR)
|
|
||||||
)
|
|
||||||
with virtualenv(VENV_DIR) as venv_local:
|
|
||||||
print(venv_local("python --version", capture=True))
|
|
||||||
venv_local("pip install --upgrade setuptools --no-cache-dir")
|
|
||||||
venv_local("pip install pytest --no-cache-dir")
|
|
||||||
venv_local("pip install wheel --no-cache-dir")
|
|
||||||
venv_local("pip install -r requirements.txt --no-cache-dir")
|
|
||||||
venv_local("pip install pex --no-cache-dir")
|
|
||||||
|
|
||||||
|
|
||||||
def install():
|
|
||||||
with virtualenv(VENV_DIR) as venv_local:
|
|
||||||
venv_local("pip install dist/*.tar.gz")
|
|
||||||
|
|
||||||
|
|
||||||
def make():
|
|
||||||
with lcd(path.dirname(__file__)):
|
|
||||||
local(
|
|
||||||
"export PYTHONPATH=`pwd` && source .env/bin/activate && python setup.py build_ext --inplace",
|
|
||||||
shell="/bin/bash",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def sdist():
|
|
||||||
with virtualenv(VENV_DIR) as venv_local:
|
|
||||||
with lcd(path.dirname(__file__)):
|
|
||||||
venv_local("python -m pip install -U setuptools srsly")
|
|
||||||
venv_local("python setup.py sdist")
|
|
||||||
|
|
||||||
|
|
||||||
def wheel():
|
|
||||||
with virtualenv(VENV_DIR) as venv_local:
|
|
||||||
with lcd(path.dirname(__file__)):
|
|
||||||
venv_local("python setup.py bdist_wheel")
|
|
||||||
|
|
||||||
|
|
||||||
def pex():
|
|
||||||
with virtualenv(VENV_DIR) as venv_local:
|
|
||||||
with lcd(path.dirname(__file__)):
|
|
||||||
sha = local("git rev-parse --short HEAD", capture=True)
|
|
||||||
venv_local(
|
|
||||||
"pex dist/*.whl -e spacy -o dist/spacy-%s.pex" % sha, direct=True
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def clean():
|
|
||||||
with lcd(path.dirname(__file__)):
|
|
||||||
local("rm -f dist/*.whl")
|
|
||||||
local("rm -f dist/*.pex")
|
|
||||||
with virtualenv(VENV_DIR) as venv_local:
|
|
||||||
venv_local("python setup.py clean --all")
|
|
||||||
|
|
||||||
|
|
||||||
def test():
|
|
||||||
with virtualenv(VENV_DIR) as venv_local:
|
|
||||||
with lcd(path.dirname(__file__)):
|
|
||||||
venv_local("pytest -x spacy/tests")
|
|
||||||
|
|
||||||
|
|
||||||
def train():
|
|
||||||
args = environ.get("SPACY_TRAIN_ARGS", "")
|
|
||||||
with virtualenv(VENV_DIR) as venv_local:
|
|
||||||
venv_local("spacy train {args}".format(args=args))
|
|
||||||
|
|
||||||
|
|
||||||
def conll17(treebank_dir, experiment_dir, vectors_dir, config, corpus=""):
|
|
||||||
is_not_clean = local("git status --porcelain", capture=True)
|
|
||||||
if is_not_clean:
|
|
||||||
print("Repository is not clean")
|
|
||||||
print(is_not_clean)
|
|
||||||
sys.exit(1)
|
|
||||||
git_sha = local("git rev-parse --short HEAD", capture=True)
|
|
||||||
config_checksum = local("sha256sum {config}".format(config=config), capture=True)
|
|
||||||
experiment_dir = Path(experiment_dir) / "{}--{}".format(
|
|
||||||
config_checksum[:6], git_sha
|
|
||||||
)
|
|
||||||
if not experiment_dir.exists():
|
|
||||||
experiment_dir.mkdir()
|
|
||||||
test_data_dir = Path(treebank_dir) / "ud-test-v2.0-conll2017"
|
|
||||||
assert test_data_dir.exists()
|
|
||||||
assert test_data_dir.is_dir()
|
|
||||||
if corpus:
|
|
||||||
corpora = [corpus]
|
|
||||||
else:
|
|
||||||
corpora = ["UD_English", "UD_Chinese", "UD_Japanese", "UD_Vietnamese"]
|
|
||||||
|
|
||||||
local(
|
|
||||||
"cp {config} {experiment_dir}/config.json".format(
|
|
||||||
config=config, experiment_dir=experiment_dir
|
|
||||||
)
|
|
||||||
)
|
|
||||||
with virtualenv(VENV_DIR) as venv_local:
|
|
||||||
for corpus in corpora:
|
|
||||||
venv_local(
|
|
||||||
"spacy ud-train {treebank_dir} {experiment_dir} {config} {corpus} -v {vectors_dir}".format(
|
|
||||||
treebank_dir=treebank_dir,
|
|
||||||
experiment_dir=experiment_dir,
|
|
||||||
config=config,
|
|
||||||
corpus=corpus,
|
|
||||||
vectors_dir=vectors_dir,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
venv_local(
|
|
||||||
"spacy ud-run-test {test_data_dir} {experiment_dir} {corpus}".format(
|
|
||||||
test_data_dir=test_data_dir,
|
|
||||||
experiment_dir=experiment_dir,
|
|
||||||
config=config,
|
|
||||||
corpus=corpus,
|
|
||||||
)
|
|
||||||
)
|
|
|
@ -1,259 +0,0 @@
|
||||||
// ISO C9x compliant stdint.h for Microsoft Visual Studio
|
|
||||||
// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124
|
|
||||||
//
|
|
||||||
// Copyright (c) 2006-2013 Alexander Chemeris
|
|
||||||
//
|
|
||||||
// Redistribution and use in source and binary forms, with or without
|
|
||||||
// modification, are permitted provided that the following conditions are met:
|
|
||||||
//
|
|
||||||
// 1. Redistributions of source code must retain the above copyright notice,
|
|
||||||
// this list of conditions and the following disclaimer.
|
|
||||||
//
|
|
||||||
// 2. Redistributions in binary form must reproduce the above copyright
|
|
||||||
// notice, this list of conditions and the following disclaimer in the
|
|
||||||
// documentation and/or other materials provided with the distribution.
|
|
||||||
//
|
|
||||||
// 3. Neither the name of the product nor the names of its contributors may
|
|
||||||
// be used to endorse or promote products derived from this software
|
|
||||||
// without specific prior written permission.
|
|
||||||
//
|
|
||||||
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
|
|
||||||
// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
|
||||||
// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
|
||||||
// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
||||||
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
||||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
|
||||||
// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
|
||||||
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
|
||||||
// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
|
||||||
// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
//
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
#ifndef _MSC_VER // [
|
|
||||||
#error "Use this header only with Microsoft Visual C++ compilers!"
|
|
||||||
#endif // _MSC_VER ]
|
|
||||||
|
|
||||||
#ifndef _MSC_STDINT_H_ // [
|
|
||||||
#define _MSC_STDINT_H_
|
|
||||||
|
|
||||||
#if _MSC_VER > 1000
|
|
||||||
#pragma once
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if _MSC_VER >= 1600 // [
|
|
||||||
#include <stdint.h>
|
|
||||||
#else // ] _MSC_VER >= 1600 [
|
|
||||||
|
|
||||||
#include <limits.h>
|
|
||||||
|
|
||||||
// For Visual Studio 6 in C++ mode and for many Visual Studio versions when
|
|
||||||
// compiling for ARM we should wrap <wchar.h> include with 'extern "C++" {}'
|
|
||||||
// or compiler give many errors like this:
|
|
||||||
// error C2733: second C linkage of overloaded function 'wmemchr' not allowed
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
# include <wchar.h>
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Define _W64 macros to mark types changing their size, like intptr_t.
|
|
||||||
#ifndef _W64
|
|
||||||
# if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300
|
|
||||||
# define _W64 __w64
|
|
||||||
# else
|
|
||||||
# define _W64
|
|
||||||
# endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
// 7.18.1 Integer types
|
|
||||||
|
|
||||||
// 7.18.1.1 Exact-width integer types
|
|
||||||
|
|
||||||
// Visual Studio 6 and Embedded Visual C++ 4 doesn't
|
|
||||||
// realize that, e.g. char has the same size as __int8
|
|
||||||
// so we give up on __intX for them.
|
|
||||||
#if (_MSC_VER < 1300)
|
|
||||||
typedef signed char int8_t;
|
|
||||||
typedef signed short int16_t;
|
|
||||||
typedef signed int int32_t;
|
|
||||||
typedef unsigned char uint8_t;
|
|
||||||
typedef unsigned short uint16_t;
|
|
||||||
typedef unsigned int uint32_t;
|
|
||||||
#else
|
|
||||||
typedef signed __int8 int8_t;
|
|
||||||
typedef signed __int16 int16_t;
|
|
||||||
typedef signed __int32 int32_t;
|
|
||||||
typedef unsigned __int8 uint8_t;
|
|
||||||
typedef unsigned __int16 uint16_t;
|
|
||||||
typedef unsigned __int32 uint32_t;
|
|
||||||
#endif
|
|
||||||
typedef signed __int64 int64_t;
|
|
||||||
typedef unsigned __int64 uint64_t;
|
|
||||||
|
|
||||||
|
|
||||||
// 7.18.1.2 Minimum-width integer types
|
|
||||||
typedef int8_t int_least8_t;
|
|
||||||
typedef int16_t int_least16_t;
|
|
||||||
typedef int32_t int_least32_t;
|
|
||||||
typedef int64_t int_least64_t;
|
|
||||||
typedef uint8_t uint_least8_t;
|
|
||||||
typedef uint16_t uint_least16_t;
|
|
||||||
typedef uint32_t uint_least32_t;
|
|
||||||
typedef uint64_t uint_least64_t;
|
|
||||||
|
|
||||||
// 7.18.1.3 Fastest minimum-width integer types
|
|
||||||
typedef int8_t int_fast8_t;
|
|
||||||
typedef int16_t int_fast16_t;
|
|
||||||
typedef int32_t int_fast32_t;
|
|
||||||
typedef int64_t int_fast64_t;
|
|
||||||
typedef uint8_t uint_fast8_t;
|
|
||||||
typedef uint16_t uint_fast16_t;
|
|
||||||
typedef uint32_t uint_fast32_t;
|
|
||||||
typedef uint64_t uint_fast64_t;
|
|
||||||
|
|
||||||
// 7.18.1.4 Integer types capable of holding object pointers
|
|
||||||
#ifdef _WIN64 // [
|
|
||||||
typedef signed __int64 intptr_t;
|
|
||||||
typedef unsigned __int64 uintptr_t;
|
|
||||||
#else // _WIN64 ][
|
|
||||||
typedef _W64 signed int intptr_t;
|
|
||||||
typedef _W64 unsigned int uintptr_t;
|
|
||||||
#endif // _WIN64 ]
|
|
||||||
|
|
||||||
// 7.18.1.5 Greatest-width integer types
|
|
||||||
typedef int64_t intmax_t;
|
|
||||||
typedef uint64_t uintmax_t;
|
|
||||||
|
|
||||||
|
|
||||||
// 7.18.2 Limits of specified-width integer types
|
|
||||||
|
|
||||||
#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259
|
|
||||||
|
|
||||||
// 7.18.2.1 Limits of exact-width integer types
|
|
||||||
#define INT8_MIN ((int8_t)_I8_MIN)
|
|
||||||
#define INT8_MAX _I8_MAX
|
|
||||||
#define INT16_MIN ((int16_t)_I16_MIN)
|
|
||||||
#define INT16_MAX _I16_MAX
|
|
||||||
#define INT32_MIN ((int32_t)_I32_MIN)
|
|
||||||
#define INT32_MAX _I32_MAX
|
|
||||||
#define INT64_MIN ((int64_t)_I64_MIN)
|
|
||||||
#define INT64_MAX _I64_MAX
|
|
||||||
#define UINT8_MAX _UI8_MAX
|
|
||||||
#define UINT16_MAX _UI16_MAX
|
|
||||||
#define UINT32_MAX _UI32_MAX
|
|
||||||
#define UINT64_MAX _UI64_MAX
|
|
||||||
|
|
||||||
// 7.18.2.2 Limits of minimum-width integer types
|
|
||||||
#define INT_LEAST8_MIN INT8_MIN
|
|
||||||
#define INT_LEAST8_MAX INT8_MAX
|
|
||||||
#define INT_LEAST16_MIN INT16_MIN
|
|
||||||
#define INT_LEAST16_MAX INT16_MAX
|
|
||||||
#define INT_LEAST32_MIN INT32_MIN
|
|
||||||
#define INT_LEAST32_MAX INT32_MAX
|
|
||||||
#define INT_LEAST64_MIN INT64_MIN
|
|
||||||
#define INT_LEAST64_MAX INT64_MAX
|
|
||||||
#define UINT_LEAST8_MAX UINT8_MAX
|
|
||||||
#define UINT_LEAST16_MAX UINT16_MAX
|
|
||||||
#define UINT_LEAST32_MAX UINT32_MAX
|
|
||||||
#define UINT_LEAST64_MAX UINT64_MAX
|
|
||||||
|
|
||||||
// 7.18.2.3 Limits of fastest minimum-width integer types
|
|
||||||
#define INT_FAST8_MIN INT8_MIN
|
|
||||||
#define INT_FAST8_MAX INT8_MAX
|
|
||||||
#define INT_FAST16_MIN INT16_MIN
|
|
||||||
#define INT_FAST16_MAX INT16_MAX
|
|
||||||
#define INT_FAST32_MIN INT32_MIN
|
|
||||||
#define INT_FAST32_MAX INT32_MAX
|
|
||||||
#define INT_FAST64_MIN INT64_MIN
|
|
||||||
#define INT_FAST64_MAX INT64_MAX
|
|
||||||
#define UINT_FAST8_MAX UINT8_MAX
|
|
||||||
#define UINT_FAST16_MAX UINT16_MAX
|
|
||||||
#define UINT_FAST32_MAX UINT32_MAX
|
|
||||||
#define UINT_FAST64_MAX UINT64_MAX
|
|
||||||
|
|
||||||
// 7.18.2.4 Limits of integer types capable of holding object pointers
|
|
||||||
#ifdef _WIN64 // [
|
|
||||||
# define INTPTR_MIN INT64_MIN
|
|
||||||
# define INTPTR_MAX INT64_MAX
|
|
||||||
# define UINTPTR_MAX UINT64_MAX
|
|
||||||
#else // _WIN64 ][
|
|
||||||
# define INTPTR_MIN INT32_MIN
|
|
||||||
# define INTPTR_MAX INT32_MAX
|
|
||||||
# define UINTPTR_MAX UINT32_MAX
|
|
||||||
#endif // _WIN64 ]
|
|
||||||
|
|
||||||
// 7.18.2.5 Limits of greatest-width integer types
|
|
||||||
#define INTMAX_MIN INT64_MIN
|
|
||||||
#define INTMAX_MAX INT64_MAX
|
|
||||||
#define UINTMAX_MAX UINT64_MAX
|
|
||||||
|
|
||||||
// 7.18.3 Limits of other integer types
|
|
||||||
|
|
||||||
#ifdef _WIN64 // [
|
|
||||||
# define PTRDIFF_MIN _I64_MIN
|
|
||||||
# define PTRDIFF_MAX _I64_MAX
|
|
||||||
#else // _WIN64 ][
|
|
||||||
# define PTRDIFF_MIN _I32_MIN
|
|
||||||
# define PTRDIFF_MAX _I32_MAX
|
|
||||||
#endif // _WIN64 ]
|
|
||||||
|
|
||||||
#define SIG_ATOMIC_MIN INT_MIN
|
|
||||||
#define SIG_ATOMIC_MAX INT_MAX
|
|
||||||
|
|
||||||
#ifndef SIZE_MAX // [
|
|
||||||
# ifdef _WIN64 // [
|
|
||||||
# define SIZE_MAX _UI64_MAX
|
|
||||||
# else // _WIN64 ][
|
|
||||||
# define SIZE_MAX _UI32_MAX
|
|
||||||
# endif // _WIN64 ]
|
|
||||||
#endif // SIZE_MAX ]
|
|
||||||
|
|
||||||
// WCHAR_MIN and WCHAR_MAX are also defined in <wchar.h>
|
|
||||||
#ifndef WCHAR_MIN // [
|
|
||||||
# define WCHAR_MIN 0
|
|
||||||
#endif // WCHAR_MIN ]
|
|
||||||
#ifndef WCHAR_MAX // [
|
|
||||||
# define WCHAR_MAX _UI16_MAX
|
|
||||||
#endif // WCHAR_MAX ]
|
|
||||||
|
|
||||||
#define WINT_MIN 0
|
|
||||||
#define WINT_MAX _UI16_MAX
|
|
||||||
|
|
||||||
#endif // __STDC_LIMIT_MACROS ]
|
|
||||||
|
|
||||||
|
|
||||||
// 7.18.4 Limits of other integer types
|
|
||||||
|
|
||||||
#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260
|
|
||||||
|
|
||||||
// 7.18.4.1 Macros for minimum-width integer constants
|
|
||||||
|
|
||||||
#define INT8_C(val) val##i8
|
|
||||||
#define INT16_C(val) val##i16
|
|
||||||
#define INT32_C(val) val##i32
|
|
||||||
#define INT64_C(val) val##i64
|
|
||||||
|
|
||||||
#define UINT8_C(val) val##ui8
|
|
||||||
#define UINT16_C(val) val##ui16
|
|
||||||
#define UINT32_C(val) val##ui32
|
|
||||||
#define UINT64_C(val) val##ui64
|
|
||||||
|
|
||||||
// 7.18.4.2 Macros for greatest-width integer constants
|
|
||||||
// These #ifndef's are needed to prevent collisions with <boost/cstdint.hpp>.
|
|
||||||
// Check out Issue 9 for the details.
|
|
||||||
#ifndef INTMAX_C // [
|
|
||||||
# define INTMAX_C INT64_C
|
|
||||||
#endif // INTMAX_C ]
|
|
||||||
#ifndef UINTMAX_C // [
|
|
||||||
# define UINTMAX_C UINT64_C
|
|
||||||
#endif // UINTMAX_C ]
|
|
||||||
|
|
||||||
#endif // __STDC_CONSTANT_MACROS ]
|
|
||||||
|
|
||||||
#endif // _MSC_VER >= 1600 ]
|
|
||||||
|
|
||||||
#endif // _MSC_STDINT_H_ ]
|
|
|
@ -1,22 +0,0 @@
|
||||||
//-----------------------------------------------------------------------------
|
|
||||||
// MurmurHash2 was written by Austin Appleby, and is placed in the public
|
|
||||||
// domain. The author hereby disclaims copyright to this source code.
|
|
||||||
|
|
||||||
#ifndef _MURMURHASH2_H_
|
|
||||||
#define _MURMURHASH2_H_
|
|
||||||
|
|
||||||
#include <stdint.h>
|
|
||||||
|
|
||||||
//-----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
uint32_t MurmurHash2 ( const void * key, int len, uint32_t seed );
|
|
||||||
uint64_t MurmurHash64A ( const void * key, int len, uint64_t seed );
|
|
||||||
uint64_t MurmurHash64B ( const void * key, int len, uint64_t seed );
|
|
||||||
uint32_t MurmurHash2A ( const void * key, int len, uint32_t seed );
|
|
||||||
uint32_t MurmurHashNeutral2 ( const void * key, int len, uint32_t seed );
|
|
||||||
uint32_t MurmurHashAligned2 ( const void * key, int len, uint32_t seed );
|
|
||||||
|
|
||||||
//-----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
#endif // _MURMURHASH2_H_
|
|
||||||
|
|
|
@ -1,28 +0,0 @@
|
||||||
//-----------------------------------------------------------------------------
|
|
||||||
// MurmurHash3 was written by Austin Appleby, and is placed in the public
|
|
||||||
// domain. The author hereby disclaims copyright to this source code.
|
|
||||||
|
|
||||||
#ifndef _MURMURHASH3_H_
|
|
||||||
#define _MURMURHASH3_H_
|
|
||||||
|
|
||||||
#include <stdint.h>
|
|
||||||
|
|
||||||
//-----------------------------------------------------------------------------
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out );
|
|
||||||
|
|
||||||
void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out );
|
|
||||||
|
|
||||||
void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out );
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
//-----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
#endif // _MURMURHASH3_H_
|
|
|
@ -36,3 +36,44 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
|
||||||
|
scikit-learn
|
||||||
|
------------
|
||||||
|
|
||||||
|
* Files: scorer.py
|
||||||
|
|
||||||
|
The following implementation of roc_auc_score() is adapted from
|
||||||
|
scikit-learn, which is distributed under the following license:
|
||||||
|
|
||||||
|
New BSD License
|
||||||
|
|
||||||
|
Copyright (c) 2007–2019 The scikit-learn developers.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
a. Redistributions of source code must retain the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer.
|
||||||
|
b. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
c. Neither the name of the Scikit-learn Developers nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written
|
||||||
|
permission.
|
||||||
|
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
|
||||||
|
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||||
|
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||||
|
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
|
||||||
|
DAMAGE.
|
||||||
|
|
17
netlify.toml
17
netlify.toml
|
@ -2,7 +2,7 @@ redirects = [
|
||||||
# Netlify
|
# Netlify
|
||||||
{from = "https://spacy.netlify.com/*", to="https://spacy.io/:splat", force = true },
|
{from = "https://spacy.netlify.com/*", to="https://spacy.io/:splat", force = true },
|
||||||
# Subdomain for branches
|
# Subdomain for branches
|
||||||
{from = "https://nightly.spacy.io/*", to="https://nightly-spacy-io.spacy.io/:splat", force = true, status = 200},
|
{from = "https://nightly.spacy.io/*", to="https://spacy.io/:splat", force = true},
|
||||||
{from = "https://v2.spacy.io/*", to="https://v2-spacy-io.spacy.io/:splat", force = true, status = 200},
|
{from = "https://v2.spacy.io/*", to="https://v2-spacy-io.spacy.io/:splat", force = true, status = 200},
|
||||||
# Old subdomains
|
# Old subdomains
|
||||||
{from = "https://survey.spacy.io/*", to = "https://spacy.io", force = true},
|
{from = "https://survey.spacy.io/*", to = "https://spacy.io", force = true},
|
||||||
|
@ -25,7 +25,7 @@ redirects = [
|
||||||
{from = "/docs/usage/customizing-tokenizer", to = "/usage/linguistic-features#tokenization", force = true},
|
{from = "/docs/usage/customizing-tokenizer", to = "/usage/linguistic-features#tokenization", force = true},
|
||||||
{from = "/docs/usage/language-processing-pipeline", to = "/usage/processing-pipelines", force = true},
|
{from = "/docs/usage/language-processing-pipeline", to = "/usage/processing-pipelines", force = true},
|
||||||
{from = "/docs/usage/customizing-pipeline", to = "/usage/processing-pipelines", force = true},
|
{from = "/docs/usage/customizing-pipeline", to = "/usage/processing-pipelines", force = true},
|
||||||
{from = "/docs/usage/training-ner", to = "/usage/training#ner", force = true},
|
{from = "/docs/usage/training-ner", to = "/usage/training", force = true},
|
||||||
{from = "/docs/usage/tutorials", to = "/usage/examples", force = true},
|
{from = "/docs/usage/tutorials", to = "/usage/examples", force = true},
|
||||||
{from = "/docs/usage/data-model", to = "/api", force = true},
|
{from = "/docs/usage/data-model", to = "/api", force = true},
|
||||||
{from = "/docs/usage/cli", to = "/api/cli", force = true},
|
{from = "/docs/usage/cli", to = "/api/cli", force = true},
|
||||||
|
@ -37,8 +37,15 @@ redirects = [
|
||||||
{from = "/docs/api/features", to = "/models/#architecture", force = true},
|
{from = "/docs/api/features", to = "/models/#architecture", force = true},
|
||||||
{from = "/docs/api/philosophy", to = "/usage/spacy-101", force = true},
|
{from = "/docs/api/philosophy", to = "/usage/spacy-101", force = true},
|
||||||
{from = "/docs/usage/showcase", to = "/universe", force = true},
|
{from = "/docs/usage/showcase", to = "/universe", force = true},
|
||||||
{from = "/tutorials/load-new-word-vectors", to = "/usage/vectors-similarity#custom", force = true},
|
{from = "/tutorials/load-new-word-vectors", to = "/usage/linguistic-features", force = true},
|
||||||
{from = "/tutorials", to = "/usage/examples", force = true},
|
{from = "/tutorials", to = "/usage/examples", force = true},
|
||||||
|
# Old documentation pages (v2.x)
|
||||||
|
{from = "/usage/adding-languages", to = "/usage/linguistic-features", force = true},
|
||||||
|
{from = "/usage/vectors-similarity", to = "/usage/linguistic-features#vectors-similarity", force = true},
|
||||||
|
{from = "/api/goldparse", to = "/api/top-level", force = true},
|
||||||
|
{from = "/api/goldcorpus", to = "/api/corpus", force = true},
|
||||||
|
{from = "/api/annotation", to = "/api/data-formats", force = true},
|
||||||
|
{from = "/usage/examples", to = "/usage/projects", force = true},
|
||||||
# Rewrite all other docs pages to /
|
# Rewrite all other docs pages to /
|
||||||
{from = "/docs/*", to = "/:splat"},
|
{from = "/docs/*", to = "/:splat"},
|
||||||
# Updated documentation pages
|
# Updated documentation pages
|
||||||
|
@ -52,5 +59,7 @@ redirects = [
|
||||||
{from = "/universe", to = "/universe/project/:id", query = {id = ":id"}, force = true},
|
{from = "/universe", to = "/universe/project/:id", query = {id = ":id"}, force = true},
|
||||||
{from = "/universe", to = "/universe/category/:category", query = {category = ":category"}, force = true},
|
{from = "/universe", to = "/universe/category/:category", query = {category = ":category"}, force = true},
|
||||||
# Renamed universe projects
|
# Renamed universe projects
|
||||||
{from = "/universe/project/spacy-pytorch-transformers", to = "/universe/project/spacy-transformers", force = true}
|
{from = "/universe/project/spacy-pytorch-transformers", to = "/universe/project/spacy-transformers", force = true},
|
||||||
|
# Old model pages
|
||||||
|
{from = "/models/en-starters", to = "/models/en", force = true},
|
||||||
]
|
]
|
||||||
|
|
|
@ -5,8 +5,9 @@ requires = [
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=7.4.1,<7.5.0",
|
"thinc>=8.0.0,<8.1.0",
|
||||||
"blis>=0.4.0,<0.8.0",
|
"blis>=0.4.0,<0.8.0",
|
||||||
|
"pathy",
|
||||||
"numpy>=1.15.0",
|
"numpy>=1.15.0",
|
||||||
]
|
]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
|
@ -1,24 +1,31 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
|
spacy-legacy>=3.0.0,<3.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=7.4.1,<7.5.0
|
thinc>=8.0.0,<8.1.0
|
||||||
blis>=0.4.0,<0.8.0
|
blis>=0.4.0,<0.8.0
|
||||||
|
ml_datasets>=0.2.0,<0.3.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.4.0,<1.1.0
|
wasabi>=0.8.1,<1.1.0
|
||||||
srsly>=1.0.2,<1.1.0
|
srsly>=2.4.0,<3.0.0
|
||||||
catalogue>=0.0.7,<1.1.0
|
catalogue>=2.0.1,<2.1.0
|
||||||
|
typer>=0.3.0,<0.4.0
|
||||||
|
pathy
|
||||||
# Third party dependencies
|
# Third party dependencies
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
plac>=0.9.6,<1.2.0
|
|
||||||
pathlib==1.0.1; python_version < "3.4"
|
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
# Optional dependencies
|
pydantic>=1.7.1,<1.8.0
|
||||||
pyrsistent<0.17.0
|
jinja2
|
||||||
jsonschema>=2.6.0,<3.1.0
|
# Official Python utilities
|
||||||
|
setuptools
|
||||||
|
packaging>=20.0
|
||||||
|
importlib_metadata>=0.20; python_version < "3.8"
|
||||||
|
typing_extensions>=3.7.4; python_version < "3.8"
|
||||||
# Development dependencies
|
# Development dependencies
|
||||||
cython>=0.25
|
cython>=0.25
|
||||||
pytest>=4.6.5
|
pytest>=5.2.0
|
||||||
pytest-timeout>=1.3.0,<2.0.0
|
pytest-timeout>=1.3.0,<2.0.0
|
||||||
mock>=2.0.0,<3.0.0
|
mock>=2.0.0,<3.0.0
|
||||||
flake8>=3.5.0,<3.6.0
|
flake8>=3.5.0,<3.6.0
|
||||||
|
hypothesis
|
||||||
|
|
71
setup.cfg
71
setup.cfg
|
@ -16,10 +16,7 @@ classifiers =
|
||||||
Operating System :: MacOS :: MacOS X
|
Operating System :: MacOS :: MacOS X
|
||||||
Operating System :: Microsoft :: Windows
|
Operating System :: Microsoft :: Windows
|
||||||
Programming Language :: Cython
|
Programming Language :: Cython
|
||||||
Programming Language :: Python :: 2
|
|
||||||
Programming Language :: Python :: 2.7
|
|
||||||
Programming Language :: Python :: 3
|
Programming Language :: Python :: 3
|
||||||
Programming Language :: Python :: 3.5
|
|
||||||
Programming Language :: Python :: 3.6
|
Programming Language :: Python :: 3.6
|
||||||
Programming Language :: Python :: 3.7
|
Programming Language :: Python :: 3.7
|
||||||
Programming Language :: Python :: 3.8
|
Programming Language :: Python :: 3.8
|
||||||
|
@ -29,9 +26,7 @@ classifiers =
|
||||||
[options]
|
[options]
|
||||||
zip_safe = false
|
zip_safe = false
|
||||||
include_package_data = true
|
include_package_data = true
|
||||||
scripts =
|
python_requires = >=3.6
|
||||||
bin/spacy
|
|
||||||
python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*
|
|
||||||
setup_requires =
|
setup_requires =
|
||||||
cython>=0.25
|
cython>=0.25
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
|
@ -39,52 +34,66 @@ setup_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=7.4.1,<7.5.0
|
thinc>=8.0.0,<8.1.0
|
||||||
blis>=0.4.0,<0.8.0
|
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
|
spacy-legacy>=3.0.0,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=7.4.1,<7.5.0
|
thinc>=8.0.0,<8.1.0
|
||||||
blis>=0.4.0,<0.8.0
|
blis>=0.4.0,<0.8.0
|
||||||
wasabi>=0.4.0,<1.1.0
|
wasabi>=0.8.1,<1.1.0
|
||||||
srsly>=1.0.2,<1.1.0
|
srsly>=2.4.0,<3.0.0
|
||||||
catalogue>=0.0.7,<1.1.0
|
catalogue>=2.0.1,<2.1.0
|
||||||
|
typer>=0.3.0,<0.4.0
|
||||||
|
pathy
|
||||||
# Third-party dependencies
|
# Third-party dependencies
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
setuptools
|
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
plac>=0.9.6,<1.2.0
|
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
pathlib==1.0.1; python_version < "3.4"
|
pydantic>=1.7.1,<1.8.0
|
||||||
|
jinja2
|
||||||
|
# Official Python utilities
|
||||||
|
setuptools
|
||||||
|
packaging>=20.0
|
||||||
|
importlib_metadata>=0.20; python_version < "3.8"
|
||||||
|
typing_extensions>=3.7.4; python_version < "3.8"
|
||||||
|
|
||||||
|
[options.entry_points]
|
||||||
|
console_scripts =
|
||||||
|
spacy = spacy.cli:app
|
||||||
|
|
||||||
[options.extras_require]
|
[options.extras_require]
|
||||||
lookups =
|
lookups =
|
||||||
spacy_lookups_data>=0.3.2,<0.4.0
|
spacy_lookups_data>=1.0.0,<1.1.0
|
||||||
|
transformers =
|
||||||
|
spacy_transformers>=1.0.0,<1.1.0
|
||||||
|
ray =
|
||||||
|
spacy_ray>=0.1.0,<1.0.0
|
||||||
cuda =
|
cuda =
|
||||||
cupy>=5.0.0b4
|
cupy>=5.0.0b4,<9.0.0
|
||||||
cuda80 =
|
cuda80 =
|
||||||
cupy-cuda80>=5.0.0b4
|
cupy-cuda80>=5.0.0b4,<9.0.0
|
||||||
cuda90 =
|
cuda90 =
|
||||||
cupy-cuda90>=5.0.0b4
|
cupy-cuda90>=5.0.0b4,<9.0.0
|
||||||
cuda91 =
|
cuda91 =
|
||||||
cupy-cuda91>=5.0.0b4
|
cupy-cuda91>=5.0.0b4,<9.0.0
|
||||||
cuda92 =
|
cuda92 =
|
||||||
cupy-cuda92>=5.0.0b4
|
cupy-cuda92>=5.0.0b4,<9.0.0
|
||||||
cuda100 =
|
cuda100 =
|
||||||
cupy-cuda100>=5.0.0b4
|
cupy-cuda100>=5.0.0b4,<9.0.0
|
||||||
cuda101 =
|
cuda101 =
|
||||||
cupy-cuda101>=5.0.0b4
|
cupy-cuda101>=5.0.0b4,<9.0.0
|
||||||
cuda102 =
|
cuda102 =
|
||||||
cupy-cuda102>=5.0.0b4
|
cupy-cuda102>=5.0.0b4,<9.0.0
|
||||||
cuda110 =
|
cuda110 =
|
||||||
cupy-cuda110>=5.0.0b4
|
cupy-cuda110>=5.0.0b4,<9.0.0
|
||||||
cuda111 =
|
cuda111 =
|
||||||
cupy-cuda111>=5.0.0b4
|
cupy-cuda111>=5.0.0b4,<9.0.0
|
||||||
# Language tokenizers with external dependencies
|
# Language tokenizers with external dependencies
|
||||||
ja =
|
ja =
|
||||||
sudachipy>=0.4.5
|
sudachipy>=0.4.9
|
||||||
sudachidict_core>=20200330
|
sudachidict_core>=20200330
|
||||||
ko =
|
ko =
|
||||||
natto-py==0.9.0
|
natto-py==0.9.0
|
||||||
|
@ -98,7 +107,7 @@ universal = false
|
||||||
formats = gztar
|
formats = gztar
|
||||||
|
|
||||||
[flake8]
|
[flake8]
|
||||||
ignore = E203, E266, E501, E731, W503
|
ignore = E203, E266, E501, E731, W503, E741
|
||||||
max-line-length = 80
|
max-line-length = 80
|
||||||
select = B,C,E,F,W,T4,B9
|
select = B,C,E,F,W,T4,B9
|
||||||
exclude =
|
exclude =
|
||||||
|
@ -106,8 +115,12 @@ exclude =
|
||||||
.git,
|
.git,
|
||||||
__pycache__,
|
__pycache__,
|
||||||
_tokenizer_exceptions_list.py,
|
_tokenizer_exceptions_list.py,
|
||||||
spacy/__init__.py
|
|
||||||
|
|
||||||
[tool:pytest]
|
[tool:pytest]
|
||||||
markers =
|
markers =
|
||||||
slow
|
slow
|
||||||
|
|
||||||
|
[mypy]
|
||||||
|
ignore_missing_imports = True
|
||||||
|
no_implicit_optional = True
|
||||||
|
plugins = pydantic.mypy, thinc.mypy
|
||||||
|
|
227
setup.py
227
setup.py
|
@ -1,78 +1,101 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
from __future__ import print_function
|
from setuptools import Extension, setup, find_packages
|
||||||
import io
|
|
||||||
import os
|
|
||||||
import subprocess
|
|
||||||
import sys
|
import sys
|
||||||
import contextlib
|
import platform
|
||||||
import numpy
|
import numpy
|
||||||
from distutils.command.build_ext import build_ext
|
from distutils.command.build_ext import build_ext
|
||||||
from distutils.sysconfig import get_python_inc
|
from distutils.sysconfig import get_python_inc
|
||||||
import distutils.util
|
from pathlib import Path
|
||||||
from distutils import ccompiler, msvccompiler
|
import shutil
|
||||||
from setuptools import Extension, setup, find_packages
|
from Cython.Build import cythonize
|
||||||
|
from Cython.Compiler import Options
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
|
||||||
def is_new_osx():
|
ROOT = Path(__file__).parent
|
||||||
"""Check whether we're on OSX >= 10.10"""
|
PACKAGE_ROOT = ROOT / "spacy"
|
||||||
name = distutils.util.get_platform()
|
|
||||||
if sys.platform != "darwin":
|
|
||||||
return False
|
|
||||||
elif name.startswith("macosx-10"):
|
|
||||||
minor_version = int(name.split("-")[1].split(".")[1])
|
|
||||||
if minor_version >= 7:
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
|
# Preserve `__doc__` on functions and classes
|
||||||
|
# http://docs.cython.org/en/latest/src/userguide/source_files_and_compilation.html#compiler-options
|
||||||
|
Options.docstrings = True
|
||||||
|
|
||||||
PACKAGES = find_packages()
|
PACKAGES = find_packages()
|
||||||
|
|
||||||
|
|
||||||
MOD_NAMES = [
|
MOD_NAMES = [
|
||||||
|
"spacy.training.example",
|
||||||
"spacy.parts_of_speech",
|
"spacy.parts_of_speech",
|
||||||
"spacy.strings",
|
"spacy.strings",
|
||||||
"spacy.lexeme",
|
"spacy.lexeme",
|
||||||
"spacy.vocab",
|
"spacy.vocab",
|
||||||
"spacy.attrs",
|
"spacy.attrs",
|
||||||
"spacy.kb",
|
"spacy.kb",
|
||||||
|
"spacy.ml.parser_model",
|
||||||
"spacy.morphology",
|
"spacy.morphology",
|
||||||
"spacy.pipeline.pipes",
|
"spacy.pipeline.dep_parser",
|
||||||
"spacy.pipeline.morphologizer",
|
"spacy.pipeline.morphologizer",
|
||||||
"spacy.syntax.stateclass",
|
"spacy.pipeline.multitask",
|
||||||
"spacy.syntax._state",
|
"spacy.pipeline.ner",
|
||||||
|
"spacy.pipeline.pipe",
|
||||||
|
"spacy.pipeline.trainable_pipe",
|
||||||
|
"spacy.pipeline.sentencizer",
|
||||||
|
"spacy.pipeline.senter",
|
||||||
|
"spacy.pipeline.tagger",
|
||||||
|
"spacy.pipeline.transition_parser",
|
||||||
|
"spacy.pipeline._parser_internals.arc_eager",
|
||||||
|
"spacy.pipeline._parser_internals.ner",
|
||||||
|
"spacy.pipeline._parser_internals.nonproj",
|
||||||
|
"spacy.pipeline._parser_internals._state",
|
||||||
|
"spacy.pipeline._parser_internals.stateclass",
|
||||||
|
"spacy.pipeline._parser_internals.transition_system",
|
||||||
|
"spacy.pipeline._parser_internals._beam_utils",
|
||||||
"spacy.tokenizer",
|
"spacy.tokenizer",
|
||||||
"spacy.syntax.nn_parser",
|
"spacy.training.align",
|
||||||
"spacy.syntax._parser_model",
|
"spacy.training.gold_io",
|
||||||
"spacy.syntax._beam_utils",
|
|
||||||
"spacy.syntax.nonproj",
|
|
||||||
"spacy.syntax.transition_system",
|
|
||||||
"spacy.syntax.arc_eager",
|
|
||||||
"spacy.gold",
|
|
||||||
"spacy.tokens.doc",
|
"spacy.tokens.doc",
|
||||||
"spacy.tokens.span",
|
"spacy.tokens.span",
|
||||||
"spacy.tokens.token",
|
"spacy.tokens.token",
|
||||||
|
"spacy.tokens.span_group",
|
||||||
|
"spacy.tokens.graph",
|
||||||
"spacy.tokens.morphanalysis",
|
"spacy.tokens.morphanalysis",
|
||||||
"spacy.tokens._retokenize",
|
"spacy.tokens._retokenize",
|
||||||
"spacy.matcher.matcher",
|
"spacy.matcher.matcher",
|
||||||
"spacy.matcher.phrasematcher",
|
"spacy.matcher.phrasematcher",
|
||||||
"spacy.matcher.dependencymatcher",
|
"spacy.matcher.dependencymatcher",
|
||||||
"spacy.syntax.ner",
|
|
||||||
"spacy.symbols",
|
"spacy.symbols",
|
||||||
"spacy.vectors",
|
"spacy.vectors",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
COMPILE_OPTIONS = {
|
COMPILE_OPTIONS = {
|
||||||
"msvc": ["/Ox", "/EHsc"],
|
"msvc": ["/Ox", "/EHsc"],
|
||||||
"mingw32": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
|
"mingw32": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
|
||||||
"other": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
|
"other": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
|
||||||
}
|
}
|
||||||
|
LINK_OPTIONS = {"msvc": ["-std=c++11"], "mingw32": ["-std=c++11"], "other": []}
|
||||||
|
COMPILER_DIRECTIVES = {
|
||||||
|
"language_level": -3,
|
||||||
|
"embedsignature": True,
|
||||||
|
"annotation_typing": False,
|
||||||
|
}
|
||||||
|
# Files to copy into the package that are otherwise not included
|
||||||
|
COPY_FILES = {
|
||||||
|
ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package",
|
||||||
|
ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package",
|
||||||
|
ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
LINK_OPTIONS = {"msvc": [], "mingw32": [], "other": []}
|
def is_new_osx():
|
||||||
|
"""Check whether we're on OSX >= 10.7"""
|
||||||
|
if sys.platform != "darwin":
|
||||||
|
return False
|
||||||
|
mac_ver = platform.mac_ver()[0]
|
||||||
|
if mac_ver.startswith("10"):
|
||||||
|
minor_version = int(mac_ver.split(".")[1])
|
||||||
|
if minor_version >= 7:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
if is_new_osx():
|
if is_new_osx():
|
||||||
|
@ -105,20 +128,6 @@ class build_ext_subclass(build_ext, build_ext_options):
|
||||||
build_ext.build_extensions(self)
|
build_ext.build_extensions(self)
|
||||||
|
|
||||||
|
|
||||||
def generate_cython(root, source):
|
|
||||||
print("Cythonizing sources")
|
|
||||||
p = subprocess.call(
|
|
||||||
[sys.executable, os.path.join(root, "bin", "cythonize.py"), source],
|
|
||||||
env=os.environ,
|
|
||||||
)
|
|
||||||
if p != 0:
|
|
||||||
raise RuntimeError("Running cythonize failed")
|
|
||||||
|
|
||||||
|
|
||||||
def is_source_release(path):
|
|
||||||
return os.path.exists(os.path.join(path, "PKG-INFO"))
|
|
||||||
|
|
||||||
|
|
||||||
# Include the git version in the build (adapted from NumPy)
|
# Include the git version in the build (adapted from NumPy)
|
||||||
# Copyright (c) 2005-2020, NumPy Developers.
|
# Copyright (c) 2005-2020, NumPy Developers.
|
||||||
# BSD 3-Clause license, see licenses/3rd_party_licenses.txt
|
# BSD 3-Clause license, see licenses/3rd_party_licenses.txt
|
||||||
|
@ -138,19 +147,19 @@ def write_git_info_py(filename="spacy/git_info.py"):
|
||||||
return out
|
return out
|
||||||
|
|
||||||
git_version = "Unknown"
|
git_version = "Unknown"
|
||||||
if os.path.exists(".git"):
|
if Path(".git").exists():
|
||||||
try:
|
try:
|
||||||
out = _minimal_ext_cmd(["git", "rev-parse", "--short", "HEAD"])
|
out = _minimal_ext_cmd(["git", "rev-parse", "--short", "HEAD"])
|
||||||
git_version = out.strip().decode("ascii")
|
git_version = out.strip().decode("ascii")
|
||||||
except:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
elif os.path.exists(filename):
|
elif Path(filename).exists():
|
||||||
# must be a source distribution, use existing version file
|
# must be a source distribution, use existing version file
|
||||||
try:
|
try:
|
||||||
a = open(filename, "r")
|
a = open(filename, "r")
|
||||||
lines = a.readlines()
|
lines = a.readlines()
|
||||||
git_version = lines[-1].split('"')[1]
|
git_version = lines[-1].split('"')[1]
|
||||||
except:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
finally:
|
finally:
|
||||||
a.close()
|
a.close()
|
||||||
|
@ -161,91 +170,55 @@ GIT_VERSION = "%(git_version)s"
|
||||||
"""
|
"""
|
||||||
a = open(filename, "w")
|
a = open(filename, "w")
|
||||||
try:
|
try:
|
||||||
a.write(
|
a.write(text % {"git_version": git_version})
|
||||||
text % {"git_version": git_version,}
|
|
||||||
)
|
|
||||||
finally:
|
finally:
|
||||||
a.close()
|
a.close()
|
||||||
|
|
||||||
|
|
||||||
def clean(path):
|
def clean(path):
|
||||||
for name in MOD_NAMES:
|
for path in path.glob("**/*"):
|
||||||
name = name.replace(".", "/")
|
if path.is_file() and path.suffix in (".so", ".cpp", ".html"):
|
||||||
for ext in [".so", ".html", ".cpp", ".c"]:
|
print(f"Deleting {path.name}")
|
||||||
file_path = os.path.join(path, name + ext)
|
path.unlink()
|
||||||
if os.path.exists(file_path):
|
|
||||||
os.unlink(file_path)
|
|
||||||
|
|
||||||
|
|
||||||
@contextlib.contextmanager
|
|
||||||
def chdir(new_dir):
|
|
||||||
old_dir = os.getcwd()
|
|
||||||
try:
|
|
||||||
os.chdir(new_dir)
|
|
||||||
sys.path.insert(0, new_dir)
|
|
||||||
yield
|
|
||||||
finally:
|
|
||||||
del sys.path[0]
|
|
||||||
os.chdir(old_dir)
|
|
||||||
|
|
||||||
|
|
||||||
def setup_package():
|
def setup_package():
|
||||||
write_git_info_py()
|
write_git_info_py()
|
||||||
|
if len(sys.argv) > 1 and sys.argv[1] == "clean":
|
||||||
|
return clean(PACKAGE_ROOT)
|
||||||
|
|
||||||
root = os.path.abspath(os.path.dirname(__file__))
|
with (PACKAGE_ROOT / "about.py").open("r") as f:
|
||||||
|
about = {}
|
||||||
|
exec(f.read(), about)
|
||||||
|
|
||||||
if hasattr(sys, "argv") and len(sys.argv) > 1 and sys.argv[1] == "clean":
|
for copy_file, target_dir in COPY_FILES.items():
|
||||||
return clean(root)
|
if copy_file.exists():
|
||||||
|
shutil.copy(str(copy_file), str(target_dir))
|
||||||
|
print(f"Copied {copy_file} -> {target_dir}")
|
||||||
|
|
||||||
with chdir(root):
|
include_dirs = [
|
||||||
with io.open(os.path.join(root, "spacy", "about.py"), encoding="utf8") as f:
|
numpy.get_include(),
|
||||||
about = {}
|
get_python_inc(plat_specific=True),
|
||||||
exec(f.read(), about)
|
]
|
||||||
|
ext_modules = []
|
||||||
include_dirs = [
|
for name in MOD_NAMES:
|
||||||
numpy.get_include(),
|
mod_path = name.replace(".", "/") + ".pyx"
|
||||||
get_python_inc(plat_specific=True),
|
ext = Extension(
|
||||||
os.path.join(root, "include"),
|
name, [mod_path], language="c++", extra_compile_args=["-std=c++11"]
|
||||||
]
|
|
||||||
|
|
||||||
if (
|
|
||||||
ccompiler.new_compiler().compiler_type == "msvc"
|
|
||||||
and msvccompiler.get_build_version() == 9
|
|
||||||
):
|
|
||||||
include_dirs.append(os.path.join(root, "include", "msvc9"))
|
|
||||||
|
|
||||||
ext_modules = []
|
|
||||||
for mod_name in MOD_NAMES:
|
|
||||||
mod_path = mod_name.replace(".", "/") + ".cpp"
|
|
||||||
extra_link_args = []
|
|
||||||
# ???
|
|
||||||
# Imported from patch from @mikepb
|
|
||||||
# See Issue #267. Running blind here...
|
|
||||||
if sys.platform == "darwin":
|
|
||||||
dylib_path = [".." for _ in range(mod_name.count("."))]
|
|
||||||
dylib_path = "/".join(dylib_path)
|
|
||||||
dylib_path = "@loader_path/%s/spacy/platform/darwin/lib" % dylib_path
|
|
||||||
extra_link_args.append("-Wl,-rpath,%s" % dylib_path)
|
|
||||||
ext_modules.append(
|
|
||||||
Extension(
|
|
||||||
mod_name,
|
|
||||||
[mod_path],
|
|
||||||
language="c++",
|
|
||||||
include_dirs=include_dirs,
|
|
||||||
extra_link_args=extra_link_args,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
if not is_source_release(root):
|
|
||||||
generate_cython(root, "spacy")
|
|
||||||
|
|
||||||
setup(
|
|
||||||
name="spacy",
|
|
||||||
packages=PACKAGES,
|
|
||||||
version=about["__version__"],
|
|
||||||
ext_modules=ext_modules,
|
|
||||||
cmdclass={"build_ext": build_ext_subclass},
|
|
||||||
)
|
)
|
||||||
|
ext_modules.append(ext)
|
||||||
|
print("Cythonizing sources")
|
||||||
|
ext_modules = cythonize(ext_modules, compiler_directives=COMPILER_DIRECTIVES)
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name="spacy",
|
||||||
|
packages=PACKAGES,
|
||||||
|
version=about["__version__"],
|
||||||
|
ext_modules=ext_modules,
|
||||||
|
cmdclass={"build_ext": build_ext_subclass},
|
||||||
|
include_dirs=include_dirs,
|
||||||
|
package_data={"": ["*.pyx", "*.pxd", "*.pxi"]},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -1,39 +1,68 @@
|
||||||
# coding: utf8
|
from typing import Union, Iterable, Dict, Any
|
||||||
from __future__ import unicode_literals
|
from pathlib import Path
|
||||||
import warnings
|
import warnings
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
|
warnings.filterwarnings("ignore", message="numpy.dtype size changed") # noqa
|
||||||
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
|
warnings.filterwarnings("ignore", message="numpy.ufunc size changed") # noqa
|
||||||
|
|
||||||
# These are imported as part of the API
|
# These are imported as part of the API
|
||||||
from thinc.neural.util import prefer_gpu, require_gpu
|
from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from . import pipeline
|
from . import pipeline # noqa: F401
|
||||||
from .cli.info import info as cli_info
|
from .cli.info import info # noqa: F401
|
||||||
from .glossary import explain
|
from .glossary import explain # noqa: F401
|
||||||
from .about import __version__
|
from .about import __version__ # noqa: F401
|
||||||
from .errors import Errors, Warnings
|
from .util import registry, logger # noqa: F401
|
||||||
|
|
||||||
|
from .errors import Errors
|
||||||
|
from .language import Language
|
||||||
|
from .vocab import Vocab
|
||||||
from . import util
|
from . import util
|
||||||
from .util import registry
|
|
||||||
from .language import component
|
|
||||||
|
|
||||||
|
|
||||||
if sys.maxunicode == 65535:
|
if sys.maxunicode == 65535:
|
||||||
raise SystemError(Errors.E130)
|
raise SystemError(Errors.E130)
|
||||||
|
|
||||||
|
|
||||||
def load(name, **overrides):
|
def load(
|
||||||
depr_path = overrides.get("path")
|
name: Union[str, Path],
|
||||||
if depr_path not in (True, False, None):
|
disable: Iterable[str] = util.SimpleFrozenList(),
|
||||||
warnings.warn(Warnings.W001.format(path=depr_path), DeprecationWarning)
|
exclude: Iterable[str] = util.SimpleFrozenList(),
|
||||||
return util.load_model(name, **overrides)
|
config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
|
||||||
|
) -> Language:
|
||||||
|
"""Load a spaCy model from an installed package or a local path.
|
||||||
|
|
||||||
|
name (str): Package name or model path.
|
||||||
|
disable (Iterable[str]): Names of pipeline components to disable. Disabled
|
||||||
|
pipes will be loaded but they won't be run unless you explicitly
|
||||||
|
enable them by calling nlp.enable_pipe.
|
||||||
|
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
|
||||||
|
components won't be loaded.
|
||||||
|
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
|
||||||
|
keyed by section values in dot notation.
|
||||||
|
RETURNS (Language): The loaded nlp object.
|
||||||
|
"""
|
||||||
|
return util.load_model(name, disable=disable, exclude=exclude, config=config)
|
||||||
|
|
||||||
|
|
||||||
def blank(name, **kwargs):
|
def blank(
|
||||||
|
name: str,
|
||||||
|
*,
|
||||||
|
vocab: Union[Vocab, bool] = True,
|
||||||
|
config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
|
||||||
|
meta: Dict[str, Any] = util.SimpleFrozenDict()
|
||||||
|
) -> Language:
|
||||||
|
"""Create a blank nlp object for a given language code.
|
||||||
|
|
||||||
|
name (str): The language code, e.g. "en".
|
||||||
|
vocab (Vocab): A Vocab object. If True, a vocab is created.
|
||||||
|
config (Dict[str, Any] / Config): Optional config overrides.
|
||||||
|
meta (Dict[str, Any]): Overrides for nlp.meta.
|
||||||
|
RETURNS (Language): The nlp object.
|
||||||
|
"""
|
||||||
LangClass = util.get_lang_class(name)
|
LangClass = util.get_lang_class(name)
|
||||||
return LangClass(**kwargs)
|
# We should accept both dot notation and nested dict here for consistency
|
||||||
|
config = util.dot_to_dict(config)
|
||||||
|
return LangClass.from_config(config, meta=meta)
|
||||||
def info(model=None, markdown=False, silent=False):
|
|
||||||
return cli_info(model, markdown, silent)
|
|
||||||
|
|
|
@ -1,36 +1,4 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
# NB! This breaks in plac on Python 2!!
|
|
||||||
# from __future__ import unicode_literals
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import plac
|
from spacy.cli import setup_cli
|
||||||
import sys
|
|
||||||
from wasabi import msg
|
|
||||||
from spacy.cli import download, link, info, package, train, pretrain, convert
|
|
||||||
from spacy.cli import init_model, profile, evaluate, validate, debug_data
|
|
||||||
|
|
||||||
commands = {
|
setup_cli()
|
||||||
"download": download,
|
|
||||||
"link": link,
|
|
||||||
"info": info,
|
|
||||||
"train": train,
|
|
||||||
"pretrain": pretrain,
|
|
||||||
"debug-data": debug_data,
|
|
||||||
"evaluate": evaluate,
|
|
||||||
"convert": convert,
|
|
||||||
"package": package,
|
|
||||||
"init-model": init_model,
|
|
||||||
"profile": profile,
|
|
||||||
"validate": validate,
|
|
||||||
}
|
|
||||||
if len(sys.argv) == 1:
|
|
||||||
msg.info("Available commands", ", ".join(commands), exits=1)
|
|
||||||
command = sys.argv.pop(1)
|
|
||||||
sys.argv[0] = "spacy %s" % command
|
|
||||||
if command in commands:
|
|
||||||
plac.call(commands[command], sys.argv[1:])
|
|
||||||
else:
|
|
||||||
available = "Available: {}".format(", ".join(commands))
|
|
||||||
msg.fail("Unknown command: {}".format(command), available, exits=1)
|
|
||||||
|
|
1004
spacy/_ml.py
1004
spacy/_ml.py
File diff suppressed because it is too large
Load Diff
|
@ -1,7 +1,7 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "2.3.5"
|
__version__ = "3.0.0"
|
||||||
__release__ = True
|
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json"
|
__projects__ = "https://github.com/explosion/projects"
|
||||||
|
__projects_branch__ = "v3"
|
||||||
|
|
|
@ -1,181 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
from collections import OrderedDict
|
|
||||||
from wasabi import Printer
|
|
||||||
|
|
||||||
from .tokens import Doc, Token, Span
|
|
||||||
from .errors import Errors, Warnings
|
|
||||||
|
|
||||||
|
|
||||||
def analyze_pipes(pipeline, name, pipe, index, warn=True):
|
|
||||||
"""Analyze a pipeline component with respect to its position in the current
|
|
||||||
pipeline and the other components. Will check whether requirements are
|
|
||||||
fulfilled (e.g. if previous components assign the attributes).
|
|
||||||
|
|
||||||
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
|
||||||
name (unicode): The name of the pipeline component to analyze.
|
|
||||||
pipe (callable): The pipeline component function to analyze.
|
|
||||||
index (int): The index of the component in the pipeline.
|
|
||||||
warn (bool): Show user warning if problem is found.
|
|
||||||
RETURNS (list): The problems found for the given pipeline component.
|
|
||||||
"""
|
|
||||||
assert pipeline[index][0] == name
|
|
||||||
prev_pipes = pipeline[:index]
|
|
||||||
pipe_requires = getattr(pipe, "requires", [])
|
|
||||||
requires = OrderedDict([(annot, False) for annot in pipe_requires])
|
|
||||||
if requires:
|
|
||||||
for prev_name, prev_pipe in prev_pipes:
|
|
||||||
prev_assigns = getattr(prev_pipe, "assigns", [])
|
|
||||||
for annot in prev_assigns:
|
|
||||||
requires[annot] = True
|
|
||||||
problems = []
|
|
||||||
for annot, fulfilled in requires.items():
|
|
||||||
if not fulfilled:
|
|
||||||
problems.append(annot)
|
|
||||||
if warn:
|
|
||||||
warnings.warn(Warnings.W025.format(name=name, attr=annot))
|
|
||||||
return problems
|
|
||||||
|
|
||||||
|
|
||||||
def analyze_all_pipes(pipeline, warn=True):
|
|
||||||
"""Analyze all pipes in the pipeline in order.
|
|
||||||
|
|
||||||
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
|
||||||
warn (bool): Show user warning if problem is found.
|
|
||||||
RETURNS (dict): The problems found, keyed by component name.
|
|
||||||
"""
|
|
||||||
problems = {}
|
|
||||||
for i, (name, pipe) in enumerate(pipeline):
|
|
||||||
problems[name] = analyze_pipes(pipeline, name, pipe, i, warn=warn)
|
|
||||||
return problems
|
|
||||||
|
|
||||||
|
|
||||||
def dot_to_dict(values):
|
|
||||||
"""Convert dot notation to a dict. For example: ["token.pos", "token._.xyz"]
|
|
||||||
become {"token": {"pos": True, "_": {"xyz": True }}}.
|
|
||||||
|
|
||||||
values (iterable): The values to convert.
|
|
||||||
RETURNS (dict): The converted values.
|
|
||||||
"""
|
|
||||||
result = {}
|
|
||||||
for value in values:
|
|
||||||
path = result
|
|
||||||
parts = value.lower().split(".")
|
|
||||||
for i, item in enumerate(parts):
|
|
||||||
is_last = i == len(parts) - 1
|
|
||||||
path = path.setdefault(item, True if is_last else {})
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def validate_attrs(values):
|
|
||||||
"""Validate component attributes provided to "assigns", "requires" etc.
|
|
||||||
Raises error for invalid attributes and formatting. Doesn't check if
|
|
||||||
custom extension attributes are registered, since this is something the
|
|
||||||
user might want to do themselves later in the component.
|
|
||||||
|
|
||||||
values (iterable): The string attributes to check, e.g. `["token.pos"]`.
|
|
||||||
RETURNS (iterable): The checked attributes.
|
|
||||||
"""
|
|
||||||
data = dot_to_dict(values)
|
|
||||||
objs = {"doc": Doc, "token": Token, "span": Span}
|
|
||||||
for obj_key, attrs in data.items():
|
|
||||||
if obj_key == "span":
|
|
||||||
# Support Span only for custom extension attributes
|
|
||||||
span_attrs = [attr for attr in values if attr.startswith("span.")]
|
|
||||||
span_attrs = [attr for attr in span_attrs if not attr.startswith("span._.")]
|
|
||||||
if span_attrs:
|
|
||||||
raise ValueError(Errors.E180.format(attrs=", ".join(span_attrs)))
|
|
||||||
if obj_key not in objs: # first element is not doc/token/span
|
|
||||||
invalid_attrs = ", ".join(a for a in values if a.startswith(obj_key))
|
|
||||||
raise ValueError(Errors.E181.format(obj=obj_key, attrs=invalid_attrs))
|
|
||||||
if not isinstance(attrs, dict): # attr is something like "doc"
|
|
||||||
raise ValueError(Errors.E182.format(attr=obj_key))
|
|
||||||
for attr, value in attrs.items():
|
|
||||||
if attr == "_":
|
|
||||||
if value is True: # attr is something like "doc._"
|
|
||||||
raise ValueError(Errors.E182.format(attr="{}._".format(obj_key)))
|
|
||||||
for ext_attr, ext_value in value.items():
|
|
||||||
# We don't check whether the attribute actually exists
|
|
||||||
if ext_value is not True: # attr is something like doc._.x.y
|
|
||||||
good = "{}._.{}".format(obj_key, ext_attr)
|
|
||||||
bad = "{}.{}".format(good, ".".join(ext_value))
|
|
||||||
raise ValueError(Errors.E183.format(attr=bad, solution=good))
|
|
||||||
continue # we can't validate those further
|
|
||||||
if attr.endswith("_"): # attr is something like "token.pos_"
|
|
||||||
raise ValueError(Errors.E184.format(attr=attr, solution=attr[:-1]))
|
|
||||||
if value is not True: # attr is something like doc.x.y
|
|
||||||
good = "{}.{}".format(obj_key, attr)
|
|
||||||
bad = "{}.{}".format(good, ".".join(value))
|
|
||||||
raise ValueError(Errors.E183.format(attr=bad, solution=good))
|
|
||||||
obj = objs[obj_key]
|
|
||||||
if not hasattr(obj, attr):
|
|
||||||
raise ValueError(Errors.E185.format(obj=obj_key, attr=attr))
|
|
||||||
return values
|
|
||||||
|
|
||||||
|
|
||||||
def _get_feature_for_attr(pipeline, attr, feature):
|
|
||||||
assert feature in ["assigns", "requires"]
|
|
||||||
result = []
|
|
||||||
for pipe_name, pipe in pipeline:
|
|
||||||
pipe_assigns = getattr(pipe, feature, [])
|
|
||||||
if attr in pipe_assigns:
|
|
||||||
result.append((pipe_name, pipe))
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def get_assigns_for_attr(pipeline, attr):
|
|
||||||
"""Get all pipeline components that assign an attr, e.g. "doc.tensor".
|
|
||||||
|
|
||||||
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
|
||||||
attr (unicode): The attribute to check.
|
|
||||||
RETURNS (list): (name, pipeline) tuples of components that assign the attr.
|
|
||||||
"""
|
|
||||||
return _get_feature_for_attr(pipeline, attr, "assigns")
|
|
||||||
|
|
||||||
|
|
||||||
def get_requires_for_attr(pipeline, attr):
|
|
||||||
"""Get all pipeline components that require an attr, e.g. "doc.tensor".
|
|
||||||
|
|
||||||
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
|
||||||
attr (unicode): The attribute to check.
|
|
||||||
RETURNS (list): (name, pipeline) tuples of components that require the attr.
|
|
||||||
"""
|
|
||||||
return _get_feature_for_attr(pipeline, attr, "requires")
|
|
||||||
|
|
||||||
|
|
||||||
def print_summary(nlp, pretty=True, no_print=False):
|
|
||||||
"""Print a formatted summary for the current nlp object's pipeline. Shows
|
|
||||||
a table with the pipeline components and why they assign and require, as
|
|
||||||
well as any problems if available.
|
|
||||||
|
|
||||||
nlp (Language): The nlp object.
|
|
||||||
pretty (bool): Pretty-print the results (color etc).
|
|
||||||
no_print (bool): Don't print anything, just return the data.
|
|
||||||
RETURNS (dict): A dict with "overview" and "problems".
|
|
||||||
"""
|
|
||||||
msg = Printer(pretty=pretty, no_print=no_print)
|
|
||||||
overview = []
|
|
||||||
problems = {}
|
|
||||||
for i, (name, pipe) in enumerate(nlp.pipeline):
|
|
||||||
requires = getattr(pipe, "requires", [])
|
|
||||||
assigns = getattr(pipe, "assigns", [])
|
|
||||||
retok = getattr(pipe, "retokenizes", False)
|
|
||||||
overview.append((i, name, requires, assigns, retok))
|
|
||||||
problems[name] = analyze_pipes(nlp.pipeline, name, pipe, i, warn=False)
|
|
||||||
msg.divider("Pipeline Overview")
|
|
||||||
header = ("#", "Component", "Requires", "Assigns", "Retokenizes")
|
|
||||||
msg.table(overview, header=header, divider=True, multiline=True)
|
|
||||||
n_problems = sum(len(p) for p in problems.values())
|
|
||||||
if any(p for p in problems.values()):
|
|
||||||
msg.divider("Problems ({})".format(n_problems))
|
|
||||||
for name, problem in problems.items():
|
|
||||||
if problem:
|
|
||||||
problem = ", ".join(problem)
|
|
||||||
msg.warn("'{}' requirements not met: {}".format(name, problem))
|
|
||||||
else:
|
|
||||||
msg.good("No problems found.")
|
|
||||||
if no_print:
|
|
||||||
return {"overview": overview, "problems": problems}
|
|
|
@ -91,6 +91,7 @@ cdef enum attr_id_t:
|
||||||
|
|
||||||
LANG
|
LANG
|
||||||
ENT_KB_ID = symbols.ENT_KB_ID
|
ENT_KB_ID = symbols.ENT_KB_ID
|
||||||
|
MORPH
|
||||||
ENT_ID = symbols.ENT_ID
|
ENT_ID = symbols.ENT_ID
|
||||||
|
|
||||||
IDX
|
IDX
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
IDS = {
|
IDS = {
|
||||||
"": NULL_ATTR,
|
"": NULL_ATTR,
|
||||||
|
@ -92,6 +89,7 @@ IDS = {
|
||||||
"SPACY": SPACY,
|
"SPACY": SPACY,
|
||||||
"PROB": PROB,
|
"PROB": PROB,
|
||||||
"LANG": LANG,
|
"LANG": LANG,
|
||||||
|
"MORPH": MORPH,
|
||||||
"IDX": IDX
|
"IDX": IDX
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,12 +1,37 @@
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
|
from ._util import app, setup_cli # noqa: F401
|
||||||
|
|
||||||
|
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
|
||||||
|
# are registered automatically and won't have to be imported here.
|
||||||
from .download import download # noqa: F401
|
from .download import download # noqa: F401
|
||||||
from .info import info # noqa: F401
|
from .info import info # noqa: F401
|
||||||
from .link import link # noqa: F401
|
|
||||||
from .package import package # noqa: F401
|
from .package import package # noqa: F401
|
||||||
from .profile import profile # noqa: F401
|
from .profile import profile # noqa: F401
|
||||||
from .train import train # noqa: F401
|
from .train import train_cli # noqa: F401
|
||||||
from .pretrain import pretrain # noqa: F401
|
from .pretrain import pretrain # noqa: F401
|
||||||
from .debug_data import debug_data # noqa: F401
|
from .debug_data import debug_data # noqa: F401
|
||||||
|
from .debug_config import debug_config # noqa: F401
|
||||||
|
from .debug_model import debug_model # noqa: F401
|
||||||
from .evaluate import evaluate # noqa: F401
|
from .evaluate import evaluate # noqa: F401
|
||||||
from .convert import convert # noqa: F401
|
from .convert import convert # noqa: F401
|
||||||
from .init_model import init_model # noqa: F401
|
from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||||
|
from .init_config import init_config, fill_config # noqa: F401
|
||||||
from .validate import validate # noqa: F401
|
from .validate import validate # noqa: F401
|
||||||
|
from .project.clone import project_clone # noqa: F401
|
||||||
|
from .project.assets import project_assets # noqa: F401
|
||||||
|
from .project.run import project_run # noqa: F401
|
||||||
|
from .project.dvc import project_update_dvc # noqa: F401
|
||||||
|
from .project.push import project_push # noqa: F401
|
||||||
|
from .project.pull import project_pull # noqa: F401
|
||||||
|
from .project.document import project_document # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
|
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||||
|
def link(*args, **kwargs):
|
||||||
|
"""As of spaCy v3.0, symlinks like "en" are deprecated. You can load trained
|
||||||
|
pipeline packages using their full names or from a directory path."""
|
||||||
|
msg.warn(
|
||||||
|
"As of spaCy v3.0, model symlinks are deprecated. You can load trained "
|
||||||
|
"pipeline packages using their full names or from a directory path."
|
||||||
|
)
|
||||||
|
|
|
@ -1,220 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
# NB: This schema describes the new format of the training data, see #2928
|
|
||||||
TRAINING_SCHEMA = {
|
|
||||||
"$schema": "http://json-schema.org/draft-06/schema",
|
|
||||||
"title": "Training data for spaCy models",
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"text": {
|
|
||||||
"title": "The text of the training example",
|
|
||||||
"type": "string",
|
|
||||||
"minLength": 1,
|
|
||||||
},
|
|
||||||
"ents": {
|
|
||||||
"title": "Named entity spans in the text",
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"start": {
|
|
||||||
"title": "Start character offset of the span",
|
|
||||||
"type": "integer",
|
|
||||||
"minimum": 0,
|
|
||||||
},
|
|
||||||
"end": {
|
|
||||||
"title": "End character offset of the span",
|
|
||||||
"type": "integer",
|
|
||||||
"minimum": 0,
|
|
||||||
},
|
|
||||||
"label": {
|
|
||||||
"title": "Entity label",
|
|
||||||
"type": "string",
|
|
||||||
"minLength": 1,
|
|
||||||
"pattern": "^[A-Z0-9]*$",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"required": ["start", "end", "label"],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"sents": {
|
|
||||||
"title": "Sentence spans in the text",
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"start": {
|
|
||||||
"title": "Start character offset of the span",
|
|
||||||
"type": "integer",
|
|
||||||
"minimum": 0,
|
|
||||||
},
|
|
||||||
"end": {
|
|
||||||
"title": "End character offset of the span",
|
|
||||||
"type": "integer",
|
|
||||||
"minimum": 0,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"required": ["start", "end"],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"cats": {
|
|
||||||
"title": "Text categories for the text classifier",
|
|
||||||
"type": "object",
|
|
||||||
"patternProperties": {
|
|
||||||
"*": {
|
|
||||||
"title": "A text category",
|
|
||||||
"oneOf": [
|
|
||||||
{"type": "boolean"},
|
|
||||||
{"type": "number", "minimum": 0},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"propertyNames": {"pattern": "^[A-Z0-9]*$", "minLength": 1},
|
|
||||||
},
|
|
||||||
"tokens": {
|
|
||||||
"title": "The tokens in the text",
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "object",
|
|
||||||
"minProperties": 1,
|
|
||||||
"properties": {
|
|
||||||
"id": {
|
|
||||||
"title": "Token ID, usually token index",
|
|
||||||
"type": "integer",
|
|
||||||
"minimum": 0,
|
|
||||||
},
|
|
||||||
"start": {
|
|
||||||
"title": "Start character offset of the token",
|
|
||||||
"type": "integer",
|
|
||||||
"minimum": 0,
|
|
||||||
},
|
|
||||||
"end": {
|
|
||||||
"title": "End character offset of the token",
|
|
||||||
"type": "integer",
|
|
||||||
"minimum": 0,
|
|
||||||
},
|
|
||||||
"pos": {
|
|
||||||
"title": "Coarse-grained part-of-speech tag",
|
|
||||||
"type": "string",
|
|
||||||
"minLength": 1,
|
|
||||||
},
|
|
||||||
"tag": {
|
|
||||||
"title": "Fine-grained part-of-speech tag",
|
|
||||||
"type": "string",
|
|
||||||
"minLength": 1,
|
|
||||||
},
|
|
||||||
"dep": {
|
|
||||||
"title": "Dependency label",
|
|
||||||
"type": "string",
|
|
||||||
"minLength": 1,
|
|
||||||
},
|
|
||||||
"head": {
|
|
||||||
"title": "Index of the token's head",
|
|
||||||
"type": "integer",
|
|
||||||
"minimum": 0,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"required": ["start", "end"],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"_": {"title": "Custom user space", "type": "object"},
|
|
||||||
},
|
|
||||||
"required": ["text"],
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
META_SCHEMA = {
|
|
||||||
"$schema": "http://json-schema.org/draft-06/schema",
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"lang": {
|
|
||||||
"title": "Two-letter language code, e.g. 'en'",
|
|
||||||
"type": "string",
|
|
||||||
"minLength": 2,
|
|
||||||
"maxLength": 2,
|
|
||||||
"pattern": "^[a-z]*$",
|
|
||||||
},
|
|
||||||
"name": {
|
|
||||||
"title": "Model name",
|
|
||||||
"type": "string",
|
|
||||||
"minLength": 1,
|
|
||||||
"pattern": "^[a-z_]*$",
|
|
||||||
},
|
|
||||||
"version": {
|
|
||||||
"title": "Model version",
|
|
||||||
"type": "string",
|
|
||||||
"minLength": 1,
|
|
||||||
"pattern": "^[0-9a-z.-]*$",
|
|
||||||
},
|
|
||||||
"spacy_version": {
|
|
||||||
"title": "Compatible spaCy version identifier",
|
|
||||||
"type": "string",
|
|
||||||
"minLength": 1,
|
|
||||||
"pattern": "^[0-9a-z.-><=]*$",
|
|
||||||
},
|
|
||||||
"parent_package": {
|
|
||||||
"title": "Name of parent spaCy package, e.g. spacy or spacy-nightly",
|
|
||||||
"type": "string",
|
|
||||||
"minLength": 1,
|
|
||||||
"default": "spacy",
|
|
||||||
},
|
|
||||||
"pipeline": {
|
|
||||||
"title": "Names of pipeline components",
|
|
||||||
"type": "array",
|
|
||||||
"items": {"type": "string", "minLength": 1},
|
|
||||||
},
|
|
||||||
"description": {"title": "Model description", "type": "string"},
|
|
||||||
"license": {"title": "Model license", "type": "string"},
|
|
||||||
"author": {"title": "Model author name", "type": "string"},
|
|
||||||
"email": {"title": "Model author email", "type": "string", "format": "email"},
|
|
||||||
"url": {"title": "Model author URL", "type": "string", "format": "uri"},
|
|
||||||
"sources": {
|
|
||||||
"title": "Training data sources",
|
|
||||||
"type": "array",
|
|
||||||
"items": {"type": "string"},
|
|
||||||
},
|
|
||||||
"vectors": {
|
|
||||||
"title": "Included word vectors",
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"keys": {
|
|
||||||
"title": "Number of unique keys",
|
|
||||||
"type": "integer",
|
|
||||||
"minimum": 0,
|
|
||||||
},
|
|
||||||
"vectors": {
|
|
||||||
"title": "Number of unique vectors",
|
|
||||||
"type": "integer",
|
|
||||||
"minimum": 0,
|
|
||||||
},
|
|
||||||
"width": {
|
|
||||||
"title": "Number of dimensions",
|
|
||||||
"type": "integer",
|
|
||||||
"minimum": 0,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"accuracy": {
|
|
||||||
"title": "Accuracy numbers",
|
|
||||||
"type": "object",
|
|
||||||
"patternProperties": {"*": {"type": "number", "minimum": 0.0}},
|
|
||||||
},
|
|
||||||
"speed": {
|
|
||||||
"title": "Speed evaluation numbers",
|
|
||||||
"type": "object",
|
|
||||||
"patternProperties": {
|
|
||||||
"*": {
|
|
||||||
"oneOf": [
|
|
||||||
{"type": "number", "minimum": 0.0},
|
|
||||||
{"type": "integer", "minimum": 0},
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"required": ["lang", "name", "version"],
|
|
||||||
}
|
|
489
spacy/cli/_util.py
Normal file
489
spacy/cli/_util.py
Normal file
|
@ -0,0 +1,489 @@
|
||||||
|
from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, TYPE_CHECKING
|
||||||
|
import sys
|
||||||
|
import shutil
|
||||||
|
from pathlib import Path
|
||||||
|
from wasabi import msg
|
||||||
|
import srsly
|
||||||
|
import hashlib
|
||||||
|
import typer
|
||||||
|
from click import NoSuchOption
|
||||||
|
from click.parser import split_arg_string
|
||||||
|
from typer.main import get_command
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from thinc.api import Config, ConfigValidationError, require_gpu
|
||||||
|
from configparser import InterpolationError
|
||||||
|
import os
|
||||||
|
|
||||||
|
from ..schemas import ProjectConfigSchema, validate
|
||||||
|
from ..util import import_file, run_command, make_tempdir, registry, logger
|
||||||
|
from ..util import is_compatible_version, ENV_VARS
|
||||||
|
from .. import about
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from pathy import Pathy # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
|
SDIST_SUFFIX = ".tar.gz"
|
||||||
|
WHEEL_SUFFIX = "-py3-none-any.whl"
|
||||||
|
|
||||||
|
PROJECT_FILE = "project.yml"
|
||||||
|
PROJECT_LOCK = "project.lock"
|
||||||
|
COMMAND = "python -m spacy"
|
||||||
|
NAME = "spacy"
|
||||||
|
HELP = """spaCy Command-line Interface
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/cli
|
||||||
|
"""
|
||||||
|
PROJECT_HELP = f"""Command-line interface for spaCy projects and templates.
|
||||||
|
You'd typically start by cloning a project template to a local directory and
|
||||||
|
fetching its assets like datasets etc. See the project's {PROJECT_FILE} for the
|
||||||
|
available commands.
|
||||||
|
"""
|
||||||
|
DEBUG_HELP = """Suite of helpful commands for debugging and profiling. Includes
|
||||||
|
commands to check and validate your config files, training and evaluation data,
|
||||||
|
and custom model implementations.
|
||||||
|
"""
|
||||||
|
INIT_HELP = """Commands for initializing configs and pipeline packages."""
|
||||||
|
|
||||||
|
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
||||||
|
# keep the names short, but not needed at the moment.
|
||||||
|
Arg = typer.Argument
|
||||||
|
Opt = typer.Option
|
||||||
|
|
||||||
|
app = typer.Typer(name=NAME, help=HELP)
|
||||||
|
project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
|
||||||
|
debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
|
||||||
|
init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
|
||||||
|
|
||||||
|
app.add_typer(project_cli)
|
||||||
|
app.add_typer(debug_cli)
|
||||||
|
app.add_typer(init_cli)
|
||||||
|
|
||||||
|
|
||||||
|
def setup_cli() -> None:
|
||||||
|
# Make sure the entry-point for CLI runs, so that they get imported.
|
||||||
|
registry.cli.get_all()
|
||||||
|
# Ensure that the help messages always display the correct prompt
|
||||||
|
command = get_command(app)
|
||||||
|
command(prog_name=COMMAND)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_config_overrides(
|
||||||
|
args: List[str], env_var: Optional[str] = ENV_VARS.CONFIG_OVERRIDES
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Generate a dictionary of config overrides based on the extra arguments
|
||||||
|
provided on the CLI, e.g. --training.batch_size to override
|
||||||
|
"training.batch_size". Arguments without a "." are considered invalid,
|
||||||
|
since the config only allows top-level sections to exist.
|
||||||
|
|
||||||
|
env_vars (Optional[str]): Optional environment variable to read from.
|
||||||
|
RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting.
|
||||||
|
"""
|
||||||
|
env_string = os.environ.get(env_var, "") if env_var else ""
|
||||||
|
env_overrides = _parse_overrides(split_arg_string(env_string))
|
||||||
|
cli_overrides = _parse_overrides(args, is_cli=True)
|
||||||
|
if cli_overrides:
|
||||||
|
keys = [k for k in cli_overrides if k not in env_overrides]
|
||||||
|
logger.debug(f"Config overrides from CLI: {keys}")
|
||||||
|
if env_overrides:
|
||||||
|
logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
|
||||||
|
return {**cli_overrides, **env_overrides}
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_overrides(args: List[str], is_cli: bool = False) -> Dict[str, Any]:
|
||||||
|
result = {}
|
||||||
|
while args:
|
||||||
|
opt = args.pop(0)
|
||||||
|
err = f"Invalid config override '{opt}'"
|
||||||
|
if opt.startswith("--"): # new argument
|
||||||
|
orig_opt = opt
|
||||||
|
opt = opt.replace("--", "")
|
||||||
|
if "." not in opt:
|
||||||
|
if is_cli:
|
||||||
|
raise NoSuchOption(orig_opt)
|
||||||
|
else:
|
||||||
|
msg.fail(f"{err}: can't override top-level sections", exits=1)
|
||||||
|
if "=" in opt: # we have --opt=value
|
||||||
|
opt, value = opt.split("=", 1)
|
||||||
|
opt = opt.replace("-", "_")
|
||||||
|
else:
|
||||||
|
if not args or args[0].startswith("--"): # flag with no value
|
||||||
|
value = "true"
|
||||||
|
else:
|
||||||
|
value = args.pop(0)
|
||||||
|
# Just like we do in the config, we're calling json.loads on the
|
||||||
|
# values. But since they come from the CLI, it'd be unintuitive to
|
||||||
|
# explicitly mark strings with escaped quotes. So we're working
|
||||||
|
# around that here by falling back to a string if parsing fails.
|
||||||
|
# TODO: improve logic to handle simple types like list of strings?
|
||||||
|
try:
|
||||||
|
result[opt] = srsly.json_loads(value)
|
||||||
|
except ValueError:
|
||||||
|
result[opt] = str(value)
|
||||||
|
else:
|
||||||
|
msg.fail(f"{err}: name should start with --", exits=1)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
|
||||||
|
"""Load the project.yml file from a directory and validate it. Also make
|
||||||
|
sure that all directories defined in the config exist.
|
||||||
|
|
||||||
|
path (Path): The path to the project directory.
|
||||||
|
interpolate (bool): Whether to substitute project variables.
|
||||||
|
RETURNS (Dict[str, Any]): The loaded project.yml.
|
||||||
|
"""
|
||||||
|
config_path = path / PROJECT_FILE
|
||||||
|
if not config_path.exists():
|
||||||
|
msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
|
||||||
|
invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
|
||||||
|
try:
|
||||||
|
config = srsly.read_yaml(config_path)
|
||||||
|
except ValueError as e:
|
||||||
|
msg.fail(invalid_err, e, exits=1)
|
||||||
|
errors = validate(ProjectConfigSchema, config)
|
||||||
|
if errors:
|
||||||
|
msg.fail(invalid_err)
|
||||||
|
print("\n".join(errors))
|
||||||
|
sys.exit(1)
|
||||||
|
validate_project_version(config)
|
||||||
|
validate_project_commands(config)
|
||||||
|
# Make sure directories defined in config exist
|
||||||
|
for subdir in config.get("directories", []):
|
||||||
|
dir_path = path / subdir
|
||||||
|
if not dir_path.exists():
|
||||||
|
dir_path.mkdir(parents=True)
|
||||||
|
if interpolate:
|
||||||
|
err = "project.yml validation error"
|
||||||
|
with show_validation_error(title=err, hint_fill=False):
|
||||||
|
config = substitute_project_variables(config)
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
def substitute_project_variables(config: Dict[str, Any], overrides: Dict = {}):
|
||||||
|
key = "vars"
|
||||||
|
config.setdefault(key, {})
|
||||||
|
config[key].update(overrides)
|
||||||
|
# Need to put variables in the top scope again so we can have a top-level
|
||||||
|
# section "project" (otherwise, a list of commands in the top scope wouldn't)
|
||||||
|
# be allowed by Thinc's config system
|
||||||
|
cfg = Config({"project": config, key: config[key]})
|
||||||
|
interpolated = cfg.interpolate()
|
||||||
|
return dict(interpolated["project"])
|
||||||
|
|
||||||
|
|
||||||
|
def validate_project_version(config: Dict[str, Any]) -> None:
|
||||||
|
"""If the project defines a compatible spaCy version range, chec that it's
|
||||||
|
compatible with the current version of spaCy.
|
||||||
|
|
||||||
|
config (Dict[str, Any]): The loaded config.
|
||||||
|
"""
|
||||||
|
spacy_version = config.get("spacy_version", None)
|
||||||
|
if spacy_version and not is_compatible_version(about.__version__, spacy_version):
|
||||||
|
err = (
|
||||||
|
f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) "
|
||||||
|
f"that's not compatible with the version of spaCy you're running "
|
||||||
|
f"({about.__version__}). You can edit version requirement in the "
|
||||||
|
f"{PROJECT_FILE} to load it, but the project may not run as expected."
|
||||||
|
)
|
||||||
|
msg.fail(err, exits=1)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_project_commands(config: Dict[str, Any]) -> None:
|
||||||
|
"""Check that project commands and workflows are valid, don't contain
|
||||||
|
duplicates, don't clash and only refer to commands that exist.
|
||||||
|
|
||||||
|
config (Dict[str, Any]): The loaded config.
|
||||||
|
"""
|
||||||
|
command_names = [cmd["name"] for cmd in config.get("commands", [])]
|
||||||
|
workflows = config.get("workflows", {})
|
||||||
|
duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
|
||||||
|
if duplicates:
|
||||||
|
err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
|
||||||
|
msg.fail(err, exits=1)
|
||||||
|
for workflow_name, workflow_steps in workflows.items():
|
||||||
|
if workflow_name in command_names:
|
||||||
|
err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
|
||||||
|
msg.fail(err, exits=1)
|
||||||
|
for step in workflow_steps:
|
||||||
|
if step not in command_names:
|
||||||
|
msg.fail(
|
||||||
|
f"Unknown command specified in workflow '{workflow_name}': {step}",
|
||||||
|
f"Workflows can only refer to commands defined in the 'commands' "
|
||||||
|
f"section of the {PROJECT_FILE}.",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_hash(data, exclude: Iterable[str] = tuple()) -> str:
|
||||||
|
"""Get the hash for a JSON-serializable object.
|
||||||
|
|
||||||
|
data: The data to hash.
|
||||||
|
exclude (Iterable[str]): Top-level keys to exclude if data is a dict.
|
||||||
|
RETURNS (str): The hash.
|
||||||
|
"""
|
||||||
|
if isinstance(data, dict):
|
||||||
|
data = {k: v for k, v in data.items() if k not in exclude}
|
||||||
|
data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
|
||||||
|
return hashlib.md5(data_str).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def get_checksum(path: Union[Path, str]) -> str:
|
||||||
|
"""Get the checksum for a file or directory given its file path. If a
|
||||||
|
directory path is provided, this uses all files in that directory.
|
||||||
|
|
||||||
|
path (Union[Path, str]): The file or directory path.
|
||||||
|
RETURNS (str): The checksum.
|
||||||
|
"""
|
||||||
|
path = Path(path)
|
||||||
|
if path.is_file():
|
||||||
|
return hashlib.md5(Path(path).read_bytes()).hexdigest()
|
||||||
|
if path.is_dir():
|
||||||
|
# TODO: this is currently pretty slow
|
||||||
|
dir_checksum = hashlib.md5()
|
||||||
|
for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
|
||||||
|
dir_checksum.update(sub_file.read_bytes())
|
||||||
|
return dir_checksum.hexdigest()
|
||||||
|
msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1)
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def show_validation_error(
|
||||||
|
file_path: Optional[Union[str, Path]] = None,
|
||||||
|
*,
|
||||||
|
title: Optional[str] = None,
|
||||||
|
desc: str = "",
|
||||||
|
show_config: Optional[bool] = None,
|
||||||
|
hint_fill: bool = True,
|
||||||
|
):
|
||||||
|
"""Helper to show custom config validation errors on the CLI.
|
||||||
|
|
||||||
|
file_path (str / Path): Optional file path of config file, used in hints.
|
||||||
|
title (str): Override title of custom formatted error.
|
||||||
|
desc (str): Override description of custom formatted error.
|
||||||
|
show_config (bool): Whether to output the config the error refers to.
|
||||||
|
hint_fill (bool): Show hint about filling config.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
yield
|
||||||
|
except ConfigValidationError as e:
|
||||||
|
title = title if title is not None else e.title
|
||||||
|
if e.desc:
|
||||||
|
desc = f"{e.desc}" if not desc else f"{e.desc}\n\n{desc}"
|
||||||
|
# Re-generate a new error object with overrides
|
||||||
|
err = e.from_error(e, title="", desc=desc, show_config=show_config)
|
||||||
|
msg.fail(title)
|
||||||
|
print(err.text.strip())
|
||||||
|
if hint_fill and "value_error.missing" in err.error_types:
|
||||||
|
config_path = (
|
||||||
|
file_path
|
||||||
|
if file_path is not None and str(file_path) != "-"
|
||||||
|
else "config.cfg"
|
||||||
|
)
|
||||||
|
msg.text(
|
||||||
|
"If your config contains missing values, you can run the 'init "
|
||||||
|
"fill-config' command to fill in all the defaults, if possible:",
|
||||||
|
spaced=True,
|
||||||
|
)
|
||||||
|
print(f"{COMMAND} init fill-config {config_path} {config_path} \n")
|
||||||
|
sys.exit(1)
|
||||||
|
except InterpolationError as e:
|
||||||
|
msg.fail("Config validation error", e, exits=1)
|
||||||
|
|
||||||
|
|
||||||
|
def import_code(code_path: Optional[Union[Path, str]]) -> None:
|
||||||
|
"""Helper to import Python file provided in training commands / commands
|
||||||
|
using the config. This makes custom registered functions available.
|
||||||
|
"""
|
||||||
|
if code_path is not None:
|
||||||
|
if not Path(code_path).exists():
|
||||||
|
msg.fail("Path to Python code not found", code_path, exits=1)
|
||||||
|
try:
|
||||||
|
import_file("python_code", code_path)
|
||||||
|
except Exception as e:
|
||||||
|
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
|
||||||
|
|
||||||
|
|
||||||
|
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
|
||||||
|
"""Upload a file.
|
||||||
|
|
||||||
|
src (Path): The source path.
|
||||||
|
url (str): The destination URL to upload to.
|
||||||
|
"""
|
||||||
|
import smart_open
|
||||||
|
|
||||||
|
dest = str(dest)
|
||||||
|
with smart_open.open(dest, mode="wb") as output_file:
|
||||||
|
with src.open(mode="rb") as input_file:
|
||||||
|
output_file.write(input_file.read())
|
||||||
|
|
||||||
|
|
||||||
|
def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) -> None:
|
||||||
|
"""Download a file using smart_open.
|
||||||
|
|
||||||
|
url (str): The URL of the file.
|
||||||
|
dest (Path): The destination path.
|
||||||
|
force (bool): Whether to force download even if file exists.
|
||||||
|
If False, the download will be skipped.
|
||||||
|
"""
|
||||||
|
import smart_open
|
||||||
|
|
||||||
|
if dest.exists() and not force:
|
||||||
|
return None
|
||||||
|
src = str(src)
|
||||||
|
with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
|
||||||
|
with dest.open(mode="wb") as output_file:
|
||||||
|
output_file.write(input_file.read())
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_pathy(path):
|
||||||
|
"""Temporary helper to prevent importing Pathy globally (which can cause
|
||||||
|
slow and annoying Google Cloud warning)."""
|
||||||
|
from pathy import Pathy # noqa: F811
|
||||||
|
|
||||||
|
return Pathy(path)
|
||||||
|
|
||||||
|
|
||||||
|
def git_checkout(
|
||||||
|
repo: str, subpath: str, dest: Path, *, branch: str = "master", sparse: bool = False
|
||||||
|
):
|
||||||
|
git_version = get_git_version()
|
||||||
|
if dest.exists():
|
||||||
|
msg.fail("Destination of checkout must not exist", exits=1)
|
||||||
|
if not dest.parent.exists():
|
||||||
|
msg.fail("Parent of destination of checkout must exist", exits=1)
|
||||||
|
if sparse and git_version >= (2, 22):
|
||||||
|
return git_sparse_checkout(repo, subpath, dest, branch)
|
||||||
|
elif sparse:
|
||||||
|
# Only show warnings if the user explicitly wants sparse checkout but
|
||||||
|
# the Git version doesn't support it
|
||||||
|
err_old = (
|
||||||
|
f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
|
||||||
|
f"that doesn't fully support sparse checkout yet."
|
||||||
|
)
|
||||||
|
err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
|
||||||
|
msg.warn(
|
||||||
|
f"{err_unk if git_version == (0, 0) else err_old} "
|
||||||
|
f"This means that more files than necessary may be downloaded "
|
||||||
|
f"temporarily. To only download the files needed, make sure "
|
||||||
|
f"you're using Git v2.22 or above."
|
||||||
|
)
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
|
||||||
|
run_command(cmd, capture=True)
|
||||||
|
# We need Path(name) to make sure we also support subdirectories
|
||||||
|
try:
|
||||||
|
shutil.copytree(str(tmp_dir / Path(subpath)), str(dest))
|
||||||
|
except FileNotFoundError:
|
||||||
|
err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')"
|
||||||
|
msg.fail(err, repo, exits=1)
|
||||||
|
|
||||||
|
|
||||||
|
def git_sparse_checkout(repo, subpath, dest, branch):
|
||||||
|
# We're using Git, partial clone and sparse checkout to
|
||||||
|
# only clone the files we need
|
||||||
|
# This ends up being RIDICULOUS. omg.
|
||||||
|
# So, every tutorial and SO post talks about 'sparse checkout'...But they
|
||||||
|
# go and *clone* the whole repo. Worthless. And cloning part of a repo
|
||||||
|
# turns out to be completely broken. The only way to specify a "path" is..
|
||||||
|
# a path *on the server*? The contents of which, specifies the paths. Wat.
|
||||||
|
# Obviously this is hopelessly broken and insecure, because you can query
|
||||||
|
# arbitrary paths on the server! So nobody enables this.
|
||||||
|
# What we have to do is disable *all* files. We could then just checkout
|
||||||
|
# the path, and it'd "work", but be hopelessly slow...Because it goes and
|
||||||
|
# transfers every missing object one-by-one. So the final piece is that we
|
||||||
|
# need to use some weird git internals to fetch the missings in bulk, and
|
||||||
|
# *that* we can do by path.
|
||||||
|
# We're using Git and sparse checkout to only clone the files we need
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
# This is the "clone, but don't download anything" part.
|
||||||
|
cmd = (
|
||||||
|
f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
|
||||||
|
f"-b {branch} --filter=blob:none"
|
||||||
|
)
|
||||||
|
run_command(cmd)
|
||||||
|
# Now we need to find the missing filenames for the subpath we want.
|
||||||
|
# Looking for this 'rev-list' command in the git --help? Hah.
|
||||||
|
cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
|
||||||
|
ret = run_command(cmd, capture=True)
|
||||||
|
git_repo = _http_to_git(repo)
|
||||||
|
# Now pass those missings into another bit of git internals
|
||||||
|
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
|
||||||
|
if not missings:
|
||||||
|
err = (
|
||||||
|
f"Could not find any relevant files for '{subpath}'. "
|
||||||
|
f"Did you specify a correct and complete path within repo '{repo}' "
|
||||||
|
f"and branch {branch}?"
|
||||||
|
)
|
||||||
|
msg.fail(err, exits=1)
|
||||||
|
cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
|
||||||
|
run_command(cmd, capture=True)
|
||||||
|
# And finally, we can checkout our subpath
|
||||||
|
cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
|
||||||
|
run_command(cmd, capture=True)
|
||||||
|
# We need Path(name) to make sure we also support subdirectories
|
||||||
|
shutil.move(str(tmp_dir / Path(subpath)), str(dest))
|
||||||
|
|
||||||
|
|
||||||
|
def get_git_version(
|
||||||
|
error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
|
||||||
|
) -> Tuple[int, int]:
|
||||||
|
"""Get the version of git and raise an error if calling 'git --version' fails.
|
||||||
|
|
||||||
|
error (str): The error message to show.
|
||||||
|
RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
|
||||||
|
(0, 0) if the version couldn't be determined.
|
||||||
|
"""
|
||||||
|
ret = run_command("git --version", capture=True)
|
||||||
|
stdout = ret.stdout.strip()
|
||||||
|
if not stdout or not stdout.startswith("git version"):
|
||||||
|
return (0, 0)
|
||||||
|
version = stdout[11:].strip().split(".")
|
||||||
|
return (int(version[0]), int(version[1]))
|
||||||
|
|
||||||
|
|
||||||
|
def _http_to_git(repo: str) -> str:
|
||||||
|
if repo.startswith("http://"):
|
||||||
|
repo = repo.replace(r"http://", r"https://")
|
||||||
|
if repo.startswith(r"https://"):
|
||||||
|
repo = repo.replace("https://", "git@").replace("/", ":", 1)
|
||||||
|
if repo.endswith("/"):
|
||||||
|
repo = repo[:-1]
|
||||||
|
repo = f"{repo}.git"
|
||||||
|
return repo
|
||||||
|
|
||||||
|
|
||||||
|
def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[int]]:
|
||||||
|
"""Parse a comma-separated string to a list and account for various
|
||||||
|
formatting options. Mostly used to handle CLI arguments that take a list of
|
||||||
|
comma-separated values.
|
||||||
|
|
||||||
|
value (str): The value to parse.
|
||||||
|
intify (bool): Whether to convert values to ints.
|
||||||
|
RETURNS (Union[List[str], List[int]]): A list of strings or ints.
|
||||||
|
"""
|
||||||
|
if not value:
|
||||||
|
return []
|
||||||
|
if value.startswith("[") and value.endswith("]"):
|
||||||
|
value = value[1:-1]
|
||||||
|
result = []
|
||||||
|
for p in value.split(","):
|
||||||
|
p = p.strip()
|
||||||
|
if p.startswith("'") and p.endswith("'"):
|
||||||
|
p = p[1:-1]
|
||||||
|
if p.startswith('"') and p.endswith('"'):
|
||||||
|
p = p[1:-1]
|
||||||
|
p = p.strip()
|
||||||
|
if intify:
|
||||||
|
p = int(p)
|
||||||
|
result.append(p)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def setup_gpu(use_gpu: int) -> None:
|
||||||
|
"""Configure the GPU and log info."""
|
||||||
|
if use_gpu >= 0:
|
||||||
|
msg.info(f"Using GPU: {use_gpu}")
|
||||||
|
require_gpu(use_gpu)
|
||||||
|
else:
|
||||||
|
msg.info("Using CPU")
|
|
@ -1,132 +1,177 @@
|
||||||
# coding: utf8
|
from typing import Optional, Any, List, Union
|
||||||
from __future__ import unicode_literals
|
from enum import Enum
|
||||||
|
|
||||||
import plac
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
import srsly
|
import srsly
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
|
import itertools
|
||||||
|
|
||||||
from .converters import conllu2json, iob2json, conll_ner2json
|
from ._util import app, Arg, Opt
|
||||||
from .converters import ner_jsonl2json
|
from ..training import docs_to_json
|
||||||
|
from ..tokens import DocBin
|
||||||
|
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
||||||
|
from ..training.converters import conllu_to_docs
|
||||||
|
|
||||||
|
|
||||||
# Converters are matched by file extension except for ner/iob, which are
|
# Converters are matched by file extension except for ner/iob, which are
|
||||||
# matched by file extension and content. To add a converter, add a new
|
# matched by file extension and content. To add a converter, add a new
|
||||||
# entry to this dict with the file extension mapped to the converter function
|
# entry to this dict with the file extension mapped to the converter function
|
||||||
# imported from /converters.
|
# imported from /converters.
|
||||||
|
|
||||||
CONVERTERS = {
|
CONVERTERS = {
|
||||||
"conllubio": conllu2json,
|
"conllubio": conllu_to_docs,
|
||||||
"conllu": conllu2json,
|
"conllu": conllu_to_docs,
|
||||||
"conll": conllu2json,
|
"conll": conllu_to_docs,
|
||||||
"ner": conll_ner2json,
|
"ner": conll_ner_to_docs,
|
||||||
"iob": iob2json,
|
"iob": iob_to_docs,
|
||||||
"jsonl": ner_jsonl2json,
|
"json": json_to_docs,
|
||||||
}
|
}
|
||||||
|
|
||||||
# File types
|
|
||||||
FILE_TYPES = ("json", "jsonl", "msg")
|
# File types that can be written to stdout
|
||||||
FILE_TYPES_STDOUT = ("json", "jsonl")
|
FILE_TYPES_STDOUT = ("json",)
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
class FileTypes(str, Enum):
|
||||||
input_file=("Input file", "positional", None, str),
|
json = "json"
|
||||||
output_dir=("Output directory. '-' for stdout.", "positional", None, str),
|
spacy = "spacy"
|
||||||
file_type=("Type of data to produce: {}".format(FILE_TYPES), "option", "t", str),
|
|
||||||
n_sents=("Number of sentences per doc (0 to disable)", "option", "n", int),
|
|
||||||
seg_sents=("Segment sentences (for -c ner)", "flag", "s"),
|
@app.command("convert")
|
||||||
model=("Model for sentence segmentation (for -s)", "option", "b", str),
|
def convert_cli(
|
||||||
converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str),
|
# fmt: off
|
||||||
lang=("Language (if tokenizer required)", "option", "l", str),
|
input_path: str = Arg(..., help="Input file or directory", exists=True),
|
||||||
morphology=("Enable appending morphology to tags", "flag", "m", bool),
|
output_dir: Path = Arg("-", help="Output directory. '-' for stdout.", allow_dash=True, exists=True),
|
||||||
)
|
file_type: FileTypes = Opt("spacy", "--file-type", "-t", help="Type of data to produce"),
|
||||||
def convert(
|
n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"),
|
||||||
input_file,
|
seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"),
|
||||||
output_dir="-",
|
model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"),
|
||||||
file_type="json",
|
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
|
||||||
n_sents=1,
|
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
|
||||||
seg_sents=False,
|
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
||||||
model=None,
|
ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
|
||||||
morphology=False,
|
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
|
||||||
converter="auto",
|
concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"),
|
||||||
lang=None,
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Convert files into JSON format for use with train command and other
|
Convert files into json or DocBin format for training. The resulting .spacy
|
||||||
experiment management functions. If no output_dir is specified, the data
|
file can be used with the train command and other experiment management
|
||||||
|
functions.
|
||||||
|
|
||||||
|
If no output_dir is specified and the output format is JSON, the data
|
||||||
is written to stdout, so you can pipe them forward to a JSON file:
|
is written to stdout, so you can pipe them forward to a JSON file:
|
||||||
$ spacy convert some_file.conllu > some_file.json
|
$ spacy convert some_file.conllu --file-type json > some_file.json
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/cli#convert
|
||||||
"""
|
"""
|
||||||
no_print = output_dir == "-"
|
if isinstance(file_type, FileTypes):
|
||||||
msg = Printer(no_print=no_print)
|
# We get an instance of the FileTypes from the CLI so we need its string value
|
||||||
input_path = Path(input_file)
|
file_type = file_type.value
|
||||||
if file_type not in FILE_TYPES:
|
input_path = Path(input_path)
|
||||||
msg.fail(
|
output_dir = "-" if output_dir == Path("-") else output_dir
|
||||||
"Unknown file type: '{}'".format(file_type),
|
silent = output_dir == "-"
|
||||||
"Supported file types: '{}'".format(", ".join(FILE_TYPES)),
|
msg = Printer(no_print=silent)
|
||||||
exits=1,
|
verify_cli_args(msg, input_path, output_dir, file_type, converter, ner_map)
|
||||||
)
|
converter = _get_converter(msg, converter, input_path)
|
||||||
if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
|
convert(
|
||||||
# TODO: support msgpack via stdout in srsly?
|
input_path,
|
||||||
msg.fail(
|
output_dir,
|
||||||
"Can't write .{} data to stdout.".format(file_type),
|
file_type=file_type,
|
||||||
"Please specify an output directory.",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
if not input_path.exists():
|
|
||||||
msg.fail("Input file not found", input_path, exits=1)
|
|
||||||
if output_dir != "-" and not Path(output_dir).exists():
|
|
||||||
msg.fail("Output directory not found", output_dir, exits=1)
|
|
||||||
input_data = input_path.open("r", encoding="utf-8").read()
|
|
||||||
if converter == "auto":
|
|
||||||
converter = input_path.suffix[1:]
|
|
||||||
if converter == "ner" or converter == "iob":
|
|
||||||
converter_autodetect = autodetect_ner_format(input_data)
|
|
||||||
if converter_autodetect == "ner":
|
|
||||||
msg.info("Auto-detected token-per-line NER format")
|
|
||||||
converter = converter_autodetect
|
|
||||||
elif converter_autodetect == "iob":
|
|
||||||
msg.info("Auto-detected sentence-per-line NER format")
|
|
||||||
converter = converter_autodetect
|
|
||||||
else:
|
|
||||||
msg.warn(
|
|
||||||
"Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
|
|
||||||
)
|
|
||||||
if converter not in CONVERTERS:
|
|
||||||
msg.fail("Can't find converter for {}".format(converter), exits=1)
|
|
||||||
# Use converter function to convert data
|
|
||||||
func = CONVERTERS[converter]
|
|
||||||
data = func(
|
|
||||||
input_data,
|
|
||||||
n_sents=n_sents,
|
n_sents=n_sents,
|
||||||
seg_sents=seg_sents,
|
seg_sents=seg_sents,
|
||||||
use_morphology=morphology,
|
|
||||||
lang=lang,
|
|
||||||
model=model,
|
model=model,
|
||||||
no_print=no_print,
|
morphology=morphology,
|
||||||
|
merge_subtokens=merge_subtokens,
|
||||||
|
converter=converter,
|
||||||
|
ner_map=ner_map,
|
||||||
|
lang=lang,
|
||||||
|
concatenate=concatenate,
|
||||||
|
silent=silent,
|
||||||
|
msg=msg,
|
||||||
)
|
)
|
||||||
if output_dir != "-":
|
|
||||||
# Export data to a file
|
|
||||||
suffix = ".{}".format(file_type)
|
def convert(
|
||||||
output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
|
input_path: Union[str, Path],
|
||||||
if file_type == "json":
|
output_dir: Union[str, Path],
|
||||||
srsly.write_json(output_file, data)
|
*,
|
||||||
elif file_type == "jsonl":
|
file_type: str = "json",
|
||||||
srsly.write_jsonl(output_file, data)
|
n_sents: int = 1,
|
||||||
elif file_type == "msg":
|
seg_sents: bool = False,
|
||||||
srsly.write_msgpack(output_file, data)
|
model: Optional[str] = None,
|
||||||
msg.good(
|
morphology: bool = False,
|
||||||
"Generated output file ({} documents): {}".format(len(data), output_file)
|
merge_subtokens: bool = False,
|
||||||
|
converter: str = "auto",
|
||||||
|
ner_map: Optional[Path] = None,
|
||||||
|
lang: Optional[str] = None,
|
||||||
|
concatenate: bool = False,
|
||||||
|
silent: bool = True,
|
||||||
|
msg: Optional[Printer],
|
||||||
|
) -> None:
|
||||||
|
if not msg:
|
||||||
|
msg = Printer(no_print=silent)
|
||||||
|
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
|
||||||
|
doc_files = []
|
||||||
|
for input_loc in walk_directory(Path(input_path), converter):
|
||||||
|
input_data = input_loc.open("r", encoding="utf-8").read()
|
||||||
|
# Use converter function to convert data
|
||||||
|
func = CONVERTERS[converter]
|
||||||
|
docs = func(
|
||||||
|
input_data,
|
||||||
|
n_sents=n_sents,
|
||||||
|
seg_sents=seg_sents,
|
||||||
|
append_morphology=morphology,
|
||||||
|
merge_subtokens=merge_subtokens,
|
||||||
|
lang=lang,
|
||||||
|
model=model,
|
||||||
|
no_print=silent,
|
||||||
|
ner_map=ner_map,
|
||||||
)
|
)
|
||||||
else:
|
doc_files.append((input_loc, docs))
|
||||||
# Print to stdout
|
if concatenate:
|
||||||
|
all_docs = itertools.chain.from_iterable([docs for _, docs in doc_files])
|
||||||
|
doc_files = [(input_path, all_docs)]
|
||||||
|
for input_loc, docs in doc_files:
|
||||||
if file_type == "json":
|
if file_type == "json":
|
||||||
srsly.write_json("-", data)
|
data = [docs_to_json(docs)]
|
||||||
elif file_type == "jsonl":
|
len_docs = len(data)
|
||||||
srsly.write_jsonl("-", data)
|
else:
|
||||||
|
db = DocBin(docs=docs, store_user_data=True)
|
||||||
|
len_docs = len(db)
|
||||||
|
data = db.to_bytes()
|
||||||
|
if output_dir == "-":
|
||||||
|
_print_docs_to_stdout(data, file_type)
|
||||||
|
else:
|
||||||
|
if input_loc != input_path:
|
||||||
|
subpath = input_loc.relative_to(input_path)
|
||||||
|
output_file = Path(output_dir) / subpath.with_suffix(f".{file_type}")
|
||||||
|
else:
|
||||||
|
output_file = Path(output_dir) / input_loc.parts[-1]
|
||||||
|
output_file = output_file.with_suffix(f".{file_type}")
|
||||||
|
_write_docs_to_file(data, output_file, file_type)
|
||||||
|
msg.good(f"Generated output file ({len_docs} documents): {output_file}")
|
||||||
|
|
||||||
|
|
||||||
def autodetect_ner_format(input_data):
|
def _print_docs_to_stdout(data: Any, output_type: str) -> None:
|
||||||
|
if output_type == "json":
|
||||||
|
srsly.write_json("-", data)
|
||||||
|
else:
|
||||||
|
sys.stdout.buffer.write(data)
|
||||||
|
|
||||||
|
|
||||||
|
def _write_docs_to_file(data: Any, output_file: Path, output_type: str) -> None:
|
||||||
|
if not output_file.parent.exists():
|
||||||
|
output_file.parent.mkdir(parents=True)
|
||||||
|
if output_type == "json":
|
||||||
|
srsly.write_json(output_file, data)
|
||||||
|
else:
|
||||||
|
with output_file.open("wb") as file_:
|
||||||
|
file_.write(data)
|
||||||
|
|
||||||
|
|
||||||
|
def autodetect_ner_format(input_data: str) -> Optional[str]:
|
||||||
# guess format from the first 20 lines
|
# guess format from the first 20 lines
|
||||||
lines = input_data.split("\n")[:20]
|
lines = input_data.split("\n")[:20]
|
||||||
format_guesses = {"ner": 0, "iob": 0}
|
format_guesses = {"ner": 0, "iob": 0}
|
||||||
|
@ -143,3 +188,86 @@ def autodetect_ner_format(input_data):
|
||||||
if format_guesses["ner"] == 0 and format_guesses["iob"] > 0:
|
if format_guesses["ner"] == 0 and format_guesses["iob"] > 0:
|
||||||
return "iob"
|
return "iob"
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def walk_directory(path: Path, converter: str) -> List[Path]:
|
||||||
|
if not path.is_dir():
|
||||||
|
return [path]
|
||||||
|
paths = [path]
|
||||||
|
locs = []
|
||||||
|
seen = set()
|
||||||
|
for path in paths:
|
||||||
|
if str(path) in seen:
|
||||||
|
continue
|
||||||
|
seen.add(str(path))
|
||||||
|
if path.parts[-1].startswith("."):
|
||||||
|
continue
|
||||||
|
elif path.is_dir():
|
||||||
|
paths.extend(path.iterdir())
|
||||||
|
elif converter == "json" and not path.parts[-1].endswith("json"):
|
||||||
|
continue
|
||||||
|
elif converter == "conll" and not path.parts[-1].endswith("conll"):
|
||||||
|
continue
|
||||||
|
elif converter == "iob" and not path.parts[-1].endswith("iob"):
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
locs.append(path)
|
||||||
|
# It's good to sort these, in case the ordering messes up cache.
|
||||||
|
locs.sort()
|
||||||
|
return locs
|
||||||
|
|
||||||
|
|
||||||
|
def verify_cli_args(
|
||||||
|
msg: Printer,
|
||||||
|
input_path: Union[str, Path],
|
||||||
|
output_dir: Union[str, Path],
|
||||||
|
file_type: FileTypes,
|
||||||
|
converter: str,
|
||||||
|
ner_map: Optional[Path],
|
||||||
|
):
|
||||||
|
input_path = Path(input_path)
|
||||||
|
if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
|
||||||
|
msg.fail(
|
||||||
|
f"Can't write .{file_type} data to stdout. Please specify an output directory.",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
if not input_path.exists():
|
||||||
|
msg.fail("Input file not found", input_path, exits=1)
|
||||||
|
if output_dir != "-" and not Path(output_dir).exists():
|
||||||
|
msg.fail("Output directory not found", output_dir, exits=1)
|
||||||
|
if ner_map is not None and not Path(ner_map).exists():
|
||||||
|
msg.fail("NER map not found", ner_map, exits=1)
|
||||||
|
if input_path.is_dir():
|
||||||
|
input_locs = walk_directory(input_path, converter)
|
||||||
|
if len(input_locs) == 0:
|
||||||
|
msg.fail("No input files in directory", input_path, exits=1)
|
||||||
|
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
|
||||||
|
if converter == "auto" and len(file_types) >= 2:
|
||||||
|
file_types = ",".join(file_types)
|
||||||
|
msg.fail("All input files must be same type", file_types, exits=1)
|
||||||
|
if converter != "auto" and converter not in CONVERTERS:
|
||||||
|
msg.fail(f"Can't find converter for {converter}", exits=1)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_converter(msg, converter, input_path):
|
||||||
|
if input_path.is_dir():
|
||||||
|
input_path = walk_directory(input_path, converter)[0]
|
||||||
|
if converter == "auto":
|
||||||
|
converter = input_path.suffix[1:]
|
||||||
|
if converter == "ner" or converter == "iob":
|
||||||
|
with input_path.open(encoding="utf8") as file_:
|
||||||
|
input_data = file_.read()
|
||||||
|
converter_autodetect = autodetect_ner_format(input_data)
|
||||||
|
if converter_autodetect == "ner":
|
||||||
|
msg.info("Auto-detected token-per-line NER format")
|
||||||
|
converter = converter_autodetect
|
||||||
|
elif converter_autodetect == "iob":
|
||||||
|
msg.info("Auto-detected sentence-per-line NER format")
|
||||||
|
converter = converter_autodetect
|
||||||
|
else:
|
||||||
|
msg.warn(
|
||||||
|
"Can't automatically detect NER format. "
|
||||||
|
"Conversion may not succeed. "
|
||||||
|
"See https://spacy.io/api/cli#convert"
|
||||||
|
)
|
||||||
|
return converter
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user