mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
c64c7f5cc1
17
.github/ISSUE_TEMPLATE.md
vendored
17
.github/ISSUE_TEMPLATE.md
vendored
|
@ -1,15 +1,18 @@
|
||||||
<!--- Please provide a summary in the title and describe your issue here.
|
<!--- Please provide a summary in the title and describe your issue here.
|
||||||
Is this a bug or feature request? If a bug, include all the steps that led to the issue.
|
Is this a bug or feature request? If a bug, include all the steps that led to the issue.
|
||||||
|
|
||||||
If you're looking for help with your code, consider posting a question on Stack Overflow instead:
|
If you're looking for help with your code, consider posting a question here:
|
||||||
http://stackoverflow.com/questions/tagged/spacy -->
|
|
||||||
|
|
||||||
|
|
||||||
|
- GitHub Discussions: https://github.com/explosion/spaCy/discussions
|
||||||
|
- Stack Overflow: http://stackoverflow.com/questions/tagged/spacy
|
||||||
|
-->
|
||||||
|
|
||||||
## Your Environment
|
## Your Environment
|
||||||
|
|
||||||
<!-- Include details of your environment. If you're using spaCy 1.7+, you can also type
|
<!-- Include details of your environment. If you're using spaCy 1.7+, you can also type
|
||||||
`python -m spacy info --markdown` and copy-paste the result here.-->
|
`python -m spacy info --markdown` and copy-paste the result here.-->
|
||||||
* Operating System:
|
|
||||||
* Python Version Used:
|
- Operating System:
|
||||||
* spaCy Version Used:
|
- Python Version Used:
|
||||||
* Environment Information:
|
- spaCy Version Used:
|
||||||
|
- Environment Information:
|
||||||
|
|
11
.github/ISSUE_TEMPLATE/03_request.md
vendored
11
.github/ISSUE_TEMPLATE/03_request.md
vendored
|
@ -1,11 +0,0 @@
|
||||||
---
|
|
||||||
name: "\U0001F381 Feature Request"
|
|
||||||
about: Do you have an idea for an improvement, a new feature or a plugin?
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Feature description
|
|
||||||
<!-- Please describe the feature: Which area of the library is it related to? What specific solution would you like? -->
|
|
||||||
|
|
||||||
## Could the feature be a [custom component](https://spacy.io/usage/processing-pipelines#custom-components) or [spaCy plugin](https://spacy.io/universe)?
|
|
||||||
If so, we will tag it as [`project idea`](https://github.com/explosion/spaCy/labels/project%20idea) so other users can take it on.
|
|
19
.github/ISSUE_TEMPLATE/04_other.md
vendored
Normal file
19
.github/ISSUE_TEMPLATE/04_other.md
vendored
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
---
|
||||||
|
name: "\U0001F4AC Anything else?"
|
||||||
|
about: For feature and project ideas, general usage questions or help with your code, please post on the GitHub Discussions board instead.
|
||||||
|
---
|
||||||
|
|
||||||
|
<!-- Describe your issue here. Please keep in mind that the GitHub issue tracker is mostly intended for reports related to the spaCy code base and source, and for bugs and enhancements. If you're looking for help with your code, consider posting a question here:
|
||||||
|
|
||||||
|
- GitHub Discussions: https://github.com/explosion/spaCy/discussions
|
||||||
|
- Stack Overflow: http://stackoverflow.com/questions/tagged/spacy
|
||||||
|
-->
|
||||||
|
|
||||||
|
## Your Environment
|
||||||
|
|
||||||
|
<!-- Include details of your environment. If you're using spaCy 1.7+, you can also type `python -m spacy info --markdown` and copy-paste the result here.-->
|
||||||
|
|
||||||
|
- Operating System:
|
||||||
|
- Python Version Used:
|
||||||
|
- spaCy Version Used:
|
||||||
|
- Environment Information:
|
15
.github/ISSUE_TEMPLATE/05_other.md
vendored
15
.github/ISSUE_TEMPLATE/05_other.md
vendored
|
@ -1,15 +0,0 @@
|
||||||
---
|
|
||||||
name: "\U0001F4AC Anything else?"
|
|
||||||
about: For general usage questions or help with your code, please consider
|
|
||||||
posting on Stack Overflow instead.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
<!-- Describe your issue here. Please keep in mind that the GitHub issue tracker is mostly intended for reports related to the spaCy code base and source, and for bugs and feature requests. If you're looking for help with your code, consider posting a question on Stack Overflow instead: http://stackoverflow.com/questions/tagged/spacy -->
|
|
||||||
|
|
||||||
## Your Environment
|
|
||||||
<!-- Include details of your environment. If you're using spaCy 1.7+, you can also type `python -m spacy info --markdown` and copy-paste the result here.-->
|
|
||||||
* Operating System:
|
|
||||||
* Python Version Used:
|
|
||||||
* spaCy Version Used:
|
|
||||||
* Environment Information:
|
|
106
.github/contributors/AMArostegui.md
vendored
Normal file
106
.github/contributors/AMArostegui.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Antonio Miras |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 11/01/2020 |
|
||||||
|
| GitHub username | AMArostegui |
|
||||||
|
| Website (optional) | |
|
108
.github/contributors/KKsharma99.md
vendored
Normal file
108
.github/contributors/KKsharma99.md
vendored
Normal file
|
@ -0,0 +1,108 @@
|
||||||
|
<!-- This agreement was mistakenly submitted as an update to the CONTRIBUTOR_AGREEMENT.md template. Commit: 8a2d22222dec5cf910df5a378cbcd9ea2ab53ec4. It was therefore moved over manually. -->
|
||||||
|
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Kunal Sharma |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 10/19/2020 |
|
||||||
|
| GitHub username | KKsharma99 |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/alexcombessie.md
vendored
Normal file
106
.github/contributors/alexcombessie.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Alex COMBESSIE |
|
||||||
|
| Company name (if applicable) | Dataiku |
|
||||||
|
| Title or role (if applicable) | R&D Engineer |
|
||||||
|
| Date | 2020-10-27 |
|
||||||
|
| GitHub username | alexcombessie |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/borijang.md
vendored
Normal file
106
.github/contributors/borijang.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Borijan Georgievski |
|
||||||
|
| Company name (if applicable) | Netcetera |
|
||||||
|
| Title or role (if applicable) | Deta Scientist |
|
||||||
|
| Date | 2020.10.09 |
|
||||||
|
| GitHub username | borijang |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/bratao.md
vendored
Normal file
106
.github/contributors/bratao.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [X] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Bruno Souza Cabral |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 24/12/2020 |
|
||||||
|
| GitHub username | bratao |
|
||||||
|
| Website (optional) | |
|
107
.github/contributors/cristianasp.md
vendored
Normal file
107
.github/contributors/cristianasp.md
vendored
Normal file
|
@ -0,0 +1,107 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [X] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Cristiana S Parada |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-11-04 |
|
||||||
|
| GitHub username | cristianasp |
|
||||||
|
| Website (optional) | |
|
||||||
|
|
106
.github/contributors/danielvasic.md
vendored
Normal file
106
.github/contributors/danielvasic.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Daniel Vasić |
|
||||||
|
| Company name (if applicable) | University of Mostar |
|
||||||
|
| Title or role (if applicable) | Teaching asistant |
|
||||||
|
| Date | 13/10/2020 |
|
||||||
|
| GitHub username | danielvasic |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/forest1988.md
vendored
Normal file
106
.github/contributors/forest1988.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Yusuke Mori |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | Ph.D. student |
|
||||||
|
| Date | 2020/11/22 |
|
||||||
|
| GitHub username | forest1988 |
|
||||||
|
| Website (optional) | https://forest1988.github.io |
|
106
.github/contributors/jabortell.md
vendored
Normal file
106
.github/contributors/jabortell.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Jacob Bortell |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-11-20 |
|
||||||
|
| GitHub username | jabortell |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/lorenanda.md
vendored
Normal file
106
.github/contributors/lorenanda.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Lorena Ciutacu |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-12-23 |
|
||||||
|
| GitHub username | lorenanda |
|
||||||
|
| Website (optional) | lorenaciutacu.com/ |
|
106
.github/contributors/ophelielacroix.md
vendored
Normal file
106
.github/contributors/ophelielacroix.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [X] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|-------------------------------|-----------------|
|
||||||
|
| Name | Ophélie Lacroix |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | |
|
||||||
|
| GitHub username | ophelielacroix |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/rafguns.md
vendored
Normal file
106
.github/contributors/rafguns.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Raf Guns |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-12-09 |
|
||||||
|
| GitHub username | rafguns |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/revuel.md
vendored
Normal file
106
.github/contributors/revuel.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Miguel Revuelta |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-11-17 |
|
||||||
|
| GitHub username | revuel |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/robertsipek.md
vendored
Normal file
106
.github/contributors/robertsipek.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Robert Šípek |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 22.10.2020 |
|
||||||
|
| GitHub username | @robertsipek |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/thomasbird.md
vendored
Normal file
106
.github/contributors/thomasbird.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | ---------------------- |
|
||||||
|
| Name | Thomas Bird |
|
||||||
|
| Company name (if applicable) | Leap Beyond Group |
|
||||||
|
| Title or role (if applicable) | Data Scientist |
|
||||||
|
| Date | 15/12/2020 |
|
||||||
|
| GitHub username | thomasbird |
|
||||||
|
| Website (optional) | https://leapbeyond.ai |
|
106
.github/contributors/vha14.md
vendored
Normal file
106
.github/contributors/vha14.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Vu Ha |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 10-23-2020 |
|
||||||
|
| GitHub username | vha14 |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/walterhenry.md
vendored
Normal file
106
.github/contributors/walterhenry.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Walter Henry |
|
||||||
|
| Company name (if applicable) | ExplosionAI GmbH |
|
||||||
|
| Title or role (if applicable) | Executive Assistant |
|
||||||
|
| Date | September 14, 2020 |
|
||||||
|
| GitHub username | walterhenry |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/yosiasz.md
vendored
Normal file
106
.github/contributors/yosiasz.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Josiah Solomon |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2020-12-15 |
|
||||||
|
| GitHub username | yosiasz |
|
||||||
|
| Website (optional) | |
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -51,6 +51,7 @@ env3.*/
|
||||||
.pypyenv
|
.pypyenv
|
||||||
.pytest_cache/
|
.pytest_cache/
|
||||||
.mypy_cache/
|
.mypy_cache/
|
||||||
|
.hypothesis/
|
||||||
|
|
||||||
# Distribution / packaging
|
# Distribution / packaging
|
||||||
env/
|
env/
|
||||||
|
|
12
CITATION
12
CITATION
|
@ -1,6 +1,8 @@
|
||||||
@unpublished{spacy2,
|
@software{spacy,
|
||||||
AUTHOR = {Honnibal, Matthew and Montani, Ines},
|
author = {Honnibal, Matthew and Montani, Ines and Van Landeghem, Sofie and Boyd, Adriane},
|
||||||
TITLE = {{spaCy 2}: Natural language understanding with {B}loom embeddings, convolutional neural networks and incremental parsing},
|
title = {{spaCy: Industrial-strength Natural Language Processing in Python}},
|
||||||
YEAR = {2017},
|
year = 2020,
|
||||||
Note = {To appear}
|
publisher = {Zenodo},
|
||||||
|
doi = {10.5281/zenodo.1212303},
|
||||||
|
url = {https://doi.org/10.5281/zenodo.1212303}
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,11 +26,11 @@ also often include helpful tips and solutions to common problems. You should
|
||||||
also check the [troubleshooting guide](https://spacy.io/usage/#troubleshooting)
|
also check the [troubleshooting guide](https://spacy.io/usage/#troubleshooting)
|
||||||
to see if your problem is already listed there.
|
to see if your problem is already listed there.
|
||||||
|
|
||||||
If you're looking for help with your code, consider posting a question on
|
If you're looking for help with your code, consider posting a question on the
|
||||||
[Stack Overflow](http://stackoverflow.com/questions/tagged/spacy) instead. If you
|
[GitHub Discussions board](https://github.com/explosion/spaCy/discussions) or
|
||||||
tag it `spacy` and `python`, more people will see it and hopefully be able to
|
[Stack Overflow](http://stackoverflow.com/questions/tagged/spacy). Please
|
||||||
help. Please understand that we won't be able to provide individual support via
|
understand that we won't be able to provide individual support via email. We
|
||||||
email. We also believe that help is much more valuable if it's **shared publicly**,
|
also believe that help is much more valuable if it's **shared publicly**,
|
||||||
so that more people can benefit from it.
|
so that more people can benefit from it.
|
||||||
|
|
||||||
### Submitting issues
|
### Submitting issues
|
||||||
|
|
2
Makefile
2
Makefile
|
@ -1,7 +1,7 @@
|
||||||
SHELL := /bin/bash
|
SHELL := /bin/bash
|
||||||
|
|
||||||
ifndef SPACY_EXTRAS
|
ifndef SPACY_EXTRAS
|
||||||
override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba spacy-pkuseg==0.0.26 sudachipy sudachidict_core
|
override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba spacy-pkuseg==0.0.28 sudachipy sudachidict_core pymorphy2
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifndef PYVER
|
ifndef PYVER
|
||||||
|
|
38
README.md
38
README.md
|
@ -61,12 +61,14 @@ much more valuable if it's shared publicly, so that more people can benefit from
|
||||||
it.
|
it.
|
||||||
|
|
||||||
| Type | Platforms |
|
| Type | Platforms |
|
||||||
| ----------------------- | ---------------------- |
|
| ------------------------------- | --------------------------------------- |
|
||||||
| 🚨 **Bug Reports** | [GitHub Issue Tracker] |
|
| 🚨 **Bug Reports** | [GitHub Issue Tracker] |
|
||||||
| 🎁 **Feature Requests** | [GitHub Issue Tracker] |
|
| 🎁 **Feature Requests & Ideas** | [GitHub Discussions] |
|
||||||
| 👩💻 **Usage Questions** | [Stack Overflow] |
|
| 👩💻 **Usage Questions** | [GitHub Discussions] · [Stack Overflow] |
|
||||||
|
| 🗯 **General Discussion** | [GitHub Discussions] |
|
||||||
|
|
||||||
[github issue tracker]: https://github.com/explosion/spaCy/issues
|
[github issue tracker]: https://github.com/explosion/spaCy/issues
|
||||||
|
[github discussions]: https://github.com/explosion/spaCy/discussions
|
||||||
[stack overflow]: https://stackoverflow.com/questions/tagged/spacy
|
[stack overflow]: https://stackoverflow.com/questions/tagged/spacy
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
@ -126,6 +128,7 @@ environment to avoid modifying system state:
|
||||||
```bash
|
```bash
|
||||||
python -m venv .env
|
python -m venv .env
|
||||||
source .env/bin/activate
|
source .env/bin/activate
|
||||||
|
pip install -U pip setuptools wheel
|
||||||
pip install spacy
|
pip install spacy
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -224,16 +227,28 @@ do that depends on your system. See notes on Ubuntu, OS X and Windows for
|
||||||
details.
|
details.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# make sure you are using the latest pip
|
|
||||||
python -m pip install -U pip
|
|
||||||
git clone https://github.com/explosion/spaCy
|
git clone https://github.com/explosion/spaCy
|
||||||
cd spaCy
|
cd spaCy
|
||||||
|
|
||||||
python -m venv .env
|
python -m venv .env
|
||||||
source .env/bin/activate
|
source .env/bin/activate
|
||||||
export PYTHONPATH=`pwd`
|
|
||||||
|
# make sure you are using the latest pip
|
||||||
|
python -m pip install -U pip setuptools wheel
|
||||||
|
|
||||||
|
pip install .
|
||||||
|
```
|
||||||
|
|
||||||
|
To install with extras:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install .[lookups,cuda102]
|
||||||
|
```
|
||||||
|
|
||||||
|
To install all dependencies required for development:
|
||||||
|
|
||||||
|
```bash
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
python setup.py build_ext --inplace
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Compared to regular install via pip, [requirements.txt](requirements.txt)
|
Compared to regular install via pip, [requirements.txt](requirements.txt)
|
||||||
|
@ -271,14 +286,13 @@ tests, you'll usually want to clone the repository and build spaCy from source.
|
||||||
This will also install the required development dependencies and test utilities
|
This will also install the required development dependencies and test utilities
|
||||||
defined in the `requirements.txt`.
|
defined in the `requirements.txt`.
|
||||||
|
|
||||||
Alternatively, you can find out where spaCy is installed and run `pytest` on
|
Alternatively, you can run `pytest` on the tests from within the installed
|
||||||
that directory. Don't forget to also install the test utilities via spaCy's
|
`spacy` package. Don't forget to also install the test utilities via spaCy's
|
||||||
`requirements.txt`:
|
`requirements.txt`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python -c "import os; import spacy; print(os.path.dirname(spacy.__file__))"
|
pip install -r requirements.txt
|
||||||
pip install -r path/to/requirements.txt
|
python -m pytest --pyargs spacy
|
||||||
python -m pytest <spacy-directory>
|
|
||||||
```
|
```
|
||||||
|
|
||||||
See [the documentation](https://spacy.io/usage#tests) for more details and
|
See [the documentation](https://spacy.io/usage#tests) for more details and
|
||||||
|
|
|
@ -2,67 +2,75 @@ trigger:
|
||||||
batch: true
|
batch: true
|
||||||
branches:
|
branches:
|
||||||
include:
|
include:
|
||||||
- '*'
|
- "*"
|
||||||
exclude:
|
exclude:
|
||||||
- 'spacy.io'
|
- "spacy.io"
|
||||||
paths:
|
paths:
|
||||||
exclude:
|
exclude:
|
||||||
- 'website/*'
|
- "website/*"
|
||||||
- '*.md'
|
- "*.md"
|
||||||
pr:
|
pr:
|
||||||
paths:
|
paths:
|
||||||
exclude:
|
exclude:
|
||||||
- 'website/*'
|
- "website/*"
|
||||||
- '*.md'
|
- "*.md"
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
|
||||||
# Perform basic checks for most important errors (syntax etc.) Uses the config
|
# Perform basic checks for most important errors (syntax etc.) Uses the config
|
||||||
# defined in .flake8 and overwrites the selected codes.
|
# defined in .flake8 and overwrites the selected codes.
|
||||||
- job: 'Validate'
|
- job: "Validate"
|
||||||
pool:
|
pool:
|
||||||
vmImage: 'ubuntu-16.04'
|
vmImage: "ubuntu-16.04"
|
||||||
steps:
|
steps:
|
||||||
- task: UsePythonVersion@0
|
- task: UsePythonVersion@0
|
||||||
inputs:
|
inputs:
|
||||||
versionSpec: '3.7'
|
versionSpec: "3.7"
|
||||||
- script: |
|
- script: |
|
||||||
pip install flake8==3.5.0
|
pip install flake8==3.5.0
|
||||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
|
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
|
||||||
displayName: 'flake8'
|
displayName: "flake8"
|
||||||
|
|
||||||
- job: 'Test'
|
- job: "Test"
|
||||||
dependsOn: 'Validate'
|
dependsOn: "Validate"
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
|
# We're only running one platform per Python version to speed up builds
|
||||||
Python36Linux:
|
Python36Linux:
|
||||||
imageName: 'ubuntu-16.04'
|
imageName: "ubuntu-16.04"
|
||||||
python.version: '3.6'
|
python.version: "3.6"
|
||||||
Python36Windows:
|
# Python36Windows:
|
||||||
imageName: 'vs2017-win2016'
|
# imageName: "vs2017-win2016"
|
||||||
python.version: '3.6'
|
# python.version: "3.6"
|
||||||
Python36Mac:
|
# Python36Mac:
|
||||||
imageName: 'macos-10.14'
|
# imageName: "macos-10.14"
|
||||||
python.version: '3.6'
|
# python.version: "3.6"
|
||||||
# Don't test on 3.7 for now to speed up builds
|
|
||||||
# Python37Linux:
|
# Python37Linux:
|
||||||
# imageName: 'ubuntu-16.04'
|
# imageName: "ubuntu-16.04"
|
||||||
# python.version: '3.7'
|
# python.version: "3.7"
|
||||||
# Python37Windows:
|
Python37Windows:
|
||||||
# imageName: 'vs2017-win2016'
|
imageName: "vs2017-win2016"
|
||||||
# python.version: '3.7'
|
python.version: "3.7"
|
||||||
# Python37Mac:
|
# Python37Mac:
|
||||||
# imageName: 'macos-10.14'
|
# imageName: "macos-10.14"
|
||||||
# python.version: '3.7'
|
# python.version: "3.7"
|
||||||
Python38Linux:
|
# Python38Linux:
|
||||||
imageName: 'ubuntu-16.04'
|
# imageName: "ubuntu-16.04"
|
||||||
python.version: '3.8'
|
# python.version: "3.8"
|
||||||
Python38Windows:
|
# Python38Windows:
|
||||||
imageName: 'vs2017-win2016'
|
# imageName: "vs2017-win2016"
|
||||||
python.version: '3.8'
|
# python.version: "3.8"
|
||||||
Python38Mac:
|
Python38Mac:
|
||||||
imageName: 'macos-10.14'
|
imageName: "macos-10.14"
|
||||||
python.version: '3.8'
|
python.version: "3.8"
|
||||||
|
Python39Linux:
|
||||||
|
imageName: "ubuntu-16.04"
|
||||||
|
python.version: "3.9"
|
||||||
|
Python39Windows:
|
||||||
|
imageName: "vs2017-win2016"
|
||||||
|
python.version: "3.9"
|
||||||
|
Python39Mac:
|
||||||
|
imageName: "macos-10.14"
|
||||||
|
python.version: "3.9"
|
||||||
maxParallel: 4
|
maxParallel: 4
|
||||||
pool:
|
pool:
|
||||||
vmImage: $(imageName)
|
vmImage: $(imageName)
|
||||||
|
@ -70,28 +78,35 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- task: UsePythonVersion@0
|
- task: UsePythonVersion@0
|
||||||
inputs:
|
inputs:
|
||||||
versionSpec: '$(python.version)'
|
versionSpec: "$(python.version)"
|
||||||
architecture: 'x64'
|
architecture: "x64"
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -m pip install -U setuptools
|
python -m pip install -U setuptools
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
displayName: 'Install dependencies'
|
displayName: "Install dependencies"
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python setup.py build_ext --inplace
|
python setup.py build_ext --inplace
|
||||||
python setup.py sdist --formats=gztar
|
python setup.py sdist --formats=gztar
|
||||||
displayName: 'Compile and build sdist'
|
displayName: "Compile and build sdist"
|
||||||
|
|
||||||
- task: DeleteFiles@1
|
- task: DeleteFiles@1
|
||||||
inputs:
|
inputs:
|
||||||
contents: 'spacy'
|
contents: "spacy"
|
||||||
displayName: 'Delete source directory'
|
displayName: "Delete source directory"
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
pip freeze > installed.txt
|
||||||
|
pip uninstall -y -r installed.txt
|
||||||
|
displayName: "Uninstall all packages"
|
||||||
|
|
||||||
- bash: |
|
- bash: |
|
||||||
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||||
pip install dist/$SDIST
|
pip install dist/$SDIST
|
||||||
displayName: 'Install from sdist'
|
displayName: "Install from sdist"
|
||||||
|
|
||||||
- script: python -m pytest --pyargs spacy
|
- script: |
|
||||||
displayName: 'Run tests'
|
pip install -r requirements.txt
|
||||||
|
python -m pytest --pyargs spacy
|
||||||
|
displayName: "Run tests"
|
||||||
|
|
5
build-constraints.txt
Normal file
5
build-constraints.txt
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
# build version constraints for use with wheelwright + multibuild
|
||||||
|
numpy==1.15.0; python_version<='3.7'
|
||||||
|
numpy==1.17.3; python_version=='3.8'
|
||||||
|
numpy==1.19.3; python_version=='3.9'
|
||||||
|
numpy; python_version>='3.10'
|
|
@ -3,6 +3,8 @@ redirects = [
|
||||||
{from = "https://spacy.netlify.com/*", to="https://spacy.io/:splat", force = true },
|
{from = "https://spacy.netlify.com/*", to="https://spacy.io/:splat", force = true },
|
||||||
# Subdomain for branches
|
# Subdomain for branches
|
||||||
{from = "https://nightly.spacy.io/*", to="https://nightly-spacy-io.spacy.io/:splat", force = true, status = 200},
|
{from = "https://nightly.spacy.io/*", to="https://nightly-spacy-io.spacy.io/:splat", force = true, status = 200},
|
||||||
|
# TODO: update this with the v2 branch build once v3 is live (status = 200)
|
||||||
|
{from = "https://v2.spacy.io/*", to="https://spacy.io/:splat", force = true},
|
||||||
# Old subdomains
|
# Old subdomains
|
||||||
{from = "https://survey.spacy.io/*", to = "https://spacy.io", force = true},
|
{from = "https://survey.spacy.io/*", to = "https://spacy.io", force = true},
|
||||||
{from = "http://survey.spacy.io/*", to = "https://spacy.io", force = true},
|
{from = "http://survey.spacy.io/*", to = "https://spacy.io", force = true},
|
||||||
|
|
|
@ -1,13 +1,13 @@
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = [
|
requires = [
|
||||||
"setuptools",
|
"setuptools",
|
||||||
"wheel",
|
|
||||||
"cython>=0.25",
|
"cython>=0.25",
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.0.0rc2,<8.1.0",
|
"thinc>=8.0.0rc3,<8.1.0",
|
||||||
"blis>=0.4.0,<0.8.0",
|
"blis>=0.4.0,<0.8.0",
|
||||||
"pathy"
|
"pathy",
|
||||||
|
"numpy>=1.15.0",
|
||||||
]
|
]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
|
spacy-legacy>=3.0.0.dev0,<3.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0rc2,<8.1.0
|
thinc>=8.0.0rc3,<8.1.0
|
||||||
blis>=0.4.0,<0.8.0
|
blis>=0.4.0,<0.8.0
|
||||||
ml_datasets==0.2.0a0
|
ml_datasets==0.2.0a0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
|
@ -14,7 +15,7 @@ pathy
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
pydantic>=1.5.0,<1.7.0
|
pydantic>=1.7.1,<1.8.0
|
||||||
jinja2
|
jinja2
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
|
@ -27,3 +28,4 @@ pytest>=4.6.5
|
||||||
pytest-timeout>=1.3.0,<2.0.0
|
pytest-timeout>=1.3.0,<2.0.0
|
||||||
mock>=2.0.0,<3.0.0
|
mock>=2.0.0,<3.0.0
|
||||||
flake8>=3.5.0,<3.6.0
|
flake8>=3.5.0,<3.6.0
|
||||||
|
hypothesis
|
||||||
|
|
11
setup.cfg
11
setup.cfg
|
@ -20,6 +20,7 @@ classifiers =
|
||||||
Programming Language :: Python :: 3.6
|
Programming Language :: Python :: 3.6
|
||||||
Programming Language :: Python :: 3.7
|
Programming Language :: Python :: 3.7
|
||||||
Programming Language :: Python :: 3.8
|
Programming Language :: Python :: 3.8
|
||||||
|
Programming Language :: Python :: 3.9
|
||||||
Topic :: Scientific/Engineering
|
Topic :: Scientific/Engineering
|
||||||
|
|
||||||
[options]
|
[options]
|
||||||
|
@ -27,20 +28,20 @@ zip_safe = false
|
||||||
include_package_data = true
|
include_package_data = true
|
||||||
python_requires = >=3.6
|
python_requires = >=3.6
|
||||||
setup_requires =
|
setup_requires =
|
||||||
wheel
|
|
||||||
cython>=0.25
|
cython>=0.25
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
# We also need our Cython packages here to compile against
|
# We also need our Cython packages here to compile against
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.0.0rc2,<8.1.0
|
thinc>=8.0.0rc3,<8.1.0
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
|
spacy-legacy>=3.0.0.dev0,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0rc2,<8.1.0
|
thinc>=8.0.0rc3,<8.1.0
|
||||||
blis>=0.4.0,<0.8.0
|
blis>=0.4.0,<0.8.0
|
||||||
wasabi>=0.8.0,<1.1.0
|
wasabi>=0.8.0,<1.1.0
|
||||||
srsly>=2.3.0,<3.0.0
|
srsly>=2.3.0,<3.0.0
|
||||||
|
@ -51,7 +52,7 @@ install_requires =
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
pydantic>=1.5.0,<1.7.0
|
pydantic>=1.7.1,<1.8.0
|
||||||
jinja2
|
jinja2
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
|
@ -88,6 +89,8 @@ cuda102 =
|
||||||
cupy-cuda102>=5.0.0b4,<9.0.0
|
cupy-cuda102>=5.0.0b4,<9.0.0
|
||||||
cuda110 =
|
cuda110 =
|
||||||
cupy-cuda110>=5.0.0b4,<9.0.0
|
cupy-cuda110>=5.0.0b4,<9.0.0
|
||||||
|
cuda111 =
|
||||||
|
cupy-cuda111>=5.0.0b4,<9.0.0
|
||||||
# Language tokenizers with external dependencies
|
# Language tokenizers with external dependencies
|
||||||
ja =
|
ja =
|
||||||
sudachipy>=0.4.9
|
sudachipy>=0.4.9
|
||||||
|
|
13
setup.py
13
setup.py
|
@ -2,9 +2,9 @@
|
||||||
from setuptools import Extension, setup, find_packages
|
from setuptools import Extension, setup, find_packages
|
||||||
import sys
|
import sys
|
||||||
import platform
|
import platform
|
||||||
|
import numpy
|
||||||
from distutils.command.build_ext import build_ext
|
from distutils.command.build_ext import build_ext
|
||||||
from distutils.sysconfig import get_python_inc
|
from distutils.sysconfig import get_python_inc
|
||||||
import numpy
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import shutil
|
import shutil
|
||||||
from Cython.Build import cythonize
|
from Cython.Build import cythonize
|
||||||
|
@ -48,12 +48,15 @@ MOD_NAMES = [
|
||||||
"spacy.pipeline._parser_internals._state",
|
"spacy.pipeline._parser_internals._state",
|
||||||
"spacy.pipeline._parser_internals.stateclass",
|
"spacy.pipeline._parser_internals.stateclass",
|
||||||
"spacy.pipeline._parser_internals.transition_system",
|
"spacy.pipeline._parser_internals.transition_system",
|
||||||
|
"spacy.pipeline._parser_internals._beam_utils",
|
||||||
"spacy.tokenizer",
|
"spacy.tokenizer",
|
||||||
"spacy.training.align",
|
"spacy.training.align",
|
||||||
"spacy.training.gold_io",
|
"spacy.training.gold_io",
|
||||||
"spacy.tokens.doc",
|
"spacy.tokens.doc",
|
||||||
"spacy.tokens.span",
|
"spacy.tokens.span",
|
||||||
"spacy.tokens.token",
|
"spacy.tokens.token",
|
||||||
|
"spacy.tokens.span_group",
|
||||||
|
"spacy.tokens.graph",
|
||||||
"spacy.tokens.morphanalysis",
|
"spacy.tokens.morphanalysis",
|
||||||
"spacy.tokens._retokenize",
|
"spacy.tokens._retokenize",
|
||||||
"spacy.matcher.matcher",
|
"spacy.matcher.matcher",
|
||||||
|
@ -67,7 +70,7 @@ COMPILE_OPTIONS = {
|
||||||
"mingw32": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
|
"mingw32": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
|
||||||
"other": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
|
"other": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
|
||||||
}
|
}
|
||||||
LINK_OPTIONS = {"msvc": [], "mingw32": [], "other": []}
|
LINK_OPTIONS = {"msvc": ["-std=c++11"], "mingw32": ["-std=c++11"], "other": []}
|
||||||
COMPILER_DIRECTIVES = {
|
COMPILER_DIRECTIVES = {
|
||||||
"language_level": -3,
|
"language_level": -3,
|
||||||
"embedsignature": True,
|
"embedsignature": True,
|
||||||
|
@ -194,13 +197,13 @@ def setup_package():
|
||||||
print(f"Copied {copy_file} -> {target_dir}")
|
print(f"Copied {copy_file} -> {target_dir}")
|
||||||
|
|
||||||
include_dirs = [
|
include_dirs = [
|
||||||
get_python_inc(plat_specific=True),
|
|
||||||
numpy.get_include(),
|
numpy.get_include(),
|
||||||
|
get_python_inc(plat_specific=True),
|
||||||
]
|
]
|
||||||
ext_modules = []
|
ext_modules = []
|
||||||
for name in MOD_NAMES:
|
for name in MOD_NAMES:
|
||||||
mod_path = name.replace(".", "/") + ".pyx"
|
mod_path = name.replace(".", "/") + ".pyx"
|
||||||
ext = Extension(name, [mod_path], language="c++")
|
ext = Extension(name, [mod_path], language="c++", extra_compile_args=["-std=c++11"])
|
||||||
ext_modules.append(ext)
|
ext_modules.append(ext)
|
||||||
print("Cythonizing sources")
|
print("Cythonizing sources")
|
||||||
ext_modules = cythonize(ext_modules, compiler_directives=COMPILER_DIRECTIVES)
|
ext_modules = cythonize(ext_modules, compiler_directives=COMPILER_DIRECTIVES)
|
||||||
|
@ -212,7 +215,7 @@ def setup_package():
|
||||||
ext_modules=ext_modules,
|
ext_modules=ext_modules,
|
||||||
cmdclass={"build_ext": build_ext_subclass},
|
cmdclass={"build_ext": build_ext_subclass},
|
||||||
include_dirs=include_dirs,
|
include_dirs=include_dirs,
|
||||||
package_data={"": ["*.pyx", "*.pxd", "*.pxi", "*.cpp"]},
|
package_data={"": ["*.pyx", "*.pxd", "*.pxi"]},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy-nightly"
|
__title__ = "spacy-nightly"
|
||||||
__version__ = "3.0.0rc2"
|
__version__ = "3.0.0rc3"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__projects__ = "https://github.com/explosion/projects"
|
__projects__ = "https://github.com/explosion/projects"
|
||||||
|
|
|
@ -272,7 +272,11 @@ def show_validation_error(
|
||||||
msg.fail(title)
|
msg.fail(title)
|
||||||
print(err.text.strip())
|
print(err.text.strip())
|
||||||
if hint_fill and "value_error.missing" in err.error_types:
|
if hint_fill and "value_error.missing" in err.error_types:
|
||||||
config_path = file_path if file_path is not None else "config.cfg"
|
config_path = (
|
||||||
|
file_path
|
||||||
|
if file_path is not None and str(file_path) != "-"
|
||||||
|
else "config.cfg"
|
||||||
|
)
|
||||||
msg.text(
|
msg.text(
|
||||||
"If your config contains missing values, you can run the 'init "
|
"If your config contains missing values, you can run the 'init "
|
||||||
"fill-config' command to fill in all the defaults, if possible:",
|
"fill-config' command to fill in all the defaults, if possible:",
|
||||||
|
|
|
@ -5,6 +5,7 @@ from wasabi import Printer
|
||||||
import srsly
|
import srsly
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
import itertools
|
||||||
|
|
||||||
from ._util import app, Arg, Opt
|
from ._util import app, Arg, Opt
|
||||||
from ..training import docs_to_json
|
from ..training import docs_to_json
|
||||||
|
@ -130,15 +131,16 @@ def convert(
|
||||||
)
|
)
|
||||||
doc_files.append((input_loc, docs))
|
doc_files.append((input_loc, docs))
|
||||||
if concatenate:
|
if concatenate:
|
||||||
all_docs = []
|
all_docs = itertools.chain.from_iterable([docs for _, docs in doc_files])
|
||||||
for _, docs in doc_files:
|
|
||||||
all_docs.extend(docs)
|
|
||||||
doc_files = [(input_path, all_docs)]
|
doc_files = [(input_path, all_docs)]
|
||||||
for input_loc, docs in doc_files:
|
for input_loc, docs in doc_files:
|
||||||
if file_type == "json":
|
if file_type == "json":
|
||||||
data = [docs_to_json(docs)]
|
data = [docs_to_json(docs)]
|
||||||
|
len_docs = len(data)
|
||||||
else:
|
else:
|
||||||
data = DocBin(docs=docs, store_user_data=True).to_bytes()
|
db = DocBin(docs=docs, store_user_data=True)
|
||||||
|
len_docs = len(db)
|
||||||
|
data = db.to_bytes()
|
||||||
if output_dir == "-":
|
if output_dir == "-":
|
||||||
_print_docs_to_stdout(data, file_type)
|
_print_docs_to_stdout(data, file_type)
|
||||||
else:
|
else:
|
||||||
|
@ -149,7 +151,7 @@ def convert(
|
||||||
output_file = Path(output_dir) / input_loc.parts[-1]
|
output_file = Path(output_dir) / input_loc.parts[-1]
|
||||||
output_file = output_file.with_suffix(f".{file_type}")
|
output_file = output_file.with_suffix(f".{file_type}")
|
||||||
_write_docs_to_file(data, output_file, file_type)
|
_write_docs_to_file(data, output_file, file_type)
|
||||||
msg.good(f"Generated output file ({len(docs)} documents): {output_file}")
|
msg.good(f"Generated output file ({len_docs} documents): {output_file}")
|
||||||
|
|
||||||
|
|
||||||
def _print_docs_to_stdout(data: Any, output_type: str) -> None:
|
def _print_docs_to_stdout(data: Any, output_type: str) -> None:
|
||||||
|
|
|
@ -7,7 +7,7 @@ import typer
|
||||||
|
|
||||||
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
|
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
|
||||||
from ._util import import_code, debug_cli
|
from ._util import import_code, debug_cli
|
||||||
from ..schemas import ConfigSchemaTraining
|
from ..schemas import ConfigSchemaInit, ConfigSchemaTraining
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
@ -19,7 +19,7 @@ from .. import util
|
||||||
def debug_config_cli(
|
def debug_config_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"),
|
show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"),
|
||||||
show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")
|
show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")
|
||||||
|
@ -55,6 +55,11 @@ def debug_config(
|
||||||
config = util.load_config(config_path, overrides=overrides)
|
config = util.load_config(config_path, overrides=overrides)
|
||||||
nlp = util.load_model_from_config(config)
|
nlp = util.load_model_from_config(config)
|
||||||
config = nlp.config.interpolate()
|
config = nlp.config.interpolate()
|
||||||
|
msg.divider("Config validation for [initialize]")
|
||||||
|
with show_validation_error(config_path):
|
||||||
|
T = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
||||||
|
msg.divider("Config validation for [training]")
|
||||||
|
with show_validation_error(config_path):
|
||||||
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||||
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
||||||
util.resolve_dot_names(config, dot_names)
|
util.resolve_dot_names(config, dot_names)
|
||||||
|
|
|
@ -12,6 +12,7 @@ from ..training import Example
|
||||||
from ..training.initialize import get_sourced_components
|
from ..training.initialize import get_sourced_components
|
||||||
from ..schemas import ConfigSchemaTraining
|
from ..schemas import ConfigSchemaTraining
|
||||||
from ..pipeline._parser_internals import nonproj
|
from ..pipeline._parser_internals import nonproj
|
||||||
|
from ..pipeline._parser_internals.nonproj import DELIMITER
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..util import registry, resolve_dot_names
|
from ..util import registry, resolve_dot_names
|
||||||
from .. import util
|
from .. import util
|
||||||
|
@ -37,7 +38,7 @@ BLANK_MODEL_THRESHOLD = 2000
|
||||||
def debug_data_cli(
|
def debug_data_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
|
ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"),
|
verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"),
|
||||||
|
@ -383,7 +384,10 @@ def debug_data(
|
||||||
# rare labels in projectivized train
|
# rare labels in projectivized train
|
||||||
rare_projectivized_labels = []
|
rare_projectivized_labels = []
|
||||||
for label in gold_train_data["deps"]:
|
for label in gold_train_data["deps"]:
|
||||||
if gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD and "||" in label:
|
if (
|
||||||
|
gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD
|
||||||
|
and DELIMITER in label
|
||||||
|
):
|
||||||
rare_projectivized_labels.append(
|
rare_projectivized_labels.append(
|
||||||
f"{label}: {gold_train_data['deps'][label]}"
|
f"{label}: {gold_train_data['deps'][label]}"
|
||||||
)
|
)
|
||||||
|
@ -504,13 +508,18 @@ def _compile_gold(
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
gold = eg.reference
|
gold = eg.reference
|
||||||
doc = eg.predicted
|
doc = eg.predicted
|
||||||
valid_words = [x for x in gold if x is not None]
|
valid_words = [x.text for x in gold]
|
||||||
data["words"].update(valid_words)
|
data["words"].update(valid_words)
|
||||||
data["n_words"] += len(valid_words)
|
data["n_words"] += len(valid_words)
|
||||||
data["n_misaligned_words"] += len(gold) - len(valid_words)
|
align = eg.alignment
|
||||||
|
for token in doc:
|
||||||
|
if token.orth_.isspace():
|
||||||
|
continue
|
||||||
|
if align.x2y.lengths[token.i] != 1:
|
||||||
|
data["n_misaligned_words"] += 1
|
||||||
data["texts"].add(doc.text)
|
data["texts"].add(doc.text)
|
||||||
if len(nlp.vocab.vectors):
|
if len(nlp.vocab.vectors):
|
||||||
for word in valid_words:
|
for word in [t.text for t in doc]:
|
||||||
if nlp.vocab.strings[word] not in nlp.vocab.vectors:
|
if nlp.vocab.strings[word] not in nlp.vocab.vectors:
|
||||||
data["words_missing_vectors"].update([word])
|
data["words_missing_vectors"].update([word])
|
||||||
if "ner" in factory_names:
|
if "ner" in factory_names:
|
||||||
|
|
|
@ -22,7 +22,7 @@ from .. import util
|
||||||
def debug_model_cli(
|
def debug_model_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||||
component: str = Arg(..., help="Name of the pipeline component of which the model should be analysed"),
|
component: str = Arg(..., help="Name of the pipeline component of which the model should be analysed"),
|
||||||
layers: str = Opt("", "--layers", "-l", help="Comma-separated names of layer IDs to print"),
|
layers: str = Opt("", "--layers", "-l", help="Comma-separated names of layer IDs to print"),
|
||||||
dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"),
|
dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"),
|
||||||
|
|
|
@ -35,7 +35,10 @@ def download_cli(
|
||||||
|
|
||||||
|
|
||||||
def download(model: str, direct: bool = False, *pip_args) -> None:
|
def download(model: str, direct: bool = False, *pip_args) -> None:
|
||||||
if not (is_package("spacy") or is_package("spacy-nightly")) and "--no-deps" not in pip_args:
|
if (
|
||||||
|
not (is_package("spacy") or is_package("spacy-nightly"))
|
||||||
|
and "--no-deps" not in pip_args
|
||||||
|
):
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Skipping pipeline package dependencies and setting `--no-deps`. "
|
"Skipping pipeline package dependencies and setting `--no-deps`. "
|
||||||
"You don't seem to have the spaCy package itself installed "
|
"You don't seem to have the spaCy package itself installed "
|
||||||
|
|
|
@ -172,7 +172,9 @@ def render_parses(
|
||||||
file_.write(html)
|
file_.write(html)
|
||||||
|
|
||||||
|
|
||||||
def print_prf_per_type(msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str) -> None:
|
def print_prf_per_type(
|
||||||
|
msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str
|
||||||
|
) -> None:
|
||||||
data = [
|
data = [
|
||||||
(k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}")
|
(k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}")
|
||||||
for k, v in scores.items()
|
for k, v in scores.items()
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
from typing import Optional, Dict, Any, Union
|
from typing import Optional, Dict, Any, Union, List
|
||||||
import platform
|
import platform
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer, MarkdownRenderer
|
from wasabi import Printer, MarkdownRenderer
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
from ._util import app, Arg, Opt
|
from ._util import app, Arg, Opt, string_to_list
|
||||||
from .. import util
|
from .. import util
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
|
@ -15,20 +15,26 @@ def info_cli(
|
||||||
model: Optional[str] = Arg(None, help="Optional loadable spaCy pipeline"),
|
model: Optional[str] = Arg(None, help="Optional loadable spaCy pipeline"),
|
||||||
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
|
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
|
||||||
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
|
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
|
||||||
|
exclude: Optional[str] = Opt("labels", "--exclude", "-e", help="Comma-separated keys to exclude from the print-out"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Print info about spaCy installation. If a pipeline is speficied as an argument,
|
Print info about spaCy installation. If a pipeline is specified as an argument,
|
||||||
print its meta information. Flag --markdown prints details in Markdown for easy
|
print its meta information. Flag --markdown prints details in Markdown for easy
|
||||||
copy-pasting to GitHub issues.
|
copy-pasting to GitHub issues.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/cli#info
|
DOCS: https://nightly.spacy.io/api/cli#info
|
||||||
"""
|
"""
|
||||||
info(model, markdown=markdown, silent=silent)
|
exclude = string_to_list(exclude)
|
||||||
|
info(model, markdown=markdown, silent=silent, exclude=exclude)
|
||||||
|
|
||||||
|
|
||||||
def info(
|
def info(
|
||||||
model: Optional[str] = None, *, markdown: bool = False, silent: bool = True
|
model: Optional[str] = None,
|
||||||
|
*,
|
||||||
|
markdown: bool = False,
|
||||||
|
silent: bool = True,
|
||||||
|
exclude: List[str],
|
||||||
) -> Union[str, dict]:
|
) -> Union[str, dict]:
|
||||||
msg = Printer(no_print=silent, pretty=not silent)
|
msg = Printer(no_print=silent, pretty=not silent)
|
||||||
if model:
|
if model:
|
||||||
|
@ -42,13 +48,13 @@ def info(
|
||||||
data["Pipelines"] = ", ".join(
|
data["Pipelines"] = ", ".join(
|
||||||
f"{n} ({v})" for n, v in data["Pipelines"].items()
|
f"{n} ({v})" for n, v in data["Pipelines"].items()
|
||||||
)
|
)
|
||||||
markdown_data = get_markdown(data, title=title)
|
markdown_data = get_markdown(data, title=title, exclude=exclude)
|
||||||
if markdown:
|
if markdown:
|
||||||
if not silent:
|
if not silent:
|
||||||
print(markdown_data)
|
print(markdown_data)
|
||||||
return markdown_data
|
return markdown_data
|
||||||
if not silent:
|
if not silent:
|
||||||
table_data = dict(data)
|
table_data = {k: v for k, v in data.items() if k not in exclude}
|
||||||
msg.table(table_data, title=title)
|
msg.table(table_data, title=title)
|
||||||
return raw_data
|
return raw_data
|
||||||
|
|
||||||
|
@ -82,7 +88,7 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
|
||||||
if util.is_package(model):
|
if util.is_package(model):
|
||||||
model_path = util.get_package_path(model)
|
model_path = util.get_package_path(model)
|
||||||
else:
|
else:
|
||||||
model_path = model
|
model_path = Path(model)
|
||||||
meta_path = model_path / "meta.json"
|
meta_path = model_path / "meta.json"
|
||||||
if not meta_path.is_file():
|
if not meta_path.is_file():
|
||||||
msg.fail("Can't find pipeline meta.json", meta_path, exits=1)
|
msg.fail("Can't find pipeline meta.json", meta_path, exits=1)
|
||||||
|
@ -96,7 +102,9 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_markdown(data: Dict[str, Any], title: Optional[str] = None) -> str:
|
def get_markdown(
|
||||||
|
data: Dict[str, Any], title: Optional[str] = None, exclude: List[str] = None
|
||||||
|
) -> str:
|
||||||
"""Get data in GitHub-flavoured Markdown format for issues etc.
|
"""Get data in GitHub-flavoured Markdown format for issues etc.
|
||||||
|
|
||||||
data (dict or list of tuples): Label/value pairs.
|
data (dict or list of tuples): Label/value pairs.
|
||||||
|
@ -108,7 +116,15 @@ def get_markdown(data: Dict[str, Any], title: Optional[str] = None) -> str:
|
||||||
md.add(md.title(2, title))
|
md.add(md.title(2, title))
|
||||||
items = []
|
items = []
|
||||||
for key, value in data.items():
|
for key, value in data.items():
|
||||||
if isinstance(value, str) and Path(value).exists():
|
if exclude and key in exclude:
|
||||||
|
continue
|
||||||
|
if isinstance(value, str):
|
||||||
|
try:
|
||||||
|
existing_path = Path(value).exists()
|
||||||
|
except Exception:
|
||||||
|
# invalid Path, like a URL string
|
||||||
|
existing_path = False
|
||||||
|
if existing_path:
|
||||||
continue
|
continue
|
||||||
items.append(f"{md.bold(f'{key}:')} {value}")
|
items.append(f"{md.bold(f'{key}:')} {value}")
|
||||||
md.add(md.list(items))
|
md.add(md.list(items))
|
||||||
|
|
|
@ -30,8 +30,9 @@ def init_config_cli(
|
||||||
lang: Optional[str] = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
|
lang: Optional[str] = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
|
||||||
pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
|
pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
|
||||||
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||||
cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
gpu: bool = Opt(False, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||||
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
|
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
|
||||||
|
force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -45,14 +46,22 @@ def init_config_cli(
|
||||||
if isinstance(optimize, Optimizations): # instance of enum from the CLI
|
if isinstance(optimize, Optimizations): # instance of enum from the CLI
|
||||||
optimize = optimize.value
|
optimize = optimize.value
|
||||||
pipeline = string_to_list(pipeline)
|
pipeline = string_to_list(pipeline)
|
||||||
init_config(
|
is_stdout = str(output_file) == "-"
|
||||||
output_file,
|
if not is_stdout and output_file.exists() and not force_overwrite:
|
||||||
|
msg = Printer()
|
||||||
|
msg.fail(
|
||||||
|
"The provided output file already exists. To force overwriting the config file, set the --force or -F flag.",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
config = init_config(
|
||||||
lang=lang,
|
lang=lang,
|
||||||
pipeline=pipeline,
|
pipeline=pipeline,
|
||||||
optimize=optimize,
|
optimize=optimize,
|
||||||
cpu=cpu,
|
gpu=gpu,
|
||||||
pretraining=pretraining,
|
pretraining=pretraining,
|
||||||
|
silent=is_stdout,
|
||||||
)
|
)
|
||||||
|
save_config(config, output_file, is_stdout=is_stdout)
|
||||||
|
|
||||||
|
|
||||||
@init_cli.command("fill-config")
|
@init_cli.command("fill-config")
|
||||||
|
@ -118,16 +127,15 @@ def fill_config(
|
||||||
|
|
||||||
|
|
||||||
def init_config(
|
def init_config(
|
||||||
output_file: Path,
|
|
||||||
*,
|
*,
|
||||||
lang: str,
|
lang: str,
|
||||||
pipeline: List[str],
|
pipeline: List[str],
|
||||||
optimize: str,
|
optimize: str,
|
||||||
cpu: bool,
|
gpu: bool,
|
||||||
pretraining: bool = False,
|
pretraining: bool = False,
|
||||||
) -> None:
|
silent: bool = True,
|
||||||
is_stdout = str(output_file) == "-"
|
) -> Config:
|
||||||
msg = Printer(no_print=is_stdout)
|
msg = Printer(no_print=silent)
|
||||||
with TEMPLATE_PATH.open("r") as f:
|
with TEMPLATE_PATH.open("r") as f:
|
||||||
template = Template(f.read())
|
template = Template(f.read())
|
||||||
# Filter out duplicates since tok2vec and transformer are added by template
|
# Filter out duplicates since tok2vec and transformer are added by template
|
||||||
|
@ -137,7 +145,7 @@ def init_config(
|
||||||
"lang": lang,
|
"lang": lang,
|
||||||
"components": pipeline,
|
"components": pipeline,
|
||||||
"optimize": optimize,
|
"optimize": optimize,
|
||||||
"hardware": "cpu" if cpu else "gpu",
|
"hardware": "gpu" if gpu else "cpu",
|
||||||
"transformer_data": reco["transformer"],
|
"transformer_data": reco["transformer"],
|
||||||
"word_vectors": reco["word_vectors"],
|
"word_vectors": reco["word_vectors"],
|
||||||
"has_letters": reco["has_letters"],
|
"has_letters": reco["has_letters"],
|
||||||
|
@ -161,7 +169,7 @@ def init_config(
|
||||||
"Hardware": variables["hardware"].upper(),
|
"Hardware": variables["hardware"].upper(),
|
||||||
"Transformer": template_vars.transformer.get("name", False),
|
"Transformer": template_vars.transformer.get("name", False),
|
||||||
}
|
}
|
||||||
msg.info("Generated template specific for your use case")
|
msg.info("Generated config template specific for your use case")
|
||||||
for label, value in use_case.items():
|
for label, value in use_case.items():
|
||||||
msg.text(f"- {label}: {value}")
|
msg.text(f"- {label}: {value}")
|
||||||
with show_validation_error(hint_fill=False):
|
with show_validation_error(hint_fill=False):
|
||||||
|
@ -173,7 +181,7 @@ def init_config(
|
||||||
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
|
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
|
||||||
config = pretrain_config.merge(config)
|
config = pretrain_config.merge(config)
|
||||||
msg.good("Auto-filled config with all values")
|
msg.good("Auto-filled config with all values")
|
||||||
save_config(config, output_file, is_stdout=is_stdout)
|
return config
|
||||||
|
|
||||||
|
|
||||||
def save_config(
|
def save_config(
|
||||||
|
|
|
@ -62,7 +62,7 @@ def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
|
||||||
def init_pipeline_cli(
|
def init_pipeline_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||||
output_path: Path = Arg(..., help="Output directory for the prepared data"),
|
output_path: Path = Arg(..., help="Output directory for the prepared data"),
|
||||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
|
@ -88,7 +88,7 @@ def init_pipeline_cli(
|
||||||
def init_labels_cli(
|
def init_labels_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||||
output_path: Path = Arg(..., help="Output directory for the labels"),
|
output_path: Path = Arg(..., help="Output directory for the labels"),
|
||||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional, Union, Any, Dict
|
from typing import Optional, Union, Any, Dict, List
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer, get_raw_input
|
from wasabi import Printer, get_raw_input
|
||||||
|
@ -16,6 +16,7 @@ def package_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
input_dir: Path = Arg(..., help="Directory with pipeline data", exists=True, file_okay=False),
|
input_dir: Path = Arg(..., help="Directory with pipeline data", exists=True, file_okay=False),
|
||||||
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
|
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
|
||||||
|
code_paths: Optional[str] = Opt(None, "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"),
|
||||||
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
|
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
|
||||||
create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
|
create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
|
||||||
name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
|
name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
|
||||||
|
@ -33,12 +34,22 @@ def package_cli(
|
||||||
After packaging, "python setup.py sdist" is run in the package directory,
|
After packaging, "python setup.py sdist" is run in the package directory,
|
||||||
which will create a .tar.gz archive that can be installed via "pip install".
|
which will create a .tar.gz archive that can be installed via "pip install".
|
||||||
|
|
||||||
|
If additional code files are provided (e.g. Python files containing custom
|
||||||
|
registered functions like pipeline components), they are copied into the
|
||||||
|
package and imported in the __init__.py.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/cli#package
|
DOCS: https://nightly.spacy.io/api/cli#package
|
||||||
"""
|
"""
|
||||||
|
code_paths = (
|
||||||
|
[Path(p.strip()) for p in code_paths.split(",")]
|
||||||
|
if code_paths is not None
|
||||||
|
else []
|
||||||
|
)
|
||||||
package(
|
package(
|
||||||
input_dir,
|
input_dir,
|
||||||
output_dir,
|
output_dir,
|
||||||
meta_path=meta_path,
|
meta_path=meta_path,
|
||||||
|
code_paths=code_paths,
|
||||||
name=name,
|
name=name,
|
||||||
version=version,
|
version=version,
|
||||||
create_meta=create_meta,
|
create_meta=create_meta,
|
||||||
|
@ -52,6 +63,7 @@ def package(
|
||||||
input_dir: Path,
|
input_dir: Path,
|
||||||
output_dir: Path,
|
output_dir: Path,
|
||||||
meta_path: Optional[Path] = None,
|
meta_path: Optional[Path] = None,
|
||||||
|
code_paths: List[Path] = [],
|
||||||
name: Optional[str] = None,
|
name: Optional[str] = None,
|
||||||
version: Optional[str] = None,
|
version: Optional[str] = None,
|
||||||
create_meta: bool = False,
|
create_meta: bool = False,
|
||||||
|
@ -67,6 +79,14 @@ def package(
|
||||||
msg.fail("Can't locate pipeline data", input_path, exits=1)
|
msg.fail("Can't locate pipeline data", input_path, exits=1)
|
||||||
if not output_path or not output_path.exists():
|
if not output_path or not output_path.exists():
|
||||||
msg.fail("Output directory not found", output_path, exits=1)
|
msg.fail("Output directory not found", output_path, exits=1)
|
||||||
|
for code_path in code_paths:
|
||||||
|
if not code_path.exists():
|
||||||
|
msg.fail("Can't find code file", code_path, exits=1)
|
||||||
|
# Import the code here so it's available when model is loaded (via
|
||||||
|
# get_meta helper). Also verifies that everything works
|
||||||
|
util.import_file(code_path.stem, code_path)
|
||||||
|
if code_paths:
|
||||||
|
msg.good(f"Including {len(code_paths)} Python module(s) with custom code")
|
||||||
if meta_path and not meta_path.exists():
|
if meta_path and not meta_path.exists():
|
||||||
msg.fail("Can't find pipeline meta.json", meta_path, exits=1)
|
msg.fail("Can't find pipeline meta.json", meta_path, exits=1)
|
||||||
meta_path = meta_path or input_dir / "meta.json"
|
meta_path = meta_path or input_dir / "meta.json"
|
||||||
|
@ -106,10 +126,17 @@ def package(
|
||||||
license_path = package_path / model_name_v / "LICENSE"
|
license_path = package_path / model_name_v / "LICENSE"
|
||||||
if license_path.exists():
|
if license_path.exists():
|
||||||
shutil.move(str(license_path), str(main_path))
|
shutil.move(str(license_path), str(main_path))
|
||||||
|
imports = []
|
||||||
|
for code_path in code_paths:
|
||||||
|
imports.append(code_path.stem)
|
||||||
|
shutil.copy(str(code_path), str(package_path))
|
||||||
create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
|
create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
|
||||||
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
||||||
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
||||||
create_file(package_path / "__init__.py", TEMPLATE_INIT)
|
init_py = TEMPLATE_INIT.format(
|
||||||
|
imports="\n".join(f"from . import {m}" for m in imports)
|
||||||
|
)
|
||||||
|
create_file(package_path / "__init__.py", init_py)
|
||||||
msg.good(f"Successfully created package '{model_name_v}'", main_path)
|
msg.good(f"Successfully created package '{model_name_v}'", main_path)
|
||||||
if create_sdist:
|
if create_sdist:
|
||||||
with util.working_dir(main_path):
|
with util.working_dir(main_path):
|
||||||
|
@ -249,6 +276,7 @@ TEMPLATE_INIT = """
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from spacy.util import load_model_from_init_py, get_model_meta
|
from spacy.util import load_model_from_init_py, get_model_meta
|
||||||
|
|
||||||
|
{imports}
|
||||||
|
|
||||||
__version__ = get_model_meta(Path(__file__).parent)['version']
|
__version__ = get_model_meta(Path(__file__).parent)['version']
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,7 @@ from ..util import load_config
|
||||||
def pretrain_cli(
|
def pretrain_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
|
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False, allow_dash=True),
|
||||||
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
|
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
|
||||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||||
|
@ -79,7 +79,7 @@ def pretrain_cli(
|
||||||
|
|
||||||
|
|
||||||
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
|
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
|
||||||
if not config_path or not config_path.exists():
|
if not config_path or (str(config_path) != "-" and not config_path.exists()):
|
||||||
msg.fail("Config file not found", config_path, exits=1)
|
msg.fail("Config file not found", config_path, exits=1)
|
||||||
if output_dir.exists() and [p for p in output_dir.iterdir()]:
|
if output_dir.exists() and [p for p in output_dir.iterdir()]:
|
||||||
if resume_path:
|
if resume_path:
|
||||||
|
|
|
@ -19,6 +19,7 @@ lang = "{{ lang }}"
|
||||||
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %}
|
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %}
|
||||||
pipeline = {{ full_pipeline|pprint()|replace("'", '"')|safe }}
|
pipeline = {{ full_pipeline|pprint()|replace("'", '"')|safe }}
|
||||||
tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"}
|
tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"}
|
||||||
|
batch_size = {{ 128 if hardware == "gpu" else 1000 }}
|
||||||
|
|
||||||
[components]
|
[components]
|
||||||
|
|
||||||
|
@ -74,7 +75,7 @@ grad_factor = 1.0
|
||||||
factory = "parser"
|
factory = "parser"
|
||||||
|
|
||||||
[components.parser.model]
|
[components.parser.model]
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
@architectures = "spacy.TransitionBasedParser.v2"
|
||||||
state_type = "parser"
|
state_type = "parser"
|
||||||
extra_state_tokens = false
|
extra_state_tokens = false
|
||||||
hidden_width = 128
|
hidden_width = 128
|
||||||
|
@ -95,7 +96,7 @@ grad_factor = 1.0
|
||||||
factory = "ner"
|
factory = "ner"
|
||||||
|
|
||||||
[components.ner.model]
|
[components.ner.model]
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
@architectures = "spacy.TransitionBasedParser.v2"
|
||||||
state_type = "ner"
|
state_type = "ner"
|
||||||
extra_state_tokens = false
|
extra_state_tokens = false
|
||||||
hidden_width = 64
|
hidden_width = 64
|
||||||
|
@ -148,13 +149,44 @@ grad_factor = 1.0
|
||||||
|
|
||||||
[components.textcat.model.linear_model]
|
[components.textcat.model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v1"
|
@architectures = "spacy.TextCatBOW.v1"
|
||||||
exclusive_classes = false
|
exclusive_classes = true
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
|
||||||
{% else -%}
|
{% else -%}
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatBOW.v1"
|
@architectures = "spacy.TextCatBOW.v1"
|
||||||
|
exclusive_classes = true
|
||||||
|
ngram_size = 1
|
||||||
|
no_output_layer = false
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
|
{% if "textcat_multilabel" in components %}
|
||||||
|
[components.textcat_multilabel]
|
||||||
|
factory = "textcat_multilabel"
|
||||||
|
|
||||||
|
{% if optimize == "accuracy" %}
|
||||||
|
[components.textcat_multilabel.model]
|
||||||
|
@architectures = "spacy.TextCatEnsemble.v2"
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.textcat_multilabel.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.textcat_multilabel.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
|
|
||||||
|
[components.textcat_multilabel.model.linear_model]
|
||||||
|
@architectures = "spacy.TextCatBOW.v1"
|
||||||
|
exclusive_classes = false
|
||||||
|
ngram_size = 1
|
||||||
|
no_output_layer = false
|
||||||
|
|
||||||
|
{% else -%}
|
||||||
|
[components.textcat_multilabel.model]
|
||||||
|
@architectures = "spacy.TextCatBOW.v1"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
@ -173,7 +205,7 @@ no_output_layer = false
|
||||||
factory = "tok2vec"
|
factory = "tok2vec"
|
||||||
|
|
||||||
[components.tok2vec.model]
|
[components.tok2vec.model]
|
||||||
@architectures = "spacy.Tok2Vec.v1"
|
@architectures = "spacy.Tok2Vec.v2"
|
||||||
|
|
||||||
[components.tok2vec.model.embed]
|
[components.tok2vec.model.embed]
|
||||||
@architectures = "spacy.MultiHashEmbed.v1"
|
@architectures = "spacy.MultiHashEmbed.v1"
|
||||||
|
@ -188,7 +220,7 @@ rows = [5000, 2500]
|
||||||
include_static_vectors = {{ "true" if optimize == "accuracy" else "false" }}
|
include_static_vectors = {{ "true" if optimize == "accuracy" else "false" }}
|
||||||
|
|
||||||
[components.tok2vec.model.encode]
|
[components.tok2vec.model.encode]
|
||||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
||||||
width = {{ 96 if optimize == "efficiency" else 256 }}
|
width = {{ 96 if optimize == "efficiency" else 256 }}
|
||||||
depth = {{ 4 if optimize == "efficiency" else 8 }}
|
depth = {{ 4 if optimize == "efficiency" else 8 }}
|
||||||
window_size = 1
|
window_size = 1
|
||||||
|
@ -225,7 +257,7 @@ width = ${components.tok2vec.model.encode.width}
|
||||||
factory = "parser"
|
factory = "parser"
|
||||||
|
|
||||||
[components.parser.model]
|
[components.parser.model]
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
@architectures = "spacy.TransitionBasedParser.v2"
|
||||||
state_type = "parser"
|
state_type = "parser"
|
||||||
extra_state_tokens = false
|
extra_state_tokens = false
|
||||||
hidden_width = 128
|
hidden_width = 128
|
||||||
|
@ -243,7 +275,7 @@ width = ${components.tok2vec.model.encode.width}
|
||||||
factory = "ner"
|
factory = "ner"
|
||||||
|
|
||||||
[components.ner.model]
|
[components.ner.model]
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
@architectures = "spacy.TransitionBasedParser.v2"
|
||||||
state_type = "ner"
|
state_type = "ner"
|
||||||
extra_state_tokens = false
|
extra_state_tokens = false
|
||||||
hidden_width = 64
|
hidden_width = 64
|
||||||
|
@ -287,13 +319,41 @@ width = ${components.tok2vec.model.encode.width}
|
||||||
|
|
||||||
[components.textcat.model.linear_model]
|
[components.textcat.model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v1"
|
@architectures = "spacy.TextCatBOW.v1"
|
||||||
exclusive_classes = false
|
exclusive_classes = true
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
|
||||||
{% else -%}
|
{% else -%}
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatBOW.v1"
|
@architectures = "spacy.TextCatBOW.v1"
|
||||||
|
exclusive_classes = true
|
||||||
|
ngram_size = 1
|
||||||
|
no_output_layer = false
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
|
{% if "textcat_multilabel" in components %}
|
||||||
|
[components.textcat_multilabel]
|
||||||
|
factory = "textcat_multilabel"
|
||||||
|
|
||||||
|
{% if optimize == "accuracy" %}
|
||||||
|
[components.textcat_multilabel.model]
|
||||||
|
@architectures = "spacy.TextCatEnsemble.v2"
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.textcat_multilabel.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
|
||||||
|
[components.textcat_multilabel.model.linear_model]
|
||||||
|
@architectures = "spacy.TextCatBOW.v1"
|
||||||
|
exclusive_classes = false
|
||||||
|
ngram_size = 1
|
||||||
|
no_output_layer = false
|
||||||
|
|
||||||
|
{% else -%}
|
||||||
|
[components.textcat_multilabel.model]
|
||||||
|
@architectures = "spacy.TextCatBOW.v1"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
@ -302,7 +362,7 @@ no_output_layer = false
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
{% for pipe in components %}
|
{% for pipe in components %}
|
||||||
{% if pipe not in ["tagger", "morphologizer", "parser", "ner", "textcat", "entity_linker"] %}
|
{% if pipe not in ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker"] %}
|
||||||
{# Other components defined by the user: we just assume they're factories #}
|
{# Other components defined by the user: we just assume they're factories #}
|
||||||
[components.{{ pipe }}]
|
[components.{{ pipe }}]
|
||||||
factory = "{{ pipe }}"
|
factory = "{{ pipe }}"
|
||||||
|
|
|
@ -18,7 +18,7 @@ from .. import util
|
||||||
def train_cli(
|
def train_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||||
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
|
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
|
||||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
|
@ -41,7 +41,7 @@ def train_cli(
|
||||||
"""
|
"""
|
||||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
# Make sure all files and paths exists if they are needed
|
# Make sure all files and paths exists if they are needed
|
||||||
if not config_path or not config_path.exists():
|
if not config_path or (str(config_path) != "-" and not config_path.exists()):
|
||||||
msg.fail("Config file not found", config_path, exits=1)
|
msg.fail("Config file not found", config_path, exits=1)
|
||||||
if output_path is not None and not output_path.exists():
|
if output_path is not None and not output_path.exists():
|
||||||
output_path.mkdir(parents=True)
|
output_path.mkdir(parents=True)
|
||||||
|
|
|
@ -20,6 +20,8 @@ disabled = []
|
||||||
before_creation = null
|
before_creation = null
|
||||||
after_creation = null
|
after_creation = null
|
||||||
after_pipeline_creation = null
|
after_pipeline_creation = null
|
||||||
|
# Default batch size to use with nlp.pipe and nlp.evaluate
|
||||||
|
batch_size = 1000
|
||||||
|
|
||||||
[nlp.tokenizer]
|
[nlp.tokenizer]
|
||||||
@tokenizers = "spacy.Tokenizer.v1"
|
@tokenizers = "spacy.Tokenizer.v1"
|
||||||
|
@ -122,3 +124,5 @@ lookups = null
|
||||||
tokenizer = {}
|
tokenizer = {}
|
||||||
# Arguments for initialize methods of the components (keyed by component)
|
# Arguments for initialize methods of the components (keyed by component)
|
||||||
components = {}
|
components = {}
|
||||||
|
before_init = null
|
||||||
|
after_init = null
|
||||||
|
|
|
@ -119,6 +119,12 @@ class Warnings:
|
||||||
"call the {matcher} on each Doc object.")
|
"call the {matcher} on each Doc object.")
|
||||||
W107 = ("The property `Doc.{prop}` is deprecated. Use "
|
W107 = ("The property `Doc.{prop}` is deprecated. Use "
|
||||||
"`Doc.has_annotation(\"{attr}\")` instead.")
|
"`Doc.has_annotation(\"{attr}\")` instead.")
|
||||||
|
W108 = ("The rule-based lemmatizer did not find POS annotation for the "
|
||||||
|
"token '{text}'. Check that your pipeline includes components that "
|
||||||
|
"assign token.pos, typically 'tagger'+'attribute_ruler' or "
|
||||||
|
"'morphologizer'.")
|
||||||
|
W109 = ("Unable to save user hooks while serializing the doc. Re-add any "
|
||||||
|
"required user hooks to the doc after processing.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
@ -457,6 +463,21 @@ class Errors:
|
||||||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
E892 = ("Unknown function registry: '{name}'.\n\nAvailable names: {available}")
|
||||||
|
E893 = ("Could not find function '{name}' in function registry '{reg_name}'. "
|
||||||
|
"If you're using a custom function, make sure the code is available. "
|
||||||
|
"If the function is provided by a third-party package, e.g. "
|
||||||
|
"spacy-transformers, make sure the package is installed in your "
|
||||||
|
"environment.\n\nAvailable names: {available}")
|
||||||
|
E894 = ("The 'noun_chunks' syntax iterator is not implemented for language "
|
||||||
|
"'{lang}'.")
|
||||||
|
E895 = ("The 'textcat' component received gold-standard annotations with "
|
||||||
|
"multiple labels per document. In spaCy 3 you should use the "
|
||||||
|
"'textcat_multilabel' component for this instead. "
|
||||||
|
"Example of an offending annotation: {value}")
|
||||||
|
E896 = ("There was an error using the static vectors. Ensure that the vectors "
|
||||||
|
"of the vocab are properly initialized, or set 'include_static_vectors' "
|
||||||
|
"to False.")
|
||||||
E897 = ("Field '{field}' should be a dot-notation string referring to the "
|
E897 = ("Field '{field}' should be a dot-notation string referring to the "
|
||||||
"relevant section in the config, but found type {type} instead.")
|
"relevant section in the config, but found type {type} instead.")
|
||||||
E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute "
|
E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute "
|
||||||
|
@ -717,6 +738,8 @@ class Errors:
|
||||||
"DocBin (.spacy) format. If your data is in spaCy v2's JSON "
|
"DocBin (.spacy) format. If your data is in spaCy v2's JSON "
|
||||||
"training format, convert it using `python -m spacy convert "
|
"training format, convert it using `python -m spacy convert "
|
||||||
"file.json .`.")
|
"file.json .`.")
|
||||||
|
E1015 = ("Can't initialize model from config: no {value} found. For more "
|
||||||
|
"information, run: python -m spacy debug config config.cfg")
|
||||||
|
|
||||||
|
|
||||||
# Deprecated model shortcuts, only used in errors and warnings
|
# Deprecated model shortcuts, only used in errors and warnings
|
||||||
|
|
27
spacy/lang/am/__init__.py
Normal file
27
spacy/lang/am/__init__.py
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
from ...language import Language
|
||||||
|
from ...attrs import LANG
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
|
class AmharicDefaults(Language.Defaults):
|
||||||
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
|
lex_attr_getters[LANG] = lambda text: "am"
|
||||||
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
writing_system = {"direction": "ltr", "has_case": False, "has_letters": True}
|
||||||
|
|
||||||
|
|
||||||
|
class Amharic(Language):
|
||||||
|
lang = "am"
|
||||||
|
Defaults = AmharicDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Amharic"]
|
18
spacy/lang/am/examples.py
Normal file
18
spacy/lang/am/examples.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.am.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"አፕል የዩኬን ጅምር ድርጅት በ 1 ቢሊዮን ዶላር ለመግዛት አስቧል።",
|
||||||
|
"የራስ ገዝ መኪኖች የኢንሹራንስ ኃላፊነትን ወደ አምራቾች ያዛውራሉ",
|
||||||
|
"ሳን ፍራንሲስኮ የእግረኛ መንገድ አቅርቦት ሮቦቶችን ማገድን ይመለከታል",
|
||||||
|
"ለንደን በእንግሊዝ የምትገኝ ትልቅ ከተማ ናት።",
|
||||||
|
"የት ነህ?",
|
||||||
|
"የፈረንሳይ ፕሬዝዳንት ማናቸው?",
|
||||||
|
"የአሜሪካ ዋና ከተማ ምንድነው?",
|
||||||
|
"ባራክ ኦባማ መቼ ተወለደ?",
|
||||||
|
]
|
102
spacy/lang/am/lex_attrs.py
Normal file
102
spacy/lang/am/lex_attrs.py
Normal file
|
@ -0,0 +1,102 @@
|
||||||
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
_num_words = [
|
||||||
|
"ዜሮ",
|
||||||
|
"አንድ",
|
||||||
|
"ሁለት",
|
||||||
|
"ሶስት",
|
||||||
|
"አራት",
|
||||||
|
"አምስት",
|
||||||
|
"ስድስት",
|
||||||
|
"ሰባት",
|
||||||
|
"ስምት",
|
||||||
|
"ዘጠኝ",
|
||||||
|
"አስር",
|
||||||
|
"አስራ አንድ",
|
||||||
|
"አስራ ሁለት",
|
||||||
|
"አስራ ሶስት",
|
||||||
|
"አስራ አራት",
|
||||||
|
"አስራ አምስት",
|
||||||
|
"አስራ ስድስት",
|
||||||
|
"አስራ ሰባት",
|
||||||
|
"አስራ ስምንት",
|
||||||
|
"አስራ ዘጠኝ",
|
||||||
|
"ሃያ",
|
||||||
|
"ሰላሳ",
|
||||||
|
"አርባ",
|
||||||
|
"ሃምሳ",
|
||||||
|
"ስልሳ",
|
||||||
|
"ሰባ",
|
||||||
|
"ሰማንያ",
|
||||||
|
"ዘጠና",
|
||||||
|
"መቶ",
|
||||||
|
"ሺህ",
|
||||||
|
"ሚሊዮን",
|
||||||
|
"ቢሊዮን",
|
||||||
|
"ትሪሊዮን",
|
||||||
|
"ኳድሪሊዮን",
|
||||||
|
"ገጅሊዮን",
|
||||||
|
"ባዝሊዮን",
|
||||||
|
]
|
||||||
|
|
||||||
|
_ordinal_words = [
|
||||||
|
"አንደኛ",
|
||||||
|
"ሁለተኛ",
|
||||||
|
"ሶስተኛ",
|
||||||
|
"አራተኛ",
|
||||||
|
"አምስተኛ",
|
||||||
|
"ስድስተኛ",
|
||||||
|
"ሰባተኛ",
|
||||||
|
"ስምንተኛ",
|
||||||
|
"ዘጠነኛ",
|
||||||
|
"አስረኛ",
|
||||||
|
"አስራ አንደኛ",
|
||||||
|
"አስራ ሁለተኛ",
|
||||||
|
"አስራ ሶስተኛ",
|
||||||
|
"አስራ አራተኛ",
|
||||||
|
"አስራ አምስተኛ",
|
||||||
|
"አስራ ስድስተኛ",
|
||||||
|
"አስራ ሰባተኛ",
|
||||||
|
"አስራ ስምንተኛ",
|
||||||
|
"አስራ ዘጠነኛ",
|
||||||
|
"ሃያኛ",
|
||||||
|
"ሰላሳኛ" "አርባኛ",
|
||||||
|
"አምሳኛ",
|
||||||
|
"ስድሳኛ",
|
||||||
|
"ሰባኛ",
|
||||||
|
"ሰማንያኛ",
|
||||||
|
"ዘጠናኛ",
|
||||||
|
"መቶኛ",
|
||||||
|
"ሺኛ",
|
||||||
|
"ሚሊዮንኛ",
|
||||||
|
"ቢሊዮንኛ",
|
||||||
|
"ትሪሊዮንኛ",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
|
text = text[1:]
|
||||||
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
if text.count("/") == 1:
|
||||||
|
num, denom = text.split("/")
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
|
||||||
|
text_lower = text.lower()
|
||||||
|
if text_lower in _num_words:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check ordinal number
|
||||||
|
if text_lower in _ordinal_words:
|
||||||
|
return True
|
||||||
|
if text_lower.endswith("ኛ"):
|
||||||
|
if text_lower[:-2].isdigit():
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
19
spacy/lang/am/punctuation.py
Normal file
19
spacy/lang/am/punctuation.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
||||||
|
from ..char_classes import UNITS, ALPHA_UPPER
|
||||||
|
|
||||||
|
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧".strip().split()
|
||||||
|
|
||||||
|
_suffixes = (
|
||||||
|
_list_punct
|
||||||
|
+ LIST_ELLIPSES
|
||||||
|
+ LIST_QUOTES
|
||||||
|
+ [
|
||||||
|
r"(?<=[0-9])\+",
|
||||||
|
# Amharic is written from Left-To-Right
|
||||||
|
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||||
|
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||||
|
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
TOKENIZER_SUFFIXES = _suffixes
|
6
spacy/lang/am/stop_words.py
Normal file
6
spacy/lang/am/stop_words.py
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
# Stop words
|
||||||
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
|
ግን አንቺ አንተ እናንተ ያንተ ያንቺ የናንተ ራስህን ራስሽን ራሳችሁን
|
||||||
|
""".split()
|
||||||
|
)
|
21
spacy/lang/am/tokenizer_exceptions.py
Normal file
21
spacy/lang/am/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
from ...symbols import ORTH, NORM
|
||||||
|
|
||||||
|
|
||||||
|
_exc = {}
|
||||||
|
|
||||||
|
|
||||||
|
for exc_data in [
|
||||||
|
{ORTH: "ት/ቤት"},
|
||||||
|
{ORTH: "ወ/ሮ", NORM: "ወይዘሮ"},
|
||||||
|
]:
|
||||||
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
|
||||||
|
for orth in [
|
||||||
|
"ዓ.ም.",
|
||||||
|
"ኪ.ሜ.",
|
||||||
|
]:
|
||||||
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = _exc
|
|
@ -2,6 +2,8 @@ split_chars = lambda char: list(char.strip().split(" "))
|
||||||
merge_chars = lambda char: char.strip().replace(" ", "|")
|
merge_chars = lambda char: char.strip().replace(" ", "|")
|
||||||
group_chars = lambda char: char.strip().replace(" ", "")
|
group_chars = lambda char: char.strip().replace(" ", "")
|
||||||
|
|
||||||
|
_ethiopic = r"\u1200-\u137F"
|
||||||
|
|
||||||
_bengali = r"\u0980-\u09FF"
|
_bengali = r"\u0980-\u09FF"
|
||||||
|
|
||||||
_hebrew = r"\u0591-\u05F4\uFB1D-\uFB4F"
|
_hebrew = r"\u0591-\u05F4\uFB1D-\uFB4F"
|
||||||
|
@ -210,11 +212,30 @@ _ukrainian_lower = r"а-щюяіїєґ"
|
||||||
_ukrainian_upper = r"А-ЩЮЯІЇЄҐ"
|
_ukrainian_upper = r"А-ЩЮЯІЇЄҐ"
|
||||||
_ukrainian = r"а-щюяіїєґА-ЩЮЯІЇЄҐ"
|
_ukrainian = r"а-щюяіїєґА-ЩЮЯІЇЄҐ"
|
||||||
|
|
||||||
_upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper
|
_macedonian_lower = r"ѓѕјљњќѐѝ"
|
||||||
_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower
|
_macedonian_upper = r"ЃЅЈЉЊЌЀЍ"
|
||||||
|
_macedonian = r"ѓѕјљњќѐѝЃЅЈЉЊЌЀЍ"
|
||||||
|
|
||||||
|
_upper = (
|
||||||
|
LATIN_UPPER
|
||||||
|
+ _russian_upper
|
||||||
|
+ _tatar_upper
|
||||||
|
+ _greek_upper
|
||||||
|
+ _ukrainian_upper
|
||||||
|
+ _macedonian_upper
|
||||||
|
)
|
||||||
|
_lower = (
|
||||||
|
LATIN_LOWER
|
||||||
|
+ _russian_lower
|
||||||
|
+ _tatar_lower
|
||||||
|
+ _greek_lower
|
||||||
|
+ _ukrainian_lower
|
||||||
|
+ _macedonian_lower
|
||||||
|
)
|
||||||
|
|
||||||
_uncased = (
|
_uncased = (
|
||||||
_bengali
|
_ethiopic
|
||||||
|
+ _bengali
|
||||||
+ _hebrew
|
+ _hebrew
|
||||||
+ _persian
|
+ _persian
|
||||||
+ _sinhala
|
+ _sinhala
|
||||||
|
@ -226,7 +247,9 @@ _uncased = (
|
||||||
+ _cjk
|
+ _cjk
|
||||||
)
|
)
|
||||||
|
|
||||||
ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased)
|
ALPHA = group_chars(
|
||||||
|
LATIN + _russian + _tatar + _greek + _ukrainian + _macedonian + _uncased
|
||||||
|
)
|
||||||
ALPHA_LOWER = group_chars(_lower + _uncased)
|
ALPHA_LOWER = group_chars(_lower + _uncased)
|
||||||
ALPHA_UPPER = group_chars(_upper + _uncased)
|
ALPHA_UPPER = group_chars(_upper + _uncased)
|
||||||
|
|
||||||
|
|
|
@ -4,8 +4,8 @@ from ...language import Language
|
||||||
|
|
||||||
|
|
||||||
class CzechDefaults(Language.Defaults):
|
class CzechDefaults(Language.Defaults):
|
||||||
stop_words = STOP_WORDS
|
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Czech(Language):
|
class Czech(Language):
|
||||||
|
|
|
@ -2,6 +2,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
|
||||||
|
|
||||||
|
@ -11,6 +12,7 @@ class DanishDefaults(Language.Defaults):
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
class Danish(Language):
|
class Danish(Language):
|
||||||
|
|
71
spacy/lang/da/syntax_iterators.py
Normal file
71
spacy/lang/da/syntax_iterators.py
Normal file
|
@ -0,0 +1,71 @@
|
||||||
|
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
|
||||||
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
|
def noun_chunks(doclike):
|
||||||
|
def is_verb_token(tok):
|
||||||
|
return tok.pos in [VERB, AUX]
|
||||||
|
|
||||||
|
def get_left_bound(doc, root):
|
||||||
|
left_bound = root
|
||||||
|
for tok in reversed(list(root.lefts)):
|
||||||
|
if tok.dep in np_left_deps:
|
||||||
|
left_bound = tok
|
||||||
|
return left_bound
|
||||||
|
|
||||||
|
def get_right_bound(doc, root):
|
||||||
|
right_bound = root
|
||||||
|
for tok in root.rights:
|
||||||
|
if tok.dep in np_right_deps:
|
||||||
|
right = get_right_bound(doc, tok)
|
||||||
|
if list(
|
||||||
|
filter(
|
||||||
|
lambda t: is_verb_token(t) or t.dep in stop_deps,
|
||||||
|
doc[root.i : right.i],
|
||||||
|
)
|
||||||
|
):
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
right_bound = right
|
||||||
|
return right_bound
|
||||||
|
|
||||||
|
def get_bounds(doc, root):
|
||||||
|
return get_left_bound(doc, root), get_right_bound(doc, root)
|
||||||
|
|
||||||
|
doc = doclike.doc
|
||||||
|
|
||||||
|
if not doc.has_annotation("DEP"):
|
||||||
|
raise ValueError(Errors.E029)
|
||||||
|
|
||||||
|
if not len(doc):
|
||||||
|
return
|
||||||
|
|
||||||
|
left_labels = [
|
||||||
|
"det",
|
||||||
|
"fixed",
|
||||||
|
"nmod:poss",
|
||||||
|
"amod",
|
||||||
|
"flat",
|
||||||
|
"goeswith",
|
||||||
|
"nummod",
|
||||||
|
"appos",
|
||||||
|
]
|
||||||
|
right_labels = ["fixed", "nmod:poss", "amod", "flat", "goeswith", "nummod", "appos"]
|
||||||
|
stop_labels = ["punct"]
|
||||||
|
|
||||||
|
np_label = doc.vocab.strings.add("NP")
|
||||||
|
np_left_deps = [doc.vocab.strings.add(label) for label in left_labels]
|
||||||
|
np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
|
||||||
|
stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
|
||||||
|
|
||||||
|
prev_right = -1
|
||||||
|
for token in doclike:
|
||||||
|
if token.pos in [PROPN, NOUN, PRON]:
|
||||||
|
left, right = get_bounds(doc, token)
|
||||||
|
if left.i <= prev_right:
|
||||||
|
continue
|
||||||
|
yield left.i, right.i + 1, np_label
|
||||||
|
prev_right = right.i
|
||||||
|
|
||||||
|
|
||||||
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
|
@ -6,10 +6,21 @@ from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
"""
|
||||||
# fmt: off
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
labels = ["nsubj", "dobj", "nsubjpass", "pcomp", "pobj", "dative", "appos", "attr", "ROOT"]
|
"""
|
||||||
# fmt: on
|
labels = [
|
||||||
|
"oprd",
|
||||||
|
"nsubj",
|
||||||
|
"dobj",
|
||||||
|
"nsubjpass",
|
||||||
|
"pcomp",
|
||||||
|
"pobj",
|
||||||
|
"dative",
|
||||||
|
"appos",
|
||||||
|
"attr",
|
||||||
|
"ROOT",
|
||||||
|
]
|
||||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
if not doc.has_annotation("DEP"):
|
if not doc.has_annotation("DEP"):
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
|
|
|
@ -330,7 +330,6 @@ for exc_data in [
|
||||||
# Other contractions with leading apostrophe
|
# Other contractions with leading apostrophe
|
||||||
|
|
||||||
for exc_data in [
|
for exc_data in [
|
||||||
{ORTH: "cause", NORM: "because"},
|
|
||||||
{ORTH: "em", NORM: "them"},
|
{ORTH: "em", NORM: "them"},
|
||||||
{ORTH: "ll", NORM: "will"},
|
{ORTH: "ll", NORM: "will"},
|
||||||
{ORTH: "nuff", NORM: "enough"},
|
{ORTH: "nuff", NORM: "enough"},
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Union, Iterator, Optional, List, Tuple
|
from typing import Union, Iterator
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
|
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
@ -19,34 +19,24 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
np_left_deps = [doc.vocab.strings.add(label) for label in left_labels]
|
np_left_deps = [doc.vocab.strings.add(label) for label in left_labels]
|
||||||
np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
|
np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
|
||||||
stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
|
stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
|
||||||
|
|
||||||
|
prev_right = -1
|
||||||
for token in doclike:
|
for token in doclike:
|
||||||
if token.pos in [PROPN, NOUN, PRON]:
|
if token.pos in [PROPN, NOUN, PRON]:
|
||||||
left, right = noun_bounds(
|
left, right = noun_bounds(
|
||||||
doc, token, np_left_deps, np_right_deps, stop_deps
|
doc, token, np_left_deps, np_right_deps, stop_deps
|
||||||
)
|
)
|
||||||
|
if left.i <= prev_right:
|
||||||
|
continue
|
||||||
yield left.i, right.i + 1, np_label
|
yield left.i, right.i + 1, np_label
|
||||||
token = right
|
prev_right = right.i
|
||||||
token = next_token(token)
|
|
||||||
|
|
||||||
|
|
||||||
def is_verb_token(token: Token) -> bool:
|
def is_verb_token(token: Token) -> bool:
|
||||||
return token.pos in [VERB, AUX]
|
return token.pos in [VERB, AUX]
|
||||||
|
|
||||||
|
|
||||||
def next_token(token: Token) -> Optional[Token]:
|
def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps):
|
||||||
try:
|
|
||||||
return token.nbor()
|
|
||||||
except IndexError:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def noun_bounds(
|
|
||||||
doc: Doc,
|
|
||||||
root: Token,
|
|
||||||
np_left_deps: List[str],
|
|
||||||
np_right_deps: List[str],
|
|
||||||
stop_deps: List[str],
|
|
||||||
) -> Tuple[Token, Token]:
|
|
||||||
left_bound = root
|
left_bound = root
|
||||||
for token in reversed(list(root.lefts)):
|
for token in reversed(list(root.lefts)):
|
||||||
if token.dep in np_left_deps:
|
if token.dep in np_left_deps:
|
||||||
|
|
|
@ -1,86 +1,80 @@
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
a à â abord absolument afin ah ai aie ailleurs ainsi ait allaient allo allons
|
a à â abord afin ah ai aie ainsi ait allaient allons
|
||||||
allô alors anterieur anterieure anterieures apres après as assez attendu au
|
alors anterieur anterieure anterieures apres après as assez attendu au
|
||||||
aucun aucune aujourd aujourd'hui aupres auquel aura auraient aurait auront
|
aucun aucune aujourd aujourd'hui aupres auquel aura auraient aurait auront
|
||||||
aussi autre autrefois autrement autres autrui aux auxquelles auxquels avaient
|
aussi autre autrement autres autrui aux auxquelles auxquels avaient
|
||||||
avais avait avant avec avoir avons ayant
|
avais avait avant avec avoir avons ayant
|
||||||
|
|
||||||
bah bas basee bat beau beaucoup bien bigre boum bravo brrr
|
bas basee bat
|
||||||
|
|
||||||
c' c’ ça car ce ceci cela celle celle-ci celle-là celles celles-ci celles-là celui
|
c' c’ ça car ce ceci cela celle celle-ci celle-là celles celles-ci celles-là celui
|
||||||
celui-ci celui-là cent cependant certain certaine certaines certains certes ces
|
celui-ci celui-là cent cependant certain certaine certaines certains certes ces
|
||||||
cet cette ceux ceux-ci ceux-là chacun chacune chaque cher chers chez chiche
|
cet cette ceux ceux-ci ceux-là chacun chacune chaque chez ci cinq cinquantaine cinquante
|
||||||
chut chère chères ci cinq cinquantaine cinquante cinquantième cinquième clac
|
cinquantième cinquième combien comme comment compris concernant
|
||||||
clic combien comme comment comparable comparables compris concernant contre
|
|
||||||
couic crac
|
|
||||||
|
|
||||||
d' d’ da dans de debout dedans dehors deja delà depuis dernier derniere derriere
|
d' d’ da dans de debout dedans dehors deja delà depuis derriere
|
||||||
derrière des desormais desquelles desquels dessous dessus deux deuxième
|
derrière des desormais desquelles desquels dessous dessus deux deuxième
|
||||||
deuxièmement devant devers devra different differentes differents différent
|
deuxièmement devant devers devra different differentes differents différent
|
||||||
différente différentes différents dire directe directement dit dite dits divers
|
différente différentes différents dire directe directement dit dite dits divers
|
||||||
diverse diverses dix dix-huit dix-neuf dix-sept dixième doit doivent donc dont
|
diverse diverses dix dix-huit dix-neuf dix-sept dixième doit doivent donc dont
|
||||||
douze douzième dring du duquel durant dès désormais
|
douze douzième du duquel durant dès désormais
|
||||||
|
|
||||||
effet egale egalement egales eh elle elle-même elles elles-mêmes en encore
|
effet egale egalement egales eh elle elle-même elles elles-mêmes en encore
|
||||||
enfin entre envers environ es ès est et etaient étaient etais étais etait était
|
enfin entre envers environ es ès est et etaient étaient etais étais etait était
|
||||||
etant étant etc été etre être eu euh eux eux-mêmes exactement excepté extenso
|
etant étant etc été etre être eu eux eux-mêmes exactement excepté
|
||||||
exterieur
|
|
||||||
|
|
||||||
fais faisaient faisant fait façon feront fi flac floc font
|
fais faisaient faisant fait façon feront font
|
||||||
|
|
||||||
gens
|
gens
|
||||||
|
|
||||||
ha hein hem hep hi ho holà hop hormis hors hou houp hue hui huit huitième hum
|
ha hem hep hi ho hormis hors hou houp hue hui huit huitième
|
||||||
hurrah hé hélas i il ils importe
|
hé i il ils importe
|
||||||
|
|
||||||
j' j’ je jusqu jusque juste
|
j' j’ je jusqu jusque juste
|
||||||
|
|
||||||
l' l’ la laisser laquelle las le lequel les lesquelles lesquels leur leurs longtemps
|
l' l’ la laisser laquelle le lequel les lesquelles lesquels leur leurs longtemps
|
||||||
lors lorsque lui lui-meme lui-même là lès
|
lors lorsque lui lui-meme lui-même là lès
|
||||||
|
|
||||||
m' m’ ma maint maintenant mais malgre malgré maximale me meme memes merci mes mien
|
m' m’ ma maint maintenant mais malgre me meme memes merci mes mien
|
||||||
mienne miennes miens mille mince minimale moi moi-meme moi-même moindres moins
|
mienne miennes miens mille moi moi-meme moi-même moindres moins
|
||||||
mon moyennant même mêmes
|
mon même mêmes
|
||||||
|
|
||||||
n' n’ na naturel naturelle naturelles ne neanmoins necessaire necessairement neuf
|
n' n’ na ne neanmoins neuvième ni nombreuses nombreux nos notamment
|
||||||
neuvième ni nombreuses nombreux non nos notamment notre nous nous-mêmes nouveau
|
notre nous nous-mêmes nouvea nul néanmoins nôtre nôtres
|
||||||
nul néanmoins nôtre nôtres
|
|
||||||
|
|
||||||
o ô oh ohé ollé olé on ont onze onzième ore ou ouf ouias oust ouste outre
|
o ô on ont onze onzième ore ou ouias oust outre
|
||||||
ouvert ouverte ouverts où
|
ouvert ouverte ouverts où
|
||||||
|
|
||||||
paf pan par parce parfois parle parlent parler parmi parseme partant
|
par parce parfois parle parlent parler parmi parseme partant
|
||||||
particulier particulière particulièrement pas passé pendant pense permet
|
pas pendant pense permet personne peu peut peuvent peux plus
|
||||||
personne peu peut peuvent peux pff pfft pfut pif pire plein plouf plus
|
plusieurs plutôt possible possibles pour pourquoi
|
||||||
plusieurs plutôt possessif possessifs possible possibles pouah pour pourquoi
|
|
||||||
pourrais pourrait pouvait prealable precisement premier première premièrement
|
pourrais pourrait pouvait prealable precisement premier première premièrement
|
||||||
pres probable probante procedant proche près psitt pu puis puisque pur pure
|
pres procedant proche près pu puis puisque
|
||||||
|
|
||||||
qu' qu’ quand quant quant-à-soi quanta quarante quatorze quatre quatre-vingt
|
qu' qu’ quand quant quant-à-soi quanta quarante quatorze quatre quatre-vingt
|
||||||
quatrième quatrièmement que quel quelconque quelle quelles quelqu'un quelque
|
quatrième quatrièmement que quel quelconque quelle quelles quelqu'un quelque
|
||||||
quelques quels qui quiconque quinze quoi quoique
|
quelques quels qui quiconque quinze quoi quoique
|
||||||
|
|
||||||
rare rarement rares relative relativement remarquable rend rendre restant reste
|
relative relativement rend rendre restant reste
|
||||||
restent restrictif retour revoici revoilà rien
|
restent retour revoici revoilà
|
||||||
|
|
||||||
s' s’ sa sacrebleu sait sans sapristi sauf se sein seize selon semblable semblaient
|
s' s’ sa sait sans sauf se seize selon semblable semblaient
|
||||||
semble semblent sent sept septième sera seraient serait seront ses seul seule
|
semble semblent sent sept septième sera seraient serait seront ses seul seule
|
||||||
seulement si sien sienne siennes siens sinon six sixième soi soi-même soit
|
seulement si sien sienne siennes siens sinon six sixième soi soi-même soit
|
||||||
soixante son sont sous souvent specifique specifiques speculatif stop
|
soixante son sont sous souvent specifique specifiques stop
|
||||||
strictement subtiles suffisant suffisante suffit suis suit suivant suivante
|
suffisant suffisante suffit suis suit suivant suivante
|
||||||
suivantes suivants suivre superpose sur surtout
|
suivantes suivants suivre sur surtout
|
||||||
|
|
||||||
t' t’ ta tac tant tardive te tel telle tellement telles tels tenant tend tenir tente
|
t' t’ ta tant te tel telle tellement telles tels tenant tend tenir tente
|
||||||
tes tic tien tienne tiennes tiens toc toi toi-même ton touchant toujours tous
|
tes tien tienne tiennes tiens toi toi-même ton touchant toujours tous
|
||||||
tout toute toutefois toutes treize trente tres trois troisième troisièmement
|
tout toute toutes treize trente tres trois troisième troisièmement
|
||||||
trop très tsoin tsouin tu té
|
tu té
|
||||||
|
|
||||||
un une unes uniformement unique uniques uns
|
un une unes uns
|
||||||
|
|
||||||
va vais vas vers via vif vifs vingt vivat vive vives vlan voici voilà vont vos
|
va vais vas vers via vingt voici voilà vont vos
|
||||||
votre vous vous-mêmes vu vé vôtre vôtres
|
votre vous vous-mêmes vu vé vôtre vôtres
|
||||||
|
|
||||||
zut
|
|
||||||
""".split()
|
""".split()
|
||||||
)
|
)
|
||||||
|
|
|
@ -86,7 +86,7 @@ def like_num(text):
|
||||||
if text in _num_words:
|
if text in _num_words:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# CHeck ordinal number
|
# Check ordinal number
|
||||||
if text in _ordinal_words:
|
if text in _ordinal_words:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
48
spacy/lang/mk/__init__.py
Normal file
48
spacy/lang/mk/__init__.py
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
from typing import Optional
|
||||||
|
from thinc.api import Model
|
||||||
|
from .lemmatizer import MacedonianLemmatizer
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
|
from ...language import Language
|
||||||
|
from ...attrs import LANG
|
||||||
|
from ...util import update_exc
|
||||||
|
from ...lookups import Lookups
|
||||||
|
|
||||||
|
|
||||||
|
class MacedonianDefaults(Language.Defaults):
|
||||||
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
|
lex_attr_getters[LANG] = lambda text: "mk"
|
||||||
|
|
||||||
|
# Optional: replace flags with custom functions, e.g. like_num()
|
||||||
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
|
|
||||||
|
# Merge base exceptions and custom tokenizer exceptions
|
||||||
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||||
|
if lookups is None:
|
||||||
|
lookups = Lookups()
|
||||||
|
return MacedonianLemmatizer(lookups)
|
||||||
|
|
||||||
|
|
||||||
|
class Macedonian(Language):
|
||||||
|
lang = "mk"
|
||||||
|
Defaults = MacedonianDefaults
|
||||||
|
|
||||||
|
|
||||||
|
@Macedonian.factory(
|
||||||
|
"lemmatizer",
|
||||||
|
assigns=["token.lemma"],
|
||||||
|
default_config={"model": None, "mode": "rule"},
|
||||||
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
|
)
|
||||||
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||||
|
return MacedonianLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Macedonian"]
|
59
spacy/lang/mk/lemmatizer.py
Normal file
59
spacy/lang/mk/lemmatizer.py
Normal file
|
@ -0,0 +1,59 @@
|
||||||
|
from typing import List
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
|
from ...pipeline import Lemmatizer
|
||||||
|
from ...tokens import Token
|
||||||
|
|
||||||
|
|
||||||
|
class MacedonianLemmatizer(Lemmatizer):
|
||||||
|
def rule_lemmatize(self, token: Token) -> List[str]:
|
||||||
|
string = token.text
|
||||||
|
univ_pos = token.pos_.lower()
|
||||||
|
morphology = token.morph.to_dict()
|
||||||
|
|
||||||
|
if univ_pos in ("", "eol", "space"):
|
||||||
|
return [string.lower()]
|
||||||
|
|
||||||
|
if string[-3:] == "јќи":
|
||||||
|
string = string[:-3]
|
||||||
|
univ_pos = "verb"
|
||||||
|
|
||||||
|
index_table = self.lookups.get_table("lemma_index", {})
|
||||||
|
exc_table = self.lookups.get_table("lemma_exc", {})
|
||||||
|
rules_table = self.lookups.get_table("lemma_rules", {})
|
||||||
|
if not any(
|
||||||
|
(
|
||||||
|
index_table.get(univ_pos),
|
||||||
|
exc_table.get(univ_pos),
|
||||||
|
rules_table.get(univ_pos),
|
||||||
|
)
|
||||||
|
):
|
||||||
|
if univ_pos == "propn":
|
||||||
|
return [string]
|
||||||
|
else:
|
||||||
|
return [string.lower()]
|
||||||
|
|
||||||
|
index = index_table.get(univ_pos, {})
|
||||||
|
exceptions = exc_table.get(univ_pos, {})
|
||||||
|
rules = rules_table.get(univ_pos, [])
|
||||||
|
|
||||||
|
orig = string
|
||||||
|
string = string.lower()
|
||||||
|
forms = []
|
||||||
|
|
||||||
|
for old, new in rules:
|
||||||
|
if string.endswith(old):
|
||||||
|
form = string[: len(string) - len(old)] + new
|
||||||
|
if not form:
|
||||||
|
continue
|
||||||
|
if form in index or not form.isalpha():
|
||||||
|
forms.append(form)
|
||||||
|
|
||||||
|
forms = list(OrderedDict.fromkeys(forms))
|
||||||
|
for form in exceptions.get(string, []):
|
||||||
|
if form not in forms:
|
||||||
|
forms.insert(0, form)
|
||||||
|
if not forms:
|
||||||
|
forms.append(orig)
|
||||||
|
|
||||||
|
return forms
|
138
spacy/lang/mk/lex_attrs.py
Normal file
138
spacy/lang/mk/lex_attrs.py
Normal file
|
@ -0,0 +1,138 @@
|
||||||
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
_num_words = [
|
||||||
|
"нула",
|
||||||
|
"еден",
|
||||||
|
"една",
|
||||||
|
"едно",
|
||||||
|
"два",
|
||||||
|
"две",
|
||||||
|
"три",
|
||||||
|
"четири",
|
||||||
|
"пет",
|
||||||
|
"шест",
|
||||||
|
"седум",
|
||||||
|
"осум",
|
||||||
|
"девет",
|
||||||
|
"десет",
|
||||||
|
"единаесет",
|
||||||
|
"дванаесет",
|
||||||
|
"тринаесет",
|
||||||
|
"четиринаесет",
|
||||||
|
"петнаесет",
|
||||||
|
"шеснаесет",
|
||||||
|
"седумнаесет",
|
||||||
|
"осумнаесет",
|
||||||
|
"деветнаесет",
|
||||||
|
"дваесет",
|
||||||
|
"триесет",
|
||||||
|
"четириесет",
|
||||||
|
"педесет",
|
||||||
|
"шеесет",
|
||||||
|
"седумдесет",
|
||||||
|
"осумдесет",
|
||||||
|
"деведесет",
|
||||||
|
"сто",
|
||||||
|
"двесте",
|
||||||
|
"триста",
|
||||||
|
"четиристотини",
|
||||||
|
"петстотини",
|
||||||
|
"шестотини",
|
||||||
|
"седумстотини",
|
||||||
|
"осумстотини",
|
||||||
|
"деветстотини",
|
||||||
|
"илјада",
|
||||||
|
"илјади",
|
||||||
|
"милион",
|
||||||
|
"милиони",
|
||||||
|
"милијарда",
|
||||||
|
"милијарди",
|
||||||
|
"билион",
|
||||||
|
"билиони",
|
||||||
|
"двајца",
|
||||||
|
"тројца",
|
||||||
|
"четворица",
|
||||||
|
"петмина",
|
||||||
|
"шестмина",
|
||||||
|
"седуммина",
|
||||||
|
"осуммина",
|
||||||
|
"деветмина",
|
||||||
|
"обата",
|
||||||
|
"обајцата",
|
||||||
|
"прв",
|
||||||
|
"втор",
|
||||||
|
"трет",
|
||||||
|
"четврт",
|
||||||
|
"седм",
|
||||||
|
"осм",
|
||||||
|
"двестоти",
|
||||||
|
"два-три",
|
||||||
|
"два-триесет",
|
||||||
|
"два-триесетмина",
|
||||||
|
"два-тринаесет",
|
||||||
|
"два-тројца",
|
||||||
|
"две-три",
|
||||||
|
"две-тристотини",
|
||||||
|
"пет-шеесет",
|
||||||
|
"пет-шеесетмина",
|
||||||
|
"пет-шеснаесетмина",
|
||||||
|
"пет-шест",
|
||||||
|
"пет-шестмина",
|
||||||
|
"пет-шестотини",
|
||||||
|
"петина",
|
||||||
|
"осмина",
|
||||||
|
"седум-осум",
|
||||||
|
"седум-осумдесет",
|
||||||
|
"седум-осуммина",
|
||||||
|
"седум-осумнаесет",
|
||||||
|
"седум-осумнаесетмина",
|
||||||
|
"три-четириесет",
|
||||||
|
"три-четиринаесет",
|
||||||
|
"шеесет",
|
||||||
|
"шеесетина",
|
||||||
|
"шеесетмина",
|
||||||
|
"шеснаесет",
|
||||||
|
"шеснаесетмина",
|
||||||
|
"шест-седум",
|
||||||
|
"шест-седумдесет",
|
||||||
|
"шест-седумнаесет",
|
||||||
|
"шест-седумстотини",
|
||||||
|
"шестоти",
|
||||||
|
"шестотини",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
|
text = text[1:]
|
||||||
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
if text.count("/") == 1:
|
||||||
|
num, denom = text.split("/")
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
|
||||||
|
text_lower = text.lower()
|
||||||
|
if text_lower in _num_words:
|
||||||
|
return True
|
||||||
|
|
||||||
|
if text_lower.endswith(("а", "о", "и")):
|
||||||
|
if text_lower[:-1] in _num_words:
|
||||||
|
return True
|
||||||
|
|
||||||
|
if text_lower.endswith(("ти", "та", "то", "на")):
|
||||||
|
if text_lower[:-2] in _num_words:
|
||||||
|
return True
|
||||||
|
|
||||||
|
if text_lower.endswith(("ата", "иот", "ите", "ина", "чки")):
|
||||||
|
if text_lower[:-3] in _num_words:
|
||||||
|
return True
|
||||||
|
|
||||||
|
if text_lower.endswith(("мина", "тина")):
|
||||||
|
if text_lower[:-4] in _num_words:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
815
spacy/lang/mk/stop_words.py
Normal file
815
spacy/lang/mk/stop_words.py
Normal file
|
@ -0,0 +1,815 @@
|
||||||
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
|
а
|
||||||
|
абре
|
||||||
|
aв
|
||||||
|
аи
|
||||||
|
ако
|
||||||
|
алало
|
||||||
|
ам
|
||||||
|
ама
|
||||||
|
аман
|
||||||
|
ами
|
||||||
|
амин
|
||||||
|
априли-ли-ли
|
||||||
|
ау
|
||||||
|
аух
|
||||||
|
ауч
|
||||||
|
ах
|
||||||
|
аха
|
||||||
|
аха-ха
|
||||||
|
аш
|
||||||
|
ашколсум
|
||||||
|
ашколсун
|
||||||
|
ај
|
||||||
|
ајде
|
||||||
|
ајс
|
||||||
|
аџаба
|
||||||
|
бавно
|
||||||
|
бам
|
||||||
|
бам-бум
|
||||||
|
бап
|
||||||
|
бар
|
||||||
|
баре
|
||||||
|
барем
|
||||||
|
бау
|
||||||
|
бау-бау
|
||||||
|
баш
|
||||||
|
бај
|
||||||
|
бе
|
||||||
|
беа
|
||||||
|
бев
|
||||||
|
бевме
|
||||||
|
бевте
|
||||||
|
без
|
||||||
|
безбели
|
||||||
|
бездруго
|
||||||
|
белки
|
||||||
|
беше
|
||||||
|
би
|
||||||
|
бидејќи
|
||||||
|
бим
|
||||||
|
бис
|
||||||
|
бла
|
||||||
|
блазе
|
||||||
|
богами
|
||||||
|
божем
|
||||||
|
боц
|
||||||
|
браво
|
||||||
|
бравос
|
||||||
|
бре
|
||||||
|
бреј
|
||||||
|
брзо
|
||||||
|
бришка
|
||||||
|
бррр
|
||||||
|
бу
|
||||||
|
бум
|
||||||
|
буф
|
||||||
|
буц
|
||||||
|
бујрум
|
||||||
|
ваа
|
||||||
|
вам
|
||||||
|
варај
|
||||||
|
варда
|
||||||
|
вас
|
||||||
|
вај
|
||||||
|
ве
|
||||||
|
велат
|
||||||
|
вели
|
||||||
|
версус
|
||||||
|
веќе
|
||||||
|
ви
|
||||||
|
виа
|
||||||
|
види
|
||||||
|
вие
|
||||||
|
вистина
|
||||||
|
витос
|
||||||
|
внатре
|
||||||
|
во
|
||||||
|
воз
|
||||||
|
вон
|
||||||
|
впрочем
|
||||||
|
врв
|
||||||
|
вред
|
||||||
|
време
|
||||||
|
врз
|
||||||
|
всушност
|
||||||
|
втор
|
||||||
|
галиба
|
||||||
|
ги
|
||||||
|
гитла
|
||||||
|
го
|
||||||
|
годе
|
||||||
|
годишник
|
||||||
|
горе
|
||||||
|
гра
|
||||||
|
гуц
|
||||||
|
гљу
|
||||||
|
да
|
||||||
|
даан
|
||||||
|
дава
|
||||||
|
дал
|
||||||
|
дали
|
||||||
|
дан
|
||||||
|
два
|
||||||
|
дваесет
|
||||||
|
дванаесет
|
||||||
|
двајца
|
||||||
|
две
|
||||||
|
двесте
|
||||||
|
движам
|
||||||
|
движат
|
||||||
|
движи
|
||||||
|
движиме
|
||||||
|
движите
|
||||||
|
движиш
|
||||||
|
де
|
||||||
|
деведесет
|
||||||
|
девет
|
||||||
|
деветнаесет
|
||||||
|
деветстотини
|
||||||
|
деветти
|
||||||
|
дека
|
||||||
|
дел
|
||||||
|
делми
|
||||||
|
демек
|
||||||
|
десет
|
||||||
|
десетина
|
||||||
|
десетти
|
||||||
|
деситици
|
||||||
|
дејгиди
|
||||||
|
дејди
|
||||||
|
ди
|
||||||
|
дилми
|
||||||
|
дин
|
||||||
|
дип
|
||||||
|
дно
|
||||||
|
до
|
||||||
|
доволно
|
||||||
|
додека
|
||||||
|
додуша
|
||||||
|
докај
|
||||||
|
доколку
|
||||||
|
доправено
|
||||||
|
доправи
|
||||||
|
досамоти
|
||||||
|
доста
|
||||||
|
држи
|
||||||
|
дрн
|
||||||
|
друг
|
||||||
|
друга
|
||||||
|
другата
|
||||||
|
други
|
||||||
|
другиот
|
||||||
|
другите
|
||||||
|
друго
|
||||||
|
другото
|
||||||
|
дум
|
||||||
|
дур
|
||||||
|
дури
|
||||||
|
е
|
||||||
|
евала
|
||||||
|
еве
|
||||||
|
евет
|
||||||
|
ега
|
||||||
|
егиди
|
||||||
|
еден
|
||||||
|
едикојси
|
||||||
|
единаесет
|
||||||
|
единствено
|
||||||
|
еднаш
|
||||||
|
едно
|
||||||
|
ексик
|
||||||
|
ела
|
||||||
|
елбете
|
||||||
|
елем
|
||||||
|
ели
|
||||||
|
ем
|
||||||
|
еми
|
||||||
|
ене
|
||||||
|
ете
|
||||||
|
еурека
|
||||||
|
ех
|
||||||
|
еј
|
||||||
|
жими
|
||||||
|
жити
|
||||||
|
за
|
||||||
|
завал
|
||||||
|
заврши
|
||||||
|
зад
|
||||||
|
задека
|
||||||
|
задоволна
|
||||||
|
задржи
|
||||||
|
заедно
|
||||||
|
зар
|
||||||
|
зарад
|
||||||
|
заради
|
||||||
|
заре
|
||||||
|
зарем
|
||||||
|
затоа
|
||||||
|
зашто
|
||||||
|
згора
|
||||||
|
зема
|
||||||
|
земе
|
||||||
|
земува
|
||||||
|
зер
|
||||||
|
значи
|
||||||
|
зошто
|
||||||
|
зуј
|
||||||
|
и
|
||||||
|
иако
|
||||||
|
из
|
||||||
|
извезен
|
||||||
|
изгледа
|
||||||
|
измеѓу
|
||||||
|
износ
|
||||||
|
или
|
||||||
|
или-или
|
||||||
|
илјада
|
||||||
|
илјади
|
||||||
|
им
|
||||||
|
има
|
||||||
|
имаа
|
||||||
|
имаат
|
||||||
|
имавме
|
||||||
|
имавте
|
||||||
|
имам
|
||||||
|
имаме
|
||||||
|
имате
|
||||||
|
имаш
|
||||||
|
имаше
|
||||||
|
име
|
||||||
|
имено
|
||||||
|
именува
|
||||||
|
имплицира
|
||||||
|
имплицираат
|
||||||
|
имплицирам
|
||||||
|
имплицираме
|
||||||
|
имплицирате
|
||||||
|
имплицираш
|
||||||
|
инаку
|
||||||
|
индицира
|
||||||
|
исечок
|
||||||
|
исклучен
|
||||||
|
исклучена
|
||||||
|
исклучени
|
||||||
|
исклучено
|
||||||
|
искористен
|
||||||
|
искористена
|
||||||
|
искористени
|
||||||
|
искористено
|
||||||
|
искористи
|
||||||
|
искрај
|
||||||
|
исти
|
||||||
|
исто
|
||||||
|
итака
|
||||||
|
итн
|
||||||
|
их
|
||||||
|
иха
|
||||||
|
ихуу
|
||||||
|
иш
|
||||||
|
ишала
|
||||||
|
иј
|
||||||
|
ка
|
||||||
|
каде
|
||||||
|
кажува
|
||||||
|
како
|
||||||
|
каков
|
||||||
|
камоли
|
||||||
|
кај
|
||||||
|
ква
|
||||||
|
ки
|
||||||
|
кит
|
||||||
|
кло
|
||||||
|
клум
|
||||||
|
кога
|
||||||
|
кого
|
||||||
|
кого-годе
|
||||||
|
кое
|
||||||
|
кои
|
||||||
|
количество
|
||||||
|
количина
|
||||||
|
колку
|
||||||
|
кому
|
||||||
|
кон
|
||||||
|
користена
|
||||||
|
користени
|
||||||
|
користено
|
||||||
|
користи
|
||||||
|
кот
|
||||||
|
котрр
|
||||||
|
кош-кош
|
||||||
|
кој
|
||||||
|
која
|
||||||
|
којзнае
|
||||||
|
којшто
|
||||||
|
кр-кр-кр
|
||||||
|
крај
|
||||||
|
крек
|
||||||
|
крз
|
||||||
|
крк
|
||||||
|
крц
|
||||||
|
куку
|
||||||
|
кукуригу
|
||||||
|
куш
|
||||||
|
ле
|
||||||
|
лебами
|
||||||
|
леле
|
||||||
|
лели
|
||||||
|
ли
|
||||||
|
лиду
|
||||||
|
луп
|
||||||
|
ма
|
||||||
|
макар
|
||||||
|
малку
|
||||||
|
марш
|
||||||
|
мат
|
||||||
|
мац
|
||||||
|
машала
|
||||||
|
ме
|
||||||
|
мене
|
||||||
|
место
|
||||||
|
меѓу
|
||||||
|
меѓувреме
|
||||||
|
меѓутоа
|
||||||
|
ми
|
||||||
|
мое
|
||||||
|
може
|
||||||
|
можеби
|
||||||
|
молам
|
||||||
|
моли
|
||||||
|
мор
|
||||||
|
мора
|
||||||
|
море
|
||||||
|
мори
|
||||||
|
мразец
|
||||||
|
му
|
||||||
|
муклец
|
||||||
|
мутлак
|
||||||
|
муц
|
||||||
|
мјау
|
||||||
|
на
|
||||||
|
навидум
|
||||||
|
навистина
|
||||||
|
над
|
||||||
|
надвор
|
||||||
|
назад
|
||||||
|
накај
|
||||||
|
накрај
|
||||||
|
нали
|
||||||
|
нам
|
||||||
|
наместо
|
||||||
|
наоколу
|
||||||
|
направено
|
||||||
|
направи
|
||||||
|
напред
|
||||||
|
нас
|
||||||
|
наспоред
|
||||||
|
наспрема
|
||||||
|
наспроти
|
||||||
|
насред
|
||||||
|
натаму
|
||||||
|
натема
|
||||||
|
начин
|
||||||
|
наш
|
||||||
|
наша
|
||||||
|
наше
|
||||||
|
наши
|
||||||
|
нај
|
||||||
|
најдоцна
|
||||||
|
најмалку
|
||||||
|
најмногу
|
||||||
|
не
|
||||||
|
неа
|
||||||
|
него
|
||||||
|
негов
|
||||||
|
негова
|
||||||
|
негови
|
||||||
|
негово
|
||||||
|
незе
|
||||||
|
нека
|
||||||
|
некаде
|
||||||
|
некако
|
||||||
|
некаков
|
||||||
|
некого
|
||||||
|
некое
|
||||||
|
некои
|
||||||
|
неколку
|
||||||
|
некому
|
||||||
|
некој
|
||||||
|
некојси
|
||||||
|
нели
|
||||||
|
немој
|
||||||
|
нему
|
||||||
|
неоти
|
||||||
|
нечиј
|
||||||
|
нешто
|
||||||
|
нејзе
|
||||||
|
нејзин
|
||||||
|
нејзини
|
||||||
|
нејзино
|
||||||
|
нејсе
|
||||||
|
ни
|
||||||
|
нив
|
||||||
|
нивен
|
||||||
|
нивна
|
||||||
|
нивни
|
||||||
|
нивно
|
||||||
|
ние
|
||||||
|
низ
|
||||||
|
никаде
|
||||||
|
никако
|
||||||
|
никогаш
|
||||||
|
никого
|
||||||
|
никому
|
||||||
|
никој
|
||||||
|
ним
|
||||||
|
нити
|
||||||
|
нито
|
||||||
|
ниту
|
||||||
|
ничиј
|
||||||
|
ништо
|
||||||
|
но
|
||||||
|
нѐ
|
||||||
|
о
|
||||||
|
обр
|
||||||
|
ова
|
||||||
|
ова-она
|
||||||
|
оваа
|
||||||
|
овај
|
||||||
|
овде
|
||||||
|
овега
|
||||||
|
овие
|
||||||
|
овој
|
||||||
|
од
|
||||||
|
одавде
|
||||||
|
оди
|
||||||
|
однесува
|
||||||
|
односно
|
||||||
|
одошто
|
||||||
|
околу
|
||||||
|
олеле
|
||||||
|
олкацок
|
||||||
|
он
|
||||||
|
она
|
||||||
|
онаа
|
||||||
|
онака
|
||||||
|
онаков
|
||||||
|
онде
|
||||||
|
они
|
||||||
|
оние
|
||||||
|
оно
|
||||||
|
оној
|
||||||
|
оп
|
||||||
|
освем
|
||||||
|
освен
|
||||||
|
осем
|
||||||
|
осми
|
||||||
|
осум
|
||||||
|
осумдесет
|
||||||
|
осумнаесет
|
||||||
|
осумстотитни
|
||||||
|
отаде
|
||||||
|
оти
|
||||||
|
откако
|
||||||
|
откај
|
||||||
|
откога
|
||||||
|
отколку
|
||||||
|
оттаму
|
||||||
|
оттука
|
||||||
|
оф
|
||||||
|
ох
|
||||||
|
ој
|
||||||
|
па
|
||||||
|
пак
|
||||||
|
папа
|
||||||
|
пардон
|
||||||
|
пате-ќуте
|
||||||
|
пати
|
||||||
|
пау
|
||||||
|
паче
|
||||||
|
пеесет
|
||||||
|
пеки
|
||||||
|
пет
|
||||||
|
петнаесет
|
||||||
|
петстотини
|
||||||
|
петти
|
||||||
|
пи
|
||||||
|
пи-пи
|
||||||
|
пис
|
||||||
|
плас
|
||||||
|
плус
|
||||||
|
по
|
||||||
|
побавно
|
||||||
|
поблиску
|
||||||
|
побрзо
|
||||||
|
побуни
|
||||||
|
повеќе
|
||||||
|
повторно
|
||||||
|
под
|
||||||
|
подалеку
|
||||||
|
подолу
|
||||||
|
подоцна
|
||||||
|
подруго
|
||||||
|
позади
|
||||||
|
поинаква
|
||||||
|
поинакви
|
||||||
|
поинакво
|
||||||
|
поинаков
|
||||||
|
поинаку
|
||||||
|
покаже
|
||||||
|
покажува
|
||||||
|
покрај
|
||||||
|
полно
|
||||||
|
помалку
|
||||||
|
помеѓу
|
||||||
|
понатаму
|
||||||
|
понекогаш
|
||||||
|
понекој
|
||||||
|
поради
|
||||||
|
поразличен
|
||||||
|
поразлична
|
||||||
|
поразлични
|
||||||
|
поразлично
|
||||||
|
поседува
|
||||||
|
после
|
||||||
|
последен
|
||||||
|
последна
|
||||||
|
последни
|
||||||
|
последно
|
||||||
|
поспоро
|
||||||
|
потег
|
||||||
|
потоа
|
||||||
|
пошироко
|
||||||
|
прави
|
||||||
|
празно
|
||||||
|
прв
|
||||||
|
пред
|
||||||
|
през
|
||||||
|
преку
|
||||||
|
претежно
|
||||||
|
претходен
|
||||||
|
претходна
|
||||||
|
претходни
|
||||||
|
претходник
|
||||||
|
претходно
|
||||||
|
при
|
||||||
|
присвои
|
||||||
|
притоа
|
||||||
|
причинува
|
||||||
|
пријатно
|
||||||
|
просто
|
||||||
|
против
|
||||||
|
прр
|
||||||
|
пст
|
||||||
|
пук
|
||||||
|
пусто
|
||||||
|
пуф
|
||||||
|
пуј
|
||||||
|
пфуј
|
||||||
|
пшт
|
||||||
|
ради
|
||||||
|
различен
|
||||||
|
различна
|
||||||
|
различни
|
||||||
|
различно
|
||||||
|
разни
|
||||||
|
разоружен
|
||||||
|
разредлив
|
||||||
|
рамките
|
||||||
|
рамнообразно
|
||||||
|
растревожено
|
||||||
|
растреперено
|
||||||
|
расчувствувано
|
||||||
|
ратоборно
|
||||||
|
рече
|
||||||
|
роден
|
||||||
|
с
|
||||||
|
сакан
|
||||||
|
сам
|
||||||
|
сама
|
||||||
|
сами
|
||||||
|
самите
|
||||||
|
само
|
||||||
|
самоти
|
||||||
|
свое
|
||||||
|
свои
|
||||||
|
свој
|
||||||
|
своја
|
||||||
|
се
|
||||||
|
себе
|
||||||
|
себеси
|
||||||
|
сега
|
||||||
|
седми
|
||||||
|
седум
|
||||||
|
седумдесет
|
||||||
|
седумнаесет
|
||||||
|
седумстотини
|
||||||
|
секаде
|
||||||
|
секаков
|
||||||
|
секи
|
||||||
|
секогаш
|
||||||
|
секого
|
||||||
|
секому
|
||||||
|
секој
|
||||||
|
секојдневно
|
||||||
|
сем
|
||||||
|
сенешто
|
||||||
|
сепак
|
||||||
|
сериозен
|
||||||
|
сериозна
|
||||||
|
сериозни
|
||||||
|
сериозно
|
||||||
|
сет
|
||||||
|
сечиј
|
||||||
|
сешто
|
||||||
|
си
|
||||||
|
сиктер
|
||||||
|
сиот
|
||||||
|
сип
|
||||||
|
сиреч
|
||||||
|
сите
|
||||||
|
сичко
|
||||||
|
скок
|
||||||
|
скоро
|
||||||
|
скрц
|
||||||
|
следбеник
|
||||||
|
следбеничка
|
||||||
|
следен
|
||||||
|
следователно
|
||||||
|
следствено
|
||||||
|
сме
|
||||||
|
со
|
||||||
|
соне
|
||||||
|
сопствен
|
||||||
|
сопствена
|
||||||
|
сопствени
|
||||||
|
сопствено
|
||||||
|
сосе
|
||||||
|
сосем
|
||||||
|
сполај
|
||||||
|
според
|
||||||
|
споро
|
||||||
|
спрема
|
||||||
|
спроти
|
||||||
|
спротив
|
||||||
|
сред
|
||||||
|
среде
|
||||||
|
среќно
|
||||||
|
срочен
|
||||||
|
сст
|
||||||
|
става
|
||||||
|
ставаат
|
||||||
|
ставам
|
||||||
|
ставаме
|
||||||
|
ставате
|
||||||
|
ставаш
|
||||||
|
стави
|
||||||
|
сте
|
||||||
|
сто
|
||||||
|
стоп
|
||||||
|
страна
|
||||||
|
сум
|
||||||
|
сума
|
||||||
|
супер
|
||||||
|
сус
|
||||||
|
сѐ
|
||||||
|
та
|
||||||
|
таа
|
||||||
|
така
|
||||||
|
таква
|
||||||
|
такви
|
||||||
|
таков
|
||||||
|
тамам
|
||||||
|
таму
|
||||||
|
тангар-мангар
|
||||||
|
тандар-мандар
|
||||||
|
тап
|
||||||
|
твое
|
||||||
|
те
|
||||||
|
тебе
|
||||||
|
тебека
|
||||||
|
тек
|
||||||
|
текот
|
||||||
|
ти
|
||||||
|
тие
|
||||||
|
тизе
|
||||||
|
тик-так
|
||||||
|
тики
|
||||||
|
тоа
|
||||||
|
тогаш
|
||||||
|
тој
|
||||||
|
трак
|
||||||
|
трака-трука
|
||||||
|
трас
|
||||||
|
треба
|
||||||
|
трет
|
||||||
|
три
|
||||||
|
триесет
|
||||||
|
тринаест
|
||||||
|
триста
|
||||||
|
труп
|
||||||
|
трупа
|
||||||
|
трус
|
||||||
|
ту
|
||||||
|
тука
|
||||||
|
туку
|
||||||
|
тукушто
|
||||||
|
туф
|
||||||
|
у
|
||||||
|
уа
|
||||||
|
убаво
|
||||||
|
уви
|
||||||
|
ужасно
|
||||||
|
уз
|
||||||
|
ура
|
||||||
|
уу
|
||||||
|
уф
|
||||||
|
уха
|
||||||
|
уш
|
||||||
|
уште
|
||||||
|
фазен
|
||||||
|
фала
|
||||||
|
фил
|
||||||
|
филан
|
||||||
|
фис
|
||||||
|
фиу
|
||||||
|
фиљан
|
||||||
|
фоб
|
||||||
|
фон
|
||||||
|
ха
|
||||||
|
ха-ха
|
||||||
|
хе
|
||||||
|
хеј
|
||||||
|
хеј
|
||||||
|
хи
|
||||||
|
хм
|
||||||
|
хо
|
||||||
|
цак
|
||||||
|
цап
|
||||||
|
целина
|
||||||
|
цело
|
||||||
|
цигу-лигу
|
||||||
|
циц
|
||||||
|
чекај
|
||||||
|
често
|
||||||
|
четврт
|
||||||
|
четири
|
||||||
|
четириесет
|
||||||
|
четиринаесет
|
||||||
|
четирстотини
|
||||||
|
чие
|
||||||
|
чии
|
||||||
|
чик
|
||||||
|
чик-чирик
|
||||||
|
чини
|
||||||
|
чиш
|
||||||
|
чиј
|
||||||
|
чија
|
||||||
|
чијшто
|
||||||
|
чкрап
|
||||||
|
чому
|
||||||
|
чук
|
||||||
|
чукш
|
||||||
|
чуму
|
||||||
|
чунки
|
||||||
|
шеесет
|
||||||
|
шеснаесет
|
||||||
|
шест
|
||||||
|
шести
|
||||||
|
шестотини
|
||||||
|
ширум
|
||||||
|
шлак
|
||||||
|
шлап
|
||||||
|
шлапа-шлупа
|
||||||
|
шлуп
|
||||||
|
шмрк
|
||||||
|
што
|
||||||
|
штогоде
|
||||||
|
штом
|
||||||
|
штотуку
|
||||||
|
штрак
|
||||||
|
штрап
|
||||||
|
штрап-штруп
|
||||||
|
шуќур
|
||||||
|
ѓиди
|
||||||
|
ѓоа
|
||||||
|
ѓоамити
|
||||||
|
ѕан
|
||||||
|
ѕе
|
||||||
|
ѕин
|
||||||
|
ја
|
||||||
|
јадец
|
||||||
|
јазе
|
||||||
|
јали
|
||||||
|
јас
|
||||||
|
јаска
|
||||||
|
јок
|
||||||
|
ќе
|
||||||
|
ќешки
|
||||||
|
ѝ
|
||||||
|
џагара-магара
|
||||||
|
џанам
|
||||||
|
џив-џив
|
||||||
|
""".split()
|
||||||
|
)
|
94
spacy/lang/mk/tokenizer_exceptions.py
Normal file
94
spacy/lang/mk/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,94 @@
|
||||||
|
from ...symbols import ORTH, NORM
|
||||||
|
|
||||||
|
|
||||||
|
_exc = {}
|
||||||
|
|
||||||
|
|
||||||
|
_abbr_exc = [
|
||||||
|
{ORTH: "м", NORM: "метар"},
|
||||||
|
{ORTH: "мм", NORM: "милиметар"},
|
||||||
|
{ORTH: "цм", NORM: "центиметар"},
|
||||||
|
{ORTH: "см", NORM: "сантиметар"},
|
||||||
|
{ORTH: "дм", NORM: "дециметар"},
|
||||||
|
{ORTH: "км", NORM: "километар"},
|
||||||
|
{ORTH: "кг", NORM: "килограм"},
|
||||||
|
{ORTH: "дкг", NORM: "декаграм"},
|
||||||
|
{ORTH: "дг", NORM: "дециграм"},
|
||||||
|
{ORTH: "мг", NORM: "милиграм"},
|
||||||
|
{ORTH: "г", NORM: "грам"},
|
||||||
|
{ORTH: "т", NORM: "тон"},
|
||||||
|
{ORTH: "кл", NORM: "килолитар"},
|
||||||
|
{ORTH: "хл", NORM: "хектолитар"},
|
||||||
|
{ORTH: "дкл", NORM: "декалитар"},
|
||||||
|
{ORTH: "л", NORM: "литар"},
|
||||||
|
{ORTH: "дл", NORM: "децилитар"},
|
||||||
|
]
|
||||||
|
for abbr in _abbr_exc:
|
||||||
|
_exc[abbr[ORTH]] = [abbr]
|
||||||
|
|
||||||
|
_abbr_line_exc = [
|
||||||
|
{ORTH: "д-р", NORM: "доктор"},
|
||||||
|
{ORTH: "м-р", NORM: "магистер"},
|
||||||
|
{ORTH: "г-ѓа", NORM: "госпоѓа"},
|
||||||
|
{ORTH: "г-ца", NORM: "госпоѓица"},
|
||||||
|
{ORTH: "г-дин", NORM: "господин"},
|
||||||
|
]
|
||||||
|
|
||||||
|
for abbr in _abbr_line_exc:
|
||||||
|
_exc[abbr[ORTH]] = [abbr]
|
||||||
|
|
||||||
|
_abbr_dot_exc = [
|
||||||
|
{ORTH: "в.", NORM: "век"},
|
||||||
|
{ORTH: "в.д.", NORM: "вршител на должност"},
|
||||||
|
{ORTH: "г.", NORM: "година"},
|
||||||
|
{ORTH: "г.г.", NORM: "господин господин"},
|
||||||
|
{ORTH: "м.р.", NORM: "машки род"},
|
||||||
|
{ORTH: "год.", NORM: "женски род"},
|
||||||
|
{ORTH: "с.р.", NORM: "среден род"},
|
||||||
|
{ORTH: "н.е.", NORM: "наша ера"},
|
||||||
|
{ORTH: "о.г.", NORM: "оваа година"},
|
||||||
|
{ORTH: "о.м.", NORM: "овој месец"},
|
||||||
|
{ORTH: "с.", NORM: "село"},
|
||||||
|
{ORTH: "т.", NORM: "точка"},
|
||||||
|
{ORTH: "т.е.", NORM: "то ест"},
|
||||||
|
{ORTH: "т.н.", NORM: "таканаречен"},
|
||||||
|
{ORTH: "бр.", NORM: "број"},
|
||||||
|
{ORTH: "гр.", NORM: "град"},
|
||||||
|
{ORTH: "др.", NORM: "другар"},
|
||||||
|
{ORTH: "и др.", NORM: "и друго"},
|
||||||
|
{ORTH: "и сл.", NORM: "и слично"},
|
||||||
|
{ORTH: "кн.", NORM: "книга"},
|
||||||
|
{ORTH: "мн.", NORM: "множина"},
|
||||||
|
{ORTH: "на пр.", NORM: "на пример"},
|
||||||
|
{ORTH: "св.", NORM: "свети"},
|
||||||
|
{ORTH: "сп.", NORM: "списание"},
|
||||||
|
{ORTH: "с.", NORM: "страница"},
|
||||||
|
{ORTH: "стр.", NORM: "страница"},
|
||||||
|
{ORTH: "чл.", NORM: "член"},
|
||||||
|
{ORTH: "арх.", NORM: "архитект"},
|
||||||
|
{ORTH: "бел.", NORM: "белешка"},
|
||||||
|
{ORTH: "гимн.", NORM: "гимназија"},
|
||||||
|
{ORTH: "ден.", NORM: "денар"},
|
||||||
|
{ORTH: "ул.", NORM: "улица"},
|
||||||
|
{ORTH: "инж.", NORM: "инженер"},
|
||||||
|
{ORTH: "проф.", NORM: "професор"},
|
||||||
|
{ORTH: "студ.", NORM: "студент"},
|
||||||
|
{ORTH: "бот.", NORM: "ботаника"},
|
||||||
|
{ORTH: "мат.", NORM: "математика"},
|
||||||
|
{ORTH: "мед.", NORM: "медицина"},
|
||||||
|
{ORTH: "прил.", NORM: "прилог"},
|
||||||
|
{ORTH: "прид.", NORM: "придавка"},
|
||||||
|
{ORTH: "сврз.", NORM: "сврзник"},
|
||||||
|
{ORTH: "физ.", NORM: "физика"},
|
||||||
|
{ORTH: "хем.", NORM: "хемија"},
|
||||||
|
{ORTH: "пр. н.", NORM: "природни науки"},
|
||||||
|
{ORTH: "истор.", NORM: "историја"},
|
||||||
|
{ORTH: "геогр.", NORM: "географија"},
|
||||||
|
{ORTH: "литер.", NORM: "литература"},
|
||||||
|
]
|
||||||
|
|
||||||
|
for abbr in _abbr_dot_exc:
|
||||||
|
_exc[abbr[ORTH]] = [abbr]
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = _exc
|
|
@ -1,6 +1,6 @@
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
à às área acerca ademais adeus agora ainda algo algumas alguns ali além ambas ambos antes
|
a à às área acerca ademais adeus agora ainda algo algumas alguns ali além ambas ambos antes
|
||||||
ao aos apenas apoia apoio apontar após aquela aquelas aquele aqueles aqui aquilo
|
ao aos apenas apoia apoio apontar após aquela aquelas aquele aqueles aqui aquilo
|
||||||
as assim através atrás até aí
|
as assim através atrás até aí
|
||||||
|
|
||||||
|
@ -14,7 +14,7 @@ da daquela daquele dar das de debaixo demais dentro depois des desde dessa desse
|
||||||
desta deste deve devem deverá dez dezanove dezasseis dezassete dezoito diante
|
desta deste deve devem deverá dez dezanove dezasseis dezassete dezoito diante
|
||||||
direita disso diz dizem dizer do dois dos doze duas dá dão
|
direita disso diz dizem dizer do dois dos doze duas dá dão
|
||||||
|
|
||||||
é és ela elas ele eles em embora enquanto entre então era essa essas esse esses esta
|
e é és ela elas ele eles em embora enquanto entre então era essa essas esse esses esta
|
||||||
estado estar estará estas estava este estes esteve estive estivemos estiveram
|
estado estar estará estas estava este estes esteve estive estivemos estiveram
|
||||||
estiveste estivestes estou está estás estão eu eventual exemplo
|
estiveste estivestes estou está estás estão eu eventual exemplo
|
||||||
|
|
||||||
|
@ -36,7 +36,7 @@ na nada naquela naquele nas nem nenhuma nessa nesse nesta neste no nos nossa
|
||||||
nossas nosso nossos nova novas nove novo novos num numa nunca nuns não nível nós
|
nossas nosso nossos nova novas nove novo novos num numa nunca nuns não nível nós
|
||||||
número números
|
número números
|
||||||
|
|
||||||
obrigada obrigado oitava oitavo oito onde ontem onze ora os ou outra outras outros
|
o obrigada obrigado oitava oitavo oito onde ontem onze ora os ou outra outras outros
|
||||||
|
|
||||||
para parece parte partir pegar pela pelas pelo pelos perto pode podem poder poderá
|
para parece parte partir pegar pela pelas pelo pelos perto pode podem poder poderá
|
||||||
podia pois ponto pontos por porquanto porque porquê portanto porém posição
|
podia pois ponto pontos por porquanto porque porquê portanto porém posição
|
||||||
|
|
|
@ -8,11 +8,13 @@ aceasta
|
||||||
această
|
această
|
||||||
aceea
|
aceea
|
||||||
aceeasi
|
aceeasi
|
||||||
|
aceeași
|
||||||
acei
|
acei
|
||||||
aceia
|
aceia
|
||||||
acel
|
acel
|
||||||
acela
|
acela
|
||||||
acelasi
|
acelasi
|
||||||
|
același
|
||||||
acele
|
acele
|
||||||
acelea
|
acelea
|
||||||
acest
|
acest
|
||||||
|
@ -24,12 +26,11 @@ acestia
|
||||||
acestui
|
acestui
|
||||||
aceşti
|
aceşti
|
||||||
aceştia
|
aceştia
|
||||||
acești
|
|
||||||
aceștia
|
|
||||||
acolo
|
acolo
|
||||||
acord
|
acord
|
||||||
acum
|
acum
|
||||||
adica
|
adica
|
||||||
|
adică
|
||||||
ai
|
ai
|
||||||
aia
|
aia
|
||||||
aibă
|
aibă
|
||||||
|
@ -53,6 +54,8 @@ alături
|
||||||
am
|
am
|
||||||
anume
|
anume
|
||||||
apoi
|
apoi
|
||||||
|
apai
|
||||||
|
apăi
|
||||||
ar
|
ar
|
||||||
are
|
are
|
||||||
as
|
as
|
||||||
|
@ -150,7 +153,9 @@ că
|
||||||
căci
|
căci
|
||||||
cărei
|
cărei
|
||||||
căror
|
căror
|
||||||
|
cărora
|
||||||
cărui
|
cărui
|
||||||
|
căruia
|
||||||
către
|
către
|
||||||
d
|
d
|
||||||
da
|
da
|
||||||
|
@ -175,6 +180,8 @@ deşi
|
||||||
deși
|
deși
|
||||||
din
|
din
|
||||||
dinaintea
|
dinaintea
|
||||||
|
dincolo
|
||||||
|
dincoace
|
||||||
dintr
|
dintr
|
||||||
dintr-
|
dintr-
|
||||||
dintre
|
dintre
|
||||||
|
@ -186,6 +193,10 @@ drept
|
||||||
dupa
|
dupa
|
||||||
după
|
după
|
||||||
dă
|
dă
|
||||||
|
deunaseara
|
||||||
|
deunăseară
|
||||||
|
deunazi
|
||||||
|
deunăzi
|
||||||
e
|
e
|
||||||
ea
|
ea
|
||||||
ei
|
ei
|
||||||
|
@ -220,7 +231,6 @@ geaba
|
||||||
graţie
|
graţie
|
||||||
grație
|
grație
|
||||||
h
|
h
|
||||||
halbă
|
|
||||||
i
|
i
|
||||||
ia
|
ia
|
||||||
iar
|
iar
|
||||||
|
@ -232,6 +242,7 @@ in
|
||||||
inainte
|
inainte
|
||||||
inapoi
|
inapoi
|
||||||
inca
|
inca
|
||||||
|
incotro
|
||||||
incit
|
incit
|
||||||
insa
|
insa
|
||||||
intr
|
intr
|
||||||
|
@ -252,6 +263,10 @@ m
|
||||||
ma
|
ma
|
||||||
mai
|
mai
|
||||||
mare
|
mare
|
||||||
|
macar
|
||||||
|
măcar
|
||||||
|
mata
|
||||||
|
matale
|
||||||
mea
|
mea
|
||||||
mei
|
mei
|
||||||
mele
|
mele
|
||||||
|
@ -274,11 +289,18 @@ mâine
|
||||||
mîine
|
mîine
|
||||||
mă
|
mă
|
||||||
n
|
n
|
||||||
|
na
|
||||||
ne
|
ne
|
||||||
|
neincetat
|
||||||
|
neîncetat
|
||||||
nevoie
|
nevoie
|
||||||
ni
|
ni
|
||||||
nici
|
nici
|
||||||
|
nicidecum
|
||||||
|
nicidecat
|
||||||
|
nicidecât
|
||||||
niciodata
|
niciodata
|
||||||
|
niciodată
|
||||||
nicăieri
|
nicăieri
|
||||||
nimeni
|
nimeni
|
||||||
nimeri
|
nimeri
|
||||||
|
@ -300,6 +322,10 @@ noștri
|
||||||
nu
|
nu
|
||||||
numai
|
numai
|
||||||
o
|
o
|
||||||
|
odata
|
||||||
|
odată
|
||||||
|
odinioara
|
||||||
|
odinioară
|
||||||
opt
|
opt
|
||||||
or
|
or
|
||||||
ori
|
ori
|
||||||
|
@ -314,7 +340,9 @@ oricît
|
||||||
oriunde
|
oriunde
|
||||||
p
|
p
|
||||||
pai
|
pai
|
||||||
|
păi
|
||||||
parca
|
parca
|
||||||
|
parcă
|
||||||
patra
|
patra
|
||||||
patru
|
patru
|
||||||
patrulea
|
patrulea
|
||||||
|
@ -331,13 +359,11 @@ prima
|
||||||
primul
|
primul
|
||||||
prin
|
prin
|
||||||
printr-
|
printr-
|
||||||
|
printre
|
||||||
putini
|
putini
|
||||||
puţin
|
puţin
|
||||||
puţina
|
puţina
|
||||||
puţină
|
puţină
|
||||||
puțin
|
|
||||||
puțina
|
|
||||||
puțină
|
|
||||||
până
|
până
|
||||||
pînă
|
pînă
|
||||||
r
|
r
|
||||||
|
@ -415,6 +441,7 @@ unuia
|
||||||
unul
|
unul
|
||||||
v
|
v
|
||||||
va
|
va
|
||||||
|
vai
|
||||||
vi
|
vi
|
||||||
voastre
|
voastre
|
||||||
voastră
|
voastră
|
||||||
|
|
|
@ -30,9 +30,7 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"The Russian lemmatizer requires the pymorphy2 library: "
|
"The Russian lemmatizer requires the pymorphy2 library: "
|
||||||
'try to fix it with "pip install pymorphy2==0.8" '
|
'try to fix it with "pip install pymorphy2"'
|
||||||
'or "pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"'
|
|
||||||
"if you need Ukrainian too"
|
|
||||||
) from None
|
) from None
|
||||||
if RussianLemmatizer._morph is None:
|
if RussianLemmatizer._morph is None:
|
||||||
RussianLemmatizer._morph = MorphAnalyzer()
|
RussianLemmatizer._morph = MorphAnalyzer()
|
||||||
|
|
27
spacy/lang/ti/__init__.py
Normal file
27
spacy/lang/ti/__init__.py
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
from ...language import Language
|
||||||
|
from ...attrs import LANG
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
|
class TigrinyaDefaults(Language.Defaults):
|
||||||
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
|
lex_attr_getters[LANG] = lambda text: "ti"
|
||||||
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
writing_system = {"direction": "ltr", "has_case": False, "has_letters": True}
|
||||||
|
|
||||||
|
|
||||||
|
class Tigrinya(Language):
|
||||||
|
lang = "ti"
|
||||||
|
Defaults = TigrinyaDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Tigrinya"]
|
18
spacy/lang/ti/examples.py
Normal file
18
spacy/lang/ti/examples.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.ti.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"አፕል ብዩኬ ትርከብ ንግድ ብ1 ቢሊዮን ዶላር ንምግዛዕ ሐሲባ።",
|
||||||
|
"ፈላማይ ክታበት ኮቪድ 19 ተጀሚሩ፤ሓዱሽ ተስፋ ሂቡ ኣሎ",
|
||||||
|
"ቻንስለር ጀርመን ኣንገላ መርከል ዝርግሓ ቫይረስ ኮሮና ንምክልካል ጽኑዕ እገዳ ክግበር ጸዊዓ",
|
||||||
|
"ለንደን ብዓዲ እንግሊዝ ትርከብ ዓባይ ከተማ እያ።",
|
||||||
|
"ናበይ አለኻ፧",
|
||||||
|
"ናይ ፈረንሳይ ፕሬዝዳንት መን እዩ፧",
|
||||||
|
"ናይ አሜሪካ ዋና ከተማ እንታይ እያ፧",
|
||||||
|
"ኦባማ መዓስ ተወሊዱ፧",
|
||||||
|
]
|
102
spacy/lang/ti/lex_attrs.py
Normal file
102
spacy/lang/ti/lex_attrs.py
Normal file
|
@ -0,0 +1,102 @@
|
||||||
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
_num_words = [
|
||||||
|
"ዜሮ",
|
||||||
|
"ሐደ",
|
||||||
|
"ክልተ",
|
||||||
|
"ሰለስተ",
|
||||||
|
"ኣርባዕተ",
|
||||||
|
"ሓሙሽተ",
|
||||||
|
"ሽድሽተ",
|
||||||
|
"ሸውዓተ",
|
||||||
|
"ሽሞንተ",
|
||||||
|
"ትሽዓተ",
|
||||||
|
"ኣሰርተ",
|
||||||
|
"ኣሰርተ ሐደ",
|
||||||
|
"ኣሰርተ ክልተ",
|
||||||
|
"ኣሰርተ ሰለስተ",
|
||||||
|
"ኣሰርተ ኣርባዕተ",
|
||||||
|
"ኣሰርተ ሓሙሽተ",
|
||||||
|
"ኣሰርተ ሽድሽተ",
|
||||||
|
"ኣሰርተ ሸውዓተ",
|
||||||
|
"ኣሰርተ ሽሞንተ",
|
||||||
|
"ኣሰርተ ትሽዓተ",
|
||||||
|
"ዕስራ",
|
||||||
|
"ሰላሳ",
|
||||||
|
"ኣርብዓ",
|
||||||
|
"ሃምሳ",
|
||||||
|
"ስልሳ",
|
||||||
|
"ሰብዓ",
|
||||||
|
"ሰማንያ",
|
||||||
|
"ተስዓ",
|
||||||
|
"ሚእቲ",
|
||||||
|
"ሺሕ",
|
||||||
|
"ሚልዮን",
|
||||||
|
"ቢልዮን",
|
||||||
|
"ትሪልዮን",
|
||||||
|
"ኳድሪልዮን",
|
||||||
|
"ገጅልዮን",
|
||||||
|
"ባዝልዮን",
|
||||||
|
]
|
||||||
|
|
||||||
|
_ordinal_words = [
|
||||||
|
"ቀዳማይ",
|
||||||
|
"ካልኣይ",
|
||||||
|
"ሳልሳይ",
|
||||||
|
"ራብኣይ",
|
||||||
|
"ሓምሻይ",
|
||||||
|
"ሻድሻይ",
|
||||||
|
"ሻውዓይ",
|
||||||
|
"ሻምናይ",
|
||||||
|
"ዘጠነኛ",
|
||||||
|
"አስረኛ",
|
||||||
|
"ኣሰርተ አንደኛ",
|
||||||
|
"ኣሰርተ ሁለተኛ",
|
||||||
|
"ኣሰርተ ሶስተኛ",
|
||||||
|
"ኣሰርተ አራተኛ",
|
||||||
|
"ኣሰርተ አምስተኛ",
|
||||||
|
"ኣሰርተ ስድስተኛ",
|
||||||
|
"ኣሰርተ ሰባተኛ",
|
||||||
|
"ኣሰርተ ስምንተኛ",
|
||||||
|
"ኣሰርተ ዘጠነኛ",
|
||||||
|
"ሃያኛ",
|
||||||
|
"ሰላሳኛ" "አርባኛ",
|
||||||
|
"አምሳኛ",
|
||||||
|
"ስድሳኛ",
|
||||||
|
"ሰባኛ",
|
||||||
|
"ሰማንያኛ",
|
||||||
|
"ዘጠናኛ",
|
||||||
|
"መቶኛ",
|
||||||
|
"ሺኛ",
|
||||||
|
"ሚሊዮንኛ",
|
||||||
|
"ቢሊዮንኛ",
|
||||||
|
"ትሪሊዮንኛ",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
|
text = text[1:]
|
||||||
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
if text.count("/") == 1:
|
||||||
|
num, denom = text.split("/")
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
|
||||||
|
text_lower = text.lower()
|
||||||
|
if text_lower in _num_words:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check ordinal number
|
||||||
|
if text_lower in _ordinal_words:
|
||||||
|
return True
|
||||||
|
if text_lower.endswith("ኛ"):
|
||||||
|
if text_lower[:-2].isdigit():
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
19
spacy/lang/ti/punctuation.py
Normal file
19
spacy/lang/ti/punctuation.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
||||||
|
from ..char_classes import UNITS, ALPHA_UPPER
|
||||||
|
|
||||||
|
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧".strip().split()
|
||||||
|
|
||||||
|
_suffixes = (
|
||||||
|
_list_punct
|
||||||
|
+ LIST_ELLIPSES
|
||||||
|
+ LIST_QUOTES
|
||||||
|
+ [
|
||||||
|
r"(?<=[0-9])\+",
|
||||||
|
# Tigrinya is written from Left-To-Right
|
||||||
|
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||||
|
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||||
|
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
TOKENIZER_SUFFIXES = _suffixes
|
6
spacy/lang/ti/stop_words.py
Normal file
6
spacy/lang/ti/stop_words.py
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
# Stop words
|
||||||
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
|
ግን ግና ንስኻ ንስኺ ንስኻትክን ንስኻትኩም ናትካ ናትኪ ናትክን ናትኩም
|
||||||
|
""".split()
|
||||||
|
)
|
22
spacy/lang/ti/tokenizer_exceptions.py
Normal file
22
spacy/lang/ti/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
from ...symbols import ORTH, NORM
|
||||||
|
|
||||||
|
|
||||||
|
_exc = {}
|
||||||
|
|
||||||
|
|
||||||
|
for exc_data in [
|
||||||
|
{ORTH: "ት/ቤት"},
|
||||||
|
{ORTH: "ወ/ሮ", NORM: "ወይዘሮ"},
|
||||||
|
{ORTH: "ወ/ሪ", NORM: "ወይዘሪት"},
|
||||||
|
]:
|
||||||
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
|
||||||
|
for orth in [
|
||||||
|
"ዓ.ም.",
|
||||||
|
"ኪ.ሜ.",
|
||||||
|
]:
|
||||||
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = _exc
|
|
@ -1,4 +1,4 @@
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
@ -9,6 +9,7 @@ class TurkishDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
token_match = TOKEN_MATCH
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,119 +1,191 @@
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
import re
|
||||||
|
|
||||||
|
from ..punctuation import ALPHA_LOWER, ALPHA
|
||||||
from ...symbols import ORTH, NORM
|
from ...symbols import ORTH, NORM
|
||||||
from ...util import update_exc
|
|
||||||
|
|
||||||
|
|
||||||
_exc = {"sağol": [{ORTH: "sağ"}, {ORTH: "ol", NORM: "olun"}]}
|
_exc = {}
|
||||||
|
|
||||||
|
|
||||||
for exc_data in [
|
_abbr_period_exc = [
|
||||||
{ORTH: "A.B.D.", NORM: "Amerika Birleşik Devletleri"},
|
{ORTH: "A.B.D.", NORM: "Amerika"},
|
||||||
{ORTH: "Alb.", NORM: "Albay"},
|
{ORTH: "Alb.", NORM: "albay"},
|
||||||
{ORTH: "Ar.Gör.", NORM: "Araştırma Görevlisi"},
|
{ORTH: "Ank.", NORM: "Ankara"},
|
||||||
{ORTH: "Arş.Gör.", NORM: "Araştırma Görevlisi"},
|
{ORTH: "Ar.Gör."},
|
||||||
{ORTH: "Asb.", NORM: "Astsubay"},
|
{ORTH: "Arş.Gör."},
|
||||||
{ORTH: "Astsb.", NORM: "Astsubay"},
|
{ORTH: "Asb.", NORM: "astsubay"},
|
||||||
{ORTH: "As.İz.", NORM: "Askeri İnzibat"},
|
{ORTH: "Astsb.", NORM: "astsubay"},
|
||||||
{ORTH: "Atğm", NORM: "Asteğmen"},
|
{ORTH: "As.İz."},
|
||||||
{ORTH: "Av.", NORM: "Avukat"},
|
{ORTH: "as.iz."},
|
||||||
{ORTH: "Apt.", NORM: "Apartmanı"},
|
{ORTH: "Atğm", NORM: "asteğmen"},
|
||||||
{ORTH: "Bçvş.", NORM: "Başçavuş"},
|
{ORTH: "Av.", NORM: "avukat"},
|
||||||
|
{ORTH: "Apt.", NORM: "apartmanı"},
|
||||||
|
{ORTH: "apt.", NORM: "apartmanı"},
|
||||||
|
{ORTH: "Bçvş.", NORM: "başçavuş"},
|
||||||
|
{ORTH: "bçvş.", NORM: "başçavuş"},
|
||||||
{ORTH: "bk.", NORM: "bakınız"},
|
{ORTH: "bk.", NORM: "bakınız"},
|
||||||
{ORTH: "bknz.", NORM: "bakınız"},
|
{ORTH: "bknz.", NORM: "bakınız"},
|
||||||
{ORTH: "Bnb.", NORM: "Binbaşı"},
|
{ORTH: "Bnb.", NORM: "binbaşı"},
|
||||||
{ORTH: "bnb.", NORM: "binbaşı"},
|
{ORTH: "bnb.", NORM: "binbaşı"},
|
||||||
{ORTH: "Böl.", NORM: "Bölümü"},
|
{ORTH: "Böl.", NORM: "bölümü"},
|
||||||
{ORTH: "Bşk.", NORM: "Başkanlığı"},
|
{ORTH: "böl.", NORM: "bölümü"},
|
||||||
{ORTH: "Bştbp.", NORM: "Baştabip"},
|
{ORTH: "Bşk.", NORM: "başkanlığı"},
|
||||||
{ORTH: "Bul.", NORM: "Bulvarı"},
|
{ORTH: "bşk.", NORM: "başkanlığı"},
|
||||||
{ORTH: "Cad.", NORM: "Caddesi"},
|
{ORTH: "Bştbp.", NORM: "baştabip"},
|
||||||
|
{ORTH: "bştbp.", NORM: "baştabip"},
|
||||||
|
{ORTH: "Bul.", NORM: "bulvarı"},
|
||||||
|
{ORTH: "bul.", NORM: "bulvarı"},
|
||||||
|
{ORTH: "Cad.", NORM: "caddesi"},
|
||||||
|
{ORTH: "cad.", NORM: "caddesi"},
|
||||||
{ORTH: "çev.", NORM: "çeviren"},
|
{ORTH: "çev.", NORM: "çeviren"},
|
||||||
{ORTH: "Çvş.", NORM: "Çavuş"},
|
{ORTH: "Çvş.", NORM: "çavuş"},
|
||||||
|
{ORTH: "çvş.", NORM: "çavuş"},
|
||||||
{ORTH: "dak.", NORM: "dakika"},
|
{ORTH: "dak.", NORM: "dakika"},
|
||||||
{ORTH: "dk.", NORM: "dakika"},
|
{ORTH: "dk.", NORM: "dakika"},
|
||||||
{ORTH: "Doç.", NORM: "Doçent"},
|
{ORTH: "Doç.", NORM: "doçent"},
|
||||||
{ORTH: "doğ.", NORM: "doğum tarihi"},
|
{ORTH: "doğ."},
|
||||||
|
{ORTH: "Dr.", NORM: "doktor"},
|
||||||
|
{ORTH: "dr.", NORM: "doktor"},
|
||||||
{ORTH: "drl.", NORM: "derleyen"},
|
{ORTH: "drl.", NORM: "derleyen"},
|
||||||
{ORTH: "Dz.", NORM: "Deniz"},
|
{ORTH: "Dz.", NORM: "deniz"},
|
||||||
{ORTH: "Dz.K.K.lığı", NORM: "Deniz Kuvvetleri Komutanlığı"},
|
{ORTH: "Dz.K.K.lığı"},
|
||||||
{ORTH: "Dz.Kuv.", NORM: "Deniz Kuvvetleri"},
|
{ORTH: "Dz.Kuv."},
|
||||||
{ORTH: "Dz.Kuv.K.", NORM: "Deniz Kuvvetleri Komutanlığı"},
|
{ORTH: "Dz.Kuv.K."},
|
||||||
{ORTH: "dzl.", NORM: "düzenleyen"},
|
{ORTH: "dzl.", NORM: "düzenleyen"},
|
||||||
{ORTH: "Ecz.", NORM: "Eczanesi"},
|
{ORTH: "Ecz.", NORM: "eczanesi"},
|
||||||
|
{ORTH: "ecz.", NORM: "eczanesi"},
|
||||||
{ORTH: "ekon.", NORM: "ekonomi"},
|
{ORTH: "ekon.", NORM: "ekonomi"},
|
||||||
{ORTH: "Fak.", NORM: "Fakültesi"},
|
{ORTH: "Fak.", NORM: "fakültesi"},
|
||||||
{ORTH: "Gn.", NORM: "Genel"},
|
{ORTH: "Gn.", NORM: "genel"},
|
||||||
{ORTH: "Gnkur.", NORM: "Genelkurmay"},
|
{ORTH: "Gnkur.", NORM: "Genelkurmay"},
|
||||||
{ORTH: "Gn.Kur.", NORM: "Genelkurmay"},
|
{ORTH: "Gn.Kur.", NORM: "Genelkurmay"},
|
||||||
{ORTH: "gr.", NORM: "gram"},
|
{ORTH: "gr.", NORM: "gram"},
|
||||||
{ORTH: "Hst.", NORM: "Hastanesi"},
|
{ORTH: "Hst.", NORM: "hastanesi"},
|
||||||
{ORTH: "Hs.Uzm.", NORM: "Hesap Uzmanı"},
|
{ORTH: "hst.", NORM: "hastanesi"},
|
||||||
|
{ORTH: "Hs.Uzm."},
|
||||||
{ORTH: "huk.", NORM: "hukuk"},
|
{ORTH: "huk.", NORM: "hukuk"},
|
||||||
{ORTH: "Hv.", NORM: "Hava"},
|
{ORTH: "Hv.", NORM: "hava"},
|
||||||
{ORTH: "Hv.K.K.lığı", NORM: "Hava Kuvvetleri Komutanlığı"},
|
{ORTH: "Hv.K.K.lığı"},
|
||||||
{ORTH: "Hv.Kuv.", NORM: "Hava Kuvvetleri"},
|
{ORTH: "Hv.Kuv."},
|
||||||
{ORTH: "Hv.Kuv.K.", NORM: "Hava Kuvvetleri Komutanlığı"},
|
{ORTH: "Hv.Kuv.K."},
|
||||||
{ORTH: "Hz.", NORM: "Hazreti"},
|
{ORTH: "Hz.", NORM: "hazreti"},
|
||||||
{ORTH: "Hz.Öz.", NORM: "Hizmete Özel"},
|
{ORTH: "Hz.Öz."},
|
||||||
{ORTH: "İng.", NORM: "İngilizce"},
|
{ORTH: "İng.", NORM: "ingilizce"},
|
||||||
{ORTH: "Jeol.", NORM: "Jeoloji"},
|
{ORTH: "İst.", NORM: "İstanbul"},
|
||||||
|
{ORTH: "Jeol.", NORM: "jeoloji"},
|
||||||
{ORTH: "jeol.", NORM: "jeoloji"},
|
{ORTH: "jeol.", NORM: "jeoloji"},
|
||||||
{ORTH: "Korg.", NORM: "Korgeneral"},
|
{ORTH: "Korg.", NORM: "korgeneral"},
|
||||||
{ORTH: "Kur.", NORM: "Kurmay"},
|
{ORTH: "Kur.", NORM: "kurmay"},
|
||||||
{ORTH: "Kur.Bşk.", NORM: "Kurmay Başkanı"},
|
{ORTH: "Kur.Bşk."},
|
||||||
{ORTH: "Kuv.", NORM: "Kuvvetleri"},
|
{ORTH: "Kuv.", NORM: "kuvvetleri"},
|
||||||
{ORTH: "Ltd.", NORM: "Limited"},
|
{ORTH: "Ltd.", NORM: "limited"},
|
||||||
{ORTH: "Mah.", NORM: "Mahallesi"},
|
{ORTH: "ltd.", NORM: "limited"},
|
||||||
|
{ORTH: "Mah.", NORM: "mahallesi"},
|
||||||
{ORTH: "mah.", NORM: "mahallesi"},
|
{ORTH: "mah.", NORM: "mahallesi"},
|
||||||
{ORTH: "max.", NORM: "maksimum"},
|
{ORTH: "max.", NORM: "maksimum"},
|
||||||
{ORTH: "min.", NORM: "minimum"},
|
{ORTH: "min.", NORM: "minimum"},
|
||||||
{ORTH: "Müh.", NORM: "Mühendisliği"},
|
{ORTH: "Müh.", NORM: "mühendisliği"},
|
||||||
{ORTH: "müh.", NORM: "mühendisliği"},
|
{ORTH: "müh.", NORM: "mühendisliği"},
|
||||||
{ORTH: "MÖ.", NORM: "Milattan Önce"},
|
{ORTH: "M.Ö."},
|
||||||
{ORTH: "Onb.", NORM: "Onbaşı"},
|
{ORTH: "M.S."},
|
||||||
{ORTH: "Ord.", NORM: "Ordinaryüs"},
|
{ORTH: "Onb.", NORM: "onbaşı"},
|
||||||
{ORTH: "Org.", NORM: "Orgeneral"},
|
{ORTH: "Ord.", NORM: "ordinaryüs"},
|
||||||
{ORTH: "Ped.", NORM: "Pedagoji"},
|
{ORTH: "Org.", NORM: "orgeneral"},
|
||||||
{ORTH: "Prof.", NORM: "Profesör"},
|
{ORTH: "Ped.", NORM: "pedagoji"},
|
||||||
{ORTH: "Sb.", NORM: "Subay"},
|
{ORTH: "Prof.", NORM: "profesör"},
|
||||||
{ORTH: "Sn.", NORM: "Sayın"},
|
{ORTH: "prof.", NORM: "profesör"},
|
||||||
|
{ORTH: "Sb.", NORM: "subay"},
|
||||||
|
{ORTH: "Sn.", NORM: "sayın"},
|
||||||
{ORTH: "sn.", NORM: "saniye"},
|
{ORTH: "sn.", NORM: "saniye"},
|
||||||
{ORTH: "Sok.", NORM: "Sokak"},
|
{ORTH: "Sok.", NORM: "sokak"},
|
||||||
{ORTH: "Şb.", NORM: "Şube"},
|
{ORTH: "sok.", NORM: "sokak"},
|
||||||
{ORTH: "Şti.", NORM: "Şirketi"},
|
{ORTH: "Şb.", NORM: "şube"},
|
||||||
{ORTH: "Tbp.", NORM: "Tabip"},
|
{ORTH: "şb.", NORM: "şube"},
|
||||||
{ORTH: "T.C.", NORM: "Türkiye Cumhuriyeti"},
|
{ORTH: "Şti.", NORM: "şirketi"},
|
||||||
{ORTH: "Tel.", NORM: "Telefon"},
|
{ORTH: "şti.", NORM: "şirketi"},
|
||||||
|
{ORTH: "Tbp.", NORM: "tabip"},
|
||||||
|
{ORTH: "tbp.", NORM: "tabip"},
|
||||||
|
{ORTH: "T.C."},
|
||||||
|
{ORTH: "Tel.", NORM: "telefon"},
|
||||||
{ORTH: "tel.", NORM: "telefon"},
|
{ORTH: "tel.", NORM: "telefon"},
|
||||||
{ORTH: "telg.", NORM: "telgraf"},
|
{ORTH: "telg.", NORM: "telgraf"},
|
||||||
{ORTH: "Tğm.", NORM: "Teğmen"},
|
{ORTH: "Tğm.", NORM: "teğmen"},
|
||||||
{ORTH: "tğm.", NORM: "teğmen"},
|
{ORTH: "tğm.", NORM: "teğmen"},
|
||||||
{ORTH: "tic.", NORM: "ticaret"},
|
{ORTH: "tic.", NORM: "ticaret"},
|
||||||
{ORTH: "Tug.", NORM: "Tugay"},
|
{ORTH: "Tug.", NORM: "tugay"},
|
||||||
{ORTH: "Tuğg.", NORM: "Tuğgeneral"},
|
{ORTH: "Tuğg.", NORM: "tuğgeneral"},
|
||||||
{ORTH: "Tümg.", NORM: "Tümgeneral"},
|
{ORTH: "Tümg.", NORM: "tümgeneral"},
|
||||||
{ORTH: "Uzm.", NORM: "Uzman"},
|
{ORTH: "Uzm.", NORM: "uzman"},
|
||||||
{ORTH: "Üçvş.", NORM: "Üstçavuş"},
|
{ORTH: "Üçvş.", NORM: "üstçavuş"},
|
||||||
{ORTH: "Üni.", NORM: "Üniversitesi"},
|
{ORTH: "Üni.", NORM: "üniversitesi"},
|
||||||
{ORTH: "Ütğm.", NORM: "Üsteğmen"},
|
{ORTH: "Ütğm.", NORM: "üsteğmen"},
|
||||||
{ORTH: "vb.", NORM: "ve benzeri"},
|
{ORTH: "vb."},
|
||||||
{ORTH: "vs.", NORM: "vesaire"},
|
{ORTH: "vs.", NORM: "vesaire"},
|
||||||
{ORTH: "Yard.", NORM: "Yardımcı"},
|
{ORTH: "Yard.", NORM: "yardımcı"},
|
||||||
{ORTH: "Yar.", NORM: "Yardımcı"},
|
{ORTH: "Yar.", NORM: "yardımcı"},
|
||||||
{ORTH: "Yd.Sb.", NORM: "Yedek Subay"},
|
{ORTH: "Yd.Sb."},
|
||||||
{ORTH: "Yard.Doç.", NORM: "Yardımcı Doçent"},
|
{ORTH: "Yard.Doç."},
|
||||||
{ORTH: "Yar.Doç.", NORM: "Yardımcı Doçent"},
|
{ORTH: "Yar.Doç."},
|
||||||
{ORTH: "Yb.", NORM: "Yarbay"},
|
{ORTH: "Yb.", NORM: "yarbay"},
|
||||||
{ORTH: "Yrd.", NORM: "Yardımcı"},
|
{ORTH: "Yrd.", NORM: "yardımcı"},
|
||||||
{ORTH: "Yrd.Doç.", NORM: "Yardımcı Doçent"},
|
{ORTH: "Yrd.Doç."},
|
||||||
{ORTH: "Y.Müh.", NORM: "Yüksek mühendis"},
|
{ORTH: "Y.Müh."},
|
||||||
{ORTH: "Y.Mim.", NORM: "Yüksek mimar"},
|
{ORTH: "Y.Mim."},
|
||||||
]:
|
{ORTH: "yy.", NORM: "yüzyıl"},
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
]
|
||||||
|
|
||||||
|
for abbr in _abbr_period_exc:
|
||||||
|
_exc[abbr[ORTH]] = [abbr]
|
||||||
|
|
||||||
|
_abbr_exc = [
|
||||||
|
{ORTH: "AB", NORM: "Avrupa Birliği"},
|
||||||
|
{ORTH: "ABD", NORM: "Amerika"},
|
||||||
|
{ORTH: "ABS", NORM: "fren"},
|
||||||
|
{ORTH: "AOÇ"},
|
||||||
|
{ORTH: "ASKİ"},
|
||||||
|
{ORTH: "Bağ-kur", NORM: "Bağkur"},
|
||||||
|
{ORTH: "BDDK"},
|
||||||
|
{ORTH: "BJK", NORM: "Beşiktaş"},
|
||||||
|
{ORTH: "ESA", NORM: "Avrupa uzay ajansı"},
|
||||||
|
{ORTH: "FB", NORM: "Fenerbahçe"},
|
||||||
|
{ORTH: "GATA"},
|
||||||
|
{ORTH: "GS", NORM: "Galatasaray"},
|
||||||
|
{ORTH: "İSKİ"},
|
||||||
|
{ORTH: "KBB"},
|
||||||
|
{ORTH: "RTÜK", NORM: "radyo ve televizyon üst kurulu"},
|
||||||
|
{ORTH: "TBMM"},
|
||||||
|
{ORTH: "TC"},
|
||||||
|
{ORTH: "TÜİK", NORM: "Türkiye istatistik kurumu"},
|
||||||
|
{ORTH: "YÖK"},
|
||||||
|
]
|
||||||
|
|
||||||
|
for abbr in _abbr_exc:
|
||||||
|
_exc[abbr[ORTH]] = [abbr]
|
||||||
|
|
||||||
|
|
||||||
for orth in ["Dr.", "yy."]:
|
_num = r"[+-]?\d+([,.]\d+)*"
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_ord_num = r"(\d+\.)"
|
||||||
|
_date = r"(((\d{1,2}[./-]){2})?(\d{4})|(\d{1,2}[./]\d{1,2}(\.)?))"
|
||||||
|
_dash_num = r"(([{al}\d]+/\d+)|(\d+/[{al}]))".format(al=ALPHA)
|
||||||
|
_roman_num = "M{0,3}(?:C[MD]|D?C{0,3})(?:X[CL]|L?X{0,3})(?:I[XV]|V?I{0,3})"
|
||||||
|
_roman_ord = r"({rn})\.".format(rn=_roman_num)
|
||||||
|
_time_exp = r"\d+(:\d+)*"
|
||||||
|
|
||||||
|
_inflections = r"'[{al}]+".format(al=ALPHA_LOWER)
|
||||||
|
_abbrev_inflected = r"[{a}]+\.'[{al}]+".format(a=ALPHA, al=ALPHA_LOWER)
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
_nums = r"(({d})|({dn})|({te})|({on})|({n})|({ro})|({rn}))({inf})?".format(
|
||||||
|
d=_date,
|
||||||
|
dn=_dash_num,
|
||||||
|
te=_time_exp,
|
||||||
|
on=_ord_num,
|
||||||
|
n=_num,
|
||||||
|
ro=_roman_ord,
|
||||||
|
rn=_roman_num,
|
||||||
|
inf=_inflections,
|
||||||
|
)
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = _exc
|
||||||
|
TOKEN_MATCH = re.compile(
|
||||||
|
r"^({abbr})|({n})$".format(n=_nums, abbr=_abbrev_inflected)
|
||||||
|
).match
|
||||||
|
|
|
@ -22,8 +22,8 @@ class UkrainianLemmatizer(RussianLemmatizer):
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"The Ukrainian lemmatizer requires the pymorphy2 library and "
|
"The Ukrainian lemmatizer requires the pymorphy2 library and "
|
||||||
'dictionaries: try to fix it with "pip uninstall pymorphy2" and'
|
"dictionaries: try to fix it with "
|
||||||
'"pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"'
|
'"pip install pymorphy2 pymorphy2-dicts-uk"'
|
||||||
) from None
|
) from None
|
||||||
if UkrainianLemmatizer._morph is None:
|
if UkrainianLemmatizer._morph is None:
|
||||||
UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk")
|
UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk")
|
||||||
|
|
|
@ -121,6 +121,7 @@ class Language:
|
||||||
max_length: int = 10 ** 6,
|
max_length: int = 10 ** 6,
|
||||||
meta: Dict[str, Any] = {},
|
meta: Dict[str, Any] = {},
|
||||||
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
|
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
|
||||||
|
batch_size: int = 1000,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialise a Language object.
|
"""Initialise a Language object.
|
||||||
|
@ -138,6 +139,7 @@ class Language:
|
||||||
100,000 characters in one text.
|
100,000 characters in one text.
|
||||||
create_tokenizer (Callable): Function that takes the nlp object and
|
create_tokenizer (Callable): Function that takes the nlp object and
|
||||||
returns a tokenizer.
|
returns a tokenizer.
|
||||||
|
batch_size (int): Default batch size for pipe and evaluate.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/language#init
|
DOCS: https://nightly.spacy.io/api/language#init
|
||||||
"""
|
"""
|
||||||
|
@ -173,6 +175,7 @@ class Language:
|
||||||
tokenizer_cfg = {"tokenizer": self._config["nlp"]["tokenizer"]}
|
tokenizer_cfg = {"tokenizer": self._config["nlp"]["tokenizer"]}
|
||||||
create_tokenizer = registry.resolve(tokenizer_cfg)["tokenizer"]
|
create_tokenizer = registry.resolve(tokenizer_cfg)["tokenizer"]
|
||||||
self.tokenizer = create_tokenizer(self)
|
self.tokenizer = create_tokenizer(self)
|
||||||
|
self.batch_size = batch_size
|
||||||
|
|
||||||
def __init_subclass__(cls, **kwargs):
|
def __init_subclass__(cls, **kwargs):
|
||||||
super().__init_subclass__(**kwargs)
|
super().__init_subclass__(**kwargs)
|
||||||
|
@ -694,6 +697,8 @@ class Language:
|
||||||
source_config = source.config.interpolate()
|
source_config = source.config.interpolate()
|
||||||
pipe_config = util.copy_config(source_config["components"][source_name])
|
pipe_config = util.copy_config(source_config["components"][source_name])
|
||||||
self._pipe_configs[name] = pipe_config
|
self._pipe_configs[name] = pipe_config
|
||||||
|
for s in source.vocab.strings:
|
||||||
|
self.vocab.strings.add(s)
|
||||||
return pipe, pipe_config["factory"]
|
return pipe, pipe_config["factory"]
|
||||||
|
|
||||||
def add_pipe(
|
def add_pipe(
|
||||||
|
@ -968,10 +973,6 @@ class Language:
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/language#call
|
DOCS: https://nightly.spacy.io/api/language#call
|
||||||
"""
|
"""
|
||||||
if len(text) > self.max_length:
|
|
||||||
raise ValueError(
|
|
||||||
Errors.E088.format(length=len(text), max_length=self.max_length)
|
|
||||||
)
|
|
||||||
doc = self.make_doc(text)
|
doc = self.make_doc(text)
|
||||||
if component_cfg is None:
|
if component_cfg is None:
|
||||||
component_cfg = {}
|
component_cfg = {}
|
||||||
|
@ -1045,6 +1046,11 @@ class Language:
|
||||||
text (str): The text to process.
|
text (str): The text to process.
|
||||||
RETURNS (Doc): The processed doc.
|
RETURNS (Doc): The processed doc.
|
||||||
"""
|
"""
|
||||||
|
if len(text) > self.max_length:
|
||||||
|
raise ValueError(
|
||||||
|
Errors.E088.format(length=len(text), max_length=self.max_length)
|
||||||
|
)
|
||||||
|
return self.tokenizer(text)
|
||||||
return self.tokenizer(text)
|
return self.tokenizer(text)
|
||||||
|
|
||||||
def update(
|
def update(
|
||||||
|
@ -1205,6 +1211,9 @@ class Language:
|
||||||
config = self.config.interpolate()
|
config = self.config.interpolate()
|
||||||
# These are the settings provided in the [initialize] block in the config
|
# These are the settings provided in the [initialize] block in the config
|
||||||
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
||||||
|
before_init = I["before_init"]
|
||||||
|
if before_init is not None:
|
||||||
|
before_init(self)
|
||||||
init_vocab(
|
init_vocab(
|
||||||
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
|
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
|
||||||
)
|
)
|
||||||
|
@ -1236,6 +1245,9 @@ class Language:
|
||||||
self._optimizer = sgd
|
self._optimizer = sgd
|
||||||
elif self._optimizer is None:
|
elif self._optimizer is None:
|
||||||
self._optimizer = self.create_optimizer()
|
self._optimizer = self.create_optimizer()
|
||||||
|
after_init = I["after_init"]
|
||||||
|
if after_init is not None:
|
||||||
|
after_init(self)
|
||||||
return self._optimizer
|
return self._optimizer
|
||||||
|
|
||||||
def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:
|
def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:
|
||||||
|
@ -1267,7 +1279,7 @@ class Language:
|
||||||
self,
|
self,
|
||||||
examples: Iterable[Example],
|
examples: Iterable[Example],
|
||||||
*,
|
*,
|
||||||
batch_size: int = 256,
|
batch_size: Optional[int] = None,
|
||||||
scorer: Optional[Scorer] = None,
|
scorer: Optional[Scorer] = None,
|
||||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||||
scorer_cfg: Optional[Dict[str, Any]] = None,
|
scorer_cfg: Optional[Dict[str, Any]] = None,
|
||||||
|
@ -1275,7 +1287,7 @@ class Language:
|
||||||
"""Evaluate a model's pipeline components.
|
"""Evaluate a model's pipeline components.
|
||||||
|
|
||||||
examples (Iterable[Example]): `Example` objects.
|
examples (Iterable[Example]): `Example` objects.
|
||||||
batch_size (int): Batch size to use.
|
batch_size (Optional[int]): Batch size to use.
|
||||||
scorer (Optional[Scorer]): Scorer to use. If not passed in, a new one
|
scorer (Optional[Scorer]): Scorer to use. If not passed in, a new one
|
||||||
will be created.
|
will be created.
|
||||||
component_cfg (dict): An optional dictionary with extra keyword
|
component_cfg (dict): An optional dictionary with extra keyword
|
||||||
|
@ -1286,7 +1298,10 @@ class Language:
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/language#evaluate
|
DOCS: https://nightly.spacy.io/api/language#evaluate
|
||||||
"""
|
"""
|
||||||
|
examples = list(examples)
|
||||||
validate_examples(examples, "Language.evaluate")
|
validate_examples(examples, "Language.evaluate")
|
||||||
|
if batch_size is None:
|
||||||
|
batch_size = self.batch_size
|
||||||
if component_cfg is None:
|
if component_cfg is None:
|
||||||
component_cfg = {}
|
component_cfg = {}
|
||||||
if scorer_cfg is None:
|
if scorer_cfg is None:
|
||||||
|
@ -1295,27 +1310,21 @@ class Language:
|
||||||
kwargs = dict(scorer_cfg)
|
kwargs = dict(scorer_cfg)
|
||||||
kwargs.setdefault("nlp", self)
|
kwargs.setdefault("nlp", self)
|
||||||
scorer = Scorer(**kwargs)
|
scorer = Scorer(**kwargs)
|
||||||
texts = [eg.reference.text for eg in examples]
|
# reset annotation in predicted docs and time tokenization
|
||||||
docs = [eg.predicted for eg in examples]
|
|
||||||
start_time = timer()
|
start_time = timer()
|
||||||
# tokenize the texts only for timing purposes
|
for eg in examples:
|
||||||
if not hasattr(self.tokenizer, "pipe"):
|
eg.predicted = self.make_doc(eg.reference.text)
|
||||||
_ = [self.tokenizer(text) for text in texts] # noqa: F841
|
# apply all pipeline components
|
||||||
else:
|
|
||||||
_ = list(self.tokenizer.pipe(texts)) # noqa: F841
|
|
||||||
for name, pipe in self.pipeline:
|
for name, pipe in self.pipeline:
|
||||||
kwargs = component_cfg.get(name, {})
|
kwargs = component_cfg.get(name, {})
|
||||||
kwargs.setdefault("batch_size", batch_size)
|
kwargs.setdefault("batch_size", batch_size)
|
||||||
docs = _pipe(docs, pipe, kwargs)
|
for doc, eg in zip(
|
||||||
# iterate over the final generator
|
_pipe((eg.predicted for eg in examples), pipe, kwargs), examples
|
||||||
if len(self.pipeline):
|
):
|
||||||
docs = list(docs)
|
|
||||||
end_time = timer()
|
|
||||||
for i, (doc, eg) in enumerate(zip(docs, examples)):
|
|
||||||
util.logger.debug(doc)
|
|
||||||
eg.predicted = doc
|
eg.predicted = doc
|
||||||
|
end_time = timer()
|
||||||
results = scorer.score(examples)
|
results = scorer.score(examples)
|
||||||
n_words = sum(len(doc) for doc in docs)
|
n_words = sum(len(eg.predicted) for eg in examples)
|
||||||
results["speed"] = n_words / (end_time - start_time)
|
results["speed"] = n_words / (end_time - start_time)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
@ -1365,7 +1374,7 @@ class Language:
|
||||||
texts: Iterable[str],
|
texts: Iterable[str],
|
||||||
*,
|
*,
|
||||||
as_tuples: bool = False,
|
as_tuples: bool = False,
|
||||||
batch_size: int = 1000,
|
batch_size: Optional[int] = None,
|
||||||
disable: Iterable[str] = SimpleFrozenList(),
|
disable: Iterable[str] = SimpleFrozenList(),
|
||||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||||
n_process: int = 1,
|
n_process: int = 1,
|
||||||
|
@ -1376,7 +1385,7 @@ class Language:
|
||||||
as_tuples (bool): If set to True, inputs should be a sequence of
|
as_tuples (bool): If set to True, inputs should be a sequence of
|
||||||
(text, context) tuples. Output will then be a sequence of
|
(text, context) tuples. Output will then be a sequence of
|
||||||
(doc, context) tuples. Defaults to False.
|
(doc, context) tuples. Defaults to False.
|
||||||
batch_size (int): The number of texts to buffer.
|
batch_size (Optional[int]): The number of texts to buffer.
|
||||||
disable (List[str]): Names of the pipeline components to disable.
|
disable (List[str]): Names of the pipeline components to disable.
|
||||||
component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword
|
component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword
|
||||||
arguments for specific components.
|
arguments for specific components.
|
||||||
|
@ -1403,6 +1412,8 @@ class Language:
|
||||||
return
|
return
|
||||||
if component_cfg is None:
|
if component_cfg is None:
|
||||||
component_cfg = {}
|
component_cfg = {}
|
||||||
|
if batch_size is None:
|
||||||
|
batch_size = self.batch_size
|
||||||
|
|
||||||
pipes = (
|
pipes = (
|
||||||
[]
|
[]
|
||||||
|
@ -1610,13 +1621,12 @@ class Language:
|
||||||
if model not in source_nlps:
|
if model not in source_nlps:
|
||||||
# We only need the components here and we need to init
|
# We only need the components here and we need to init
|
||||||
# model with the same vocab as the current nlp object
|
# model with the same vocab as the current nlp object
|
||||||
source_nlps[model] = util.load_model(
|
source_nlps[model] = util.load_model(model, vocab=nlp.vocab)
|
||||||
model, vocab=nlp.vocab, disable=["vocab", "tokenizer"]
|
|
||||||
)
|
|
||||||
source_name = pipe_cfg.get("component", pipe_name)
|
source_name = pipe_cfg.get("component", pipe_name)
|
||||||
nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name)
|
nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name)
|
||||||
disabled_pipes = [*config["nlp"]["disabled"], *disable]
|
disabled_pipes = [*config["nlp"]["disabled"], *disable]
|
||||||
nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
|
nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
|
||||||
|
nlp.batch_size = config["nlp"]["batch_size"]
|
||||||
nlp.config = filled if auto_fill else config
|
nlp.config = filled if auto_fill else config
|
||||||
if after_pipeline_creation is not None:
|
if after_pipeline_creation is not None:
|
||||||
nlp = after_pipeline_creation(nlp)
|
nlp = after_pipeline_creation(nlp)
|
||||||
|
|
|
@ -26,6 +26,7 @@ cdef enum quantifier_t:
|
||||||
ZERO_PLUS
|
ZERO_PLUS
|
||||||
ONE
|
ONE
|
||||||
ONE_PLUS
|
ONE_PLUS
|
||||||
|
FINAL_ID
|
||||||
|
|
||||||
|
|
||||||
cdef struct AttrValueC:
|
cdef struct AttrValueC:
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
from libc.stdint cimport int32_t
|
from libc.stdint cimport int32_t, int8_t
|
||||||
from libc.string cimport memset, memcmp
|
from libc.string cimport memset, memcmp
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
|
@ -308,7 +308,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
||||||
# avoid any processing or mem alloc if the document is empty
|
# avoid any processing or mem alloc if the document is empty
|
||||||
return output
|
return output
|
||||||
if len(predicates) > 0:
|
if len(predicates) > 0:
|
||||||
predicate_cache = <char*>mem.alloc(length * len(predicates), sizeof(char))
|
predicate_cache = <int8_t*>mem.alloc(length * len(predicates), sizeof(int8_t))
|
||||||
if extensions is not None and len(extensions) >= 1:
|
if extensions is not None and len(extensions) >= 1:
|
||||||
nr_extra_attr = max(extensions.values()) + 1
|
nr_extra_attr = max(extensions.values()) + 1
|
||||||
extra_attr_values = <attr_t*>mem.alloc(length * nr_extra_attr, sizeof(attr_t))
|
extra_attr_values = <attr_t*>mem.alloc(length * nr_extra_attr, sizeof(attr_t))
|
||||||
|
@ -349,7 +349,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
||||||
|
|
||||||
|
|
||||||
cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
|
cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
|
||||||
char* cached_py_predicates,
|
int8_t* cached_py_predicates,
|
||||||
Token token, const attr_t* extra_attrs, py_predicates) except *:
|
Token token, const attr_t* extra_attrs, py_predicates) except *:
|
||||||
cdef int q = 0
|
cdef int q = 0
|
||||||
cdef vector[PatternStateC] new_states
|
cdef vector[PatternStateC] new_states
|
||||||
|
@ -421,7 +421,7 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
|
||||||
states.push_back(new_states[i])
|
states.push_back(new_states[i])
|
||||||
|
|
||||||
|
|
||||||
cdef int update_predicate_cache(char* cache,
|
cdef int update_predicate_cache(int8_t* cache,
|
||||||
const TokenPatternC* pattern, Token token, predicates) except -1:
|
const TokenPatternC* pattern, Token token, predicates) except -1:
|
||||||
# If the state references any extra predicates, check whether they match.
|
# If the state references any extra predicates, check whether they match.
|
||||||
# These are cached, so that we don't call these potentially expensive
|
# These are cached, so that we don't call these potentially expensive
|
||||||
|
@ -459,7 +459,7 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states)
|
||||||
|
|
||||||
cdef action_t get_action(PatternStateC state,
|
cdef action_t get_action(PatternStateC state,
|
||||||
const TokenC* token, const attr_t* extra_attrs,
|
const TokenC* token, const attr_t* extra_attrs,
|
||||||
const char* predicate_matches) nogil:
|
const int8_t* predicate_matches) nogil:
|
||||||
"""We need to consider:
|
"""We need to consider:
|
||||||
a) Does the token match the specification? [Yes, No]
|
a) Does the token match the specification? [Yes, No]
|
||||||
b) What's the quantifier? [1, 0+, ?]
|
b) What's the quantifier? [1, 0+, ?]
|
||||||
|
@ -517,7 +517,7 @@ cdef action_t get_action(PatternStateC state,
|
||||||
|
|
||||||
Problem: If a quantifier is matching, we're adding a lot of open partials
|
Problem: If a quantifier is matching, we're adding a lot of open partials
|
||||||
"""
|
"""
|
||||||
cdef char is_match
|
cdef int8_t is_match
|
||||||
is_match = get_is_match(state, token, extra_attrs, predicate_matches)
|
is_match = get_is_match(state, token, extra_attrs, predicate_matches)
|
||||||
quantifier = get_quantifier(state)
|
quantifier = get_quantifier(state)
|
||||||
is_final = get_is_final(state)
|
is_final = get_is_final(state)
|
||||||
|
@ -569,9 +569,9 @@ cdef action_t get_action(PatternStateC state,
|
||||||
return RETRY
|
return RETRY
|
||||||
|
|
||||||
|
|
||||||
cdef char get_is_match(PatternStateC state,
|
cdef int8_t get_is_match(PatternStateC state,
|
||||||
const TokenC* token, const attr_t* extra_attrs,
|
const TokenC* token, const attr_t* extra_attrs,
|
||||||
const char* predicate_matches) nogil:
|
const int8_t* predicate_matches) nogil:
|
||||||
for i in range(state.pattern.nr_py):
|
for i in range(state.pattern.nr_py):
|
||||||
if predicate_matches[state.pattern.py_predicates[i]] == -1:
|
if predicate_matches[state.pattern.py_predicates[i]] == -1:
|
||||||
return 0
|
return 0
|
||||||
|
@ -586,8 +586,8 @@ cdef char get_is_match(PatternStateC state,
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
cdef char get_is_final(PatternStateC state) nogil:
|
cdef int8_t get_is_final(PatternStateC state) nogil:
|
||||||
if state.pattern[1].nr_attr == 0 and state.pattern[1].attrs != NULL:
|
if state.pattern[1].quantifier == FINAL_ID:
|
||||||
id_attr = state.pattern[1].attrs[0]
|
id_attr = state.pattern[1].attrs[0]
|
||||||
if id_attr.attr != ID:
|
if id_attr.attr != ID:
|
||||||
with gil:
|
with gil:
|
||||||
|
@ -597,7 +597,7 @@ cdef char get_is_final(PatternStateC state) nogil:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
cdef char get_quantifier(PatternStateC state) nogil:
|
cdef int8_t get_quantifier(PatternStateC state) nogil:
|
||||||
return state.pattern.quantifier
|
return state.pattern.quantifier
|
||||||
|
|
||||||
|
|
||||||
|
@ -626,36 +626,20 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs)
|
||||||
pattern[i].nr_py = len(predicates)
|
pattern[i].nr_py = len(predicates)
|
||||||
pattern[i].key = hash64(pattern[i].attrs, pattern[i].nr_attr * sizeof(AttrValueC), 0)
|
pattern[i].key = hash64(pattern[i].attrs, pattern[i].nr_attr * sizeof(AttrValueC), 0)
|
||||||
i = len(token_specs)
|
i = len(token_specs)
|
||||||
# Even though here, nr_attr == 0, we're storing the ID value in attrs[0] (bug-prone, thread carefully!)
|
# Use quantifier to identify final ID pattern node (rather than previous
|
||||||
pattern[i].attrs = <AttrValueC*>mem.alloc(2, sizeof(AttrValueC))
|
# uninitialized quantifier == 0/ZERO + nr_attr == 0 + non-zero-length attrs)
|
||||||
|
pattern[i].quantifier = FINAL_ID
|
||||||
|
pattern[i].attrs = <AttrValueC*>mem.alloc(1, sizeof(AttrValueC))
|
||||||
pattern[i].attrs[0].attr = ID
|
pattern[i].attrs[0].attr = ID
|
||||||
pattern[i].attrs[0].value = entity_id
|
pattern[i].attrs[0].value = entity_id
|
||||||
pattern[i].nr_attr = 0
|
pattern[i].nr_attr = 1
|
||||||
pattern[i].nr_extra_attr = 0
|
pattern[i].nr_extra_attr = 0
|
||||||
pattern[i].nr_py = 0
|
pattern[i].nr_py = 0
|
||||||
return pattern
|
return pattern
|
||||||
|
|
||||||
|
|
||||||
cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
|
cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
|
||||||
# There have been a few bugs here. We used to have two functions,
|
while pattern.quantifier != FINAL_ID:
|
||||||
# get_ent_id and get_pattern_key that tried to do the same thing. These
|
|
||||||
# are now unified to try to solve the "ghost match" problem.
|
|
||||||
# Below is the previous implementation of get_ent_id and the comment on it,
|
|
||||||
# preserved for reference while we figure out whether the heisenbug in the
|
|
||||||
# matcher is resolved.
|
|
||||||
#
|
|
||||||
#
|
|
||||||
# cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
|
|
||||||
# # The code was originally designed to always have pattern[1].attrs.value
|
|
||||||
# # be the ent_id when we get to the end of a pattern. However, Issue #2671
|
|
||||||
# # showed this wasn't the case when we had a reject-and-continue before a
|
|
||||||
# # match.
|
|
||||||
# # The patch to #2671 was wrong though, which came up in #3839.
|
|
||||||
# while pattern.attrs.attr != ID:
|
|
||||||
# pattern += 1
|
|
||||||
# return pattern.attrs.value
|
|
||||||
while pattern.nr_attr != 0 or pattern.nr_extra_attr != 0 or pattern.nr_py != 0 \
|
|
||||||
or pattern.quantifier != ZERO:
|
|
||||||
pattern += 1
|
pattern += 1
|
||||||
id_attr = pattern[0].attrs[0]
|
id_attr = pattern[0].attrs[0]
|
||||||
if id_attr.attr != ID:
|
if id_attr.attr != ID:
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import numpy
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from ..attrs import LOWER
|
from ..attrs import LOWER
|
||||||
|
@ -23,10 +22,7 @@ def forward(model: Model, docs, is_train: bool):
|
||||||
keys, vals = model.ops.xp.unique(keys, return_counts=True)
|
keys, vals = model.ops.xp.unique(keys, return_counts=True)
|
||||||
batch_keys.append(keys)
|
batch_keys.append(keys)
|
||||||
batch_vals.append(vals)
|
batch_vals.append(vals)
|
||||||
# The dtype here matches what thinc is expecting -- which differs per
|
lengths = model.ops.asarray([arr.shape[0] for arr in batch_keys], dtype="int32")
|
||||||
# platform (by int definition). This should be fixed once the problem
|
|
||||||
# is fixed on Thinc's side.
|
|
||||||
lengths = model.ops.asarray([arr.shape[0] for arr in batch_keys], dtype=numpy.int_)
|
|
||||||
batch_keys = model.ops.xp.concatenate(batch_keys)
|
batch_keys = model.ops.xp.concatenate(batch_keys)
|
||||||
batch_vals = model.ops.asarray(model.ops.xp.concatenate(batch_vals), dtype="f")
|
batch_vals = model.ops.asarray(model.ops.xp.concatenate(batch_vals), dtype="f")
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,7 @@ from ...tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TransitionBasedParser.v1")
|
@registry.architectures.register("spacy.TransitionBasedParser.v1")
|
||||||
def build_tb_parser_model(
|
def transition_parser_v1(
|
||||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||||
state_type: Literal["parser", "ner"],
|
state_type: Literal["parser", "ner"],
|
||||||
extra_state_tokens: bool,
|
extra_state_tokens: bool,
|
||||||
|
@ -19,6 +19,47 @@ def build_tb_parser_model(
|
||||||
maxout_pieces: int,
|
maxout_pieces: int,
|
||||||
use_upper: bool = True,
|
use_upper: bool = True,
|
||||||
nO: Optional[int] = None,
|
nO: Optional[int] = None,
|
||||||
|
) -> Model:
|
||||||
|
return build_tb_parser_model(
|
||||||
|
tok2vec,
|
||||||
|
state_type,
|
||||||
|
extra_state_tokens,
|
||||||
|
hidden_width,
|
||||||
|
maxout_pieces,
|
||||||
|
use_upper,
|
||||||
|
nO,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.TransitionBasedParser.v2")
|
||||||
|
def transition_parser_v2(
|
||||||
|
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||||
|
state_type: Literal["parser", "ner"],
|
||||||
|
extra_state_tokens: bool,
|
||||||
|
hidden_width: int,
|
||||||
|
maxout_pieces: int,
|
||||||
|
use_upper: bool,
|
||||||
|
nO: Optional[int] = None,
|
||||||
|
) -> Model:
|
||||||
|
return build_tb_parser_model(
|
||||||
|
tok2vec,
|
||||||
|
state_type,
|
||||||
|
extra_state_tokens,
|
||||||
|
hidden_width,
|
||||||
|
maxout_pieces,
|
||||||
|
use_upper,
|
||||||
|
nO,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def build_tb_parser_model(
|
||||||
|
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||||
|
state_type: Literal["parser", "ner"],
|
||||||
|
extra_state_tokens: bool,
|
||||||
|
hidden_width: int,
|
||||||
|
maxout_pieces: int,
|
||||||
|
use_upper: bool,
|
||||||
|
nO: Optional[int] = None,
|
||||||
) -> Model:
|
) -> Model:
|
||||||
"""
|
"""
|
||||||
Build a transition-based parser model. Can apply to NER or dependency-parsing.
|
Build a transition-based parser model. Can apply to NER or dependency-parsing.
|
||||||
|
@ -72,16 +113,99 @@ def build_tb_parser_model(
|
||||||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||||
tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width))
|
tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width))
|
||||||
tok2vec.set_dim("nO", hidden_width)
|
tok2vec.set_dim("nO", hidden_width)
|
||||||
lower = PrecomputableAffine(
|
lower = _define_lower(
|
||||||
nO=hidden_width if use_upper else nO,
|
nO=hidden_width if use_upper else nO,
|
||||||
nF=nr_feature_tokens,
|
nF=nr_feature_tokens,
|
||||||
nI=tok2vec.get_dim("nO"),
|
nI=tok2vec.get_dim("nO"),
|
||||||
nP=maxout_pieces,
|
nP=maxout_pieces,
|
||||||
)
|
)
|
||||||
|
upper = None
|
||||||
if use_upper:
|
if use_upper:
|
||||||
with use_ops("numpy"):
|
with use_ops("numpy"):
|
||||||
# Initialize weights at zero, as it's a classification layer.
|
# Initialize weights at zero, as it's a classification layer.
|
||||||
upper = Linear(nO=nO, init_W=zero_init)
|
upper = _define_upper(nO=nO, nI=None)
|
||||||
else:
|
return TransitionModel(tok2vec, lower, upper, resize_output)
|
||||||
upper = None
|
|
||||||
return TransitionModel(tok2vec, lower, upper)
|
|
||||||
|
def _define_upper(nO, nI):
|
||||||
|
return Linear(nO=nO, nI=nI, init_W=zero_init)
|
||||||
|
|
||||||
|
|
||||||
|
def _define_lower(nO, nF, nI, nP):
|
||||||
|
return PrecomputableAffine(nO=nO, nF=nF, nI=nI, nP=nP)
|
||||||
|
|
||||||
|
|
||||||
|
def resize_output(model, new_nO):
|
||||||
|
if model.attrs["has_upper"]:
|
||||||
|
return _resize_upper(model, new_nO)
|
||||||
|
return _resize_lower(model, new_nO)
|
||||||
|
|
||||||
|
|
||||||
|
def _resize_upper(model, new_nO):
|
||||||
|
upper = model.get_ref("upper")
|
||||||
|
if upper.has_dim("nO") is None:
|
||||||
|
upper.set_dim("nO", new_nO)
|
||||||
|
return model
|
||||||
|
elif new_nO == upper.get_dim("nO"):
|
||||||
|
return model
|
||||||
|
|
||||||
|
smaller = upper
|
||||||
|
nI = smaller.maybe_get_dim("nI")
|
||||||
|
with use_ops("numpy"):
|
||||||
|
larger = _define_upper(nO=new_nO, nI=nI)
|
||||||
|
# it could be that the model is not initialized yet, then skip this bit
|
||||||
|
if smaller.has_param("W"):
|
||||||
|
larger_W = larger.ops.alloc2f(new_nO, nI)
|
||||||
|
larger_b = larger.ops.alloc1f(new_nO)
|
||||||
|
smaller_W = smaller.get_param("W")
|
||||||
|
smaller_b = smaller.get_param("b")
|
||||||
|
# Weights are stored in (nr_out, nr_in) format, so we're basically
|
||||||
|
# just adding rows here.
|
||||||
|
if smaller.has_dim("nO"):
|
||||||
|
old_nO = smaller.get_dim("nO")
|
||||||
|
larger_W[:old_nO] = smaller_W
|
||||||
|
larger_b[:old_nO] = smaller_b
|
||||||
|
for i in range(old_nO, new_nO):
|
||||||
|
model.attrs["unseen_classes"].add(i)
|
||||||
|
|
||||||
|
larger.set_param("W", larger_W)
|
||||||
|
larger.set_param("b", larger_b)
|
||||||
|
model._layers[-1] = larger
|
||||||
|
model.set_ref("upper", larger)
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def _resize_lower(model, new_nO):
|
||||||
|
lower = model.get_ref("lower")
|
||||||
|
if lower.has_dim("nO") is None:
|
||||||
|
lower.set_dim("nO", new_nO)
|
||||||
|
return model
|
||||||
|
|
||||||
|
smaller = lower
|
||||||
|
nI = smaller.maybe_get_dim("nI")
|
||||||
|
nF = smaller.maybe_get_dim("nF")
|
||||||
|
nP = smaller.maybe_get_dim("nP")
|
||||||
|
larger = _define_lower(nO=new_nO, nI=nI, nF=nF, nP=nP)
|
||||||
|
# it could be that the model is not initialized yet, then skip this bit
|
||||||
|
if smaller.has_param("W"):
|
||||||
|
larger_W = larger.ops.alloc4f(nF, new_nO, nP, nI)
|
||||||
|
larger_b = larger.ops.alloc2f(new_nO, nP)
|
||||||
|
larger_pad = larger.ops.alloc4f(1, nF, new_nO, nP)
|
||||||
|
smaller_W = smaller.get_param("W")
|
||||||
|
smaller_b = smaller.get_param("b")
|
||||||
|
smaller_pad = smaller.get_param("pad")
|
||||||
|
# Copy the old weights and padding into the new layer
|
||||||
|
if smaller.has_dim("nO"):
|
||||||
|
old_nO = smaller.get_dim("nO")
|
||||||
|
larger_W[:, 0:old_nO, :, :] = smaller_W
|
||||||
|
larger_pad[:, :, 0:old_nO, :] = smaller_pad
|
||||||
|
larger_b[0:old_nO, :] = smaller_b
|
||||||
|
for i in range(old_nO, new_nO):
|
||||||
|
model.attrs["unseen_classes"].add(i)
|
||||||
|
|
||||||
|
larger.set_param("W", larger_W)
|
||||||
|
larger.set_param("b", larger_b)
|
||||||
|
larger.set_param("pad", larger_pad)
|
||||||
|
model._layers[1] = larger
|
||||||
|
model.set_ref("lower", larger)
|
||||||
|
return model
|
||||||
|
|
|
@ -4,15 +4,15 @@ from thinc.types import Floats2d
|
||||||
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
||||||
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
||||||
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
||||||
from thinc.api import HashEmbed, with_array, with_cpu, uniqued
|
from thinc.api import with_cpu, Relu, residual, LayerNorm
|
||||||
from thinc.api import Relu, residual, expand_window
|
from thinc.layers.chain import init as init_chain
|
||||||
|
|
||||||
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
|
from ...attrs import ORTH
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from ..extract_ngrams import extract_ngrams
|
from ..extract_ngrams import extract_ngrams
|
||||||
from ..staticvectors import StaticVectors
|
from ..staticvectors import StaticVectors
|
||||||
from ..featureextractor import FeatureExtractor
|
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
|
from .tok2vec import get_tok2vec_width
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TextCatCNN.v1")
|
@registry.architectures.register("spacy.TextCatCNN.v1")
|
||||||
|
@ -69,124 +69,45 @@ def build_text_classifier_v2(
|
||||||
exclusive_classes = not linear_model.attrs["multi_label"]
|
exclusive_classes = not linear_model.attrs["multi_label"]
|
||||||
with Model.define_operators({">>": chain, "|": concatenate}):
|
with Model.define_operators({">>": chain, "|": concatenate}):
|
||||||
width = tok2vec.maybe_get_dim("nO")
|
width = tok2vec.maybe_get_dim("nO")
|
||||||
|
attention_layer = ParametricAttention(
|
||||||
|
width
|
||||||
|
) # TODO: benchmark performance difference of this layer
|
||||||
|
maxout_layer = Maxout(nO=width, nI=width)
|
||||||
|
norm_layer = LayerNorm(nI=width)
|
||||||
cnn_model = (
|
cnn_model = (
|
||||||
tok2vec
|
tok2vec
|
||||||
>> list2ragged()
|
>> list2ragged()
|
||||||
>> ParametricAttention(width) # TODO: benchmark performance difference of this layer
|
>> attention_layer
|
||||||
>> reduce_sum()
|
>> reduce_sum()
|
||||||
>> residual(Maxout(nO=width, nI=width))
|
>> residual(maxout_layer >> norm_layer >> Dropout(0.0))
|
||||||
>> Linear(nO=nO, nI=width)
|
|
||||||
>> Dropout(0.0)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
nO_double = nO * 2 if nO else None
|
nO_double = nO * 2 if nO else None
|
||||||
if exclusive_classes:
|
if exclusive_classes:
|
||||||
output_layer = Softmax(nO=nO, nI=nO_double)
|
output_layer = Softmax(nO=nO, nI=nO_double)
|
||||||
else:
|
else:
|
||||||
output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
|
output_layer = Linear(nO=nO, nI=nO_double) >> Logistic()
|
||||||
model = (linear_model | cnn_model) >> output_layer
|
model = (linear_model | cnn_model) >> output_layer
|
||||||
model.set_ref("tok2vec", tok2vec)
|
model.set_ref("tok2vec", tok2vec)
|
||||||
if model.has_dim("nO") is not False:
|
if model.has_dim("nO") is not False:
|
||||||
model.set_dim("nO", nO)
|
model.set_dim("nO", nO)
|
||||||
model.set_ref("output_layer", linear_model.get_ref("output_layer"))
|
model.set_ref("output_layer", linear_model.get_ref("output_layer"))
|
||||||
|
model.set_ref("attention_layer", attention_layer)
|
||||||
|
model.set_ref("maxout_layer", maxout_layer)
|
||||||
|
model.set_ref("norm_layer", norm_layer)
|
||||||
model.attrs["multi_label"] = not exclusive_classes
|
model.attrs["multi_label"] = not exclusive_classes
|
||||||
|
|
||||||
|
model.init = init_ensemble_textcat
|
||||||
return model
|
return model
|
||||||
|
|
||||||
# TODO: move to legacy
|
|
||||||
@registry.architectures.register("spacy.TextCatEnsemble.v1")
|
|
||||||
def build_text_classifier_v1(
|
|
||||||
width: int,
|
|
||||||
embed_size: int,
|
|
||||||
pretrained_vectors: Optional[bool],
|
|
||||||
exclusive_classes: bool,
|
|
||||||
ngram_size: int,
|
|
||||||
window_size: int,
|
|
||||||
conv_depth: int,
|
|
||||||
dropout: Optional[float],
|
|
||||||
nO: Optional[int] = None,
|
|
||||||
) -> Model:
|
|
||||||
# Don't document this yet, I'm not sure it's right.
|
|
||||||
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
|
|
||||||
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
|
||||||
lower = HashEmbed(
|
|
||||||
nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout, seed=10
|
|
||||||
)
|
|
||||||
prefix = HashEmbed(
|
|
||||||
nO=width // 2,
|
|
||||||
nV=embed_size,
|
|
||||||
column=cols.index(PREFIX),
|
|
||||||
dropout=dropout,
|
|
||||||
seed=11,
|
|
||||||
)
|
|
||||||
suffix = HashEmbed(
|
|
||||||
nO=width // 2,
|
|
||||||
nV=embed_size,
|
|
||||||
column=cols.index(SUFFIX),
|
|
||||||
dropout=dropout,
|
|
||||||
seed=12,
|
|
||||||
)
|
|
||||||
shape = HashEmbed(
|
|
||||||
nO=width // 2,
|
|
||||||
nV=embed_size,
|
|
||||||
column=cols.index(SHAPE),
|
|
||||||
dropout=dropout,
|
|
||||||
seed=13,
|
|
||||||
)
|
|
||||||
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
|
|
||||||
trained_vectors = FeatureExtractor(cols) >> with_array(
|
|
||||||
uniqued(
|
|
||||||
(lower | prefix | suffix | shape)
|
|
||||||
>> Maxout(nO=width, nI=width_nI, normalize=True),
|
|
||||||
column=cols.index(ORTH),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if pretrained_vectors:
|
|
||||||
static_vectors = StaticVectors(width)
|
|
||||||
vector_layer = trained_vectors | static_vectors
|
|
||||||
vectors_width = width * 2
|
|
||||||
else:
|
|
||||||
vector_layer = trained_vectors
|
|
||||||
vectors_width = width
|
|
||||||
tok2vec = vector_layer >> with_array(
|
|
||||||
Maxout(width, vectors_width, normalize=True)
|
|
||||||
>> residual(
|
|
||||||
(
|
|
||||||
expand_window(window_size=window_size)
|
|
||||||
>> Maxout(
|
|
||||||
nO=width, nI=width * ((window_size * 2) + 1), normalize=True
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
** conv_depth,
|
|
||||||
pad=conv_depth,
|
|
||||||
)
|
|
||||||
cnn_model = (
|
|
||||||
tok2vec
|
|
||||||
>> list2ragged()
|
|
||||||
>> ParametricAttention(width)
|
|
||||||
>> reduce_sum()
|
|
||||||
>> residual(Maxout(nO=width, nI=width))
|
|
||||||
>> Linear(nO=nO, nI=width)
|
|
||||||
>> Dropout(0.0)
|
|
||||||
)
|
|
||||||
|
|
||||||
linear_model = build_bow_text_classifier(
|
def init_ensemble_textcat(model, X, Y) -> Model:
|
||||||
nO=nO,
|
tok2vec_width = get_tok2vec_width(model)
|
||||||
ngram_size=ngram_size,
|
model.get_ref("attention_layer").set_dim("nO", tok2vec_width)
|
||||||
exclusive_classes=exclusive_classes,
|
model.get_ref("maxout_layer").set_dim("nO", tok2vec_width)
|
||||||
no_output_layer=False,
|
model.get_ref("maxout_layer").set_dim("nI", tok2vec_width)
|
||||||
)
|
model.get_ref("norm_layer").set_dim("nI", tok2vec_width)
|
||||||
nO_double = nO * 2 if nO else None
|
init_chain(model, X, Y)
|
||||||
if exclusive_classes:
|
|
||||||
output_layer = Softmax(nO=nO, nI=nO_double)
|
|
||||||
else:
|
|
||||||
output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
|
|
||||||
model = (linear_model | cnn_model) >> output_layer
|
|
||||||
model.set_ref("tok2vec", tok2vec)
|
|
||||||
if model.has_dim("nO") is not False:
|
|
||||||
model.set_dim("nO", nO)
|
|
||||||
model.set_ref("output_layer", linear_model.get_ref("output_layer"))
|
|
||||||
model.attrs["multi_label"] = not exclusive_classes
|
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -20,6 +20,17 @@ def tok2vec_listener_v1(width: int, upstream: str = "*"):
|
||||||
return tok2vec
|
return tok2vec
|
||||||
|
|
||||||
|
|
||||||
|
def get_tok2vec_width(model: Model):
|
||||||
|
nO = None
|
||||||
|
if model.has_ref("tok2vec"):
|
||||||
|
tok2vec = model.get_ref("tok2vec")
|
||||||
|
if tok2vec.has_dim("nO"):
|
||||||
|
nO = tok2vec.get_dim("nO")
|
||||||
|
elif tok2vec.has_ref("listener"):
|
||||||
|
nO = tok2vec.get_ref("listener").get_dim("nO")
|
||||||
|
return nO
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.HashEmbedCNN.v1")
|
@registry.architectures.register("spacy.HashEmbedCNN.v1")
|
||||||
def build_hash_embed_cnn_tok2vec(
|
def build_hash_embed_cnn_tok2vec(
|
||||||
*,
|
*,
|
||||||
|
@ -76,7 +87,7 @@ def build_hash_embed_cnn_tok2vec(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.Tok2Vec.v1")
|
@registry.architectures.register("spacy.Tok2Vec.v2")
|
||||||
def build_Tok2Vec_model(
|
def build_Tok2Vec_model(
|
||||||
embed: Model[List[Doc], List[Floats2d]],
|
embed: Model[List[Doc], List[Floats2d]],
|
||||||
encode: Model[List[Floats2d], List[Floats2d]],
|
encode: Model[List[Floats2d], List[Floats2d]],
|
||||||
|
@ -89,8 +100,7 @@ def build_Tok2Vec_model(
|
||||||
encode (Model[List[Floats2d], List[Floats2d]]): Encode context into the
|
encode (Model[List[Floats2d], List[Floats2d]]): Encode context into the
|
||||||
embeddings, using an architecture such as a CNN, BiLSTM or transformer.
|
embeddings, using an architecture such as a CNN, BiLSTM or transformer.
|
||||||
"""
|
"""
|
||||||
receptive_field = encode.attrs.get("receptive_field", 0)
|
tok2vec = chain(embed, encode)
|
||||||
tok2vec = chain(embed, with_array(encode, pad=receptive_field))
|
|
||||||
tok2vec.set_dim("nO", encode.get_dim("nO"))
|
tok2vec.set_dim("nO", encode.get_dim("nO"))
|
||||||
tok2vec.set_ref("embed", embed)
|
tok2vec.set_ref("embed", embed)
|
||||||
tok2vec.set_ref("encode", encode)
|
tok2vec.set_ref("encode", encode)
|
||||||
|
@ -244,7 +254,7 @@ def CharacterEmbed(
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.MaxoutWindowEncoder.v1")
|
@registry.architectures.register("spacy.MaxoutWindowEncoder.v2")
|
||||||
def MaxoutWindowEncoder(
|
def MaxoutWindowEncoder(
|
||||||
width: int, window_size: int, maxout_pieces: int, depth: int
|
width: int, window_size: int, maxout_pieces: int, depth: int
|
||||||
) -> Model[List[Floats2d], List[Floats2d]]:
|
) -> Model[List[Floats2d], List[Floats2d]]:
|
||||||
|
@ -272,11 +282,11 @@ def MaxoutWindowEncoder(
|
||||||
)
|
)
|
||||||
model = clone(residual(cnn), depth)
|
model = clone(residual(cnn), depth)
|
||||||
model.set_dim("nO", width)
|
model.set_dim("nO", width)
|
||||||
model.attrs["receptive_field"] = window_size * depth
|
receptive_field = window_size * depth
|
||||||
return model
|
return with_array(model, pad=receptive_field)
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.MishWindowEncoder.v1")
|
@registry.architectures.register("spacy.MishWindowEncoder.v2")
|
||||||
def MishWindowEncoder(
|
def MishWindowEncoder(
|
||||||
width: int, window_size: int, depth: int
|
width: int, window_size: int, depth: int
|
||||||
) -> Model[List[Floats2d], List[Floats2d]]:
|
) -> Model[List[Floats2d], List[Floats2d]]:
|
||||||
|
@ -296,7 +306,7 @@ def MishWindowEncoder(
|
||||||
)
|
)
|
||||||
model = clone(residual(cnn), depth)
|
model = clone(residual(cnn), depth)
|
||||||
model.set_dim("nO", width)
|
model.set_dim("nO", width)
|
||||||
return model
|
return with_array(model)
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
|
@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
|
||||||
|
@ -308,9 +318,9 @@ def BiLSTMEncoder(
|
||||||
width (int): The input and output width. These are required to be the same,
|
width (int): The input and output width. These are required to be the same,
|
||||||
to allow residual connections. This value will be determined by the
|
to allow residual connections. This value will be determined by the
|
||||||
width of the inputs. Recommended values are between 64 and 300.
|
width of the inputs. Recommended values are between 64 and 300.
|
||||||
window_size (int): The number of words to concatenate around each token
|
depth (int): The number of recurrent layers.
|
||||||
to construct the convolution. Recommended value is 1.
|
dropout (float): Creates a Dropout layer on the outputs of each LSTM layer
|
||||||
depth (int): The number of convolutional layers. Recommended value is 4.
|
except the last layer. Set to 0 to disable this functionality.
|
||||||
"""
|
"""
|
||||||
if depth == 0:
|
if depth == 0:
|
||||||
return noop()
|
return noop()
|
||||||
|
|
|
@ -42,9 +42,12 @@ def forward(
|
||||||
rows = model.ops.flatten(
|
rows = model.ops.flatten(
|
||||||
[doc.vocab.vectors.find(keys=doc.to_array(key_attr)) for doc in docs]
|
[doc.vocab.vectors.find(keys=doc.to_array(key_attr)) for doc in docs]
|
||||||
)
|
)
|
||||||
|
try:
|
||||||
|
vectors_data = model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True)
|
||||||
|
except ValueError:
|
||||||
|
raise RuntimeError(Errors.E896)
|
||||||
output = Ragged(
|
output = Ragged(
|
||||||
model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True),
|
vectors_data, model.ops.asarray([len(doc) for doc in docs], dtype="i")
|
||||||
model.ops.asarray([len(doc) for doc in docs], dtype="i"),
|
|
||||||
)
|
)
|
||||||
mask = None
|
mask = None
|
||||||
if is_train:
|
if is_train:
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
from thinc.api import Model, noop, use_ops, Linear
|
from thinc.api import Model, noop
|
||||||
from .parser_model import ParserStepModel
|
from .parser_model import ParserStepModel
|
||||||
|
|
||||||
|
|
||||||
def TransitionModel(tok2vec, lower, upper, dropout=0.2, unseen_classes=set()):
|
def TransitionModel(
|
||||||
|
tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()
|
||||||
|
):
|
||||||
"""Set up a stepwise transition-based model"""
|
"""Set up a stepwise transition-based model"""
|
||||||
if upper is None:
|
if upper is None:
|
||||||
has_upper = False
|
has_upper = False
|
||||||
|
@ -44,43 +46,3 @@ def init(model, X=None, Y=None):
|
||||||
if model.attrs["has_upper"]:
|
if model.attrs["has_upper"]:
|
||||||
statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
|
statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
|
||||||
model.get_ref("upper").initialize(X=statevecs)
|
model.get_ref("upper").initialize(X=statevecs)
|
||||||
|
|
||||||
|
|
||||||
def resize_output(model, new_nO):
|
|
||||||
lower = model.get_ref("lower")
|
|
||||||
upper = model.get_ref("upper")
|
|
||||||
if not model.attrs["has_upper"]:
|
|
||||||
if lower.has_dim("nO") is None:
|
|
||||||
lower.set_dim("nO", new_nO)
|
|
||||||
return
|
|
||||||
elif upper.has_dim("nO") is None:
|
|
||||||
upper.set_dim("nO", new_nO)
|
|
||||||
return
|
|
||||||
elif new_nO == upper.get_dim("nO"):
|
|
||||||
return
|
|
||||||
smaller = upper
|
|
||||||
nI = None
|
|
||||||
if smaller.has_dim("nI"):
|
|
||||||
nI = smaller.get_dim("nI")
|
|
||||||
with use_ops("numpy"):
|
|
||||||
larger = Linear(nO=new_nO, nI=nI)
|
|
||||||
larger.init = smaller.init
|
|
||||||
# it could be that the model is not initialized yet, then skip this bit
|
|
||||||
if nI:
|
|
||||||
larger_W = larger.ops.alloc2f(new_nO, nI)
|
|
||||||
larger_b = larger.ops.alloc1f(new_nO)
|
|
||||||
smaller_W = smaller.get_param("W")
|
|
||||||
smaller_b = smaller.get_param("b")
|
|
||||||
# Weights are stored in (nr_out, nr_in) format, so we're basically
|
|
||||||
# just adding rows here.
|
|
||||||
if smaller.has_dim("nO"):
|
|
||||||
larger_W[: smaller.get_dim("nO")] = smaller_W
|
|
||||||
larger_b[: smaller.get_dim("nO")] = smaller_b
|
|
||||||
for i in range(smaller.get_dim("nO"), new_nO):
|
|
||||||
model.attrs["unseen_classes"].add(i)
|
|
||||||
|
|
||||||
larger.set_param("W", larger_W)
|
|
||||||
larger.set_param("b", larger_b)
|
|
||||||
model._layers[-1] = larger
|
|
||||||
model.set_ref("upper", larger)
|
|
||||||
return model
|
|
||||||
|
|
|
@ -29,7 +29,7 @@ cdef class Morphology:
|
||||||
FEATURE_SEP = "|"
|
FEATURE_SEP = "|"
|
||||||
FIELD_SEP = "="
|
FIELD_SEP = "="
|
||||||
VALUE_SEP = ","
|
VALUE_SEP = ","
|
||||||
# not an empty string so that the PreshMap key is not 0
|
# not an empty string so we can distinguish unset morph from empty morph
|
||||||
EMPTY_MORPH = symbols.NAMES[symbols._]
|
EMPTY_MORPH = symbols.NAMES[symbols._]
|
||||||
|
|
||||||
def __init__(self, StringStore strings):
|
def __init__(self, StringStore strings):
|
||||||
|
@ -50,8 +50,8 @@ cdef class Morphology:
|
||||||
"""
|
"""
|
||||||
cdef MorphAnalysisC* tag_ptr
|
cdef MorphAnalysisC* tag_ptr
|
||||||
if isinstance(features, str):
|
if isinstance(features, str):
|
||||||
if features == self.EMPTY_MORPH:
|
if features == "":
|
||||||
features = ""
|
features = self.EMPTY_MORPH
|
||||||
tag_ptr = <MorphAnalysisC*>self.tags.get(<hash_t>self.strings[features])
|
tag_ptr = <MorphAnalysisC*>self.tags.get(<hash_t>self.strings[features])
|
||||||
if tag_ptr != NULL:
|
if tag_ptr != NULL:
|
||||||
return tag_ptr.key
|
return tag_ptr.key
|
||||||
|
@ -71,13 +71,9 @@ cdef class Morphology:
|
||||||
))
|
))
|
||||||
cdef MorphAnalysisC tag = self.create_morph_tag(field_feature_pairs)
|
cdef MorphAnalysisC tag = self.create_morph_tag(field_feature_pairs)
|
||||||
# the hash key for the tag is either the hash of the normalized UFEATS
|
# the hash key for the tag is either the hash of the normalized UFEATS
|
||||||
# string or the hash of an empty placeholder (using the empty string
|
# string or the hash of an empty placeholder
|
||||||
# would give a hash key of 0, which is not good for PreshMap)
|
|
||||||
norm_feats_string = self.normalize_features(features)
|
norm_feats_string = self.normalize_features(features)
|
||||||
if norm_feats_string:
|
|
||||||
tag.key = self.strings.add(norm_feats_string)
|
tag.key = self.strings.add(norm_feats_string)
|
||||||
else:
|
|
||||||
tag.key = self.strings.add(self.EMPTY_MORPH)
|
|
||||||
self.insert(tag)
|
self.insert(tag)
|
||||||
return tag.key
|
return tag.key
|
||||||
|
|
||||||
|
@ -137,6 +133,7 @@ cdef class Morphology:
|
||||||
"""
|
"""
|
||||||
cdef MorphAnalysisC tag
|
cdef MorphAnalysisC tag
|
||||||
tag.length = len(field_feature_pairs)
|
tag.length = len(field_feature_pairs)
|
||||||
|
if tag.length > 0:
|
||||||
tag.fields = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
|
tag.fields = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
|
||||||
tag.features = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
|
tag.features = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
|
||||||
for i, (field, feature) in enumerate(field_feature_pairs):
|
for i, (field, feature) in enumerate(field_feature_pairs):
|
||||||
|
|
|
@ -11,6 +11,7 @@ from .senter import SentenceRecognizer
|
||||||
from .sentencizer import Sentencizer
|
from .sentencizer import Sentencizer
|
||||||
from .tagger import Tagger
|
from .tagger import Tagger
|
||||||
from .textcat import TextCategorizer
|
from .textcat import TextCategorizer
|
||||||
|
from .textcat_multilabel import MultiLabel_TextCategorizer
|
||||||
from .tok2vec import Tok2Vec
|
from .tok2vec import Tok2Vec
|
||||||
from .functions import merge_entities, merge_noun_chunks, merge_subtokens
|
from .functions import merge_entities, merge_noun_chunks, merge_subtokens
|
||||||
|
|
||||||
|
@ -22,13 +23,14 @@ __all__ = [
|
||||||
"EntityRuler",
|
"EntityRuler",
|
||||||
"Morphologizer",
|
"Morphologizer",
|
||||||
"Lemmatizer",
|
"Lemmatizer",
|
||||||
"TrainablePipe",
|
"MultiLabel_TextCategorizer",
|
||||||
"Pipe",
|
"Pipe",
|
||||||
"SentenceRecognizer",
|
"SentenceRecognizer",
|
||||||
"Sentencizer",
|
"Sentencizer",
|
||||||
"Tagger",
|
"Tagger",
|
||||||
"TextCategorizer",
|
"TextCategorizer",
|
||||||
"Tok2Vec",
|
"Tok2Vec",
|
||||||
|
"TrainablePipe",
|
||||||
"merge_entities",
|
"merge_entities",
|
||||||
"merge_noun_chunks",
|
"merge_noun_chunks",
|
||||||
"merge_subtokens",
|
"merge_subtokens",
|
||||||
|
|
0
spacy/pipeline/_parser_internals/__init__.pxd
Normal file
0
spacy/pipeline/_parser_internals/__init__.pxd
Normal file
6
spacy/pipeline/_parser_internals/_beam_utils.pxd
Normal file
6
spacy/pipeline/_parser_internals/_beam_utils.pxd
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
from ...typedefs cimport class_t, hash_t
|
||||||
|
|
||||||
|
# These are passed as callbacks to thinc.search.Beam
|
||||||
|
cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1
|
||||||
|
|
||||||
|
cdef int check_final_state(void* _state, void* extra_args) except -1
|
296
spacy/pipeline/_parser_internals/_beam_utils.pyx
Normal file
296
spacy/pipeline/_parser_internals/_beam_utils.pyx
Normal file
|
@ -0,0 +1,296 @@
|
||||||
|
# cython: infer_types=True
|
||||||
|
# cython: profile=True
|
||||||
|
cimport numpy as np
|
||||||
|
import numpy
|
||||||
|
from cpython.ref cimport PyObject, Py_XDECREF
|
||||||
|
from thinc.extra.search cimport Beam
|
||||||
|
from thinc.extra.search import MaxViolation
|
||||||
|
from thinc.extra.search cimport MaxViolation
|
||||||
|
|
||||||
|
from ...typedefs cimport hash_t, class_t
|
||||||
|
from .transition_system cimport TransitionSystem, Transition
|
||||||
|
from ...errors import Errors
|
||||||
|
from .stateclass cimport StateC, StateClass
|
||||||
|
|
||||||
|
|
||||||
|
# These are passed as callbacks to thinc.search.Beam
|
||||||
|
cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
|
||||||
|
dest = <StateC*>_dest
|
||||||
|
src = <StateC*>_src
|
||||||
|
moves = <const Transition*>_moves
|
||||||
|
dest.clone(src)
|
||||||
|
moves[clas].do(dest, moves[clas].label)
|
||||||
|
|
||||||
|
|
||||||
|
cdef int check_final_state(void* _state, void* extra_args) except -1:
|
||||||
|
state = <StateC*>_state
|
||||||
|
return state.is_final()
|
||||||
|
|
||||||
|
|
||||||
|
cdef class BeamBatch(object):
|
||||||
|
cdef public TransitionSystem moves
|
||||||
|
cdef public object states
|
||||||
|
cdef public object docs
|
||||||
|
cdef public object golds
|
||||||
|
cdef public object beams
|
||||||
|
|
||||||
|
def __init__(self, TransitionSystem moves, states, golds,
|
||||||
|
int width, float density=0.):
|
||||||
|
cdef StateClass state
|
||||||
|
self.moves = moves
|
||||||
|
self.states = states
|
||||||
|
self.docs = [state.doc for state in states]
|
||||||
|
self.golds = golds
|
||||||
|
self.beams = []
|
||||||
|
cdef Beam beam
|
||||||
|
cdef StateC* st
|
||||||
|
for state in states:
|
||||||
|
beam = Beam(self.moves.n_moves, width, min_density=density)
|
||||||
|
beam.initialize(self.moves.init_beam_state,
|
||||||
|
self.moves.del_beam_state, state.c.length,
|
||||||
|
<void*>state.c._sent)
|
||||||
|
for i in range(beam.width):
|
||||||
|
st = <StateC*>beam.at(i)
|
||||||
|
st.offset = state.c.offset
|
||||||
|
beam.check_done(check_final_state, NULL)
|
||||||
|
self.beams.append(beam)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_done(self):
|
||||||
|
return all(b.is_done for b in self.beams)
|
||||||
|
|
||||||
|
def __getitem__(self, i):
|
||||||
|
return self.beams[i]
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.beams)
|
||||||
|
|
||||||
|
def get_states(self):
|
||||||
|
cdef Beam beam
|
||||||
|
cdef StateC* state
|
||||||
|
cdef StateClass stcls
|
||||||
|
states = []
|
||||||
|
for beam, doc in zip(self, self.docs):
|
||||||
|
for i in range(beam.size):
|
||||||
|
state = <StateC*>beam.at(i)
|
||||||
|
stcls = StateClass.borrow(state, doc)
|
||||||
|
states.append(stcls)
|
||||||
|
return states
|
||||||
|
|
||||||
|
def get_unfinished_states(self):
|
||||||
|
return [st for st in self.get_states() if not st.is_final()]
|
||||||
|
|
||||||
|
def advance(self, float[:, ::1] scores, follow_gold=False):
|
||||||
|
cdef Beam beam
|
||||||
|
cdef int nr_class = scores.shape[1]
|
||||||
|
cdef const float* c_scores = &scores[0, 0]
|
||||||
|
docs = self.docs
|
||||||
|
for i, beam in enumerate(self):
|
||||||
|
if not beam.is_done:
|
||||||
|
nr_state = self._set_scores(beam, c_scores, nr_class)
|
||||||
|
assert nr_state
|
||||||
|
if self.golds is not None:
|
||||||
|
self._set_costs(
|
||||||
|
beam,
|
||||||
|
docs[i],
|
||||||
|
self.golds[i],
|
||||||
|
follow_gold=follow_gold
|
||||||
|
)
|
||||||
|
c_scores += nr_state * nr_class
|
||||||
|
beam.advance(transition_state, NULL, <void*>self.moves.c)
|
||||||
|
beam.check_done(check_final_state, NULL)
|
||||||
|
|
||||||
|
cdef int _set_scores(self, Beam beam, const float* scores, int nr_class) except -1:
|
||||||
|
cdef int nr_state = 0
|
||||||
|
for i in range(beam.size):
|
||||||
|
state = <StateC*>beam.at(i)
|
||||||
|
if not state.is_final():
|
||||||
|
for j in range(nr_class):
|
||||||
|
beam.scores[i][j] = scores[nr_state * nr_class + j]
|
||||||
|
self.moves.set_valid(beam.is_valid[i], state)
|
||||||
|
nr_state += 1
|
||||||
|
else:
|
||||||
|
for j in range(beam.nr_class):
|
||||||
|
beam.scores[i][j] = 0
|
||||||
|
beam.costs[i][j] = 0
|
||||||
|
return nr_state
|
||||||
|
|
||||||
|
def _set_costs(self, Beam beam, doc, gold, int follow_gold=False):
|
||||||
|
cdef const StateC* state
|
||||||
|
for i in range(beam.size):
|
||||||
|
state = <const StateC*>beam.at(i)
|
||||||
|
if state.is_final():
|
||||||
|
for j in range(beam.nr_class):
|
||||||
|
beam.is_valid[i][j] = 0
|
||||||
|
beam.costs[i][j] = 9000
|
||||||
|
else:
|
||||||
|
self.moves.set_costs(beam.is_valid[i], beam.costs[i],
|
||||||
|
state, gold)
|
||||||
|
if follow_gold:
|
||||||
|
min_cost = 0
|
||||||
|
for j in range(beam.nr_class):
|
||||||
|
if beam.is_valid[i][j] and beam.costs[i][j] < min_cost:
|
||||||
|
min_cost = beam.costs[i][j]
|
||||||
|
for j in range(beam.nr_class):
|
||||||
|
if beam.costs[i][j] > min_cost:
|
||||||
|
beam.is_valid[i][j] = 0
|
||||||
|
|
||||||
|
|
||||||
|
def update_beam(TransitionSystem moves, states, golds, model, int width, beam_density=0.0):
|
||||||
|
cdef MaxViolation violn
|
||||||
|
pbeam = BeamBatch(moves, states, golds, width=width, density=beam_density)
|
||||||
|
gbeam = BeamBatch(moves, states, golds, width=width, density=0.0)
|
||||||
|
cdef StateClass state
|
||||||
|
beam_maps = []
|
||||||
|
backprops = []
|
||||||
|
violns = [MaxViolation() for _ in range(len(states))]
|
||||||
|
dones = [False for _ in states]
|
||||||
|
while not pbeam.is_done or not gbeam.is_done:
|
||||||
|
# The beam maps let us find the right row in the flattened scores
|
||||||
|
# array for each state. States are identified by (example id,
|
||||||
|
# history). We keep a different beam map for each step (since we'll
|
||||||
|
# have a flat scores array for each step). The beam map will let us
|
||||||
|
# take the per-state losses, and compute the gradient for each (step,
|
||||||
|
# state, class).
|
||||||
|
# Gather all states from the two beams in a list. Some stats may occur
|
||||||
|
# in both beams. To figure out which beam each state belonged to,
|
||||||
|
# we keep two lists of indices, p_indices and g_indices
|
||||||
|
states, p_indices, g_indices, beam_map = get_unique_states(pbeam, gbeam)
|
||||||
|
beam_maps.append(beam_map)
|
||||||
|
if not states:
|
||||||
|
break
|
||||||
|
# Now that we have our flat list of states, feed them through the model
|
||||||
|
scores, bp_scores = model.begin_update(states)
|
||||||
|
assert scores.size != 0
|
||||||
|
# Store the callbacks for the backward pass
|
||||||
|
backprops.append(bp_scores)
|
||||||
|
# Unpack the scores for the two beams. The indices arrays
|
||||||
|
# tell us which example and state the scores-row refers to.
|
||||||
|
# Now advance the states in the beams. The gold beam is constrained to
|
||||||
|
# to follow only gold analyses.
|
||||||
|
if not pbeam.is_done:
|
||||||
|
pbeam.advance(model.ops.as_contig(scores[p_indices]))
|
||||||
|
if not gbeam.is_done:
|
||||||
|
gbeam.advance(model.ops.as_contig(scores[g_indices]), follow_gold=True)
|
||||||
|
# Track the "maximum violation", to use in the update.
|
||||||
|
for i, violn in enumerate(violns):
|
||||||
|
if not dones[i]:
|
||||||
|
violn.check_crf(pbeam[i], gbeam[i])
|
||||||
|
if pbeam[i].is_done and gbeam[i].is_done:
|
||||||
|
dones[i] = True
|
||||||
|
histories = []
|
||||||
|
grads = []
|
||||||
|
for violn in violns:
|
||||||
|
if violn.p_hist:
|
||||||
|
histories.append(violn.p_hist + violn.g_hist)
|
||||||
|
d_loss = [d_l * violn.cost for d_l in violn.p_probs + violn.g_probs]
|
||||||
|
grads.append(d_loss)
|
||||||
|
else:
|
||||||
|
histories.append([])
|
||||||
|
grads.append([])
|
||||||
|
loss = 0.0
|
||||||
|
states_d_scores = get_gradient(moves.n_moves, beam_maps, histories, grads)
|
||||||
|
for i, (d_scores, bp_scores) in enumerate(zip(states_d_scores, backprops)):
|
||||||
|
loss += (d_scores**2).mean()
|
||||||
|
bp_scores(d_scores)
|
||||||
|
return loss
|
||||||
|
|
||||||
|
|
||||||
|
def collect_states(beams, docs):
|
||||||
|
cdef StateClass state
|
||||||
|
cdef Beam beam
|
||||||
|
states = []
|
||||||
|
for state_or_beam, doc in zip(beams, docs):
|
||||||
|
if isinstance(state_or_beam, StateClass):
|
||||||
|
states.append(state_or_beam)
|
||||||
|
else:
|
||||||
|
beam = state_or_beam
|
||||||
|
state = StateClass.borrow(<StateC*>beam.at(0), doc)
|
||||||
|
states.append(state)
|
||||||
|
return states
|
||||||
|
|
||||||
|
|
||||||
|
def get_unique_states(pbeams, gbeams):
|
||||||
|
seen = {}
|
||||||
|
states = []
|
||||||
|
p_indices = []
|
||||||
|
g_indices = []
|
||||||
|
beam_map = {}
|
||||||
|
docs = pbeams.docs
|
||||||
|
cdef Beam pbeam, gbeam
|
||||||
|
if len(pbeams) != len(gbeams):
|
||||||
|
raise ValueError(Errors.E079.format(pbeams=len(pbeams), gbeams=len(gbeams)))
|
||||||
|
for eg_id, (pbeam, gbeam, doc) in enumerate(zip(pbeams, gbeams, docs)):
|
||||||
|
if not pbeam.is_done:
|
||||||
|
for i in range(pbeam.size):
|
||||||
|
state = StateClass.borrow(<StateC*>pbeam.at(i), doc)
|
||||||
|
if not state.is_final():
|
||||||
|
key = tuple([eg_id] + pbeam.histories[i])
|
||||||
|
if key in seen:
|
||||||
|
raise ValueError(Errors.E080.format(key=key))
|
||||||
|
seen[key] = len(states)
|
||||||
|
p_indices.append(len(states))
|
||||||
|
states.append(state)
|
||||||
|
beam_map.update(seen)
|
||||||
|
if not gbeam.is_done:
|
||||||
|
for i in range(gbeam.size):
|
||||||
|
state = StateClass.borrow(<StateC*>gbeam.at(i), doc)
|
||||||
|
if not state.is_final():
|
||||||
|
key = tuple([eg_id] + gbeam.histories[i])
|
||||||
|
if key in seen:
|
||||||
|
g_indices.append(seen[key])
|
||||||
|
else:
|
||||||
|
g_indices.append(len(states))
|
||||||
|
beam_map[key] = len(states)
|
||||||
|
states.append(state)
|
||||||
|
p_indices = numpy.asarray(p_indices, dtype='i')
|
||||||
|
g_indices = numpy.asarray(g_indices, dtype='i')
|
||||||
|
return states, p_indices, g_indices, beam_map
|
||||||
|
|
||||||
|
|
||||||
|
def get_gradient(nr_class, beam_maps, histories, losses):
|
||||||
|
"""The global model assigns a loss to each parse. The beam scores
|
||||||
|
are additive, so the same gradient is applied to each action
|
||||||
|
in the history. This gives the gradient of a single *action*
|
||||||
|
for a beam state -- so we have "the gradient of loss for taking
|
||||||
|
action i given history H."
|
||||||
|
|
||||||
|
Histories: Each history is a list of actions
|
||||||
|
Each candidate has a history
|
||||||
|
Each beam has multiple candidates
|
||||||
|
Each batch has multiple beams
|
||||||
|
So history is list of lists of lists of ints
|
||||||
|
"""
|
||||||
|
grads = []
|
||||||
|
nr_steps = []
|
||||||
|
for eg_id, hists in enumerate(histories):
|
||||||
|
nr_step = 0
|
||||||
|
for loss, hist in zip(losses[eg_id], hists):
|
||||||
|
assert not numpy.isnan(loss)
|
||||||
|
if loss != 0.0:
|
||||||
|
nr_step = max(nr_step, len(hist))
|
||||||
|
nr_steps.append(nr_step)
|
||||||
|
for i in range(max(nr_steps)):
|
||||||
|
grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class),
|
||||||
|
dtype='f'))
|
||||||
|
if len(histories) != len(losses):
|
||||||
|
raise ValueError(Errors.E081.format(n_hist=len(histories), losses=len(losses)))
|
||||||
|
for eg_id, hists in enumerate(histories):
|
||||||
|
for loss, hist in zip(losses[eg_id], hists):
|
||||||
|
assert not numpy.isnan(loss)
|
||||||
|
if loss == 0.0:
|
||||||
|
continue
|
||||||
|
key = tuple([eg_id])
|
||||||
|
# Adjust loss for length
|
||||||
|
# We need to do this because each state in a short path is scored
|
||||||
|
# multiple times, as we add in the average cost when we run out
|
||||||
|
# of actions.
|
||||||
|
avg_loss = loss / len(hist)
|
||||||
|
loss += avg_loss * (nr_steps[eg_id] - len(hist))
|
||||||
|
for step, clas in enumerate(hist):
|
||||||
|
i = beam_maps[step][key]
|
||||||
|
# In step j, at state i action clas
|
||||||
|
# resulted in loss
|
||||||
|
grads[step][i, clas] += loss
|
||||||
|
key = key + tuple([clas])
|
||||||
|
return grads
|
|
@ -1,6 +1,9 @@
|
||||||
from libc.string cimport memcpy, memset
|
from libc.string cimport memcpy, memset
|
||||||
from libc.stdlib cimport calloc, free
|
from libc.stdlib cimport calloc, free
|
||||||
from libc.stdint cimport uint32_t, uint64_t
|
from libc.stdint cimport uint32_t, uint64_t
|
||||||
|
cimport libcpp
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
from libcpp.set cimport set
|
||||||
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
|
|
||||||
|
@ -14,89 +17,48 @@ from ...typedefs cimport attr_t
|
||||||
cdef inline bint is_space_token(const TokenC* token) nogil:
|
cdef inline bint is_space_token(const TokenC* token) nogil:
|
||||||
return Lexeme.c_check_flag(token.lex, IS_SPACE)
|
return Lexeme.c_check_flag(token.lex, IS_SPACE)
|
||||||
|
|
||||||
cdef struct RingBufferC:
|
cdef struct ArcC:
|
||||||
int[8] data
|
int head
|
||||||
int i
|
int child
|
||||||
int default
|
attr_t label
|
||||||
|
|
||||||
cdef inline int ring_push(RingBufferC* ring, int value) nogil:
|
|
||||||
ring.data[ring.i] = value
|
|
||||||
ring.i += 1
|
|
||||||
if ring.i >= 8:
|
|
||||||
ring.i = 0
|
|
||||||
|
|
||||||
cdef inline int ring_get(RingBufferC* ring, int i) nogil:
|
|
||||||
if i >= ring.i:
|
|
||||||
return ring.default
|
|
||||||
else:
|
|
||||||
return ring.data[ring.i-i]
|
|
||||||
|
|
||||||
|
|
||||||
cdef cppclass StateC:
|
cdef cppclass StateC:
|
||||||
int* _stack
|
int* _heads
|
||||||
int* _buffer
|
const TokenC* _sent
|
||||||
bint* shifted
|
vector[int] _stack
|
||||||
TokenC* _sent
|
vector[int] _rebuffer
|
||||||
SpanC* _ents
|
vector[SpanC] _ents
|
||||||
|
vector[ArcC] _left_arcs
|
||||||
|
vector[ArcC] _right_arcs
|
||||||
|
vector[libcpp.bool] _unshiftable
|
||||||
|
set[int] _sent_starts
|
||||||
TokenC _empty_token
|
TokenC _empty_token
|
||||||
RingBufferC _hist
|
|
||||||
int length
|
int length
|
||||||
int offset
|
int offset
|
||||||
int _s_i
|
|
||||||
int _b_i
|
int _b_i
|
||||||
int _e_i
|
|
||||||
int _break
|
|
||||||
|
|
||||||
__init__(const TokenC* sent, int length) nogil:
|
__init__(const TokenC* sent, int length) nogil:
|
||||||
cdef int PADDING = 5
|
this._sent = sent
|
||||||
this._buffer = <int*>calloc(length + (PADDING * 2), sizeof(int))
|
this._heads = <int*>calloc(length, sizeof(int))
|
||||||
this._stack = <int*>calloc(length + (PADDING * 2), sizeof(int))
|
if not (this._sent and this._heads):
|
||||||
this.shifted = <bint*>calloc(length + (PADDING * 2), sizeof(bint))
|
|
||||||
this._sent = <TokenC*>calloc(length + (PADDING * 2), sizeof(TokenC))
|
|
||||||
this._ents = <SpanC*>calloc(length + (PADDING * 2), sizeof(SpanC))
|
|
||||||
if not (this._buffer and this._stack and this.shifted
|
|
||||||
and this._sent and this._ents):
|
|
||||||
with gil:
|
with gil:
|
||||||
PyErr_SetFromErrno(MemoryError)
|
PyErr_SetFromErrno(MemoryError)
|
||||||
PyErr_CheckSignals()
|
PyErr_CheckSignals()
|
||||||
memset(&this._hist, 0, sizeof(this._hist))
|
|
||||||
this.offset = 0
|
this.offset = 0
|
||||||
cdef int i
|
|
||||||
for i in range(length + (PADDING * 2)):
|
|
||||||
this._ents[i].end = -1
|
|
||||||
this._sent[i].l_edge = i
|
|
||||||
this._sent[i].r_edge = i
|
|
||||||
for i in range(PADDING):
|
|
||||||
this._sent[i].lex = &EMPTY_LEXEME
|
|
||||||
this._sent += PADDING
|
|
||||||
this._ents += PADDING
|
|
||||||
this._buffer += PADDING
|
|
||||||
this._stack += PADDING
|
|
||||||
this.shifted += PADDING
|
|
||||||
this.length = length
|
this.length = length
|
||||||
this._break = -1
|
|
||||||
this._s_i = 0
|
|
||||||
this._b_i = 0
|
this._b_i = 0
|
||||||
this._e_i = 0
|
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
this._buffer[i] = i
|
this._heads[i] = -1
|
||||||
|
this._unshiftable.push_back(0)
|
||||||
memset(&this._empty_token, 0, sizeof(TokenC))
|
memset(&this._empty_token, 0, sizeof(TokenC))
|
||||||
this._empty_token.lex = &EMPTY_LEXEME
|
this._empty_token.lex = &EMPTY_LEXEME
|
||||||
for i in range(length):
|
|
||||||
this._sent[i] = sent[i]
|
|
||||||
this._buffer[i] = i
|
|
||||||
for i in range(length, length+PADDING):
|
|
||||||
this._sent[i].lex = &EMPTY_LEXEME
|
|
||||||
|
|
||||||
__dealloc__():
|
__dealloc__():
|
||||||
cdef int PADDING = 5
|
free(this._heads)
|
||||||
free(this._sent - PADDING)
|
|
||||||
free(this._ents - PADDING)
|
|
||||||
free(this._buffer - PADDING)
|
|
||||||
free(this._stack - PADDING)
|
|
||||||
free(this.shifted - PADDING)
|
|
||||||
|
|
||||||
void set_context_tokens(int* ids, int n) nogil:
|
void set_context_tokens(int* ids, int n) nogil:
|
||||||
|
cdef int i, j
|
||||||
if n == 1:
|
if n == 1:
|
||||||
if this.B(0) >= 0:
|
if this.B(0) >= 0:
|
||||||
ids[0] = this.B(0)
|
ids[0] = this.B(0)
|
||||||
|
@ -145,22 +107,18 @@ cdef cppclass StateC:
|
||||||
ids[11] = this.R(this.S(1), 1)
|
ids[11] = this.R(this.S(1), 1)
|
||||||
ids[12] = this.R(this.S(1), 2)
|
ids[12] = this.R(this.S(1), 2)
|
||||||
elif n == 6:
|
elif n == 6:
|
||||||
|
for i in range(6):
|
||||||
|
ids[i] = -1
|
||||||
if this.B(0) >= 0:
|
if this.B(0) >= 0:
|
||||||
ids[0] = this.B(0)
|
ids[0] = this.B(0)
|
||||||
ids[1] = this.B(0)-1
|
if this.entity_is_open():
|
||||||
else:
|
ent = this.get_ent()
|
||||||
ids[0] = -1
|
j = 1
|
||||||
ids[1] = -1
|
for i in range(ent.start, this.B(0)):
|
||||||
ids[2] = this.B(1)
|
ids[j] = i
|
||||||
ids[3] = this.E(0)
|
j += 1
|
||||||
if ids[3] >= 1:
|
if j >= 6:
|
||||||
ids[4] = this.E(0)-1
|
break
|
||||||
else:
|
|
||||||
ids[4] = -1
|
|
||||||
if (ids[3]+1) < this.length:
|
|
||||||
ids[5] = this.E(0)+1
|
|
||||||
else:
|
|
||||||
ids[5] = -1
|
|
||||||
else:
|
else:
|
||||||
# TODO error =/
|
# TODO error =/
|
||||||
pass
|
pass
|
||||||
|
@ -171,329 +129,256 @@ cdef cppclass StateC:
|
||||||
ids[i] = -1
|
ids[i] = -1
|
||||||
|
|
||||||
int S(int i) nogil const:
|
int S(int i) nogil const:
|
||||||
if i >= this._s_i:
|
if i >= this._stack.size():
|
||||||
return -1
|
return -1
|
||||||
return this._stack[this._s_i - (i+1)]
|
elif i < 0:
|
||||||
|
return -1
|
||||||
|
return this._stack.at(this._stack.size() - (i+1))
|
||||||
|
|
||||||
int B(int i) nogil const:
|
int B(int i) nogil const:
|
||||||
if (i + this._b_i) >= this.length:
|
if i < 0:
|
||||||
return -1
|
return -1
|
||||||
return this._buffer[this._b_i + i]
|
elif i < this._rebuffer.size():
|
||||||
|
return this._rebuffer.at(this._rebuffer.size() - (i+1))
|
||||||
const TokenC* S_(int i) nogil const:
|
else:
|
||||||
return this.safe_get(this.S(i))
|
b_i = this._b_i + (i - this._rebuffer.size())
|
||||||
|
if b_i >= this.length:
|
||||||
|
return -1
|
||||||
|
else:
|
||||||
|
return b_i
|
||||||
|
|
||||||
const TokenC* B_(int i) nogil const:
|
const TokenC* B_(int i) nogil const:
|
||||||
return this.safe_get(this.B(i))
|
return this.safe_get(this.B(i))
|
||||||
|
|
||||||
const TokenC* H_(int i) nogil const:
|
|
||||||
return this.safe_get(this.H(i))
|
|
||||||
|
|
||||||
const TokenC* E_(int i) nogil const:
|
const TokenC* E_(int i) nogil const:
|
||||||
return this.safe_get(this.E(i))
|
return this.safe_get(this.E(i))
|
||||||
|
|
||||||
const TokenC* L_(int i, int idx) nogil const:
|
|
||||||
return this.safe_get(this.L(i, idx))
|
|
||||||
|
|
||||||
const TokenC* R_(int i, int idx) nogil const:
|
|
||||||
return this.safe_get(this.R(i, idx))
|
|
||||||
|
|
||||||
const TokenC* safe_get(int i) nogil const:
|
const TokenC* safe_get(int i) nogil const:
|
||||||
if i < 0 or i >= this.length:
|
if i < 0 or i >= this.length:
|
||||||
return &this._empty_token
|
return &this._empty_token
|
||||||
else:
|
else:
|
||||||
return &this._sent[i]
|
return &this._sent[i]
|
||||||
|
|
||||||
int H(int i) nogil const:
|
void get_arcs(vector[ArcC]* arcs) nogil const:
|
||||||
if i < 0 or i >= this.length:
|
for i in range(this._left_arcs.size()):
|
||||||
|
arc = this._left_arcs.at(i)
|
||||||
|
if arc.head != -1 and arc.child != -1:
|
||||||
|
arcs.push_back(arc)
|
||||||
|
for i in range(this._right_arcs.size()):
|
||||||
|
arc = this._right_arcs.at(i)
|
||||||
|
if arc.head != -1 and arc.child != -1:
|
||||||
|
arcs.push_back(arc)
|
||||||
|
|
||||||
|
int H(int child) nogil const:
|
||||||
|
if child >= this.length or child < 0:
|
||||||
return -1
|
return -1
|
||||||
return this._sent[i].head + i
|
else:
|
||||||
|
return this._heads[child]
|
||||||
|
|
||||||
int E(int i) nogil const:
|
int E(int i) nogil const:
|
||||||
if this._e_i <= 0 or this._e_i >= this.length:
|
if this._ents.size() == 0:
|
||||||
return -1
|
return -1
|
||||||
if i < 0 or i >= this._e_i:
|
|
||||||
return -1
|
|
||||||
return this._ents[this._e_i - (i+1)].start
|
|
||||||
|
|
||||||
int L(int i, int idx) nogil const:
|
|
||||||
if idx < 1:
|
|
||||||
return -1
|
|
||||||
if i < 0 or i >= this.length:
|
|
||||||
return -1
|
|
||||||
cdef const TokenC* target = &this._sent[i]
|
|
||||||
if target.l_kids < <uint32_t>idx:
|
|
||||||
return -1
|
|
||||||
cdef const TokenC* ptr = &this._sent[target.l_edge]
|
|
||||||
|
|
||||||
while ptr < target:
|
|
||||||
# If this head is still to the right of us, we can skip to it
|
|
||||||
# No token that's between this token and this head could be our
|
|
||||||
# child.
|
|
||||||
if (ptr.head >= 1) and (ptr + ptr.head) < target:
|
|
||||||
ptr += ptr.head
|
|
||||||
|
|
||||||
elif ptr + ptr.head == target:
|
|
||||||
idx -= 1
|
|
||||||
if idx == 0:
|
|
||||||
return ptr - this._sent
|
|
||||||
ptr += 1
|
|
||||||
else:
|
else:
|
||||||
ptr += 1
|
return this._ents.back().start
|
||||||
return -1
|
|
||||||
|
|
||||||
int R(int i, int idx) nogil const:
|
int L(int head, int idx) nogil const:
|
||||||
if idx < 1:
|
if idx < 1 or this._left_arcs.size() == 0:
|
||||||
return -1
|
return -1
|
||||||
if i < 0 or i >= this.length:
|
cdef vector[int] lefts
|
||||||
|
for i in range(this._left_arcs.size()):
|
||||||
|
arc = this._left_arcs.at(i)
|
||||||
|
if arc.head == head and arc.child != -1 and arc.child < head:
|
||||||
|
lefts.push_back(arc.child)
|
||||||
|
idx = (<int>lefts.size()) - idx
|
||||||
|
if idx < 0:
|
||||||
return -1
|
return -1
|
||||||
cdef const TokenC* target = &this._sent[i]
|
|
||||||
if target.r_kids < <uint32_t>idx:
|
|
||||||
return -1
|
|
||||||
cdef const TokenC* ptr = &this._sent[target.r_edge]
|
|
||||||
while ptr > target:
|
|
||||||
# If this head is still to the right of us, we can skip to it
|
|
||||||
# No token that's between this token and this head could be our
|
|
||||||
# child.
|
|
||||||
if (ptr.head < 0) and ((ptr + ptr.head) > target):
|
|
||||||
ptr += ptr.head
|
|
||||||
elif ptr + ptr.head == target:
|
|
||||||
idx -= 1
|
|
||||||
if idx == 0:
|
|
||||||
return ptr - this._sent
|
|
||||||
ptr -= 1
|
|
||||||
else:
|
else:
|
||||||
ptr -= 1
|
return lefts.at(idx)
|
||||||
|
|
||||||
|
int R(int head, int idx) nogil const:
|
||||||
|
if idx < 1 or this._right_arcs.size() == 0:
|
||||||
return -1
|
return -1
|
||||||
|
cdef vector[int] rights
|
||||||
|
for i in range(this._right_arcs.size()):
|
||||||
|
arc = this._right_arcs.at(i)
|
||||||
|
if arc.head == head and arc.child != -1 and arc.child > head:
|
||||||
|
rights.push_back(arc.child)
|
||||||
|
idx = (<int>rights.size()) - idx
|
||||||
|
if idx < 0:
|
||||||
|
return -1
|
||||||
|
else:
|
||||||
|
return rights.at(idx)
|
||||||
|
|
||||||
bint empty() nogil const:
|
bint empty() nogil const:
|
||||||
return this._s_i <= 0
|
return this._stack.size() == 0
|
||||||
|
|
||||||
bint eol() nogil const:
|
bint eol() nogil const:
|
||||||
return this.buffer_length() == 0
|
return this.buffer_length() == 0
|
||||||
|
|
||||||
bint at_break() nogil const:
|
|
||||||
return this._break != -1
|
|
||||||
|
|
||||||
bint is_final() nogil const:
|
bint is_final() nogil const:
|
||||||
return this.stack_depth() <= 0 and this._b_i >= this.length
|
return this.stack_depth() <= 0 and this.eol()
|
||||||
|
|
||||||
bint has_head(int i) nogil const:
|
int cannot_sent_start(int word) nogil const:
|
||||||
return this.safe_get(i).head != 0
|
if word < 0 or word >= this.length:
|
||||||
|
return 0
|
||||||
|
elif this._sent[word].sent_start == -1:
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
int n_L(int i) nogil const:
|
int is_sent_start(int word) nogil const:
|
||||||
return this.safe_get(i).l_kids
|
if word < 0 or word >= this.length:
|
||||||
|
return 0
|
||||||
|
elif this._sent[word].sent_start == 1:
|
||||||
|
return 1
|
||||||
|
elif this._sent_starts.count(word) >= 1:
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
int n_R(int i) nogil const:
|
void set_sent_start(int word, int value) nogil:
|
||||||
return this.safe_get(i).r_kids
|
if value >= 1:
|
||||||
|
this._sent_starts.insert(word)
|
||||||
|
|
||||||
|
bint has_head(int child) nogil const:
|
||||||
|
return this._heads[child] >= 0
|
||||||
|
|
||||||
|
int l_edge(int word) nogil const:
|
||||||
|
return word
|
||||||
|
|
||||||
|
int r_edge(int word) nogil const:
|
||||||
|
return word
|
||||||
|
|
||||||
|
int n_L(int head) nogil const:
|
||||||
|
cdef int n = 0
|
||||||
|
for i in range(this._left_arcs.size()):
|
||||||
|
arc = this._left_arcs.at(i)
|
||||||
|
if arc.head == head and arc.child != -1 and arc.child < arc.head:
|
||||||
|
n += 1
|
||||||
|
return n
|
||||||
|
|
||||||
|
int n_R(int head) nogil const:
|
||||||
|
cdef int n = 0
|
||||||
|
for i in range(this._right_arcs.size()):
|
||||||
|
arc = this._right_arcs.at(i)
|
||||||
|
if arc.head == head and arc.child != -1 and arc.child > arc.head:
|
||||||
|
n += 1
|
||||||
|
return n
|
||||||
|
|
||||||
bint stack_is_connected() nogil const:
|
bint stack_is_connected() nogil const:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
bint entity_is_open() nogil const:
|
bint entity_is_open() nogil const:
|
||||||
if this._e_i < 1:
|
if this._ents.size() == 0:
|
||||||
return False
|
return False
|
||||||
return this._ents[this._e_i-1].end == -1
|
else:
|
||||||
|
return this._ents.back().end == -1
|
||||||
|
|
||||||
int stack_depth() nogil const:
|
int stack_depth() nogil const:
|
||||||
return this._s_i
|
return this._stack.size()
|
||||||
|
|
||||||
int buffer_length() nogil const:
|
int buffer_length() nogil const:
|
||||||
if this._break != -1:
|
|
||||||
return this._break - this._b_i
|
|
||||||
else:
|
|
||||||
return this.length - this._b_i
|
return this.length - this._b_i
|
||||||
|
|
||||||
uint64_t hash() nogil const:
|
|
||||||
cdef TokenC[11] sig
|
|
||||||
sig[0] = this.S_(2)[0]
|
|
||||||
sig[1] = this.S_(1)[0]
|
|
||||||
sig[2] = this.R_(this.S(1), 1)[0]
|
|
||||||
sig[3] = this.L_(this.S(0), 1)[0]
|
|
||||||
sig[4] = this.L_(this.S(0), 2)[0]
|
|
||||||
sig[5] = this.S_(0)[0]
|
|
||||||
sig[6] = this.R_(this.S(0), 2)[0]
|
|
||||||
sig[7] = this.R_(this.S(0), 1)[0]
|
|
||||||
sig[8] = this.B_(0)[0]
|
|
||||||
sig[9] = this.E_(0)[0]
|
|
||||||
sig[10] = this.E_(1)[0]
|
|
||||||
return hash64(sig, sizeof(sig), this._s_i) \
|
|
||||||
+ hash64(<void*>&this._hist, sizeof(RingBufferC), 1)
|
|
||||||
|
|
||||||
void push_hist(int act) nogil:
|
|
||||||
ring_push(&this._hist, act+1)
|
|
||||||
|
|
||||||
int get_hist(int i) nogil:
|
|
||||||
return ring_get(&this._hist, i)
|
|
||||||
|
|
||||||
void push() nogil:
|
void push() nogil:
|
||||||
if this.B(0) != -1:
|
b0 = this.B(0)
|
||||||
this._stack[this._s_i] = this.B(0)
|
if this._rebuffer.size():
|
||||||
this._s_i += 1
|
b0 = this._rebuffer.back()
|
||||||
|
this._rebuffer.pop_back()
|
||||||
|
else:
|
||||||
|
b0 = this._b_i
|
||||||
this._b_i += 1
|
this._b_i += 1
|
||||||
if this.safe_get(this.B_(0).l_edge).sent_start == 1:
|
this._stack.push_back(b0)
|
||||||
this.set_break(this.B_(0).l_edge)
|
|
||||||
if this._b_i > this._break:
|
|
||||||
this._break = -1
|
|
||||||
|
|
||||||
void pop() nogil:
|
void pop() nogil:
|
||||||
if this._s_i >= 1:
|
this._stack.pop_back()
|
||||||
this._s_i -= 1
|
|
||||||
|
|
||||||
void force_final() nogil:
|
void force_final() nogil:
|
||||||
# This should only be used in desperate situations, as it may leave
|
# This should only be used in desperate situations, as it may leave
|
||||||
# the analysis in an unexpected state.
|
# the analysis in an unexpected state.
|
||||||
this._s_i = 0
|
this._stack.clear()
|
||||||
this._b_i = this.length
|
this._b_i = this.length
|
||||||
|
|
||||||
void unshift() nogil:
|
void unshift() nogil:
|
||||||
this._b_i -= 1
|
s0 = this._stack.back()
|
||||||
this._buffer[this._b_i] = this.S(0)
|
this._unshiftable[s0] = 1
|
||||||
this._s_i -= 1
|
this._rebuffer.push_back(s0)
|
||||||
this.shifted[this.B(0)] = True
|
this._stack.pop_back()
|
||||||
|
|
||||||
|
int is_unshiftable(int item) nogil const:
|
||||||
|
if item >= this._unshiftable.size():
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
return this._unshiftable.at(item)
|
||||||
|
|
||||||
|
void set_reshiftable(int item) nogil:
|
||||||
|
if item < this._unshiftable.size():
|
||||||
|
this._unshiftable[item] = 0
|
||||||
|
|
||||||
void add_arc(int head, int child, attr_t label) nogil:
|
void add_arc(int head, int child, attr_t label) nogil:
|
||||||
if this.has_head(child):
|
if this.has_head(child):
|
||||||
this.del_arc(this.H(child), child)
|
this.del_arc(this.H(child), child)
|
||||||
|
cdef ArcC arc
|
||||||
cdef int dist = head - child
|
arc.head = head
|
||||||
this._sent[child].head = dist
|
arc.child = child
|
||||||
this._sent[child].dep = label
|
arc.label = label
|
||||||
cdef int i
|
if head > child:
|
||||||
if child > head:
|
this._left_arcs.push_back(arc)
|
||||||
this._sent[head].r_kids += 1
|
|
||||||
# Some transition systems can have a word in the buffer have a
|
|
||||||
# rightward child, e.g. from Unshift.
|
|
||||||
this._sent[head].r_edge = this._sent[child].r_edge
|
|
||||||
i = 0
|
|
||||||
while this.has_head(head) and i < this.length:
|
|
||||||
head = this.H(head)
|
|
||||||
this._sent[head].r_edge = this._sent[child].r_edge
|
|
||||||
i += 1 # Guard against infinite loops
|
|
||||||
else:
|
else:
|
||||||
this._sent[head].l_kids += 1
|
this._right_arcs.push_back(arc)
|
||||||
this._sent[head].l_edge = this._sent[child].l_edge
|
this._heads[child] = head
|
||||||
|
|
||||||
void del_arc(int h_i, int c_i) nogil:
|
void del_arc(int h_i, int c_i) nogil:
|
||||||
cdef int dist = h_i - c_i
|
cdef vector[ArcC]* arcs
|
||||||
cdef TokenC* h = &this._sent[h_i]
|
if h_i > c_i:
|
||||||
cdef int i = 0
|
arcs = &this._left_arcs
|
||||||
if c_i > h_i:
|
|
||||||
# this.R_(h_i, 2) returns the second-rightmost child token of h_i
|
|
||||||
# If we have more than 2 rightmost children, our 2nd rightmost child's
|
|
||||||
# rightmost edge is going to be our new rightmost edge.
|
|
||||||
h.r_edge = this.R_(h_i, 2).r_edge if h.r_kids >= 2 else h_i
|
|
||||||
h.r_kids -= 1
|
|
||||||
new_edge = h.r_edge
|
|
||||||
# Correct upwards in the tree --- see Issue #251
|
|
||||||
while h.head < 0 and i < this.length: # Guard infinite loop
|
|
||||||
h += h.head
|
|
||||||
h.r_edge = new_edge
|
|
||||||
i += 1
|
|
||||||
else:
|
else:
|
||||||
# Same logic applies for left edge, but we don't need to walk up
|
arcs = &this._right_arcs
|
||||||
# the tree, as the head is off the stack.
|
if arcs.size() == 0:
|
||||||
h.l_edge = this.L_(h_i, 2).l_edge if h.l_kids >= 2 else h_i
|
return
|
||||||
h.l_kids -= 1
|
arc = arcs.back()
|
||||||
|
if arc.head == h_i and arc.child == c_i:
|
||||||
|
arcs.pop_back()
|
||||||
|
else:
|
||||||
|
for i in range(arcs.size()-1):
|
||||||
|
arc = arcs.at(i)
|
||||||
|
if arc.head == h_i and arc.child == c_i:
|
||||||
|
arc.head = -1
|
||||||
|
arc.child = -1
|
||||||
|
arc.label = 0
|
||||||
|
break
|
||||||
|
|
||||||
|
SpanC get_ent() nogil const:
|
||||||
|
cdef SpanC ent
|
||||||
|
if this._ents.size() == 0:
|
||||||
|
ent.start = 0
|
||||||
|
ent.end = 0
|
||||||
|
ent.label = 0
|
||||||
|
return ent
|
||||||
|
else:
|
||||||
|
return this._ents.back()
|
||||||
|
|
||||||
void open_ent(attr_t label) nogil:
|
void open_ent(attr_t label) nogil:
|
||||||
this._ents[this._e_i].start = this.B(0)
|
cdef SpanC ent
|
||||||
this._ents[this._e_i].label = label
|
ent.start = this.B(0)
|
||||||
this._ents[this._e_i].end = -1
|
ent.label = label
|
||||||
this._e_i += 1
|
ent.end = -1
|
||||||
|
this._ents.push_back(ent)
|
||||||
|
|
||||||
void close_ent() nogil:
|
void close_ent() nogil:
|
||||||
# Note that we don't decrement _e_i here! We want to maintain all
|
this._ents.back().end = this.B(0)+1
|
||||||
# entities, not over-write them...
|
|
||||||
this._ents[this._e_i-1].end = this.B(0)+1
|
|
||||||
this._sent[this.B(0)].ent_iob = 1
|
|
||||||
|
|
||||||
void set_ent_tag(int i, int ent_iob, attr_t ent_type) nogil:
|
|
||||||
if 0 <= i < this.length:
|
|
||||||
this._sent[i].ent_iob = ent_iob
|
|
||||||
this._sent[i].ent_type = ent_type
|
|
||||||
|
|
||||||
void set_break(int i) nogil:
|
|
||||||
if 0 <= i < this.length:
|
|
||||||
this._sent[i].sent_start = 1
|
|
||||||
this._break = this._b_i
|
|
||||||
|
|
||||||
void clone(const StateC* src) nogil:
|
void clone(const StateC* src) nogil:
|
||||||
this.length = src.length
|
this.length = src.length
|
||||||
memcpy(this._sent, src._sent, this.length * sizeof(TokenC))
|
this._sent = src._sent
|
||||||
memcpy(this._stack, src._stack, this.length * sizeof(int))
|
this._stack = src._stack
|
||||||
memcpy(this._buffer, src._buffer, this.length * sizeof(int))
|
this._rebuffer = src._rebuffer
|
||||||
memcpy(this._ents, src._ents, this.length * sizeof(SpanC))
|
this._sent_starts = src._sent_starts
|
||||||
memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0]))
|
this._unshiftable = src._unshiftable
|
||||||
|
memcpy(this._heads, src._heads, this.length * sizeof(this._heads[0]))
|
||||||
|
this._ents = src._ents
|
||||||
|
this._left_arcs = src._left_arcs
|
||||||
|
this._right_arcs = src._right_arcs
|
||||||
this._b_i = src._b_i
|
this._b_i = src._b_i
|
||||||
this._s_i = src._s_i
|
|
||||||
this._e_i = src._e_i
|
|
||||||
this._break = src._break
|
|
||||||
this.offset = src.offset
|
this.offset = src.offset
|
||||||
this._empty_token = src._empty_token
|
this._empty_token = src._empty_token
|
||||||
|
|
||||||
void fast_forward() nogil:
|
|
||||||
# space token attachement policy:
|
|
||||||
# - attach space tokens always to the last preceding real token
|
|
||||||
# - except if it's the beginning of a sentence, then attach to the first following
|
|
||||||
# - boundary case: a document containing multiple space tokens but nothing else,
|
|
||||||
# then make the last space token the head of all others
|
|
||||||
|
|
||||||
while is_space_token(this.B_(0)) \
|
|
||||||
or this.buffer_length() == 0 \
|
|
||||||
or this.stack_depth() == 0:
|
|
||||||
if this.buffer_length() == 0:
|
|
||||||
# remove the last sentence's root from the stack
|
|
||||||
if this.stack_depth() == 1:
|
|
||||||
this.pop()
|
|
||||||
# parser got stuck: reduce stack or unshift
|
|
||||||
elif this.stack_depth() > 1:
|
|
||||||
if this.has_head(this.S(0)):
|
|
||||||
this.pop()
|
|
||||||
else:
|
|
||||||
this.unshift()
|
|
||||||
# stack is empty but there is another sentence on the buffer
|
|
||||||
elif (this.length - this._b_i) >= 1:
|
|
||||||
this.push()
|
|
||||||
else: # stack empty and nothing else coming
|
|
||||||
break
|
|
||||||
|
|
||||||
elif is_space_token(this.B_(0)):
|
|
||||||
# the normal case: we're somewhere inside a sentence
|
|
||||||
if this.stack_depth() > 0:
|
|
||||||
# assert not is_space_token(this.S_(0))
|
|
||||||
# attach all coming space tokens to their last preceding
|
|
||||||
# real token (which should be on the top of the stack)
|
|
||||||
while is_space_token(this.B_(0)):
|
|
||||||
this.add_arc(this.S(0),this.B(0),0)
|
|
||||||
this.push()
|
|
||||||
this.pop()
|
|
||||||
# the rare case: we're at the beginning of a document:
|
|
||||||
# space tokens are attached to the first real token on the buffer
|
|
||||||
elif this.stack_depth() == 0:
|
|
||||||
# store all space tokens on the stack until a real token shows up
|
|
||||||
# or the last token on the buffer is reached
|
|
||||||
while is_space_token(this.B_(0)) and this.buffer_length() > 1:
|
|
||||||
this.push()
|
|
||||||
# empty the stack by attaching all space tokens to the
|
|
||||||
# first token on the buffer
|
|
||||||
# boundary case: if all tokens are space tokens, the last one
|
|
||||||
# becomes the head of all others
|
|
||||||
while this.stack_depth() > 0:
|
|
||||||
this.add_arc(this.B(0),this.S(0),0)
|
|
||||||
this.pop()
|
|
||||||
# move the first token onto the stack
|
|
||||||
this.push()
|
|
||||||
|
|
||||||
elif this.stack_depth() == 0:
|
|
||||||
# for one token sentences (?)
|
|
||||||
if this.buffer_length() == 1:
|
|
||||||
this.push()
|
|
||||||
this.pop()
|
|
||||||
# with an empty stack and a non-empty buffer
|
|
||||||
# only shift is valid anyway
|
|
||||||
elif (this.length - this._b_i) >= 1:
|
|
||||||
this.push()
|
|
||||||
|
|
||||||
else: # can this even happen?
|
|
||||||
break
|
|
||||||
|
|
|
@ -1,11 +1,7 @@
|
||||||
from .stateclass cimport StateClass
|
from ._state cimport StateC
|
||||||
from ...typedefs cimport weight_t, attr_t
|
from ...typedefs cimport weight_t, attr_t
|
||||||
from .transition_system cimport Transition, TransitionSystem
|
from .transition_system cimport Transition, TransitionSystem
|
||||||
|
|
||||||
|
|
||||||
cdef class ArcEager(TransitionSystem):
|
cdef class ArcEager(TransitionSystem):
|
||||||
pass
|
cdef get_arcs(self, StateC* state)
|
||||||
|
|
||||||
|
|
||||||
cdef weight_t push_cost(StateClass stcls, const void* _gold, int target) nogil
|
|
||||||
cdef weight_t arc_cost(StateClass stcls, const void* _gold, int head, int child) nogil
|
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user