Merge pull request #8813 from adrianeboyd/chore/develop-v3.2

Update develop for v3.2
This commit is contained in:
Adriane Boyd 2021-07-27 11:26:18 +02:00 committed by GitHub
commit 4f28190afe
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
244 changed files with 8332 additions and 1530 deletions

View File

@ -1,18 +0,0 @@
<!--- Please provide a summary in the title and describe your issue here.
Is this a bug or feature request? If a bug, include all the steps that led to the issue.
If you're looking for help with your code, consider posting a question here:
- GitHub Discussions: https://github.com/explosion/spaCy/discussions
- Stack Overflow: http://stackoverflow.com/questions/tagged/spacy
-->
## Your Environment
<!-- Include details of your environment. If you're using spaCy 1.7+, you can also type
`python -m spacy info --markdown` and copy-paste the result here.-->
- Operating System:
- Python Version Used:
- spaCy Version Used:
- Environment Information:

View File

@ -1,6 +1,6 @@
---
name: "\U0001F6A8 Bug Report"
about: Did you come across a bug or unexpected behaviour differing from the docs?
name: "\U0001F6A8 Submit a Bug Report"
about: Use this template if you came across a bug or unexpected behaviour differing from the docs.
---

View File

@ -1,5 +1,5 @@
---
name: "\U0001F4DA Documentation"
name: "\U0001F4DA Submit a Documentation Report"
about: Did you spot a mistake in the docs, is anything unclear or do you have a
suggestion?

View File

@ -1,19 +0,0 @@
---
name: "\U0001F4AC Anything else?"
about: For feature and project ideas, general usage questions or help with your code, please post on the GitHub Discussions board instead.
---
<!-- Describe your issue here. Please keep in mind that the GitHub issue tracker is mostly intended for reports related to the spaCy code base and source, and for bugs and enhancements. If you're looking for help with your code, consider posting a question here:
- GitHub Discussions: https://github.com/explosion/spaCy/discussions
- Stack Overflow: http://stackoverflow.com/questions/tagged/spacy
-->
## Your Environment
<!-- Include details of your environment. If you're using spaCy 1.7+, you can also type `python -m spacy info --markdown` and copy-paste the result here.-->
- Operating System:
- Python Version Used:
- spaCy Version Used:
- Environment Information:

14
.github/ISSUE_TEMPLATE/config.yml vendored Normal file
View File

@ -0,0 +1,14 @@
blank_issues_enabled: false
contact_links:
- name: 🗯 Discussions Forum
url: https://github.com/explosion/spaCy/discussions
about: Usage questions, general discussion and anything else that isn't a bug report.
- name: 📖 spaCy FAQ & Troubleshooting
url: https://github.com/explosion/spaCy/discussions/8226
about: Before you post, check out the FAQ for answers to common community questions!
- name: 💫 spaCy Usage Guides & API reference
url: https://spacy.io/usage
about: Everything you need to know about spaCy and how to use it.
- name: 🛠 Submit a Pull Request
url: https://github.com/explosion/spaCy/pulls
about: Did you spot a mistake and know how to fix it? Feel free to submit a PR straight away!

View File

@ -11,6 +11,10 @@ steps:
versionSpec: ${{ parameters.python_version }}
architecture: ${{ parameters.architecture }}
- bash: |
echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
displayName: 'Set variables'
- script: |
${{ parameters.prefix }} python -m pip install -U pip setuptools
${{ parameters.prefix }} python -m pip install -U -r requirements.txt
@ -41,7 +45,7 @@ steps:
displayName: "Install test requirements"
- script: |
${{ parameters.prefix }} python -m pip install -U cupy-cuda110
${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
displayName: "Install GPU requirements"
condition: eq(${{ parameters.gpu }}, true)
@ -55,3 +59,44 @@ steps:
${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu
displayName: "Run GPU tests"
condition: eq(${{ parameters.gpu }}, true)
- script: |
python -m spacy download ca_core_news_sm
python -m spacy download ca_core_news_md
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
displayName: 'Test download CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
displayName: 'Test convert CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
python -m spacy init config -p ner -l ca ner.cfg
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
displayName: 'Test debug config CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
# will have errors due to sparse data, check for summary in output
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
displayName: 'Test debug data CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
displayName: 'Test train CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
displayName: 'Test assemble CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
displayName: 'Test assemble CLI vectors warning'
condition: eq(variables['python_version'], '3.8')

View File

@ -0,0 +1,106 @@
# spaCy contributor agreement
This spaCy Contributor Agreement (**"SCA"**) is based on the
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
The SCA applies to any contribution that you make to any product or project
managed by us (the **"project"**), and sets out the intellectual property rights
you grant to us in the contributed materials. The term **"us"** shall mean
[ExplosionAI GmbH](https://explosion.ai/legal). The term
**"you"** shall mean the person or entity identified below.
If you agree to be bound by these terms, fill in the information requested
below and include the filled-in version with your first pull request, under the
folder [`.github/contributors/`](/.github/contributors/). The name of the file
should be your GitHub username, with the extension `.md`. For example, the user
example_user would create the file `.github/contributors/example_user.md`.
Read this agreement carefully before signing. These terms and conditions
constitute a binding legal agreement.
## Contributor Agreement
1. The term "contribution" or "contributed materials" means any source code,
object code, patch, tool, sample, graphic, specification, manual,
documentation, or any other material posted or submitted by you to the project.
2. With respect to any worldwide copyrights, or copyright applications and
registrations, in your contribution:
* you hereby assign to us joint ownership, and to the extent that such
assignment is or becomes invalid, ineffective or unenforceable, you hereby
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
royalty-free, unrestricted license to exercise all rights under those
copyrights. This includes, at our option, the right to sublicense these same
rights to third parties through multiple levels of sublicensees or other
licensing arrangements;
* you agree that each of us can do all things in relation to your
contribution as if each of us were the sole owners, and if one of us makes
a derivative work of your contribution, the one who makes the derivative
work (or has it made will be the sole owner of that derivative work;
* you agree that you will not assert any moral rights in your contribution
against us, our licensees or transferees;
* you agree that we may register a copyright in your contribution and
exercise all ownership rights associated with it; and
* you agree that neither of us has any duty to consult with, obtain the
consent of, pay or render an accounting to the other for any use or
distribution of your contribution.
3. With respect to any patents you own, or that you can license without payment
to any third party, you hereby grant to us a perpetual, irrevocable,
non-exclusive, worldwide, no-charge, royalty-free license to:
* make, have made, use, sell, offer to sell, import, and otherwise transfer
your contribution in whole or in part, alone or in combination with or
included in any product, work or materials arising out of the project to
which your contribution was submitted, and
* at our option, to sublicense these same rights to third parties through
multiple levels of sublicensees or other licensing arrangements.
4. Except as set out above, you keep all right, title, and interest in your
contribution. The rights that you grant to us under these terms are effective
on the date you first submitted a contribution to us, even if your submission
took place before the date you sign these terms.
5. You covenant, represent, warrant and agree that:
* Each contribution that you submit is and shall be an original work of
authorship and you can legally grant the rights set out in this SCA;
* to the best of your knowledge, each contribution will not violate any
third party's copyrights, trademarks, patents, or other intellectual
property rights; and
* each contribution shall be in compliance with U.S. export control laws and
other applicable export and import laws. You agree to notify us if you
become aware of any circumstance which would make any of the foregoing
representations inaccurate in any respect. We may publicly disclose your
participation in the project, including the fact that you have signed the SCA.
6. This SCA is governed by the laws of the State of California and applicable
U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
* [X] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my
contributions.
* [ ] I am signing on behalf of my employer or a legal entity and I have the
actual authority to contractually bind that entity.
## Contributor Details
| Field | Entry |
|------------------------------- | -------------------------- |
| Name | Kenneth Enevoldsen |
| Company name (if applicable) | |
| Title or role (if applicable) | |
| Date | 2021-07-13 |
| GitHub username | KennethEnevoldsen |
| Website (optional) | www.kennethenevoldsen.com |

106
.github/contributors/ZeeD.md vendored Normal file
View File

@ -0,0 +1,106 @@
# spaCy contributor agreement
This spaCy Contributor Agreement (**"SCA"**) is based on the
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
The SCA applies to any contribution that you make to any product or project
managed by us (the **"project"**), and sets out the intellectual property rights
you grant to us in the contributed materials. The term **"us"** shall mean
[ExplosionAI GmbH](https://explosion.ai/legal). The term
**"you"** shall mean the person or entity identified below.
If you agree to be bound by these terms, fill in the information requested
below and include the filled-in version with your first pull request, under the
folder [`.github/contributors/`](/.github/contributors/). The name of the file
should be your GitHub username, with the extension `.md`. For example, the user
example_user would create the file `.github/contributors/example_user.md`.
Read this agreement carefully before signing. These terms and conditions
constitute a binding legal agreement.
## Contributor Agreement
1. The term "contribution" or "contributed materials" means any source code,
object code, patch, tool, sample, graphic, specification, manual,
documentation, or any other material posted or submitted by you to the project.
2. With respect to any worldwide copyrights, or copyright applications and
registrations, in your contribution:
* you hereby assign to us joint ownership, and to the extent that such
assignment is or becomes invalid, ineffective or unenforceable, you hereby
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
royalty-free, unrestricted license to exercise all rights under those
copyrights. This includes, at our option, the right to sublicense these same
rights to third parties through multiple levels of sublicensees or other
licensing arrangements;
* you agree that each of us can do all things in relation to your
contribution as if each of us were the sole owners, and if one of us makes
a derivative work of your contribution, the one who makes the derivative
work (or has it made will be the sole owner of that derivative work;
* you agree that you will not assert any moral rights in your contribution
against us, our licensees or transferees;
* you agree that we may register a copyright in your contribution and
exercise all ownership rights associated with it; and
* you agree that neither of us has any duty to consult with, obtain the
consent of, pay or render an accounting to the other for any use or
distribution of your contribution.
3. With respect to any patents you own, or that you can license without payment
to any third party, you hereby grant to us a perpetual, irrevocable,
non-exclusive, worldwide, no-charge, royalty-free license to:
* make, have made, use, sell, offer to sell, import, and otherwise transfer
your contribution in whole or in part, alone or in combination with or
included in any product, work or materials arising out of the project to
which your contribution was submitted, and
* at our option, to sublicense these same rights to third parties through
multiple levels of sublicensees or other licensing arrangements.
4. Except as set out above, you keep all right, title, and interest in your
contribution. The rights that you grant to us under these terms are effective
on the date you first submitted a contribution to us, even if your submission
took place before the date you sign these terms.
5. You covenant, represent, warrant and agree that:
* Each contribution that you submit is and shall be an original work of
authorship and you can legally grant the rights set out in this SCA;
* to the best of your knowledge, each contribution will not violate any
third party's copyrights, trademarks, patents, or other intellectual
property rights; and
* each contribution shall be in compliance with U.S. export control laws and
other applicable export and import laws. You agree to notify us if you
become aware of any circumstance which would make any of the foregoing
representations inaccurate in any respect. We may publicly disclose your
participation in the project, including the fact that you have signed the SCA.
6. This SCA is governed by the laws of the State of California and applicable
U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
* [x] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my
contributions.
* [ ] I am signing on behalf of my employer or a legal entity and I have the
actual authority to contractually bind that entity.
## Contributor Details
| Field | Entry |
|------------------------------- | -------------------- |
| Name | Vito De Tullio |
| Company name (if applicable) | |
| Title or role (if applicable) | |
| Date | 2021-06-01 |
| GitHub username | ZeeD |
| Website (optional) | |

106
.github/contributors/bodak.md vendored Normal file
View File

@ -0,0 +1,106 @@
# spaCy contributor agreement
This spaCy Contributor Agreement (**"SCA"**) is based on the
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
The SCA applies to any contribution that you make to any product or project
managed by us (the **"project"**), and sets out the intellectual property rights
you grant to us in the contributed materials. The term **"us"** shall mean
[ExplosionAI GmbH](https://explosion.ai/legal). The term
**"you"** shall mean the person or entity identified below.
If you agree to be bound by these terms, fill in the information requested
below and include the filled-in version with your first pull request, under the
folder [`.github/contributors/`](/.github/contributors/). The name of the file
should be your GitHub username, with the extension `.md`. For example, the user
example_user would create the file `.github/contributors/example_user.md`.
Read this agreement carefully before signing. These terms and conditions
constitute a binding legal agreement.
## Contributor Agreement
1. The term "contribution" or "contributed materials" means any source code,
object code, patch, tool, sample, graphic, specification, manual,
documentation, or any other material posted or submitted by you to the project.
2. With respect to any worldwide copyrights, or copyright applications and
registrations, in your contribution:
* you hereby assign to us joint ownership, and to the extent that such
assignment is or becomes invalid, ineffective or unenforceable, you hereby
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
royalty-free, unrestricted license to exercise all rights under those
copyrights. This includes, at our option, the right to sublicense these same
rights to third parties through multiple levels of sublicensees or other
licensing arrangements;
* you agree that each of us can do all things in relation to your
contribution as if each of us were the sole owners, and if one of us makes
a derivative work of your contribution, the one who makes the derivative
work (or has it made will be the sole owner of that derivative work;
* you agree that you will not assert any moral rights in your contribution
against us, our licensees or transferees;
* you agree that we may register a copyright in your contribution and
exercise all ownership rights associated with it; and
* you agree that neither of us has any duty to consult with, obtain the
consent of, pay or render an accounting to the other for any use or
distribution of your contribution.
3. With respect to any patents you own, or that you can license without payment
to any third party, you hereby grant to us a perpetual, irrevocable,
non-exclusive, worldwide, no-charge, royalty-free license to:
* make, have made, use, sell, offer to sell, import, and otherwise transfer
your contribution in whole or in part, alone or in combination with or
included in any product, work or materials arising out of the project to
which your contribution was submitted, and
* at our option, to sublicense these same rights to third parties through
multiple levels of sublicensees or other licensing arrangements.
4. Except as set out above, you keep all right, title, and interest in your
contribution. The rights that you grant to us under these terms are effective
on the date you first submitted a contribution to us, even if your submission
took place before the date you sign these terms.
5. You covenant, represent, warrant and agree that:
* Each contribution that you submit is and shall be an original work of
authorship and you can legally grant the rights set out in this SCA;
* to the best of your knowledge, each contribution will not violate any
third party's copyrights, trademarks, patents, or other intellectual
property rights; and
* each contribution shall be in compliance with U.S. export control laws and
other applicable export and import laws. You agree to notify us if you
become aware of any circumstance which would make any of the foregoing
representations inaccurate in any respect. We may publicly disclose your
participation in the project, including the fact that you have signed the SCA.
6. This SCA is governed by the laws of the State of California and applicable
U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
* [x] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my
contributions.
* [ ] I am signing on behalf of my employer or a legal entity and I have the
actual authority to contractually bind that entity.
## Contributor Details
| Field | Entry |
|------------------------------- | -------------------- |
| Name | Kristian Boda |
| Company name (if applicable) | |
| Title or role (if applicable) | |
| Date | 18.05.2021 |
| GitHub username | bodak |
| Website (optional) | |

106
.github/contributors/gtoffoli.md vendored Normal file
View File

@ -0,0 +1,106 @@
# spaCy contributor agreement
This spaCy Contributor Agreement (**"SCA"**) is based on the
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
The SCA applies to any contribution that you make to any product or project
managed by us (the **"project"**), and sets out the intellectual property rights
you grant to us in the contributed materials. The term **"us"** shall mean
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
**"you"** shall mean the person or entity identified below.
If you agree to be bound by these terms, fill in the information requested
below and include the filled-in version with your first pull request, under the
folder [`.github/contributors/`](/.github/contributors/). The name of the file
should be your GitHub username, with the extension `.md`. For example, the user
example_user would create the file `.github/contributors/example_user.md`.
Read this agreement carefully before signing. These terms and conditions
constitute a binding legal agreement.
## Contributor Agreement
1. The term "contribution" or "contributed materials" means any source code,
object code, patch, tool, sample, graphic, specification, manual,
documentation, or any other material posted or submitted by you to the project.
2. With respect to any worldwide copyrights, or copyright applications and
registrations, in your contribution:
* you hereby assign to us joint ownership, and to the extent that such
assignment is or becomes invalid, ineffective or unenforceable, you hereby
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
royalty-free, unrestricted license to exercise all rights under those
copyrights. This includes, at our option, the right to sublicense these same
rights to third parties through multiple levels of sublicensees or other
licensing arrangements;
* you agree that each of us can do all things in relation to your
contribution as if each of us were the sole owners, and if one of us makes
a derivative work of your contribution, the one who makes the derivative
work (or has it made will be the sole owner of that derivative work;
* you agree that you will not assert any moral rights in your contribution
against us, our licensees or transferees;
* you agree that we may register a copyright in your contribution and
exercise all ownership rights associated with it; and
* you agree that neither of us has any duty to consult with, obtain the
consent of, pay or render an accounting to the other for any use or
distribution of your contribution.
3. With respect to any patents you own, or that you can license without payment
to any third party, you hereby grant to us a perpetual, irrevocable,
non-exclusive, worldwide, no-charge, royalty-free license to:
* make, have made, use, sell, offer to sell, import, and otherwise transfer
your contribution in whole or in part, alone or in combination with or
included in any product, work or materials arising out of the project to
which your contribution was submitted, and
* at our option, to sublicense these same rights to third parties through
multiple levels of sublicensees or other licensing arrangements.
4. Except as set out above, you keep all right, title, and interest in your
contribution. The rights that you grant to us under these terms are effective
on the date you first submitted a contribution to us, even if your submission
took place before the date you sign these terms.
5. You covenant, represent, warrant and agree that:
* Each contribution that you submit is and shall be an original work of
authorship and you can legally grant the rights set out in this SCA;
* to the best of your knowledge, each contribution will not violate any
third party's copyrights, trademarks, patents, or other intellectual
property rights; and
* each contribution shall be in compliance with U.S. export control laws and
other applicable export and import laws. You agree to notify us if you
become aware of any circumstance which would make any of the foregoing
representations inaccurate in any respect. We may publicly disclose your
participation in the project, including the fact that you have signed the SCA.
6. This SCA is governed by the laws of the State of California and applicable
U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
* [x] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my
contributions.
* [ ] I am signing on behalf of my employer or a legal entity and I have the
actual authority to contractually bind that entity.
## Contributor Details
| Field | Entry |
|------------------------------- | ------------------------ |
| Name | Giovanni Toffoli |
| Company name (if applicable) | |
| Title or role (if applicable) | |
| Date | 2021-05-12 |
| GitHub username | gtoffoli |
| Website (optional) | |

106
.github/contributors/jklaise.md vendored Normal file
View File

@ -0,0 +1,106 @@
# spaCy contributor agreement
This spaCy Contributor Agreement (**"SCA"**) is based on the
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
The SCA applies to any contribution that you make to any product or project
managed by us (the **"project"**), and sets out the intellectual property rights
you grant to us in the contributed materials. The term **"us"** shall mean
[ExplosionAI GmbH](https://explosion.ai/legal). The term
**"you"** shall mean the person or entity identified below.
If you agree to be bound by these terms, fill in the information requested
below and include the filled-in version with your first pull request, under the
folder [`.github/contributors/`](/.github/contributors/). The name of the file
should be your GitHub username, with the extension `.md`. For example, the user
example_user would create the file `.github/contributors/example_user.md`.
Read this agreement carefully before signing. These terms and conditions
constitute a binding legal agreement.
## Contributor Agreement
1. The term "contribution" or "contributed materials" means any source code,
object code, patch, tool, sample, graphic, specification, manual,
documentation, or any other material posted or submitted by you to the project.
2. With respect to any worldwide copyrights, or copyright applications and
registrations, in your contribution:
* you hereby assign to us joint ownership, and to the extent that such
assignment is or becomes invalid, ineffective or unenforceable, you hereby
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
royalty-free, unrestricted license to exercise all rights under those
copyrights. This includes, at our option, the right to sublicense these same
rights to third parties through multiple levels of sublicensees or other
licensing arrangements;
* you agree that each of us can do all things in relation to your
contribution as if each of us were the sole owners, and if one of us makes
a derivative work of your contribution, the one who makes the derivative
work (or has it made will be the sole owner of that derivative work;
* you agree that you will not assert any moral rights in your contribution
against us, our licensees or transferees;
* you agree that we may register a copyright in your contribution and
exercise all ownership rights associated with it; and
* you agree that neither of us has any duty to consult with, obtain the
consent of, pay or render an accounting to the other for any use or
distribution of your contribution.
3. With respect to any patents you own, or that you can license without payment
to any third party, you hereby grant to us a perpetual, irrevocable,
non-exclusive, worldwide, no-charge, royalty-free license to:
* make, have made, use, sell, offer to sell, import, and otherwise transfer
your contribution in whole or in part, alone or in combination with or
included in any product, work or materials arising out of the project to
which your contribution was submitted, and
* at our option, to sublicense these same rights to third parties through
multiple levels of sublicensees or other licensing arrangements.
4. Except as set out above, you keep all right, title, and interest in your
contribution. The rights that you grant to us under these terms are effective
on the date you first submitted a contribution to us, even if your submission
took place before the date you sign these terms.
5. You covenant, represent, warrant and agree that:
* Each contribution that you submit is and shall be an original work of
authorship and you can legally grant the rights set out in this SCA;
* to the best of your knowledge, each contribution will not violate any
third party's copyrights, trademarks, patents, or other intellectual
property rights; and
* each contribution shall be in compliance with U.S. export control laws and
other applicable export and import laws. You agree to notify us if you
become aware of any circumstance which would make any of the foregoing
representations inaccurate in any respect. We may publicly disclose your
participation in the project, including the fact that you have signed the SCA.
6. This SCA is governed by the laws of the State of California and applicable
U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
* [x] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my
contributions.
* [ ] I am signing on behalf of my employer or a legal entity and I have the
actual authority to contractually bind that entity.
## Contributor Details
| Field | Entry |
|------------------------------- | -------------------- |
| Name |Janis Klaise |
| Company name (if applicable) | |
| Title or role (if applicable) | |
| Date |26/04/2021 |
| GitHub username |jklaise |
| Website (optional) |janisklaise.com |

106
.github/contributors/jmyerston.md vendored Normal file
View File

@ -0,0 +1,106 @@
# spaCy contributor agreement
This spaCy Contributor Agreement (**"SCA"**) is based on the
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
The SCA applies to any contribution that you make to any product or project
managed by us (the **"project"**), and sets out the intellectual property rights
you grant to us in the contributed materials. The term **"us"** shall mean
[ExplosionAI GmbH](https://explosion.ai/legal). The term
**"you"** shall mean the person or entity identified below.
If you agree to be bound by these terms, fill in the information requested
below and include the filled-in version with your first pull request, under the
folder [`.github/contributors/`](/.github/contributors/). The name of the file
should be your GitHub username, with the extension `.md`. For example, the user
example_user would create the file `.github/contributors/example_user.md`.
Read this agreement carefully before signing. These terms and conditions
constitute a binding legal agreement.
## Contributor Agreement
1. The term "contribution" or "contributed materials" means any source code,
object code, patch, tool, sample, graphic, specification, manual,
documentation, or any other material posted or submitted by you to the project.
2. With respect to any worldwide copyrights, or copyright applications and
registrations, in your contribution:
* you hereby assign to us joint ownership, and to the extent that such
assignment is or becomes invalid, ineffective or unenforceable, you hereby
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
royalty-free, unrestricted license to exercise all rights under those
copyrights. This includes, at our option, the right to sublicense these same
rights to third parties through multiple levels of sublicensees or other
licensing arrangements;
* you agree that each of us can do all things in relation to your
contribution as if each of us were the sole owners, and if one of us makes
a derivative work of your contribution, the one who makes the derivative
work (or has it made will be the sole owner of that derivative work;
* you agree that you will not assert any moral rights in your contribution
against us, our licensees or transferees;
* you agree that we may register a copyright in your contribution and
exercise all ownership rights associated with it; and
* you agree that neither of us has any duty to consult with, obtain the
consent of, pay or render an accounting to the other for any use or
distribution of your contribution.
3. With respect to any patents you own, or that you can license without payment
to any third party, you hereby grant to us a perpetual, irrevocable,
non-exclusive, worldwide, no-charge, royalty-free license to:
* make, have made, use, sell, offer to sell, import, and otherwise transfer
your contribution in whole or in part, alone or in combination with or
included in any product, work or materials arising out of the project to
which your contribution was submitted, and
* at our option, to sublicense these same rights to third parties through
multiple levels of sublicensees or other licensing arrangements.
4. Except as set out above, you keep all right, title, and interest in your
contribution. The rights that you grant to us under these terms are effective
on the date you first submitted a contribution to us, even if your submission
took place before the date you sign these terms.
5. You covenant, represent, warrant and agree that:
* Each contribution that you submit is and shall be an original work of
authorship and you can legally grant the rights set out in this SCA;
* to the best of your knowledge, each contribution will not violate any
third party's copyrights, trademarks, patents, or other intellectual
property rights; and
* each contribution shall be in compliance with U.S. export control laws and
other applicable export and import laws. You agree to notify us if you
become aware of any circumstance which would make any of the foregoing
representations inaccurate in any respect. We may publicly disclose your
participation in the project, including the fact that you have signed the SCA.
6. This SCA is governed by the laws of the State of California and applicable
U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
* [x] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my
contributions.
* [ ] I am signing on behalf of my employer or a legal entity and I have the
actual authority to contractually bind that entity.
## Contributor Details
| Field | Entry |
| ----------------------------- | ----------------------------------- |
| Name | Jacobo Myerston |
| Company name (if applicable) | University of California, San Diego |
| Title or role (if applicable) | Academic |
| Date | 07/05/2021 |
| GitHub username | jmyerston |
| Website (optional) | diogenet.ucsd.edu |

106
.github/contributors/julien-talkair.md vendored Normal file
View File

@ -0,0 +1,106 @@
# spaCy contributor agreement
This spaCy Contributor Agreement (**"SCA"**) is based on the
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
The SCA applies to any contribution that you make to any product or project
managed by us (the **"project"**), and sets out the intellectual property rights
you grant to us in the contributed materials. The term **"us"** shall mean
[ExplosionAI GmbH](https://explosion.ai/legal). The term
**"you"** shall mean the person or entity identified below.
If you agree to be bound by these terms, fill in the information requested
below and include the filled-in version with your first pull request, under the
folder [`.github/contributors/`](/.github/contributors/). The name of the file
should be your GitHub username, with the extension `.md`. For example, the user
example_user would create the file `.github/contributors/example_user.md`.
Read this agreement carefully before signing. These terms and conditions
constitute a binding legal agreement.
## Contributor Agreement
1. The term "contribution" or "contributed materials" means any source code,
object code, patch, tool, sample, graphic, specification, manual,
documentation, or any other material posted or submitted by you to the project.
2. With respect to any worldwide copyrights, or copyright applications and
registrations, in your contribution:
* you hereby assign to us joint ownership, and to the extent that such
assignment is or becomes invalid, ineffective or unenforceable, you hereby
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
royalty-free, unrestricted license to exercise all rights under those
copyrights. This includes, at our option, the right to sublicense these same
rights to third parties through multiple levels of sublicensees or other
licensing arrangements;
* you agree that each of us can do all things in relation to your
contribution as if each of us were the sole owners, and if one of us makes
a derivative work of your contribution, the one who makes the derivative
work (or has it made will be the sole owner of that derivative work;
* you agree that you will not assert any moral rights in your contribution
against us, our licensees or transferees;
* you agree that we may register a copyright in your contribution and
exercise all ownership rights associated with it; and
* you agree that neither of us has any duty to consult with, obtain the
consent of, pay or render an accounting to the other for any use or
distribution of your contribution.
3. With respect to any patents you own, or that you can license without payment
to any third party, you hereby grant to us a perpetual, irrevocable,
non-exclusive, worldwide, no-charge, royalty-free license to:
* make, have made, use, sell, offer to sell, import, and otherwise transfer
your contribution in whole or in part, alone or in combination with or
included in any product, work or materials arising out of the project to
which your contribution was submitted, and
* at our option, to sublicense these same rights to third parties through
multiple levels of sublicensees or other licensing arrangements.
4. Except as set out above, you keep all right, title, and interest in your
contribution. The rights that you grant to us under these terms are effective
on the date you first submitted a contribution to us, even if your submission
took place before the date you sign these terms.
5. You covenant, represent, warrant and agree that:
* Each contribution that you submit is and shall be an original work of
authorship and you can legally grant the rights set out in this SCA;
* to the best of your knowledge, each contribution will not violate any
third party's copyrights, trademarks, patents, or other intellectual
property rights; and
* each contribution shall be in compliance with U.S. export control laws and
other applicable export and import laws. You agree to notify us if you
become aware of any circumstance which would make any of the foregoing
representations inaccurate in any respect. We may publicly disclose your
participation in the project, including the fact that you have signed the SCA.
6. This SCA is governed by the laws of the State of California and applicable
U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
* [ ] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my
contributions.
* [x] I am signing on behalf of my employer or a legal entity and I have the
actual authority to contractually bind that entity.
## Contributor Details
| Field | Entry |
|------------------------------- | -------------------- |
| Name | Julien Rossi |
| Company name (if applicable) | TalkAir BV |
| Title or role (if applicable) | CTO, Partner |
| Date | June 28 2021 |
| GitHub username | julien-talkair |
| Website (optional) | |

106
.github/contributors/juliensalinas.md vendored Normal file
View File

@ -0,0 +1,106 @@
# spaCy contributor agreement
This spaCy Contributor Agreement (**"SCA"**) is based on the
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
The SCA applies to any contribution that you make to any product or project
managed by us (the **"project"**), and sets out the intellectual property rights
you grant to us in the contributed materials. The term **"us"** shall mean
[ExplosionAI GmbH](https://explosion.ai/legal). The term
**"you"** shall mean the person or entity identified below.
If you agree to be bound by these terms, fill in the information requested
below and include the filled-in version with your first pull request, under the
folder [`.github/contributors/`](/.github/contributors/). The name of the file
should be your GitHub username, with the extension `.md`. For example, the user
example_user would create the file `.github/contributors/example_user.md`.
Read this agreement carefully before signing. These terms and conditions
constitute a binding legal agreement.
## Contributor Agreement
1. The term "contribution" or "contributed materials" means any source code,
object code, patch, tool, sample, graphic, specification, manual,
documentation, or any other material posted or submitted by you to the project.
2. With respect to any worldwide copyrights, or copyright applications and
registrations, in your contribution:
* you hereby assign to us joint ownership, and to the extent that such
assignment is or becomes invalid, ineffective or unenforceable, you hereby
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
royalty-free, unrestricted license to exercise all rights under those
copyrights. This includes, at our option, the right to sublicense these same
rights to third parties through multiple levels of sublicensees or other
licensing arrangements;
* you agree that each of us can do all things in relation to your
contribution as if each of us were the sole owners, and if one of us makes
a derivative work of your contribution, the one who makes the derivative
work (or has it made will be the sole owner of that derivative work;
* you agree that you will not assert any moral rights in your contribution
against us, our licensees or transferees;
* you agree that we may register a copyright in your contribution and
exercise all ownership rights associated with it; and
* you agree that neither of us has any duty to consult with, obtain the
consent of, pay or render an accounting to the other for any use or
distribution of your contribution.
3. With respect to any patents you own, or that you can license without payment
to any third party, you hereby grant to us a perpetual, irrevocable,
non-exclusive, worldwide, no-charge, royalty-free license to:
* make, have made, use, sell, offer to sell, import, and otherwise transfer
your contribution in whole or in part, alone or in combination with or
included in any product, work or materials arising out of the project to
which your contribution was submitted, and
* at our option, to sublicense these same rights to third parties through
multiple levels of sublicensees or other licensing arrangements.
4. Except as set out above, you keep all right, title, and interest in your
contribution. The rights that you grant to us under these terms are effective
on the date you first submitted a contribution to us, even if your submission
took place before the date you sign these terms.
5. You covenant, represent, warrant and agree that:
* Each contribution that you submit is and shall be an original work of
authorship and you can legally grant the rights set out in this SCA;
* to the best of your knowledge, each contribution will not violate any
third party's copyrights, trademarks, patents, or other intellectual
property rights; and
* each contribution shall be in compliance with U.S. export control laws and
other applicable export and import laws. You agree to notify us if you
become aware of any circumstance which would make any of the foregoing
representations inaccurate in any respect. We may publicly disclose your
participation in the project, including the fact that you have signed the SCA.
6. This SCA is governed by the laws of the State of California and applicable
U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
* [X] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my
contributions.
* [ ] I am signing on behalf of my employer or a legal entity and I have the
actual authority to contractually bind that entity.
## Contributor Details
| Field | Entry |
| ----------------------------- | ------------------- |
| Name | Julien Salinas |
| Company name (if applicable) | NLP Cloud |
| Title or role (if applicable) | Founder and CTO |
| Date | Mayb 14th 2021 |
| GitHub username | juliensalinas |
| Website (optional) | https://nlpcloud.io |

106
.github/contributors/mariosasko.md vendored Normal file
View File

@ -0,0 +1,106 @@
# spaCy contributor agreement
This spaCy Contributor Agreement (**"SCA"**) is based on the
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
The SCA applies to any contribution that you make to any product or project
managed by us (the **"project"**), and sets out the intellectual property rights
you grant to us in the contributed materials. The term **"us"** shall mean
[ExplosionAI GmbH](https://explosion.ai/legal). The term
**"you"** shall mean the person or entity identified below.
If you agree to be bound by these terms, fill in the information requested
below and include the filled-in version with your first pull request, under the
folder [`.github/contributors/`](/.github/contributors/). The name of the file
should be your GitHub username, with the extension `.md`. For example, the user
example_user would create the file `.github/contributors/example_user.md`.
Read this agreement carefully before signing. These terms and conditions
constitute a binding legal agreement.
## Contributor Agreement
1. The term "contribution" or "contributed materials" means any source code,
object code, patch, tool, sample, graphic, specification, manual,
documentation, or any other material posted or submitted by you to the project.
2. With respect to any worldwide copyrights, or copyright applications and
registrations, in your contribution:
* you hereby assign to us joint ownership, and to the extent that such
assignment is or becomes invalid, ineffective or unenforceable, you hereby
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
royalty-free, unrestricted license to exercise all rights under those
copyrights. This includes, at our option, the right to sublicense these same
rights to third parties through multiple levels of sublicensees or other
licensing arrangements;
* you agree that each of us can do all things in relation to your
contribution as if each of us were the sole owners, and if one of us makes
a derivative work of your contribution, the one who makes the derivative
work (or has it made will be the sole owner of that derivative work;
* you agree that you will not assert any moral rights in your contribution
against us, our licensees or transferees;
* you agree that we may register a copyright in your contribution and
exercise all ownership rights associated with it; and
* you agree that neither of us has any duty to consult with, obtain the
consent of, pay or render an accounting to the other for any use or
distribution of your contribution.
3. With respect to any patents you own, or that you can license without payment
to any third party, you hereby grant to us a perpetual, irrevocable,
non-exclusive, worldwide, no-charge, royalty-free license to:
* make, have made, use, sell, offer to sell, import, and otherwise transfer
your contribution in whole or in part, alone or in combination with or
included in any product, work or materials arising out of the project to
which your contribution was submitted, and
* at our option, to sublicense these same rights to third parties through
multiple levels of sublicensees or other licensing arrangements.
4. Except as set out above, you keep all right, title, and interest in your
contribution. The rights that you grant to us under these terms are effective
on the date you first submitted a contribution to us, even if your submission
took place before the date you sign these terms.
5. You covenant, represent, warrant and agree that:
* Each contribution that you submit is and shall be an original work of
authorship and you can legally grant the rights set out in this SCA;
* to the best of your knowledge, each contribution will not violate any
third party's copyrights, trademarks, patents, or other intellectual
property rights; and
* each contribution shall be in compliance with U.S. export control laws and
other applicable export and import laws. You agree to notify us if you
become aware of any circumstance which would make any of the foregoing
representations inaccurate in any respect. We may publicly disclose your
participation in the project, including the fact that you have signed the SCA.
6. This SCA is governed by the laws of the State of California and applicable
U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
* [ ] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my
contributions.
* [x] I am signing on behalf of my employer or a legal entity and I have the
actual authority to contractually bind that entity.
## Contributor Details
| Field | Entry |
|------------------------------- | -------------------- |
| Name | Mario Šaško |
| Company name (if applicable) | TakeLab FER |
| Title or role (if applicable) | R&D Intern |
| Date | 2021-07-12 |
| GitHub username | mariosasko |
| Website (optional) | |

106
.github/contributors/narayanacharya6.md vendored Normal file
View File

@ -0,0 +1,106 @@
# spaCy contributor agreement
This spaCy Contributor Agreement (**"SCA"**) is based on the
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
The SCA applies to any contribution that you make to any product or project
managed by us (the **"project"**), and sets out the intellectual property rights
you grant to us in the contributed materials. The term **"us"** shall mean
[ExplosionAI GmbH](https://explosion.ai/legal). The term
**"you"** shall mean the person or entity identified below.
If you agree to be bound by these terms, fill in the information requested
below and include the filled-in version with your first pull request, under the
folder [`.github/contributors/`](/.github/contributors/). The name of the file
should be your GitHub username, with the extension `.md`. For example, the user
example_user would create the file `.github/contributors/example_user.md`.
Read this agreement carefully before signing. These terms and conditions
constitute a binding legal agreement.
## Contributor Agreement
1. The term "contribution" or "contributed materials" means any source code,
object code, patch, tool, sample, graphic, specification, manual,
documentation, or any other material posted or submitted by you to the project.
2. With respect to any worldwide copyrights, or copyright applications and
registrations, in your contribution:
* you hereby assign to us joint ownership, and to the extent that such
assignment is or becomes invalid, ineffective or unenforceable, you hereby
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
royalty-free, unrestricted license to exercise all rights under those
copyrights. This includes, at our option, the right to sublicense these same
rights to third parties through multiple levels of sublicensees or other
licensing arrangements;
* you agree that each of us can do all things in relation to your
contribution as if each of us were the sole owners, and if one of us makes
a derivative work of your contribution, the one who makes the derivative
work (or has it made will be the sole owner of that derivative work;
* you agree that you will not assert any moral rights in your contribution
against us, our licensees or transferees;
* you agree that we may register a copyright in your contribution and
exercise all ownership rights associated with it; and
* you agree that neither of us has any duty to consult with, obtain the
consent of, pay or render an accounting to the other for any use or
distribution of your contribution.
3. With respect to any patents you own, or that you can license without payment
to any third party, you hereby grant to us a perpetual, irrevocable,
non-exclusive, worldwide, no-charge, royalty-free license to:
* make, have made, use, sell, offer to sell, import, and otherwise transfer
your contribution in whole or in part, alone or in combination with or
included in any product, work or materials arising out of the project to
which your contribution was submitted, and
* at our option, to sublicense these same rights to third parties through
multiple levels of sublicensees or other licensing arrangements.
4. Except as set out above, you keep all right, title, and interest in your
contribution. The rights that you grant to us under these terms are effective
on the date you first submitted a contribution to us, even if your submission
took place before the date you sign these terms.
5. You covenant, represent, warrant and agree that:
* Each contribution that you submit is and shall be an original work of
authorship and you can legally grant the rights set out in this SCA;
* to the best of your knowledge, each contribution will not violate any
third party's copyrights, trademarks, patents, or other intellectual
property rights; and
* each contribution shall be in compliance with U.S. export control laws and
other applicable export and import laws. You agree to notify us if you
become aware of any circumstance which would make any of the foregoing
representations inaccurate in any respect. We may publicly disclose your
participation in the project, including the fact that you have signed the SCA.
6. This SCA is governed by the laws of the State of California and applicable
U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
* [x] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my
contributions.
* [ ] I am signing on behalf of my employer or a legal entity and I have the
actual authority to contractually bind that entity.
## Contributor Details
| Field | Entry |
|------------------------------- | -------------------- |
| Name | Narayan Acharya |
| Company name (if applicable) | |
| Title or role (if applicable) | |
| Date | 29 APR 2021 |
| GitHub username | narayanacharya6 |
| Website (optional) | narayanacharya.com |

106
.github/contributors/sevdimali.md vendored Normal file
View File

@ -0,0 +1,106 @@
# spaCy contributor agreement
This spaCy Contributor Agreement (**"SCA"**) is based on the
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
The SCA applies to any contribution that you make to any product or project
managed by us (the **"project"**), and sets out the intellectual property rights
you grant to us in the contributed materials. The term **"us"** shall mean
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
**"you"** shall mean the person or entity identified below.
If you agree to be bound by these terms, fill in the information requested
below and include the filled-in version with your first pull request, under the
folder [`.github/contributors/`](/.github/contributors/). The name of the file
should be your GitHub username, with the extension `.md`. For example, the user
example_user would create the file `.github/contributors/example_user.md`.
Read this agreement carefully before signing. These terms and conditions
constitute a binding legal agreement.
## Contributor Agreement
1. The term "contribution" or "contributed materials" means any source code,
object code, patch, tool, sample, graphic, specification, manual,
documentation, or any other material posted or submitted by you to the project.
2. With respect to any worldwide copyrights, or copyright applications and
registrations, in your contribution:
* you hereby assign to us joint ownership, and to the extent that such
assignment is or becomes invalid, ineffective or unenforceable, you hereby
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
royalty-free, unrestricted license to exercise all rights under those
copyrights. This includes, at our option, the right to sublicense these same
rights to third parties through multiple levels of sublicensees or other
licensing arrangements;
* you agree that each of us can do all things in relation to your
contribution as if each of us were the sole owners, and if one of us makes
a derivative work of your contribution, the one who makes the derivative
work (or has it made will be the sole owner of that derivative work;
* you agree that you will not assert any moral rights in your contribution
against us, our licensees or transferees;
* you agree that we may register a copyright in your contribution and
exercise all ownership rights associated with it; and
* you agree that neither of us has any duty to consult with, obtain the
consent of, pay or render an accounting to the other for any use or
distribution of your contribution.
3. With respect to any patents you own, or that you can license without payment
to any third party, you hereby grant to us a perpetual, irrevocable,
non-exclusive, worldwide, no-charge, royalty-free license to:
* make, have made, use, sell, offer to sell, import, and otherwise transfer
your contribution in whole or in part, alone or in combination with or
included in any product, work or materials arising out of the project to
which your contribution was submitted, and
* at our option, to sublicense these same rights to third parties through
multiple levels of sublicensees or other licensing arrangements.
4. Except as set out above, you keep all right, title, and interest in your
contribution. The rights that you grant to us under these terms are effective
on the date you first submitted a contribution to us, even if your submission
took place before the date you sign these terms.
5. You covenant, represent, warrant and agree that:
* Each contribution that you submit is and shall be an original work of
authorship and you can legally grant the rights set out in this SCA;
* to the best of your knowledge, each contribution will not violate any
third party's copyrights, trademarks, patents, or other intellectual
property rights; and
* each contribution shall be in compliance with U.S. export control laws and
other applicable export and import laws. You agree to notify us if you
become aware of any circumstance which would make any of the foregoing
representations inaccurate in any respect. We may publicly disclose your
participation in the project, including the fact that you have signed the SCA.
6. This SCA is governed by the laws of the State of California and applicable
U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
* [x] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my
contributions.
* [ ] I am signing on behalf of my employer or a legal entity and I have the
actual authority to contractually bind that entity.
## Contributor Details
| Field | Entry |
|------------------------------- | -------------------- |
| Name | Sevdimali |
| Company name (if applicable) | |
| Title or role (if applicable) | |
| Date | 10/4/2021 |
| GitHub username | sevdimali |
| Website (optional) | https://sevdimali.me |

106
.github/contributors/thomashacker.md vendored Normal file
View File

@ -0,0 +1,106 @@
# spaCy contributor agreement
This spaCy Contributor Agreement (**"SCA"**) is based on the
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
The SCA applies to any contribution that you make to any product or project
managed by us (the **"project"**), and sets out the intellectual property rights
you grant to us in the contributed materials. The term **"us"** shall mean
[ExplosionAI GmbH](https://explosion.ai/legal). The term
**"you"** shall mean the person or entity identified below.
If you agree to be bound by these terms, fill in the information requested
below and include the filled-in version with your first pull request, under the
folder [`.github/contributors/`](/.github/contributors/). The name of the file
should be your GitHub username, with the extension `.md`. For example, the user
example_user would create the file `.github/contributors/example_user.md`.
Read this agreement carefully before signing. These terms and conditions
constitute a binding legal agreement.
## Contributor Agreement
1. The term "contribution" or "contributed materials" means any source code,
object code, patch, tool, sample, graphic, specification, manual,
documentation, or any other material posted or submitted by you to the project.
2. With respect to any worldwide copyrights, or copyright applications and
registrations, in your contribution:
* you hereby assign to us joint ownership, and to the extent that such
assignment is or becomes invalid, ineffective or unenforceable, you hereby
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
royalty-free, unrestricted license to exercise all rights under those
copyrights. This includes, at our option, the right to sublicense these same
rights to third parties through multiple levels of sublicensees or other
licensing arrangements;
* you agree that each of us can do all things in relation to your
contribution as if each of us were the sole owners, and if one of us makes
a derivative work of your contribution, the one who makes the derivative
work (or has it made will be the sole owner of that derivative work;
* you agree that you will not assert any moral rights in your contribution
against us, our licensees or transferees;
* you agree that we may register a copyright in your contribution and
exercise all ownership rights associated with it; and
* you agree that neither of us has any duty to consult with, obtain the
consent of, pay or render an accounting to the other for any use or
distribution of your contribution.
3. With respect to any patents you own, or that you can license without payment
to any third party, you hereby grant to us a perpetual, irrevocable,
non-exclusive, worldwide, no-charge, royalty-free license to:
* make, have made, use, sell, offer to sell, import, and otherwise transfer
your contribution in whole or in part, alone or in combination with or
included in any product, work or materials arising out of the project to
which your contribution was submitted, and
* at our option, to sublicense these same rights to third parties through
multiple levels of sublicensees or other licensing arrangements.
4. Except as set out above, you keep all right, title, and interest in your
contribution. The rights that you grant to us under these terms are effective
on the date you first submitted a contribution to us, even if your submission
took place before the date you sign these terms.
5. You covenant, represent, warrant and agree that:
* Each contribution that you submit is and shall be an original work of
authorship and you can legally grant the rights set out in this SCA;
* to the best of your knowledge, each contribution will not violate any
third party's copyrights, trademarks, patents, or other intellectual
property rights; and
* each contribution shall be in compliance with U.S. export control laws and
other applicable export and import laws. You agree to notify us if you
become aware of any circumstance which would make any of the foregoing
representations inaccurate in any respect. We may publicly disclose your
participation in the project, including the fact that you have signed the SCA.
6. This SCA is governed by the laws of the State of California and applicable
U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
* [x] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my
contributions.
* [ ] I am signing on behalf of my employer or a legal entity and I have the
actual authority to contractually bind that entity.
## Contributor Details
| Field | Entry |
|------------------------------- | -------------------- |
| Name | Edward Schmuhl |
| Company name (if applicable) | |
| Title or role (if applicable) | |
| Date | 09.07.2021 |
| GitHub username | thomashacker |
| Website (optional) | |

106
.github/contributors/xadrianzetx.md vendored Normal file
View File

@ -0,0 +1,106 @@
# spaCy contributor agreement
This spaCy Contributor Agreement (**"SCA"**) is based on the
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
The SCA applies to any contribution that you make to any product or project
managed by us (the **"project"**), and sets out the intellectual property rights
you grant to us in the contributed materials. The term **"us"** shall mean
[ExplosionAI GmbH](https://explosion.ai/legal). The term
**"you"** shall mean the person or entity identified below.
If you agree to be bound by these terms, fill in the information requested
below and include the filled-in version with your first pull request, under the
folder [`.github/contributors/`](/.github/contributors/). The name of the file
should be your GitHub username, with the extension `.md`. For example, the user
example_user would create the file `.github/contributors/example_user.md`.
Read this agreement carefully before signing. These terms and conditions
constitute a binding legal agreement.
## Contributor Agreement
1. The term "contribution" or "contributed materials" means any source code,
object code, patch, tool, sample, graphic, specification, manual,
documentation, or any other material posted or submitted by you to the project.
2. With respect to any worldwide copyrights, or copyright applications and
registrations, in your contribution:
* you hereby assign to us joint ownership, and to the extent that such
assignment is or becomes invalid, ineffective or unenforceable, you hereby
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
royalty-free, unrestricted license to exercise all rights under those
copyrights. This includes, at our option, the right to sublicense these same
rights to third parties through multiple levels of sublicensees or other
licensing arrangements;
* you agree that each of us can do all things in relation to your
contribution as if each of us were the sole owners, and if one of us makes
a derivative work of your contribution, the one who makes the derivative
work (or has it made will be the sole owner of that derivative work;
* you agree that you will not assert any moral rights in your contribution
against us, our licensees or transferees;
* you agree that we may register a copyright in your contribution and
exercise all ownership rights associated with it; and
* you agree that neither of us has any duty to consult with, obtain the
consent of, pay or render an accounting to the other for any use or
distribution of your contribution.
3. With respect to any patents you own, or that you can license without payment
to any third party, you hereby grant to us a perpetual, irrevocable,
non-exclusive, worldwide, no-charge, royalty-free license to:
* make, have made, use, sell, offer to sell, import, and otherwise transfer
your contribution in whole or in part, alone or in combination with or
included in any product, work or materials arising out of the project to
which your contribution was submitted, and
* at our option, to sublicense these same rights to third parties through
multiple levels of sublicensees or other licensing arrangements.
4. Except as set out above, you keep all right, title, and interest in your
contribution. The rights that you grant to us under these terms are effective
on the date you first submitted a contribution to us, even if your submission
took place before the date you sign these terms.
5. You covenant, represent, warrant and agree that:
* Each contribution that you submit is and shall be an original work of
authorship and you can legally grant the rights set out in this SCA;
* to the best of your knowledge, each contribution will not violate any
third party's copyrights, trademarks, patents, or other intellectual
property rights; and
* each contribution shall be in compliance with U.S. export control laws and
other applicable export and import laws. You agree to notify us if you
become aware of any circumstance which would make any of the foregoing
representations inaccurate in any respect. We may publicly disclose your
participation in the project, including the fact that you have signed the SCA.
6. This SCA is governed by the laws of the State of California and applicable
U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
* [x] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my
contributions.
* [ ] I am signing on behalf of my employer or a legal entity and I have the
actual authority to contractually bind that entity.
## Contributor Details
| Field | Entry |
|------------------------------- | -------------------- |
| Name |Adrian Zuber |
| Company name (if applicable) | |
| Title or role (if applicable) | |
| Date |20-06-2021 |
| GitHub username |xadrianzetx |
| Website (optional) | |

106
.github/contributors/yohasebe.md vendored Normal file
View File

@ -0,0 +1,106 @@
# spaCy contributor agreement
This spaCy Contributor Agreement (**"SCA"**) is based on the
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
The SCA applies to any contribution that you make to any product or project
managed by us (the **"project"**), and sets out the intellectual property rights
you grant to us in the contributed materials. The term **"us"** shall mean
[ExplosionAI GmbH](https://explosion.ai/legal). The term
**"you"** shall mean the person or entity identified below.
If you agree to be bound by these terms, fill in the information requested
below and include the filled-in version with your first pull request, under the
folder [`.github/contributors/`](/.github/contributors/). The name of the file
should be your GitHub username, with the extension `.md`. For example, the user
example_user would create the file `.github/contributors/example_user.md`.
Read this agreement carefully before signing. These terms and conditions
constitute a binding legal agreement.
## Contributor Agreement
1. The term "contribution" or "contributed materials" means any source code,
object code, patch, tool, sample, graphic, specification, manual,
documentation, or any other material posted or submitted by you to the project.
2. With respect to any worldwide copyrights, or copyright applications and
registrations, in your contribution:
* you hereby assign to us joint ownership, and to the extent that such
assignment is or becomes invalid, ineffective or unenforceable, you hereby
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
royalty-free, unrestricted license to exercise all rights under those
copyrights. This includes, at our option, the right to sublicense these same
rights to third parties through multiple levels of sublicensees or other
licensing arrangements;
* you agree that each of us can do all things in relation to your
contribution as if each of us were the sole owners, and if one of us makes
a derivative work of your contribution, the one who makes the derivative
work (or has it made will be the sole owner of that derivative work;
* you agree that you will not assert any moral rights in your contribution
against us, our licensees or transferees;
* you agree that we may register a copyright in your contribution and
exercise all ownership rights associated with it; and
* you agree that neither of us has any duty to consult with, obtain the
consent of, pay or render an accounting to the other for any use or
distribution of your contribution.
3. With respect to any patents you own, or that you can license without payment
to any third party, you hereby grant to us a perpetual, irrevocable,
non-exclusive, worldwide, no-charge, royalty-free license to:
* make, have made, use, sell, offer to sell, import, and otherwise transfer
your contribution in whole or in part, alone or in combination with or
included in any product, work or materials arising out of the project to
which your contribution was submitted, and
* at our option, to sublicense these same rights to third parties through
multiple levels of sublicensees or other licensing arrangements.
4. Except as set out above, you keep all right, title, and interest in your
contribution. The rights that you grant to us under these terms are effective
on the date you first submitted a contribution to us, even if your submission
took place before the date you sign these terms.
5. You covenant, represent, warrant and agree that:
* Each contribution that you submit is and shall be an original work of
authorship and you can legally grant the rights set out in this SCA;
* to the best of your knowledge, each contribution will not violate any
third party's copyrights, trademarks, patents, or other intellectual
property rights; and
* each contribution shall be in compliance with U.S. export control laws and
other applicable export and import laws. You agree to notify us if you
become aware of any circumstance which would make any of the foregoing
representations inaccurate in any respect. We may publicly disclose your
participation in the project, including the fact that you have signed the SCA.
6. This SCA is governed by the laws of the State of California and applicable
U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
* [x] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my
contributions.
* [ ] I am signing on behalf of my employer or a legal entity and I have the
actual authority to contractually bind that entity.
## Contributor Details
| Field | Entry |
|------------------------------- | -------------------- |
| Name | Yoichiro Hasebe |
| Company name (if applicable) | |
| Title or role (if applicable) | |
| Date | July 4th, 2021 |
| GitHub username | yohasebe |
| Website (optional) | https://yohasebe.com |

44
.github/workflows/autoblack.yml vendored Normal file
View File

@ -0,0 +1,44 @@
# GitHub Action that uses Black to reformat all Python code and submits a PR
# in regular intervals. Inspired by: https://github.com/cclauss/autoblack
name: autoblack
on:
workflow_dispatch: # allow manual trigger
schedule:
- cron: '0 8 * * 5' # every Friday at 8am UTC
jobs:
autoblack:
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
with:
ref: ${{ github.head_ref }}
- uses: actions/setup-python@v2
- run: pip install black
- name: Auto-format code if needed
run: black spacy
# We can't run black --check here because that returns a non-zero excit
# code and makes GitHub think the action failed
- name: Check for modified files
id: git-check
run: echo ::set-output name=modified::$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi)
- name: Create Pull Request
if: steps.git-check.outputs.modified == 'true'
uses: peter-evans/create-pull-request@v3
with:
title: Auto-format code with black
labels: meta
commit-message: Auto-format code with black
committer: GitHub <noreply@github.com>
author: explosion-bot <explosion-bot@users.noreply.github.com>
body: _This PR is auto-generated._
branch: autoblack
delete-branch: true
draft: false
- name: Check outputs
if: steps.git-check.outputs.modified == 'true'
run: |
echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}"
echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}"

12
.pre-commit-config.yaml Normal file
View File

@ -0,0 +1,12 @@
repos:
- repo: https://github.com/ambv/black
rev: 21.6b0
hooks:
- id: black
language_version: python3.7
- repo: https://gitlab.com/pycqa/flake8
rev: 3.9.2
hooks:
- id: flake8
args:
- "--config=setup.cfg"

View File

@ -2,11 +2,7 @@
# Contribute to spaCy
Thanks for your interest in contributing to spaCy 🎉 The project is maintained
by **[@honnibal](https://github.com/honnibal)**,
**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)** and
**[@adrianeboyd](https://github.com/adrianeboyd)**,
and we'll do our best to help you get started. This page will give you a quick
Thanks for your interest in contributing to spaCy 🎉 This page will give you a quick
overview of how things are organized and most importantly, how to get involved.
## Table of contents
@ -181,6 +177,15 @@ tools installed.
**⚠️ Note that formatting and linting is currently only possible for Python
modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.**
### Pre-Commit Hooks
After cloning the repo, after installing the packages from `requirements.txt`, enter the repo folder and run `pre-commit install`.
Each time a `git commit` is initiated, `black` and `flake8` will run automatically on the modified files only.
In case of error, or when `black` modified a file, the modified file needs to be `git add` once again and a new
`git commit` has to be issued.
### Code formatting
[`black`](https://github.com/ambv/black) is an opinionated Python code

View File

@ -1,7 +1,7 @@
SHELL := /bin/bash
ifndef SPACY_EXTRAS
override SPACY_EXTRAS = spacy-lookups-data==1.0.0 jieba spacy-pkuseg==0.0.28 sudachipy sudachidict_core pymorphy2
override SPACY_EXTRAS = spacy-lookups-data==1.0.2 jieba spacy-pkuseg==0.0.28 sudachipy sudachidict_core pymorphy2
endif
ifndef PYVER

View File

@ -61,11 +61,11 @@ open-source software, released under the MIT license.
## 💬 Where to ask questions
The spaCy project is maintained by **[@honnibal](https://github.com/honnibal)**,
**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)** and
**[@adrianeboyd](https://github.com/adrianeboyd)**. Please understand that we won't
be able to provide individual support via email. We also believe that help is
much more valuable if it's shared publicly, so that more people can benefit from
it.
**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)**,
**[@adrianeboyd](https://github.com/adrianeboyd)** and **[@polm](https://github.com/polm)**.
Please understand that we won't be able to provide individual support via email.
We also believe that help is much more valuable if it's shared publicly, so that
more people can benefit from it.
| Type | Platforms |
| ------------------------------- | --------------------------------------- |

View File

@ -22,13 +22,13 @@ jobs:
# defined in .flake8 and overwrites the selected codes.
- job: "Validate"
pool:
vmImage: "ubuntu-16.04"
vmImage: "ubuntu-18.04"
steps:
- task: UsePythonVersion@0
inputs:
versionSpec: "3.7"
- script: |
pip install flake8==3.5.0
pip install flake8==3.9.2
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
displayName: "flake8"
@ -38,7 +38,7 @@ jobs:
matrix:
# We're only running one platform per Python version to speed up builds
Python36Linux:
imageName: "ubuntu-16.04"
imageName: "ubuntu-18.04"
python.version: "3.6"
# Python36Windows:
# imageName: "vs2017-win2016"
@ -47,7 +47,7 @@ jobs:
# imageName: "macos-10.14"
# python.version: "3.6"
# Python37Linux:
# imageName: "ubuntu-16.04"
# imageName: "ubuntu-18.04"
# python.version: "3.7"
Python37Windows:
imageName: "vs2017-win2016"
@ -56,7 +56,7 @@ jobs:
# imageName: "macos-10.14"
# python.version: "3.7"
# Python38Linux:
# imageName: "ubuntu-16.04"
# imageName: "ubuntu-18.04"
# python.version: "3.8"
# Python38Windows:
# imageName: "vs2017-win2016"
@ -65,7 +65,7 @@ jobs:
imageName: "macos-10.14"
python.version: "3.8"
Python39Linux:
imageName: "ubuntu-16.04"
imageName: "ubuntu-18.04"
python.version: "3.9"
Python39Windows:
imageName: "vs2017-win2016"

View File

@ -1,11 +1,11 @@
[build-system]
requires = [
"setuptools",
"cython>=0.25",
"cython>=0.25,<3.0",
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.3,<8.1.0",
"thinc>=8.0.8,<8.1.0",
"blis>=0.4.0,<0.8.0",
"pathy",
"numpy>=1.15.0",

View File

@ -1,30 +1,31 @@
# Our libraries
spacy-legacy>=3.0.5,<3.1.0
spacy-legacy>=3.0.7,<3.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.3,<8.1.0
thinc>=8.0.8,<8.1.0
blis>=0.4.0,<0.8.0
ml_datasets>=0.2.0,<0.3.0
murmurhash>=0.28.0,<1.1.0
wasabi>=0.8.1,<1.1.0
srsly>=2.4.1,<3.0.0
catalogue>=2.0.3,<2.1.0
catalogue>=2.0.4,<2.1.0
typer>=0.3.0,<0.4.0
pathy>=0.3.5
# Third party dependencies
numpy>=1.15.0
requests>=2.13.0,<3.0.0
tqdm>=4.38.0,<5.0.0
pydantic>=1.7.1,<1.8.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0
jinja2
# Official Python utilities
setuptools
packaging>=20.0
typing_extensions>=3.7.4.1,<4.0.0.0; python_version < "3.8"
# Development dependencies
cython>=0.25
pre-commit>=2.13.0
cython>=0.25,<3.0
pytest>=5.2.0
pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0
flake8>=3.5.0,<3.6.0
flake8>=3.8.0,<3.10.0
hypothesis>=3.27.0,<7.0.0

View File

@ -22,37 +22,40 @@ classifiers =
Programming Language :: Python :: 3.8
Programming Language :: Python :: 3.9
Topic :: Scientific/Engineering
project_urls =
Release notes = https://github.com/explosion/spaCy/releases
Source = https://github.com/explosion/spaCy
[options]
zip_safe = false
include_package_data = true
python_requires = >=3.6
setup_requires =
cython>=0.25
cython>=0.25,<3.0
numpy>=1.15.0
# We also need our Cython packages here to compile against
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
thinc>=8.0.3,<8.1.0
thinc>=8.0.8,<8.1.0
install_requires =
# Our libraries
spacy-legacy>=3.0.5,<3.1.0
spacy-legacy>=3.0.7,<3.1.0
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.3,<8.1.0
thinc>=8.0.8,<8.1.0
blis>=0.4.0,<0.8.0
wasabi>=0.8.1,<1.1.0
srsly>=2.4.1,<3.0.0
catalogue>=2.0.3,<2.1.0
catalogue>=2.0.4,<2.1.0
typer>=0.3.0,<0.4.0
pathy>=0.3.5
# Third-party dependencies
tqdm>=4.38.0,<5.0.0
numpy>=1.15.0
requests>=2.13.0,<3.0.0
pydantic>=1.7.1,<1.8.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0
jinja2
# Official Python utilities
setuptools
@ -61,37 +64,37 @@ install_requires =
[options.entry_points]
console_scripts =
spacy = spacy.cli:app
spacy = spacy.cli:setup_cli
[options.extras_require]
lookups =
spacy_lookups_data>=1.0.1,<1.1.0
spacy_lookups_data>=1.0.2,<1.1.0
transformers =
spacy_transformers>=1.0.1,<1.1.0
ray =
spacy_ray>=0.1.0,<1.0.0
cuda =
cupy>=5.0.0b4,<9.0.0
cupy>=5.0.0b4,<10.0.0
cuda80 =
cupy-cuda80>=5.0.0b4,<9.0.0
cupy-cuda80>=5.0.0b4,<10.0.0
cuda90 =
cupy-cuda90>=5.0.0b4,<9.0.0
cupy-cuda90>=5.0.0b4,<10.0.0
cuda91 =
cupy-cuda91>=5.0.0b4,<9.0.0
cupy-cuda91>=5.0.0b4,<10.0.0
cuda92 =
cupy-cuda92>=5.0.0b4,<9.0.0
cupy-cuda92>=5.0.0b4,<10.0.0
cuda100 =
cupy-cuda100>=5.0.0b4,<9.0.0
cupy-cuda100>=5.0.0b4,<10.0.0
cuda101 =
cupy-cuda101>=5.0.0b4,<9.0.0
cupy-cuda101>=5.0.0b4,<10.0.0
cuda102 =
cupy-cuda102>=5.0.0b4,<9.0.0
cupy-cuda102>=5.0.0b4,<10.0.0
cuda110 =
cupy-cuda110>=5.0.0b4,<9.0.0
cupy-cuda110>=5.0.0b4,<10.0.0
cuda111 =
cupy-cuda111>=5.0.0b4,<9.0.0
cupy-cuda111>=5.0.0b4,<10.0.0
cuda112 =
cupy-cuda112>=5.0.0b4,<9.0.0
cupy-cuda112>=5.0.0b4,<10.0.0
# Language tokenizers with external dependencies
ja =
sudachipy>=0.4.9
@ -108,7 +111,7 @@ universal = false
formats = gztar
[flake8]
ignore = E203, E266, E501, E731, W503, E741
ignore = E203, E266, E501, E731, W503, E741, F541
max-line-length = 80
select = B,C,E,F,W,T4,B9
exclude =

View File

@ -4,7 +4,8 @@ import sys
# set library-specific custom warning handling before doing anything else
from .errors import setup_default_warnings
setup_default_warnings()
setup_default_warnings() # noqa: E402
# These are imported as part of the API
from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401

View File

@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy"
__version__ = "3.1.0.dev0"
__version__ = "3.1.1"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects"

View File

@ -74,7 +74,6 @@ IDS = {
"SUFFIX": SUFFIX,
"LENGTH": LENGTH,
"CLUSTER": CLUSTER,
"LEMMA": LEMMA,
"POS": POS,
"TAG": TAG,
@ -85,9 +84,7 @@ IDS = {
"ENT_KB_ID": ENT_KB_ID,
"HEAD": HEAD,
"SENT_START": SENT_START,
"SENT_END": SENT_END,
"SPACY": SPACY,
"PROB": PROB,
"LANG": LANG,
"MORPH": MORPH,
"IDX": IDX

View File

@ -2,7 +2,7 @@ from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, TYPE_CHECK
import sys
import shutil
from pathlib import Path
from wasabi import msg
from wasabi import msg, Printer
import srsly
import hashlib
import typer
@ -504,12 +504,16 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
return result
def setup_gpu(use_gpu: int) -> None:
def setup_gpu(use_gpu: int, silent=None) -> None:
"""Configure the GPU and log info."""
if silent is None:
local_msg = Printer()
else:
local_msg = Printer(no_print=silent, pretty=not silent)
if use_gpu >= 0:
msg.info(f"Using GPU: {use_gpu}")
local_msg.info(f"Using GPU: {use_gpu}")
require_gpu(use_gpu)
else:
msg.info("Using CPU")
local_msg.info("Using CPU")
if has_cupy and gpu_is_available():
msg.info("To switch to GPU 0, use the option: --gpu-id 0")
local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")

View File

@ -6,7 +6,6 @@ import logging
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code
from ..training.initialize import init_nlp
from .. import util
from ..util import get_sourced_components, load_model_from_config

View File

@ -101,13 +101,14 @@ def debug_data(
# Create the gold corpus to be able to better analyze data
dot_names = [T["train_corpus"], T["dev_corpus"]]
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
nlp.initialize(lambda: train_corpus(nlp))
msg.good("Pipeline can be initialized with data")
train_dataset = list(train_corpus(nlp))
dev_dataset = list(dev_corpus(nlp))
msg.good("Corpus is loadable")
nlp.initialize(lambda: train_dataset)
msg.good("Pipeline can be initialized with data")
# Create all gold data here to avoid iterating over the train_dataset constantly
gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True)
gold_train_unpreprocessed_data = _compile_gold(
@ -173,8 +174,9 @@ def debug_data(
)
n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
msg.warn(
"{} words in training data without vectors ({:0.2f}%)".format(
n_missing_vectors, n_missing_vectors / gold_train_data["n_words"]
"{} words in training data without vectors ({:.0f}%)".format(
n_missing_vectors,
100 * (n_missing_vectors / gold_train_data["n_words"]),
),
)
msg.text(
@ -282,42 +284,7 @@ def debug_data(
labels = _get_labels_from_model(nlp, "textcat")
msg.info(f"Text Classification: {len(labels)} label(s)")
msg.text(f"Labels: {_format_labels(labels)}", show=verbose)
labels_with_counts = _format_labels(
gold_train_data["cats"].most_common(), counts=True
)
msg.text(f"Labels in train data: {labels_with_counts}", show=verbose)
missing_labels = labels - set(gold_train_data["cats"].keys())
if missing_labels:
msg.warn(
"Some model labels are not present in the train data. The "
"model performance may be degraded for these labels after "
f"training: {_format_labels(missing_labels)}."
)
if gold_train_data["n_cats_multilabel"] > 0:
# Note: you should never get here because you run into E895 on
# initialization first.
msg.warn(
"The train data contains instances without "
"mutually-exclusive classes. Use the component "
"'textcat_multilabel' instead of 'textcat'."
)
if gold_dev_data["n_cats_multilabel"] > 0:
msg.fail(
"Train/dev mismatch: the dev data contains instances "
"without mutually-exclusive classes while the train data "
"contains only instances with mutually-exclusive classes."
)
if "textcat_multilabel" in factory_names:
msg.divider("Text Classification (Multilabel)")
labels = _get_labels_from_model(nlp, "textcat_multilabel")
msg.info(f"Text Classification: {len(labels)} label(s)")
msg.text(f"Labels: {_format_labels(labels)}", show=verbose)
labels_with_counts = _format_labels(
gold_train_data["cats"].most_common(), counts=True
)
msg.text(f"Labels in train data: {labels_with_counts}", show=verbose)
missing_labels = labels - set(gold_train_data["cats"].keys())
missing_labels = labels - set(gold_train_data["cats"])
if missing_labels:
msg.warn(
"Some model labels are not present in the train data. The "
@ -325,17 +292,76 @@ def debug_data(
f"training: {_format_labels(missing_labels)}."
)
if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
msg.fail(
f"The train and dev labels are not the same. "
msg.warn(
"Potential train/dev mismatch: the train and dev labels are "
"not the same. "
f"Train labels: {_format_labels(gold_train_data['cats'])}. "
f"Dev labels: {_format_labels(gold_dev_data['cats'])}."
)
if len(labels) < 2:
msg.fail(
"The model does not have enough labels. 'textcat' requires at "
"least two labels due to mutually-exclusive classes, e.g. "
"LABEL/NOT_LABEL or POSITIVE/NEGATIVE for a binary "
"classification task."
)
if (
gold_train_data["n_cats_bad_values"] > 0
or gold_dev_data["n_cats_bad_values"] > 0
):
msg.fail(
"Unsupported values for cats: the supported values are "
"1.0/True and 0.0/False."
)
if gold_train_data["n_cats_multilabel"] > 0:
# Note: you should never get here because you run into E895 on
# initialization first.
msg.fail(
"The train data contains instances without mutually-exclusive "
"classes. Use the component 'textcat_multilabel' instead of "
"'textcat'."
)
if gold_dev_data["n_cats_multilabel"] > 0:
msg.fail(
"The dev data contains instances without mutually-exclusive "
"classes. Use the component 'textcat_multilabel' instead of "
"'textcat'."
)
if "textcat_multilabel" in factory_names:
msg.divider("Text Classification (Multilabel)")
labels = _get_labels_from_model(nlp, "textcat_multilabel")
msg.info(f"Text Classification: {len(labels)} label(s)")
msg.text(f"Labels: {_format_labels(labels)}", show=verbose)
missing_labels = labels - set(gold_train_data["cats"])
if missing_labels:
msg.warn(
"Some model labels are not present in the train data. The "
"model performance may be degraded for these labels after "
f"training: {_format_labels(missing_labels)}."
)
if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
msg.warn(
"Potential train/dev mismatch: the train and dev labels are "
"not the same. "
f"Train labels: {_format_labels(gold_train_data['cats'])}. "
f"Dev labels: {_format_labels(gold_dev_data['cats'])}."
)
if (
gold_train_data["n_cats_bad_values"] > 0
or gold_dev_data["n_cats_bad_values"] > 0
):
msg.fail(
"Unsupported values for cats: the supported values are "
"1.0/True and 0.0/False."
)
if gold_train_data["n_cats_multilabel"] > 0:
if gold_dev_data["n_cats_multilabel"] == 0:
msg.warn(
"Potential train/dev mismatch: the train data contains "
"instances without mutually-exclusive classes while the "
"dev data does not."
"dev data contains only instances with mutually-exclusive "
"classes."
)
else:
msg.warn(
@ -556,6 +582,7 @@ def _compile_gold(
"n_nonproj": 0,
"n_cycles": 0,
"n_cats_multilabel": 0,
"n_cats_bad_values": 0,
"texts": set(),
}
for eg in examples:
@ -599,7 +626,9 @@ def _compile_gold(
data["ner"]["-"] += 1
if "textcat" in factory_names or "textcat_multilabel" in factory_names:
data["cats"].update(gold.cats)
if list(gold.cats.values()).count(1.0) != 1:
if any(val not in (0, 1) for val in gold.cats.values()):
data["n_cats_bad_values"] += 1
if list(gold.cats.values()).count(1) != 1:
data["n_cats_multilabel"] += 1
if "tagger" in factory_names:
tags = eg.get_aligned("TAG", as_string=True)

View File

@ -1,10 +1,11 @@
from typing import Dict, Any, Optional, Iterable
from typing import Dict, Any, Optional
from pathlib import Path
import itertools
from spacy.training import Example
from spacy.util import resolve_dot_names
from wasabi import msg
from thinc.api import fix_random_seed, set_dropout_rate, Adam
from thinc.api import fix_random_seed, set_dropout_rate
from thinc.api import Model, data_validation, set_gpu_allocator
import typer
@ -73,23 +74,24 @@ def debug_model_cli(
msg.info(f"Fixing random seed: {seed}")
fix_random_seed(seed)
pipe = nlp.get_pipe(component)
if not hasattr(pipe, "model"):
msg.fail(
f"The component '{component}' does not specify an object that holds a Model.",
exits=1,
)
model = pipe.model
debug_model(config, T, nlp, model, print_settings=print_settings)
debug_model(config, T, nlp, pipe, print_settings=print_settings)
def debug_model(
config,
resolved_train_config,
nlp,
model: Model,
pipe,
*,
print_settings: Optional[Dict[str, Any]] = None,
):
if not hasattr(pipe, "model"):
msg.fail(
f"The component '{pipe}' does not specify an object that holds a Model.",
exits=1,
)
model = pipe.model
if not isinstance(model, Model):
msg.fail(
f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
@ -105,8 +107,6 @@ def debug_model(
_print_model(model, print_settings)
# STEP 1: Initializing the model and printing again
X = _get_docs()
# The output vector might differ from the official type of the output layer
with data_validation(False):
try:
dot_names = [resolved_train_config["train_corpus"]]
@ -114,15 +114,17 @@ def debug_model(
(train_corpus,) = resolve_dot_names(config, dot_names)
nlp.initialize(lambda: train_corpus(nlp))
msg.info("Initialized the model with the training corpus.")
examples = list(itertools.islice(train_corpus(nlp), 5))
except ValueError:
try:
_set_output_dim(nO=7, model=model)
with show_validation_error():
nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X])
examples = [Example.from_dict(x, {}) for x in _get_docs()]
nlp.initialize(lambda: examples)
msg.info("Initialized the model with dummy data.")
except Exception:
msg.fail(
"Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.",
"Could not initialize the model: you'll have to provide a valid 'train_corpus' argument in the config file.",
exits=1,
)
@ -131,28 +133,26 @@ def debug_model(
_print_model(model, print_settings)
# STEP 2: Updating the model and printing again
optimizer = Adam(0.001)
set_dropout_rate(model, 0.2)
# ugly hack to deal with Tok2Vec listeners
tok2vec = None
if model.has_ref("tok2vec") and model.get_ref("tok2vec").name == "tok2vec-listener":
tok2vec = nlp.get_pipe("tok2vec")
goldY = None
# ugly hack to deal with Tok2Vec/Transformer listeners
upstream_component = None
if model.has_ref("tok2vec") and "tok2vec-listener" in model.get_ref("tok2vec").name:
upstream_component = nlp.get_pipe("tok2vec")
if (
model.has_ref("tok2vec")
and "transformer-listener" in model.get_ref("tok2vec").name
):
upstream_component = nlp.get_pipe("transformer")
for e in range(3):
if tok2vec:
tok2vec.update([Example.from_dict(x, {}) for x in X])
Y, get_dX = model.begin_update(X)
if goldY is None:
goldY = _simulate_gold(Y)
dY = get_gradient(goldY, Y, model.ops)
get_dX(dY)
model.finish_update(optimizer)
if upstream_component:
upstream_component.update(examples)
pipe.update(examples)
if print_settings.get("print_after_training"):
msg.divider(f"STEP 2 - after training")
_print_model(model, print_settings)
# STEP 3: the final prediction
prediction = model.predict(X)
prediction = model.predict([ex.predicted for ex in examples])
if print_settings.get("print_prediction"):
msg.divider(f"STEP 3 - prediction")
msg.info(str(prediction))
@ -160,19 +160,6 @@ def debug_model(
msg.good(f"Succesfully ended analysis - model looks good.")
def get_gradient(goldY, Y, ops):
return ops.asarray(Y) - ops.asarray(goldY)
def _simulate_gold(element, counter=1):
if isinstance(element, Iterable):
for i in range(len(element)):
element[i] = _simulate_gold(element[i], counter + i)
return element
else:
return 1 / counter
def _sentences():
return [
"Apple is looking at buying U.K. startup for $1 billion",
@ -209,11 +196,7 @@ def _print_model(model, print_settings):
if dimensions:
for name in node.dim_names:
if node.has_dim(name):
msg.info(f" - dim {name}: {node.get_dim(name)}")
else:
msg.info(f" - dim {name}: {node.has_dim(name)}")
msg.info(f" - dim {name}: {node.maybe_get_dim(name)}")
if parameters:
for name in node.param_names:
if node.has_param(name):

View File

@ -6,7 +6,7 @@ import typer
from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
from .. import about
from ..util import is_package, get_base_version, run_command
from ..util import is_package, get_minor_version, run_command
from ..errors import OLD_MODEL_SHORTCUTS
@ -74,7 +74,7 @@ def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -
def get_compatibility() -> dict:
version = get_base_version(about.__version__)
version = get_minor_version(about.__version__)
r = requests.get(about.__compatibility__)
if r.status_code != 200:
msg.fail(

View File

@ -1,4 +1,4 @@
from typing import Optional, List, Dict
from typing import Optional, List, Dict, Any, Union
from wasabi import Printer
from pathlib import Path
import re
@ -60,10 +60,11 @@ def evaluate(
displacy_path: Optional[Path] = None,
displacy_limit: int = 25,
silent: bool = True,
) -> Scorer:
spans_key: str = "sc",
) -> Dict[str, Any]:
msg = Printer(no_print=silent, pretty=not silent)
fix_random_seed()
setup_gpu(use_gpu)
setup_gpu(use_gpu, silent=silent)
data_path = util.ensure_path(data_path)
output_path = util.ensure_path(output)
displacy_path = util.ensure_path(displacy_path)
@ -90,6 +91,9 @@ def evaluate(
"SENT P": "sents_p",
"SENT R": "sents_r",
"SENT F": "sents_f",
"SPAN P": f"spans_{spans_key}_p",
"SPAN R": f"spans_{spans_key}_r",
"SPAN F": f"spans_{spans_key}_f",
"SPEED": "speed",
}
results = {}
@ -108,27 +112,7 @@ def evaluate(
data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
msg.table(results, title="Results")
if "morph_per_feat" in scores:
if scores["morph_per_feat"]:
print_prf_per_type(msg, scores["morph_per_feat"], "MORPH", "feat")
data["morph_per_feat"] = scores["morph_per_feat"]
if "dep_las_per_type" in scores:
if scores["dep_las_per_type"]:
print_prf_per_type(msg, scores["dep_las_per_type"], "LAS", "type")
data["dep_las_per_type"] = scores["dep_las_per_type"]
if "ents_per_type" in scores:
if scores["ents_per_type"]:
print_prf_per_type(msg, scores["ents_per_type"], "NER", "type")
data["ents_per_type"] = scores["ents_per_type"]
if "cats_f_per_type" in scores:
if scores["cats_f_per_type"]:
print_prf_per_type(msg, scores["cats_f_per_type"], "Textcat F", "label")
data["cats_f_per_type"] = scores["cats_f_per_type"]
if "cats_auc_per_type" in scores:
if scores["cats_auc_per_type"]:
print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"])
data["cats_auc_per_type"] = scores["cats_auc_per_type"]
data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
if displacy_path:
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
@ -151,6 +135,43 @@ def evaluate(
return data
def handle_scores_per_type(
scores: Union[Scorer, Dict[str, Any]],
data: Dict[str, Any] = {},
*,
spans_key: str = "sc",
silent: bool = False,
) -> Dict[str, Any]:
msg = Printer(no_print=silent, pretty=not silent)
if "morph_per_feat" in scores:
if scores["morph_per_feat"]:
print_prf_per_type(msg, scores["morph_per_feat"], "MORPH", "feat")
data["morph_per_feat"] = scores["morph_per_feat"]
if "dep_las_per_type" in scores:
if scores["dep_las_per_type"]:
print_prf_per_type(msg, scores["dep_las_per_type"], "LAS", "type")
data["dep_las_per_type"] = scores["dep_las_per_type"]
if "ents_per_type" in scores:
if scores["ents_per_type"]:
print_prf_per_type(msg, scores["ents_per_type"], "NER", "type")
data["ents_per_type"] = scores["ents_per_type"]
if f"spans_{spans_key}_per_type" in scores:
if scores[f"spans_{spans_key}_per_type"]:
print_prf_per_type(
msg, scores[f"spans_{spans_key}_per_type"], "SPANS", "type"
)
data[f"spans_{spans_key}_per_type"] = scores[f"spans_{spans_key}_per_type"]
if "cats_f_per_type" in scores:
if scores["cats_f_per_type"]:
print_prf_per_type(msg, scores["cats_f_per_type"], "Textcat F", "label")
data["cats_f_per_type"] = scores["cats_f_per_type"]
if "cats_auc_per_type" in scores:
if scores["cats_auc_per_type"]:
print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"])
data["cats_auc_per_type"] = scores["cats_auc_per_type"]
return scores
def render_parses(
docs: List[Doc],
output_path: Path,

View File

@ -108,6 +108,10 @@ def init_labels_cli(
config = util.load_config(config_path, overrides=overrides)
with show_validation_error(hint_fill=False):
nlp = init_nlp(config, use_gpu=use_gpu)
_init_labels(nlp, output_path)
def _init_labels(nlp, output_path):
for name, component in nlp.pipeline:
if getattr(component, "label_data", None) is not None:
output_file = output_path / f"{name}.json"

View File

@ -1,7 +1,7 @@
from typing import Optional, Union, Any, Dict, List, Tuple
import shutil
from pathlib import Path
from wasabi import Printer, get_raw_input
from wasabi import Printer, MarkdownRenderer, get_raw_input
import srsly
import sys
@ -18,7 +18,7 @@ def package_cli(
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"),
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
create_meta: bool = Opt(False, "--create-meta", "-C", help="Create meta.json, even if one exists"),
name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."),
@ -133,7 +133,15 @@ def package(
for file_name in FILENAMES_DOCS:
file_path = package_path / model_name_v / file_name
if file_path.exists():
shutil.move(str(file_path), str(main_path))
shutil.copy(str(file_path), str(main_path))
readme_path = main_path / "README.md"
if not readme_path.exists():
readme = generate_readme(meta)
create_file(readme_path, readme)
create_file(package_path / model_name_v / "README.md", readme)
msg.good("Generated README.md from meta.json")
else:
msg.info("Using existing README.md from pipeline directory")
imports = []
for code_path in code_paths:
imports.append(code_path.stem)
@ -197,8 +205,9 @@ def get_meta(
"url": "",
"license": "MIT",
}
meta.update(existing_meta)
nlp = util.load_model_from_path(Path(model_path))
meta.update(nlp.meta)
meta.update(existing_meta)
meta["spacy_version"] = util.get_model_version_range(about.__version__)
meta["vectors"] = {
"width": nlp.vocab.vectors_length,
@ -234,6 +243,113 @@ def generate_meta(existing_meta: Dict[str, Any], msg: Printer) -> Dict[str, Any]
return meta
def generate_readme(meta: Dict[str, Any]) -> str:
"""
Generate a Markdown-formatted README text from a model meta.json. Used
within the GitHub release notes and as content for README.md file added
to model packages.
"""
md = MarkdownRenderer()
lang = meta["lang"]
name = f"{lang}_{meta['name']}"
version = meta["version"]
pipeline = ", ".join([md.code(p) for p in meta.get("pipeline", [])])
components = ", ".join([md.code(p) for p in meta.get("components", [])])
vecs = meta.get("vectors", {})
vectors = f"{vecs.get('keys', 0)} keys, {vecs.get('vectors', 0)} unique vectors ({ vecs.get('width', 0)} dimensions)"
author = meta.get("author") or "n/a"
notes = meta.get("notes", "")
license_name = meta.get("license")
sources = _format_sources(meta.get("sources"))
description = meta.get("description")
label_scheme = _format_label_scheme(meta.get("labels"))
accuracy = _format_accuracy(meta.get("performance"))
table_data = [
(md.bold("Name"), md.code(name)),
(md.bold("Version"), md.code(version)),
(md.bold("spaCy"), md.code(meta["spacy_version"])),
(md.bold("Default Pipeline"), pipeline),
(md.bold("Components"), components),
(md.bold("Vectors"), vectors),
(md.bold("Sources"), sources or "n/a"),
(md.bold("License"), md.code(license_name) if license_name else "n/a"),
(md.bold("Author"), md.link(author, meta["url"]) if "url" in meta else author),
]
# Put together Markdown body
if description:
md.add(description)
md.add(md.table(table_data, ["Feature", "Description"]))
if label_scheme:
md.add(md.title(3, "Label Scheme"))
md.add(label_scheme)
if accuracy:
md.add(md.title(3, "Accuracy"))
md.add(accuracy)
if notes:
md.add(notes)
return md.text
def _format_sources(data: Any) -> str:
if not data or not isinstance(data, list):
return "n/a"
sources = []
for source in data:
if not isinstance(source, dict):
source = {"name": source}
name = source.get("name")
if not name:
continue
url = source.get("url")
author = source.get("author")
result = name if not url else "[{}]({})".format(name, url)
if author:
result += " ({})".format(author)
sources.append(result)
return "<br />".join(sources)
def _format_accuracy(data: Dict[str, Any], exclude: List[str] = ["speed"]) -> str:
if not data:
return ""
md = MarkdownRenderer()
scalars = [(k, v) for k, v in data.items() if isinstance(v, (int, float))]
scores = [
(md.code(acc.upper()), f"{score*100:.2f}")
for acc, score in scalars
if acc not in exclude
]
md.add(md.table(scores, ["Type", "Score"]))
return md.text
def _format_label_scheme(data: Dict[str, Any]) -> str:
if not data:
return ""
md = MarkdownRenderer()
n_labels = 0
n_pipes = 0
label_data = []
for pipe, labels in data.items():
if not labels:
continue
col1 = md.bold(md.code(pipe))
col2 = ", ".join(
[md.code(label.replace("|", "\\|")) for label in labels]
) # noqa: W605
label_data.append((col1, col2))
n_labels += len(labels)
n_pipes += 1
if not label_data:
return ""
label_info = f"View label scheme ({n_labels} labels for {n_pipes} components)"
md.add("<details>")
md.add(f"<summary>{label_info}</summary>")
md.add(md.table(label_data, ["Component", "Labels"]))
md.add("</details>")
return md.text
TEMPLATE_SETUP = """
#!/usr/bin/env python
import io
@ -248,6 +364,13 @@ def load_meta(fp):
return json.load(f)
def load_readme(fp):
if path.exists(fp):
with io.open(fp, encoding='utf8') as f:
return f.read()
return ""
def list_files(data_dir):
output = []
for root, _, filenames in walk(data_dir):
@ -273,6 +396,8 @@ def setup_package():
root = path.abspath(path.dirname(__file__))
meta_path = path.join(root, 'meta.json')
meta = load_meta(meta_path)
readme_path = path.join(root, 'README.md')
readme = load_readme(readme_path)
model_name = str(meta['lang'] + '_' + meta['name'])
model_dir = path.join(model_name, model_name + '-' + meta['version'])
@ -282,6 +407,7 @@ def setup_package():
setup(
name=model_name,
description=meta.get('description'),
long_description=readme,
author=meta.get('author'),
author_email=meta.get('email'),
url=meta.get('url'),
@ -303,6 +429,8 @@ if __name__ == '__main__':
TEMPLATE_MANIFEST = """
include meta.json
include LICENSE
include LICENSES_SOURCES
include README.md
""".strip()

View File

@ -95,6 +95,13 @@ def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
"then the new directory will be created for you.",
)
if resume_path is not None:
if resume_path.is_dir():
# This is necessary because Windows gives a Permission Denied when we
# try to open the directory later, which is confusing. See #7878
msg.fail(
"--resume-path should be a weights file, but {resume_path} is a directory.",
exits=True,
)
model_name = re.search(r"model\d+\.bin", str(resume_path))
if not model_name and not epoch_resume:
msg.fail(

View File

@ -212,6 +212,9 @@ def check_rerun(
strict_version (bool):
RETURNS (bool): Whether to re-run the command.
"""
# Always rerun if no-skip is set
if command.get("no_skip", False):
return True
lock_path = project_dir / PROJECT_LOCK
if not lock_path.exists(): # We don't have a lockfile, run command
return True

View File

@ -151,14 +151,14 @@ grad_factor = 1.0
@layers = "reduce_mean.v1"
[components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v1"
@architectures = "spacy.TextCatBOW.v2"
exclusive_classes = true
ngram_size = 1
no_output_layer = false
{% else -%}
[components.textcat.model]
@architectures = "spacy.TextCatBOW.v1"
@architectures = "spacy.TextCatBOW.v2"
exclusive_classes = true
ngram_size = 1
no_output_layer = false
@ -182,14 +182,14 @@ grad_factor = 1.0
@layers = "reduce_mean.v1"
[components.textcat_multilabel.model.linear_model]
@architectures = "spacy.TextCatBOW.v1"
@architectures = "spacy.TextCatBOW.v2"
exclusive_classes = false
ngram_size = 1
no_output_layer = false
{% else -%}
[components.textcat_multilabel.model]
@architectures = "spacy.TextCatBOW.v1"
@architectures = "spacy.TextCatBOW.v2"
exclusive_classes = false
ngram_size = 1
no_output_layer = false
@ -316,14 +316,14 @@ nO = null
width = ${components.tok2vec.model.encode.width}
[components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v1"
@architectures = "spacy.TextCatBOW.v2"
exclusive_classes = true
ngram_size = 1
no_output_layer = false
{% else -%}
[components.textcat.model]
@architectures = "spacy.TextCatBOW.v1"
@architectures = "spacy.TextCatBOW.v2"
exclusive_classes = true
ngram_size = 1
no_output_layer = false
@ -344,14 +344,14 @@ nO = null
width = ${components.tok2vec.model.encode.width}
[components.textcat_multilabel.model.linear_model]
@architectures = "spacy.TextCatBOW.v1"
@architectures = "spacy.TextCatBOW.v2"
exclusive_classes = false
ngram_size = 1
no_output_layer = false
{% else -%}
[components.textcat_multilabel.model]
@architectures = "spacy.TextCatBOW.v1"
@architectures = "spacy.TextCatBOW.v2"
exclusive_classes = false
ngram_size = 1
no_output_layer = false
@ -418,7 +418,7 @@ compound = 1.001
[initialize]
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
vectors = null
vectors = ${paths.vectors}
{% else -%}
vectors = "{{ word_vectors }}"
{% endif -%}

View File

@ -28,7 +28,7 @@ def train_cli(
"""
Train or update a spaCy pipeline. Requires data in spaCy's binary format. To
convert data from other formats, use the `spacy convert` command. The
config file includes all settings and hyperparameters used during traing.
config file includes all settings and hyperparameters used during training.
To override settings in the config, e.g. settings that point to local
paths or that you want to experiment with, you can override them as
command line options. For instance, --training.batch_size 128 overrides

View File

@ -3,10 +3,11 @@ from pathlib import Path
import sys
import requests
from wasabi import msg, Printer
import warnings
from ._util import app
from .. import about
from ..util import get_package_version, get_installed_models, get_base_version
from ..util import get_package_version, get_installed_models, get_minor_version
from ..util import get_package_path, get_model_meta, is_compatible_version
@ -24,7 +25,7 @@ def validate_cli():
def validate() -> None:
model_pkgs, compat = get_model_pkgs()
spacy_version = get_base_version(about.__version__)
spacy_version = get_minor_version(about.__version__)
current_compat = compat.get(spacy_version, {})
if not current_compat:
msg.warn(f"No compatible packages found for v{spacy_version} of spaCy")
@ -44,8 +45,8 @@ def validate() -> None:
comp = msg.text("", color="green", icon="good", no_print=True)
version = msg.text(data["version"], color="green", no_print=True)
else:
version = msg.text(data["version"], color="red", no_print=True)
comp = f"--> {compat.get(data['name'], ['n/a'])[0]}"
version = msg.text(data["version"], color="yellow", no_print=True)
comp = f"--> {current_compat.get(data['name'], ['n/a'])[0]}"
rows.append((data["name"], data["spacy"], version, comp))
msg.table(rows, header=header)
else:
@ -78,7 +79,9 @@ def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]:
msg.good("Loaded compatibility table")
compat = r.json()["spacy"]
all_models = set()
installed_models = get_installed_models()
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="\\[W09[45]")
installed_models = get_installed_models()
for spacy_v, models in dict(compat).items():
all_models.update(models.keys())
for model, model_vs in models.items():
@ -92,7 +95,9 @@ def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]:
spacy_version = about.__version__
else:
model_path = get_package_path(package)
model_meta = get_model_meta(model_path)
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="\\[W09[45]")
model_meta = get_model_meta(model_path)
spacy_version = model_meta.get("spacy_version", "n/a")
is_compat = is_compatible_version(about.__version__, spacy_version)
pkgs[pkg_name] = {

View File

@ -120,7 +120,9 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
doc (Doc): Document do parse.
RETURNS (dict): Generated dependency parse keyed by words and arcs.
"""
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data", "user_hooks"]))
doc = Doc(orig_doc.vocab).from_bytes(
orig_doc.to_bytes(exclude=["user_data", "user_hooks"])
)
if not doc.has_annotation("DEP"):
warnings.warn(Warnings.W005)
if options.get("collapse_phrases", False):

View File

@ -150,12 +150,12 @@ class Warnings:
"released, because the model may say it's compatible when it's "
'not. Consider changing the "spacy_version" in your meta.json to a '
"version range, with a lower and upper pin. For example: {example}")
W095 = ("Model '{model}' ({model_version}) requires spaCy {version} and is "
"incompatible with the current version ({current}). This may lead "
"to unexpected results or runtime errors. To resolve this, "
"download a newer compatible model or retrain your custom model "
"with the current spaCy version. For more details and available "
"updates, run: python -m spacy validate")
W095 = ("Model '{model}' ({model_version}) was trained with spaCy "
"{version} and may not be 100% compatible with the current version "
"({current}). If you see errors or degraded performance, download "
"a newer compatible model or retrain your custom model with the "
"current spaCy version. For more details and available updates, "
"run: python -m spacy validate")
W096 = ("The method `nlp.disable_pipes` is now deprecated - use "
"`nlp.select_pipes` instead.")
W100 = ("Skipping unsupported morphological feature(s): '{feature}'. "
@ -406,21 +406,10 @@ class Errors:
E125 = ("Unexpected value: {value}")
E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. "
"This is likely a bug in spaCy, so feel free to open an issue.")
E129 = ("Cannot write the label of an existing Span object because a Span "
"is a read-only view of the underlying Token objects stored in the "
"Doc. Instead, create a new Span object and specify the `label` "
"keyword argument, for example:\nfrom spacy.tokens import Span\n"
"span = Span(doc, start={start}, end={end}, label='{label}')")
E130 = ("You are running a narrow unicode build, which is incompatible "
"with spacy >= 2.1.0. To fix this, reinstall Python and use a wide "
"unicode build instead. You can also rebuild Python and set the "
"`--enable-unicode=ucs4 flag`.")
E131 = ("Cannot write the kb_id of an existing Span object because a Span "
"is a read-only view of the underlying Token objects stored in "
"the Doc. Instead, create a new Span object and specify the "
"`kb_id` keyword argument, for example:\nfrom spacy.tokens "
"import Span\nspan = Span(doc, start={start}, end={end}, "
"label='{label}', kb_id='{kb_id}')")
E132 = ("The vectors for entities and probabilities for alias '{alias}' "
"should have equal length, but found {entities_length} and "
"{probabilities_length} respectively.")
@ -532,6 +521,24 @@ class Errors:
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
# New errors added in v3.x
E867 = ("The 'textcat' component requires at least two labels because it "
"uses mutually exclusive classes where exactly one label is True "
"for each doc. For binary classification tasks, you can use two "
"labels with 'textcat' (LABEL / NOT_LABEL) or alternatively, you "
"can use the 'textcat_multilabel' component with one label.")
E868 = ("Found a conflicting gold annotation in a reference document, "
"with the following char-based span occurring both in the gold ents "
"as well as in the negative spans: {span}.")
E869 = ("The notation '{label}' is not supported anymore. To annotate "
"negative NER samples, use `doc.spans[key]` instead, and "
"specify the key as 'incorrect_spans_key' when constructing "
"the NER component.")
E870 = ("Could not serialize the DocBin because it is too large. Consider "
"splitting up your documents into several doc bins and serializing "
"each separately. spacy.Corpus.v1 will search recursively for all "
"*.spacy files if you provide a directory instead of a filename as "
"the 'path'.")
E871 = ("Error encountered in nlp.pipe with multiprocessing:\n\n{error}")
E872 = ("Unable to copy tokenizer from base model due to different "
'tokenizer settings: current tokenizer config "{curr_config}" '
'vs. base model "{base_config}"')
@ -851,6 +858,15 @@ class Errors:
"DependencyMatcher token patterns. The token pattern in "
"RIGHT_ATTR should return matches that are each exactly one token "
"long. Invalid pattern:\n{node}")
E1017 = ("A Doc object requires both 'deps' and 'heads' for dependency "
"parses. If no dependency labels are available, provide "
"placeholder deps such as `deps=[\"dep\"]*len(heads)`.")
E1018 = ("Knowledge base for component '{name}' is not set. "
"Make sure either `nel.initialize` or `nel.set_kb` "
"is called with a `kb_loader` function.")
E1019 = ("`noun_chunks` requires the pos tagging, which requires a "
"statistical model to be installed and loaded. For more info, see "
"the documentation:\nhttps://spacy.io/usage/models")
# Deprecated model shortcuts, only used in errors and warnings

View File

@ -58,7 +58,7 @@ GLOSSARY = {
"FW": "foreign word",
"HYPH": "punctuation mark, hyphen",
"IN": "conjunction, subordinating or preposition",
"JJ": "adjective",
"JJ": "adjective (English), other noun-modifier (Chinese)",
"JJR": "adjective, comparative",
"JJS": "adjective, superlative",
"LS": "list item marker",
@ -88,7 +88,7 @@ GLOSSARY = {
"WP": "wh-pronoun, personal",
"WP$": "wh-pronoun, possessive",
"WRB": "wh-adverb",
"SP": "space",
"SP": "space (English), sentence-final particle (Chinese)",
"ADD": "email",
"NFP": "superfluous punctuation",
"GW": "additional word in multi-word expression",
@ -152,6 +152,40 @@ GLOSSARY = {
"VVIZU": 'infinitive with "zu", full',
"VVPP": "perfect participle, full",
"XY": "non-word containing non-letter",
# POS Tags (Chinese)
# OntoNotes / Chinese Penn Treebank
# https://repository.upenn.edu/cgi/viewcontent.cgi?article=1039&context=ircs_reports
"AD": "adverb",
"AS": "aspect marker",
"BA": "把 in ba-construction",
# "CD": "cardinal number",
"CS": "subordinating conjunction",
"DEC": "的 in a relative clause",
"DEG": "associative 的",
"DER": "得 in V-de const. and V-de-R",
"DEV": "地 before VP",
"ETC": "for words 等, 等等",
# "FW": "foreign words"
"IJ": "interjection",
# "JJ": "other noun-modifier",
"LB": "被 in long bei-const",
"LC": "localizer",
"M": "measure word",
"MSP": "other particle",
# "NN": "common noun",
"NR": "proper noun",
"NT": "temporal noun",
"OD": "ordinal number",
"ON": "onomatopoeia",
"P": "preposition excluding 把 and 被",
"PN": "pronoun",
"PU": "punctuation",
"SB": "被 in short bei-const",
# "SP": "sentence-final particle",
"VA": "predicative adjective",
"VC": "是 (copula)",
"VE": "有 as the main verb",
"VV": "other verb",
# Noun chunks
"NP": "noun phrase",
"PP": "prepositional phrase",

View File

@ -93,6 +93,15 @@ cdef class KnowledgeBase:
self.vocab = vocab
self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
def initialize_entities(self, int64_t nr_entities):
self._entry_index = PreshMap(nr_entities + 1)
self._entries = entry_vec(nr_entities + 1)
self._vectors_table = float_matrix(nr_entities + 1)
def initialize_aliases(self, int64_t nr_aliases):
self._alias_index = PreshMap(nr_aliases + 1)
self._aliases_table = alias_vec(nr_aliases + 1)
@property
def entity_vector_length(self):
"""RETURNS (uint64): length of the entity vectors"""
@ -144,8 +153,7 @@ cdef class KnowledgeBase:
raise ValueError(Errors.E140)
nr_entities = len(set(entity_list))
self._entry_index = PreshMap(nr_entities+1)
self._entries = entry_vec(nr_entities+1)
self.initialize_entities(nr_entities)
i = 0
cdef KBEntryC entry
@ -325,6 +333,102 @@ cdef class KnowledgeBase:
return 0.0
def to_bytes(self, **kwargs):
"""Serialize the current state to a binary string.
"""
def serialize_header():
header = (self.get_size_entities(), self.get_size_aliases(), self.entity_vector_length)
return srsly.json_dumps(header)
def serialize_entries():
i = 1
tuples = []
for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
entry = self._entries[entry_index]
assert entry.entity_hash == entry_hash
assert entry_index == i
tuples.append((entry.entity_hash, entry.freq, entry.vector_index))
i = i + 1
return srsly.json_dumps(tuples)
def serialize_aliases():
i = 1
headers = []
indices_lists = []
probs_lists = []
for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
alias = self._aliases_table[alias_index]
assert alias_index == i
candidate_length = len(alias.entry_indices)
headers.append((alias_hash, candidate_length))
indices_lists.append(alias.entry_indices)
probs_lists.append(alias.probs)
i = i + 1
headers_dump = srsly.json_dumps(headers)
indices_dump = srsly.json_dumps(indices_lists)
probs_dump = srsly.json_dumps(probs_lists)
return srsly.json_dumps((headers_dump, indices_dump, probs_dump))
serializers = {
"header": serialize_header,
"entity_vectors": lambda: srsly.json_dumps(self._vectors_table),
"entries": serialize_entries,
"aliases": serialize_aliases,
}
return util.to_bytes(serializers, [])
def from_bytes(self, bytes_data, *, exclude=tuple()):
"""Load state from a binary string.
"""
def deserialize_header(b):
header = srsly.json_loads(b)
nr_entities = header[0]
nr_aliases = header[1]
entity_vector_length = header[2]
self.initialize_entities(nr_entities)
self.initialize_aliases(nr_aliases)
self.entity_vector_length = entity_vector_length
def deserialize_vectors(b):
self._vectors_table = srsly.json_loads(b)
def deserialize_entries(b):
cdef KBEntryC entry
tuples = srsly.json_loads(b)
i = 1
for (entity_hash, freq, vector_index) in tuples:
entry.entity_hash = entity_hash
entry.freq = freq
entry.vector_index = vector_index
entry.feats_row = -1 # Features table currently not implemented
self._entries[i] = entry
self._entry_index[entity_hash] = i
i += 1
def deserialize_aliases(b):
cdef AliasC alias
i = 1
all_data = srsly.json_loads(b)
headers = srsly.json_loads(all_data[0])
indices = srsly.json_loads(all_data[1])
probs = srsly.json_loads(all_data[2])
for header, indices, probs in zip(headers, indices, probs):
alias_hash, candidate_length = header
alias.entry_indices = indices
alias.probs = probs
self._aliases_table[i] = alias
self._alias_index[alias_hash] = i
i += 1
setters = {
"header": deserialize_header,
"entity_vectors": deserialize_vectors,
"entries": deserialize_entries,
"aliases": deserialize_aliases,
}
util.from_bytes(bytes_data, setters, exclude)
return self
def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
path = ensure_path(path)
if not path.exists():
@ -404,10 +508,8 @@ cdef class KnowledgeBase:
cdef int64_t entity_vector_length
reader.read_header(&nr_entities, &entity_vector_length)
self.initialize_entities(nr_entities)
self.entity_vector_length = entity_vector_length
self._entry_index = PreshMap(nr_entities+1)
self._entries = entry_vec(nr_entities+1)
self._vectors_table = float_matrix(nr_entities+1)
# STEP 1: load entity vectors
cdef int i = 0
@ -445,8 +547,7 @@ cdef class KnowledgeBase:
# STEP 3: load aliases
cdef int64_t nr_aliases
reader.read_alias_length(&nr_aliases)
self._alias_index = PreshMap(nr_aliases+1)
self._aliases_table = alias_vec(nr_aliases+1)
self.initialize_aliases(nr_aliases)
cdef int64_t nr_candidates
cdef vector[int64_t] entry_indices

16
spacy/lang/az/__init__.py Normal file
View File

@ -0,0 +1,16 @@
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
class AzerbaijaniDefaults(Language.Defaults):
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Azerbaijani(Language):
lang = "az"
Defaults = AzerbaijaniDefaults
__all__ = ["Azerbaijani"]

18
spacy/lang/az/examples.py Normal file
View File

@ -0,0 +1,18 @@
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.az.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Bu bir cümlədir.",
"Necəsən?",
"Qarabağ ordeni vətən müharibəsində qələbə münasibəti ilə təsis edilmişdir.",
"Məktəbimizə Bakıdan bir tarix müəllimi gəlmişdi.",
"Atılan növbəti mərmilər lap yaxınlıqda partladı.",
"Sinqapur koronavirus baxımından ən təhlükəsiz ölkələr sırasındadır.",
"Marsda ilk sınaq uçuşu həyata keçirilib.",
"SSRİ dağılandan bəri 5 sahil dövləti Xəzərin statusunu müəyyən edə bilməyiblər.",
"Videoda beyninə xüsusi çip yerləşdirilmiş meymun əks olunub.",
]

View File

@ -0,0 +1,89 @@
from ...attrs import LIKE_NUM
# Eleven, twelve etc. are written separate: on bir, on iki
_num_words = [
"bir",
"iki",
"üç",
"dörd",
"beş",
"altı",
"yeddi",
"səkkiz",
"doqquz",
"on",
"iyirmi",
"otuz",
"qırx",
"əlli",
"altmış",
"yetmiş",
"səksən",
"doxsan",
"yüz",
"min",
"milyon",
"milyard",
"trilyon",
"kvadrilyon",
"kentilyon",
]
_ordinal_words = [
"birinci",
"ikinci",
"üçüncü",
"dördüncü",
"beşinci",
"altıncı",
"yedinci",
"səkkizinci",
"doqquzuncu",
"onuncu",
"iyirminci",
"otuzuncu",
"qırxıncı",
"əllinci",
"altmışıncı",
"yetmişinci",
"səksəninci",
"doxsanıncı",
"yüzüncü",
"mininci",
"milyonuncu",
"milyardıncı",
"trilyonuncu",
"kvadrilyonuncu",
"kentilyonuncu",
]
_ordinal_endings = ("inci", "ıncı", "nci", "ncı", "uncu", "üncü")
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
text_lower = text.lower()
# Check cardinal number
if text_lower in _num_words:
return True
# Check ordinal number
if text_lower in _ordinal_words:
return True
if text_lower.endswith(_ordinal_endings):
if text_lower[:-3].isdigit() or text_lower[:-4].isdigit():
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}

145
spacy/lang/az/stop_words.py Normal file
View File

@ -0,0 +1,145 @@
# Source: https://github.com/eliasdabbas/advertools/blob/master/advertools/stopwords.py
STOP_WORDS = set(
"""
amma
arasında
artıq
ay
az
bax
belə
beş
bilər
bir
biraz
biri
birşey
biz
bizim
bizlər
bu
buna
bundan
bunların
bunu
bunun
buradan
bütün
bəli
bəlkə
bəy
bəzi
bəzən
daha
dedi
deyil
dir
düz
dək
dən
dəqiqə
edir
edən
elə
et
etdi
etmə
etmək
faiz
gilə
görə
ha
haqqında
harada
heç
həm
həmin
həmişə
hər
idi
il
ildə
ilk
ilə
in
indi
istifadə
isə
ki
kim
kimi
kimə
lakin
lap
mirşey
məhz
mən
mənə
niyə
nəhayət
o
obirisi
of
olan
olar
olaraq
oldu
olduğu
olmadı
olmaz
olmuşdur
olsun
olur
on
ona
ondan
onlar
onlardan
onların
onsuzda
onu
onun
oradan
qarşı
qədər
saat
sadəcə
saniyə
siz
sizin
sizlər
sonra
səhv
sən
sənin
sənə
təəssüf
var
xan
xanım
xeyr
ya
yalnız
yaxşı
yeddi
yenə
yox
yoxdur
yoxsa
yəni
zaman
çox
çünki
öz
özü
üçün
əgər
əlbəttə
ən
əslində
""".split()
)

View File

@ -22,13 +22,13 @@ _num_words = [
"тринадесет",
"тринайсет",
"четиринадесет",
"четиринайсет"
"четиринайсет",
"петнадесет",
"петнайсет"
"петнайсет",
"шестнадесет",
"шестнайсет",
"седемнадесет",
"седемнайсет"
"седемнайсет",
"осемнадесет",
"осемнайсет",
"деветнадесет",
@ -36,7 +36,7 @@ _num_words = [
"двадесет",
"двайсет",
"тридесет",
"трийсет"
"трийсет",
"четиридесет",
"четиресет",
"петдесет",

View File

@ -58,7 +58,6 @@ _abbr_dot_exc = [
{ORTH: "стр.", NORM: "страница"},
{ORTH: "ул.", NORM: "улица"},
{ORTH: "чл.", NORM: "член"},
]
for abbr in _abbr_dot_exc:

View File

@ -260,7 +260,10 @@ _units = (
"кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб"
"كم كم² كم³ م م² م³ سم سم² سم³ مم مم² مم³ كم غرام جرام جم كغ ملغ كوب اكواب"
)
_currency = r"\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼ ₴"
_currency = (
r"\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼ ₴ ₠ ₡ ₢ ₣ ₤ ₥ ₦ ₧ ₨ ₩ ₪ ₫ € ₭ ₮ ₯ ₰ "
r"₱ ₲ ₳ ₴ ₵ ₶ ₷ ₸ ₹ ₺ ₻ ₼ ₽ ₾ ₿"
)
# These expressions contain various unicode variations, including characters
# used in Chinese (see #1333, #1340, #1351) unless there are cross-language

View File

@ -57,6 +57,6 @@ class GreekLemmatizer(Lemmatizer):
forms.extend(oov_forms)
if not forms:
forms.append(string)
forms = list(set(forms))
forms = list(dict.fromkeys(forms))
self.cache[cache_key] = forms
return forms

View File

@ -35,7 +35,7 @@ def like_num(text: str) -> bool:
# Check ordinal number
if text_lower in _ordinal_words:
return True
if text_lower.endswith("th"):
if text_lower.endswith(("st", "nd", "rd", "th")):
if text_lower[:-2].isdigit():
return True
return False

View File

@ -1,5 +1,5 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH
from ...symbols import ORTH, NORM
from ...util import update_exc
@ -79,5 +79,34 @@ for exc_data in [
]:
_exc[exc_data[ORTH]] = [exc_data]
# Source: https://kaino.kotus.fi/visk/sisallys.php?p=141
conj_contraction_bases = [
("ett", "että"),
("jott", "jotta"),
("kosk", "koska"),
("mutt", "mutta"),
("vaikk", "vaikka"),
("ehk", "ehkä"),
("miks", "miksi"),
("siks", "siksi"),
("joll", "jos"),
("ell", "jos"),
]
conj_contraction_negations = [
("en", "en"),
("et", "et"),
("ei", "ei"),
("emme", "emme"),
("ette", "ette"),
("eivat", "eivät"),
("eivät", "eivät"),
]
for (base_lower, base_norm) in conj_contraction_bases:
for base in [base_lower, base_lower.title()]:
for (suffix, suffix_norm) in conj_contraction_negations:
_exc[base + suffix] = [
{ORTH: base, NORM: base_norm},
{ORTH: suffix, NORM: suffix_norm},
]
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,30 +1,31 @@
STOP_WORDS = set(
"""
a à â abord afin ah ai aie ainsi ait allaient allons
alors anterieur anterieure anterieures apres après as assez attendu au
aucun aucune aujourd aujourd'hui aupres auquel aura auraient aurait auront
alors anterieur anterieure anterieures antérieur antérieure antérieures
apres après as assez attendu au
aupres auquel aura auraient aurait auront
aussi autre autrement autres autrui aux auxquelles auxquels avaient
avais avait avant avec avoir avons ayant
bas basee bat
c' c ça car ce ceci cela celle celle-ci celle-là celles celles-ci celles-là celui
celui-ci celui- cent cependant certain certaine certaines certains certes ces
c' c ça car ce ceci cela celle celle-ci celle-la celle-là celles celles-ci celles-la celles-là
celui celui-ci celui-la celui- cent cependant certain certaine certaines certains certes ces
cet cette ceux ceux-ci ceux- chacun chacune chaque chez ci cinq cinquantaine cinquante
cinquantième cinquième combien comme comment compris concernant
d' d da dans de debout dedans dehors deja delà depuis derriere
d' d da dans de debout dedans dehors deja dejà delà depuis derriere
derrière des desormais desquelles desquels dessous dessus deux deuxième
deuxièmement devant devers devra different differentes differents différent
deuxièmement devant devers devra different differente differentes differents différent
différente différentes différents dire directe directement dit dite dits divers
diverse diverses dix dix-huit dix-neuf dix-sept dixième doit doivent donc dont
douze douzième du duquel durant dès sormais
douze douzième du duquel durant dès ja déjà sormais
effet egale egalement egales eh elle elle-même elles elles-mêmes en encore
effet egalement eh elle elle-meme elle-même elles elles-memes elles-mêmes en encore
enfin entre envers environ es ès est et etaient étaient etais étais etait était
etant étant etc été etre être eu eux eux-mêmes exactement excepté
etant étant etc etre être eu eux eux-mêmes exactement excepté également
fais faisaient faisant fait façon feront font
fais faisaient faisant fait facon façon feront font
gens
@ -36,45 +37,48 @@ j' j je jusqu jusque juste
l' l la laisser laquelle le lequel les lesquelles lesquels leur leurs longtemps
lors lorsque lui lui-meme lui-même lès
m' m ma maint maintenant mais malgre me meme memes merci mes mien
m' m ma maint maintenant mais malgre malgré me meme memes merci mes mien
mienne miennes miens mille moi moi-meme moi-même moindres moins
mon même mêmes
n' n na ne neanmoins neuvième ni nombreuses nombreux nos notamment
notre nous nous-mêmes nouvea nul néanmoins nôtre nôtres
notre nous nous-mêmes nouveau nul néanmoins nôtre nôtres
o ô on ont onze onzième ore ou ouias oust outre
o ô on ont onze onzième or ou ouias ouste outre
ouvert ouverte ouverts
par parce parfois parle parlent parler parmi parseme partant
par parce parfois parle parlent parler parmi partant
pas pendant pense permet personne peu peut peuvent peux plus
plusieurs plutôt possible possibles pour pourquoi
pourrais pourrait pouvait prealable precisement premier première premièrement
pres procedant proche près pu puis puisque
plusieurs plutot plutôt possible possibles pour pourquoi
pourrais pourrait pouvait prealable precisement
premier première premièrement
pres procedant proche près préalable précisement pu puis puisque
qu' qu quand quant quant-à-soi quanta quarante quatorze quatre quatre-vingt
qu' qu quand quant quant-à-soi quarante quatorze quatre quatre-vingt
quatrième quatrièmement que quel quelconque quelle quelles quelqu'un quelque
quelques quels qui quiconque quinze quoi quoique
relative relativement rend rendre restant reste
restent retour revoici revoilà
restent retour revoici revoila revoilà
s' s sa sait sans sauf se seize selon semblable semblaient
semble semblent sent sept septième sera seraient serait seront ses seul seule
seulement si sien sienne siennes siens sinon six sixième soi soi-même soit
soixante son sont sous souvent specifique specifiques stop
seulement seuls seules si sien sienne siennes siens sinon six sixième soi soi-meme soi-même soit
soixante son sont sous souvent specifique specifiques spécifique spécifiques stop
suffisant suffisante suffit suis suit suivant suivante
suivantes suivants suivre sur surtout
t' t ta tant te tel telle tellement telles tels tenant tend tenir tente
tes tien tienne tiennes tiens toi toi-même ton touchant toujours tous
tout toute toutes treize trente tres trois troisième troisièmement
tes tien tienne tiennes tiens toi toi-meme toi-même ton touchant toujours tous
tout toute toutes treize trente tres trois troisième troisièmement très
tu
un une unes uns
va vais vas vers via vingt voici voilà vont vos
votre vous vous-mêmes vu vôtre vôtres
va vais vas vers via vingt voici voila voilà vont vos
votre votres vous vous-mêmes vu vôtre vôtres
y
""".split()
)

View File

@ -0,0 +1,18 @@
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
class AncientGreekDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class AncientGreek(Language):
lang = "grc"
Defaults = AncientGreekDefaults
__all__ = ["AncientGreek"]

View File

@ -0,0 +1,17 @@
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.grc.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"ἐρᾷ μὲν ἁγνὸς οὐρανὸς τρῶσαι χθόνα, ἔρως δὲ γαῖαν λαμβάνει γάμου τυχεῖν·",
"εὐδαίμων Χαρίτων καὶ Μελάνιππος ἔφυ, θείας ἁγητῆρες ἐφαμερίοις φιλότατος.",
"ὃ μὲν δὴ ἀπόστολος ἐς τὴν Μίλητον ἦν.",
"Θρασύβουλος δὲ σαφέως προπεπυσμένος πάντα λόγον καὶ εἰδὼς τὰ Ἀλυάττης μέλλοι ποιήσειν μηχανᾶται τοιάδε.",
"φιλόπαις δ' ἦν ἐκμανῶς καὶ Ἀλέξανδρος ὁ βασιλεύς.",
"Ἀντίγονος ὁ βασιλεὺς ἐπεκώμαζε τῷ Ζήνωνι",
"αὐτὰρ ὃ δεύτατος ἦλθεν ἄναξ ἀνδρῶν Ἀγαμέμνων ἕλκος ἔχων",
]

314
spacy/lang/grc/lex_attrs.py Normal file
View File

@ -0,0 +1,314 @@
from ...attrs import LIKE_NUM
_num_words = [
# CARDINALS
"εἷς",
"ἑνός",
"ἑνί",
"ἕνα",
"μία",
"μιᾶς",
"μιᾷ",
"μίαν",
"ἕν",
"δύο",
"δυοῖν",
"τρεῖς",
"τριῶν",
"τρισί",
"τρία",
"τέτταρες",
"τεττάρων",
"τέτταρσι",
"τέτταρα",
"τέτταρας",
"πέντε",
"ἕξ",
"ἑπτά",
"ὀκτώ",
"ἐννέα",
"δέκα",
"ἕνδεκα",
"δώδεκα",
"πεντεκαίδεκα",
"ἑκκαίδεκα",
"ἑπτακαίδεκα",
"ὀκτωκαίδεκα",
"ἐννεακαίδεκα",
"εἴκοσι",
"τριάκοντα",
"τετταράκοντα",
"πεντήκοντα",
"ἑξήκοντα",
"ἑβδομήκοντα",
"ὀγδοήκοντα",
"ἐνενήκοντα",
"ἑκατόν",
"διακόσιοι",
"διακοσίων",
"διακοσιᾶν",
"διακοσίους",
"διακοσίοις",
"διακόσια",
"διακόσιαι",
"διακοσίαις",
"διακοσίαισι",
"διηκόσιοι",
"διηκοσίων",
"διηκοσιέων",
"διακοσίας",
"διηκόσια",
"διηκόσιαι",
"διηκοσίας",
"τριακόσιοι",
"τριακοσίων",
"τριακοσιᾶν",
"τριακοσίους",
"τριακοσίοις",
"τριακόσια",
"τριακόσιαι",
"τριακοσίαις",
"τριακοσίαισι",
"τριακοσιέων",
"τριακοσίας",
"τριηκόσια",
"τριηκοσίας",
"τριηκόσιοι",
"τριηκοσίοισιν",
"τριηκοσίους",
"τριηκοσίων",
"τετρακόσιοι",
"τετρακοσίων",
"τετρακοσιᾶν",
"τετρακοσίους",
"τετρακοσίοις",
"τετρακόσια",
"τετρακόσιαι",
"τετρακοσίαις",
"τετρακοσίαισι",
"τετρακοσιέων",
"τετρακοσίας",
"πεντακόσιοι",
"πεντακοσίων",
"πεντακοσιᾶν",
"πεντακοσίους",
"πεντακοσίοις",
"πεντακόσια",
"πεντακόσιαι",
"πεντακοσίαις",
"πεντακοσίαισι",
"πεντακοσιέων",
"πεντακοσίας",
"ἑξακόσιοι",
"ἑξακοσίων",
"ἑξακοσιᾶν",
"ἑξακοσίους",
"ἑξακοσίοις",
"ἑξακόσια",
"ἑξακόσιαι",
"ἑξακοσίαις",
"ἑξακοσίαισι",
"ἑξακοσιέων",
"ἑξακοσίας",
"ἑπτακόσιοι",
"ἑπτακοσίων",
"ἑπτακοσιᾶν",
"ἑπτακοσίους",
"ἑπτακοσίοις",
"ἑπτακόσια",
"ἑπτακόσιαι",
"ἑπτακοσίαις",
"ἑπτακοσίαισι",
"ἑπτακοσιέων",
"ἑπτακοσίας",
"ὀκτακόσιοι",
"ὀκτακοσίων",
"ὀκτακοσιᾶν",
"ὀκτακοσίους",
"ὀκτακοσίοις",
"ὀκτακόσια",
"ὀκτακόσιαι",
"ὀκτακοσίαις",
"ὀκτακοσίαισι",
"ὀκτακοσιέων",
"ὀκτακοσίας",
"ἐνακόσιοι",
"ἐνακοσίων",
"ἐνακοσιᾶν",
"ἐνακοσίους",
"ἐνακοσίοις",
"ἐνακόσια",
"ἐνακόσιαι",
"ἐνακοσίαις",
"ἐνακοσίαισι",
"ἐνακοσιέων",
"ἐνακοσίας",
"χίλιοι",
"χιλίων",
"χιλιῶν",
"χιλίους",
"χιλίοις",
"χίλιαι",
"χιλίας",
"χιλίαις",
"χίλια",
"χίλι",
"δισχίλιοι",
"δισχιλίων",
"δισχιλιῶν",
"δισχιλίους",
"δισχιλίοις",
"δισχίλιαι",
"δισχιλίας",
"δισχιλίαις",
"δισχίλια",
"δισχίλι",
"τρισχίλιοι",
"τρισχιλίων",
"τρισχιλιῶν",
"τρισχιλίους",
"τρισχιλίοις",
"τρισχίλιαι",
"τρισχιλίας",
"τρισχιλίαις",
"τρισχίλια",
"τρισχίλι",
"μύριοι",
"μύριοί",
"μυρίων",
"μυρίοις",
"μυρίους",
"μύριαι",
"μυρίαις",
"μυρίας",
"μύρια",
"δισμύριοι",
"δισμύριοί",
"δισμυρίων",
"δισμυρίοις",
"δισμυρίους",
"δισμύριαι",
"δισμυρίαις",
"δισμυρίας",
"δισμύρια",
"δεκακισμύριοι",
"δεκακισμύριοί",
"δεκακισμυρίων",
"δεκακισμυρίοις",
"δεκακισμυρίους",
"δεκακισμύριαι",
"δεκακισμυρίαις",
"δεκακισμυρίας",
"δεκακισμύρια",
# ANCIENT GREEK NUMBERS (1-100)
"α",
"β",
"γ",
"δ",
"ε",
"ϛ",
"ζ",
"η",
"θ",
"ι",
"ια",
"ιβ",
"ιγ",
"ιδ",
"ιε",
"ιϛ",
"ιζ",
"ιη",
"ιθ",
"κ",
"κα",
"κβ",
"κγ",
"κδ",
"κε",
"κϛ",
"κζ",
"κη",
"κθ",
"λ",
"λα",
"λβ",
"λγ",
"λδ",
"λε",
"λϛ",
"λζ",
"λη",
"λθ",
"μ",
"μα",
"μβ",
"μγ",
"μδ",
"με",
"μϛ",
"μζ",
"μη",
"μθ",
"ν",
"να",
"νβ",
"νγ",
"νδ",
"νε",
"νϛ",
"νζ",
"νη",
"νθ",
"ξ",
"ξα",
"ξβ",
"ξγ",
"ξδ",
"ξε",
"ξϛ",
"ξζ",
"ξη",
"ξθ",
"ο",
"οα",
"οβ",
"ογ",
"οδ",
"οε",
"οϛ",
"οζ",
"οη",
"οθ",
"π",
"πα",
"πβ",
"πγ",
"πδ",
"πε",
"πϛ",
"πζ",
"πη",
"πθ",
"ϟ",
"ϟα",
"ϟβ",
"ϟγ",
"ϟδ",
"ϟε",
"ϟϛ",
"ϟζ",
"ϟη",
"ϟθ",
"ρ",
]
def like_num(text):
if text.lower() in _num_words:
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}

View File

@ -0,0 +1,61 @@
STOP_WORDS = set(
"""
αὐτῷ αὐτοῦ αὐτῆς αὐτόν αὐτὸν αὐτῶν αὐτὸς αὐτὸ αὐτό αὐτός αὐτὴν αὐτοῖς αὐτοὺς αὔτ' αὐτὰ αὐτῇ αὐτὴ
αὐτὼ αὑταὶ καὐτὸς αὐτά αὑτός αὐτοῖσι αὐτοῖσιν αὑτὸς αὐτήν αὐτοῖσί αὐτοί αὐτοὶ αὐτοῖο αὐτάων αὐτὰς
αὐτέων αὐτώ αὐτάς αὐτούς αὐτή αὐταί αὐταὶ αὐτῇσιν τὠυτῷ τὠυτὸ ταὐτὰ ταύτῃ αὐτῇσι αὐτῇς αὐταῖς αὐτᾶς αὐτὰν ταὐτὸν
γε γ' γέ γὰρ γάρ δαῖτα δαιτὸς δαιτὶ δαὶ δαιτί δαῖτ' δαΐδας δαΐδων δἰ διὰ διά δὲ δ' δέ δὴ δή εἰ εἴ κεἰ κεἴ αἴ αἲ εἲ αἰ
ἐστί ἐστιν ὢν ἦν ἐστὶν ὦσιν εἶναι ὄντι εἰσιν ἐστι ὄντα οὖσαν ἦσαν ἔστι ὄντας ἐστὲ εἰσὶ εἶ ὤν οὖσαι ἔσται ἐσμὲν ἐστ' ἐστίν ἔστ' ἔσει ἦμεν εἰμι εἰσὶν ἦσθ'
ἐστὶ οὖσ' ἔστιν εἰμὶ εἴμ' ἐσθ' ᾖς στί εἴην εἶναί οὖσα κἄστ' εἴη ἦσθα εἰμ' ἔστω ὄντ' ἔσθ' ἔμμεναι ἔω ἐὼν ἐσσι ἔσσεται ἐστὸν ἔσαν ἔστων ἐόντα ἦεν ἐοῦσαν ἔην
ἔσσομαι εἰσί ἐστόν ἔσκεν ἐόντ' ἐών ἔσσεσθ' εἰσ' ἐόντες ἐόντε ἐσσεῖται εἰμεν ἔασιν ἔσκε ἔμεναι ἔσεσθαι ἔῃ εἰμὲν εἰσι ἐόντας ἔστε εἰς ἦτε εἰμί ἔσσεαι ἔμμεν
ἐοῦσα ἔμεν ᾖσιν ἐστε ἐόντι εἶεν ἔσσονται ἔησθα ἔσεσθε ἐσσί ἐοῦσ' ἔασι ἔα ἦα ἐόν ἔσσεσθαι ἔσομαι ἔσκον εἴης ἔωσιν εἴησαν ἐὸν ἐουσέων ἔσσῃ ἐούσης ἔσονται
ἐούσας ἐόντων ἐόντος ἐσομένην ἔστωσαν ἔωσι ἔας ἐοῦσαι ἣν εἰσίν ἤστην ὄντες ὄντων οὔσας οὔσαις ὄντος οὖσι οὔσης ἔσῃ ὂν ἐσμεν ἐσμέν οὖσιν ἐσομένους ἐσσόμεσθα
ἒς ἐς ἔς ἐν κεἰς εἲς κἀν ἔν κατὰ κατ' καθ' κατά κάτα κὰπ κὰκ κὰδ κὰρ κάρ κὰγ κὰμ καὶ καί μετὰ μεθ' μετ' μέτα μετά μέθ' μέτ' μὲν μέν μὴ
μή μη οὐκ οὒ οὐ οὐχ οὐχὶ κοὐ κοὐχ οὔ κοὐκ οὐχί οὐκὶ οὐδὲν οὐδεὶς οὐδέν κοὐδεὶς κοὐδὲν οὐδένα οὐδενὸς οὐδέν' οὐδενός οὐδενὶ
οὐδεμία οὐδείς οὐδεμίαν οὐδὲ οὐδ' κοὐδ' οὐδέ οὔτε οὔθ' οὔτέ τε οὔτ' οὕτως οὕτω οὕτῶ χοὔτως οὖν ὦν ὧν τοῦτο τοῦθ' τοῦτον τούτῳ
τούτοις ταύτας αὕτη ταῦτα οὗτος ταύτης ταύτην τούτων ταῦτ' τοῦτ' τούτου αὗται τούτους τοῦτό ταῦτά τούτοισι χαὔτη ταῦθ' χοὖτοι
τούτοισιν οὗτός οὗτοι τούτω τουτέων τοῦτὸν οὗτοί τοῦτου οὗτοὶ ταύτῃσι ταύταις ταυτὶ παρὰ παρ' πάρα παρά πὰρ παραὶ πάρ' περὶ
πέρι περί πρὸς πρός ποτ' ποτὶ προτὶ προτί πότι
σὸς σήν σὴν σὸν σόν σὰ σῶν σοῖσιν σός σῆς σῷ σαῖς σῇ σοῖς σοῦ σ' σὰν σά σὴ σὰς
σᾷ σοὺς σούς σοῖσι σῇς σῇσι σή σῇσιν σοὶ σου ὑμεῖς σὲ σύ σοι ὑμᾶς ὑμῶν ὑμῖν σε
σέ σὺ σέθεν σοί ὑμὶν σφῷν ὑμίν τοι τοὶ σφὼ ὔμμ' σφῶϊ σεῖο τ' σφῶϊν ὔμμιν σέο σευ σεῦ
ὔμμι ὑμέων τύνη ὑμείων τοί ὔμμες σεο τέ τεοῖο ὑμέας σὺν ξὺν σύν
θ' τί τι τις τινες τινα τινος τινὸς τινὶ τινῶν τίς τίνες τινὰς τιν' τῳ του τίνα τοῦ τῷ τινί τινά τίνος τινι τινας τινὰ τινων
τίν' τευ τέο τινές τεο τινὲς τεῷ τέῳ τινός τεῳ τισὶ
τοιαῦτα τοιοῦτον τοιοῦθ' τοιοῦτος τοιαύτην τοιαῦτ' τοιούτου τοιαῦθ' τοιαύτῃ τοιούτοις τοιαῦται τοιαῦτά τοιαύτη τοιοῦτοι τοιούτων τοιούτοισι
τοιοῦτο τοιούτους τοιούτῳ τοιαύτης τοιαύταις τοιαύτας τοιοῦτός τίνι τοῖσι τίνων τέων τέοισί τὰ τῇ τώ τὼ
ἀλλὰ ἀλλ' ἀλλά ἀπ' ἀπὸ κἀπ' ἀφ' τἀπὸ κἀφ' ἄπο ἀπό τὠπὸ τἀπ' ἄλλων ἄλλῳ ἄλλη ἄλλης ἄλλους ἄλλοις ἄλλον ἄλλο ἄλλου τἄλλα ἄλλα
ἄλλᾳ ἄλλοισιν τἄλλ' ἄλλ' ἄλλος ἄλλοισι κἄλλ' ἄλλοι ἄλλῃσι ἄλλόν ἄλλην ἄλλά ἄλλαι ἄλλοισίν ὧλλοι ἄλλῃ ἄλλας ἀλλέων τἆλλα ἄλλως
ἀλλάων ἄλλαις τἆλλ'
ἂν ἄν κἂν τἂν ἃν κεν κ' κέν κέ κε χ' ἄρα τἄρα ἄρ' τἄρ' ἄρ ῥα ῥά τὰρ ἄρά ἂρ
ἡμᾶς με ἐγὼ ἐμὲ μοι κἀγὼ ἡμῶν ἡμεῖς ἐμοὶ ἔγωγ' ἁμοὶ ἡμῖν μ' ἔγωγέ ἐγώ ἐμοί ἐμοῦ κἀμοῦ ἔμ' κἀμὲ ἡμὶν μου ἐμέ ἔγωγε νῷν νὼ χἠμεῖς ἁμὲ κἀγώ κἀμοὶ χἠμᾶς
ἁγὼ ἡμίν κἄμ' ἔμοιγ' μοί τοὐμὲ ἄμμε ἐγὼν ἐμεῦ ἐμεῖο μευ ἔμοιγε ἄμμι μέ ἡμέας νῶϊ ἄμμιν ἧμιν ἐγών νῶΐ ἐμέθεν ἥμιν ἄμμες νῶι ἡμείων ἄμμ' ἡμέων ἐμέο
ἐκ ἔκ ἐξ κἀκ κ ἃκ κἀξ ἔξ εξ Ἐκ τἀμὰ ἐμοῖς τοὐμόν ἐμᾶς τοὐμὸν ἐμῶν ἐμὸς ἐμῆς ἐμῷ τὠμῷ ἐμὸν τἄμ' ἐμὴ ἐμὰς ἐμαῖς ἐμὴν ἐμόν ἐμὰ ἐμός ἐμοὺς ἐμῇ ἐμᾷ
οὑμὸς ἐμοῖν οὑμός κἀμὸν ἐμαὶ ἐμή ἐμάς ἐμοῖσι ἐμοῖσιν ἐμῇσιν ἐμῇσι ἐμῇς ἐμήν
ἔνι ἐνὶ εἰνὶ εἰν ἐμ ἐπὶ ἐπ' ἔπι ἐφ' κἀπὶ τἀπὶ ἐπί ἔφ' ἔπ' ἐὰν ἢν ἐάν ἤν ἄνπερ
αὑτοῖς αὑτὸν αὑτῷ ἑαυτοῦ αὑτόν αὑτῆς αὑτῶν αὑτοῦ αὑτὴν αὑτοῖν χαὐτοῦ αὑταῖς ἑωυτοῦ ἑωυτῇ ἑωυτὸν ἐωυτῷ ἑωυτῆς ἑωυτόν ἑωυτῷ
ἑωυτάς ἑωυτῶν ἑωυτοὺς ἑωυτοῖσι ἑαυτῇ ἑαυτούς αὑτοὺς ἑαυτῶν ἑαυτοὺς ἑαυτὸν ἑαυτῷ ἑαυτοῖς ἑαυτὴν ἑαυτῆς
ἔτι ἔτ' ἔθ' κἄτι ἠέ ἠὲ ἦε ἦέ τοὺς τὴν τὸ τῶν τὸν οἱ τοῖς ταῖς τῆς τὰς αἱ τό τὰν τᾶς τοῖσιν αἳ χὠ τήν τά τοῖν τάς
χοἰ χἠ τάν τᾶν οἳ οἵ τοῖο τόν τοῖιν τούς τάων ταὶ τῇς τῇσι τῇσιν αἵ τοῖό τοῖσίν ὅττί ταί Τὴν τῆ τῶ τάδε ὅδε τοῦδε τόδε τόνδ'
τάδ' τῆσδε τῷδε ὅδ' τῶνδ' τῇδ' τοῦδέ τῶνδε τόνδε τόδ' τοῦδ' τάσδε τήνδε τάσδ' τήνδ' ταῖσδέ τῇδε τῆσδ' τάνδ' τῷδ' τάνδε ἅδε τοῖσδ' ἥδ'
τᾷδέ τοῖσδε τούσδ' ἥδε τούσδε τώδ' ἅδ' οἵδ' τῶνδέ οἵδε τᾷδε τοῖσδεσσι τώδε τῇδέ τοῖσιδε αἵδε τοῦδὲ τῆδ' αἵδ' τοῖσδεσι ὃν ὃς οὗ ἅπερ
οὓς ἧς οἷς ἅσπερ χὦνπερ αἷς ὅς ἥπερ ἃς ὅσπερ ὅνπερ ὧνπερ ᾧπερ ὅν αἷν οἷσι ἇς ἅς οὕς ἥν οἷσιν ἕης ὅου ᾗς οἷσί οἷσίν τοῖσί ᾗσιν οἵπερ αἷσπερ
ὅστις ἥτις ὅτου ὅτοισι ἥντιν' ὅτῳ ὅντιν' ὅττι ἅσσά ὅτεῳ ὅτις ὅτιν' ὅτευ ἥντινα αἵτινές ὅντινα ἅσσα ᾧτινι οἵτινες ὅτι ἅτις ὅτ' ὑμὴ
ὑμήν ὑμὸν ὑπὲρ ὕπερ ὑπέρτερον ὑπεὶρ ὑπέρτατος ὑπὸ ὑπ' ὑφ' ὕπο ὑπαὶ ὑπό ὕπ' ὕφ'
ὣς ὡς ὥς ὧς ὥστ' ὥστε ὥσθ'
""".split()
)

View File

@ -0,0 +1,115 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM
from ...util import update_exc
_exc = {}
for token in ["᾽Απ'", "᾽ΑΠ'", "ἀφ'", "᾽Αφ", "ἀπὸ"]:
_exc[token] = [{ORTH: token, NORM: "από"}]
for token in ["᾽Αλλ'", "ἀλλ'", "ἀλλὰ"]:
_exc[token] = [{ORTH: token, NORM: "ἀλλά"}]
for token in ["παρ'", "Παρ'", "παρὰ", "παρ"]:
_exc[token] = [{ORTH: token, NORM: "παρά"}]
for token in ["καθ'", "Καθ'", "κατ'", "Κατ'", "κατὰ"]:
_exc[token] = [{ORTH: token, NORM: "κατά"}]
for token in ["Ἐπ'", "ἐπ'", "ἐπὶ", "Εφ'", "εφ'"]:
_exc[token] = [{ORTH: token, NORM: "επί"}]
for token in ["Δι'", "δι'", "διὰ"]:
_exc[token] = [{ORTH: token, NORM: "διά"}]
for token in ["Ὑπ'", "ὑπ'", "ὑφ'"]:
_exc[token] = [{ORTH: token, NORM: "ὑπό"}]
for token in ["Μετ'", "μετ'", "μεθ'", "μετὰ"]:
_exc[token] = [{ORTH: token, NORM: "μετά"}]
for token in ["Μ'", "μ'", "μέ", "μὲ"]:
_exc[token] = [{ORTH: token, NORM: "με"}]
for token in ["Σ'", "σ'", "σέ", "σὲ"]:
_exc[token] = [{ORTH: token, NORM: "σε"}]
for token in ["Τ'", "τ'", "τέ", "τὲ"]:
_exc[token] = [{ORTH: token, NORM: "τε"}]
for token in ["Δ'", "δ'", "δὲ"]:
_exc[token] = [{ORTH: token, NORM: "δέ"}]
_other_exc = {
"μὲν": [{ORTH: "μὲν", NORM: "μέν"}],
"μὴν": [{ORTH: "μὴν", NORM: "μήν"}],
"τὴν": [{ORTH: "τὴν", NORM: "τήν"}],
"τὸν": [{ORTH: "τὸν", NORM: "τόν"}],
"καὶ": [{ORTH: "καὶ", NORM: "καί"}],
"καὐτός": [{ORTH: "κ", NORM: "καί"}, {ORTH: "αὐτός"}],
"καὐτὸς": [{ORTH: "κ", NORM: "καί"}, {ORTH: "αὐτὸς", NORM: "αὐτός"}],
"κοὐ": [{ORTH: "κ", NORM: "καί"}, {ORTH: "οὐ"}],
"χἡ": [{ORTH: "χ", NORM: "καί"}, {ORTH: ""}],
"χοἱ": [{ORTH: "χ", NORM: "καί"}, {ORTH: "οἱ"}],
"χἱκετεύετε": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ἱκετεύετε"}],
"κἀν": [{ORTH: "κ", NORM: "καί"}, {ORTH: "ἀν", NORM: "ἐν"}],
"κἀγὼ": [{ORTH: "κἀ", NORM: "καί"}, {ORTH: "γὼ", NORM: "ἐγώ"}],
"κἀγώ": [{ORTH: "κἀ", NORM: "καί"}, {ORTH: "γώ", NORM: "ἐγώ"}],
"ἁγώ": [{ORTH: "", NORM: ""}, {ORTH: "γώ", NORM: "ἐγώ"}],
"ἁγὼ": [{ORTH: "", NORM: ""}, {ORTH: "γὼ", NORM: "ἐγώ"}],
"ἐγᾦδα": [{ORTH: "ἐγ", NORM: "ἐγώ"}, {ORTH: "ᾦδα", NORM: "οἶδα"}],
"ἐγᾦμαι": [{ORTH: "ἐγ", NORM: "ἐγώ"}, {ORTH: "ᾦμαι", NORM: "οἶμαι"}],
"κἀς": [{ORTH: "κ", NORM: "καί"}, {ORTH: "ἀς", NORM: "ἐς"}],
"κᾆτα": [{ORTH: "κ", NORM: "καί"}, {ORTH: "ᾆτα", NORM: "εἶτα"}],
"κεἰ": [{ORTH: "κ", NORM: "καί"}, {ORTH: "εἰ"}],
"κεἰς": [{ORTH: "κ", NORM: "καί"}, {ORTH: "εἰς"}],
"χὤτε": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤτε", NORM: "ὅτε"}],
"χὤπως": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤπως", NORM: "ὅπως"}],
"χὤτι": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤτι", NORM: "ὅτι"}],
"χὤταν": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤταν", NORM: "ὅταν"}],
"οὑμός": [{ORTH: "οὑ", NORM: ""}, {ORTH: "μός", NORM: "ἐμός"}],
"οὑμὸς": [{ORTH: "οὑ", NORM: ""}, {ORTH: "μὸς", NORM: "ἐμός"}],
"οὑμοί": [{ORTH: "οὑ", NORM: "οἱ"}, {ORTH: "μοί", NORM: "ἐμoί"}],
"οὑμοὶ": [{ORTH: "οὑ", NORM: "οἱ"}, {ORTH: "μοὶ", NORM: "ἐμoί"}],
"σοὔστι": [{ORTH: "σοὔ", NORM: "σοί"}, {ORTH: "στι", NORM: "ἐστι"}],
"σοὐστί": [{ORTH: "σοὐ", NORM: "σοί"}, {ORTH: "στί", NORM: "ἐστί"}],
"σοὐστὶ": [{ORTH: "σοὐ", NORM: "σοί"}, {ORTH: "στὶ", NORM: "ἐστί"}],
"μοὖστι": [{ORTH: "μοὖ", NORM: "μοί"}, {ORTH: "στι", NORM: "ἐστι"}],
"μοὔστι": [{ORTH: "μοὔ", NORM: "μοί"}, {ORTH: "στι", NORM: "ἐστι"}],
"τοὔνομα": [{ORTH: "τοὔ", NORM: "τό"}, {ORTH: "νομα", NORM: "ὄνομα"}],
"οὑν": [{ORTH: "οὑ", NORM: ""}, {ORTH: "ν", NORM: "ἐν"}],
"ὦνερ": [{ORTH: "", NORM: ""}, {ORTH: "νερ", NORM: "ἄνερ"}],
"ὦνδρες": [{ORTH: "", NORM: ""}, {ORTH: "νδρες", NORM: "ἄνδρες"}],
"προὔχων": [{ORTH: "προὔ", NORM: "πρό"}, {ORTH: "χων", NORM: "ἔχων"}],
"προὔχοντα": [{ORTH: "προὔ", NORM: "πρό"}, {ORTH: "χοντα", NORM: "ἔχοντα"}],
"ὥνεκα": [{ORTH: "", NORM: "οὗ"}, {ORTH: "νεκα", NORM: "ἕνεκα"}],
"θοἰμάτιον": [{ORTH: "θο", NORM: "τό"}, {ORTH: "ἰμάτιον"}],
"ὥνεκα": [{ORTH: "", NORM: "οὗ"}, {ORTH: "νεκα", NORM: "ἕνεκα"}],
"τὠληθές": [{ORTH: "τὠ", NORM: "τὸ"}, {ORTH: "ληθές", NORM: "ἀληθές"}],
"θἡμέρᾳ": [{ORTH: "θ", NORM: "τῇ"}, {ORTH: "ἡμέρᾳ"}],
"ἅνθρωπος": [{ORTH: "", NORM: ""}, {ORTH: "νθρωπος", NORM: "ἄνθρωπος"}],
"τἄλλα": [{ORTH: "τ", NORM: "τὰ"}, {ORTH: "ἄλλα"}],
"τἆλλα": [{ORTH: "τἆ", NORM: "τὰ"}, {ORTH: "λλα", NORM: "ἄλλα"}],
"ἁνήρ": [{ORTH: "", NORM: ""}, {ORTH: "νήρ", NORM: "ἀνήρ"}],
"ἁνὴρ": [{ORTH: "", NORM: ""}, {ORTH: "νὴρ", NORM: "ἀνήρ"}],
"ἅνδρες": [{ORTH: "", NORM: "οἱ"}, {ORTH: "νδρες", NORM: "ἄνδρες"}],
"ἁγαθαί": [{ORTH: "", NORM: "αἱ"}, {ORTH: "γαθαί", NORM: "ἀγαθαί"}],
"ἁγαθαὶ": [{ORTH: "", NORM: "αἱ"}, {ORTH: "γαθαὶ", NORM: "ἀγαθαί"}],
"ἁλήθεια": [{ORTH: "", NORM: ""}, {ORTH: "λήθεια", NORM: "ἀλήθεια"}],
"τἀνδρός": [{ORTH: "τ", NORM: "τοῦ"}, {ORTH: "ἀνδρός"}],
"τἀνδρὸς": [{ORTH: "τ", NORM: "τοῦ"}, {ORTH: "ἀνδρὸς", NORM: "ἀνδρός"}],
"τἀνδρί": [{ORTH: "τ", NORM: "τῷ"}, {ORTH: "ἀνδρί"}],
"τἀνδρὶ": [{ORTH: "τ", NORM: "τῷ"}, {ORTH: "ἀνδρὶ", NORM: "ἀνδρί"}],
"αὑτός": [{ORTH: "αὑ", NORM: ""}, {ORTH: "τός", NORM: "αὐτός"}],
"αὑτὸς": [{ORTH: "αὑ", NORM: ""}, {ORTH: "τὸς", NORM: "αὐτός"}],
"ταὐτοῦ": [{ORTH: "τ", NORM: "τοῦ"}, {ORTH: "αὐτοῦ"}],
}
_exc.update(_other_exc)
_exc_data = {}
_exc.update(_exc_data)
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,7 +1,11 @@
from typing import Optional
from thinc.api import Model
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from ...language import Language
from .lemmatizer import ItalianLemmatizer
class ItalianDefaults(Language.Defaults):
@ -16,4 +20,16 @@ class Italian(Language):
Defaults = ItalianDefaults
@Italian.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
):
return ItalianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
__all__ = ["Italian"]

132
spacy/lang/it/lemmatizer.py Normal file
View File

@ -0,0 +1,132 @@
from typing import List, Dict, Tuple
from ...pipeline import Lemmatizer
from ...tokens import Token
class ItalianLemmatizer(Lemmatizer):
"""This lemmatizer was adapted from the Polish one (version of April 2021).
It implements lookup lemmatization based on the morphological lexicon
morph-it (Baroni and Zanchetta). The table lemma_lookup with non-POS-aware
entries is used as a backup for words that aren't handled by morph-it."""
@classmethod
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
if mode == "pos_lookup":
required = [
"lemma_lookup_num",
"lemma_lookup_det",
"lemma_lookup_adp",
"lemma_lookup_adj",
"lemma_lookup_noun",
"lemma_lookup_pron",
"lemma_lookup_verb",
"lemma_lookup_aux",
"lemma_lookup_adv",
"lemma_lookup_other",
"lemma_lookup",
]
return (required, [])
else:
return super().get_lookups_config(mode)
def pos_lookup_lemmatize(self, token: Token) -> List[str]:
string = token.text
univ_pos = token.pos_
morphology = token.morph.to_dict()
lookup_pos = univ_pos.lower()
if univ_pos == "PROPN":
lookup_pos = "noun"
elif univ_pos == "PART":
lookup_pos = "pron"
lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})
if univ_pos == "NOUN":
return self.lemmatize_noun(string, morphology, lookup_table)
else:
if univ_pos != "PROPN":
string = string.lower()
if univ_pos == "DET":
return self.lemmatize_det(string, morphology, lookup_table)
elif univ_pos == "PRON":
return self.lemmatize_pron(string, morphology, lookup_table)
elif univ_pos == "ADP":
return self.lemmatize_adp(string, morphology, lookup_table)
elif univ_pos == "ADJ":
return self.lemmatize_adj(string, morphology, lookup_table)
else:
lemma = lookup_table.get(string, "")
if not lemma:
lookup_table = self.lookups.get_table("lemma_lookup_other")
lemma = lookup_table.get(string, "")
if not lemma:
lookup_table = self.lookups.get_table(
"lemma_lookup"
) # "legacy" lookup table
lemma = lookup_table.get(string, string.lower())
return [lemma]
def lemmatize_det(
self, string: str, morphology: dict, lookup_table: Dict[str, str]
) -> List[str]:
if string in [
"l'",
"lo",
"la",
"i",
"gli",
"le",
]:
return ["il"]
if string in ["un'", "un", "una"]:
return ["uno"]
return [lookup_table.get(string, string)]
def lemmatize_pron(
self, string: str, morphology: dict, lookup_table: Dict[str, str]
) -> List[str]:
if string in [
"l'",
"li",
"la",
"gli",
"le",
]:
return ["lo"]
if string in ["un'", "un", "una"]:
return ["uno"]
lemma = lookup_table.get(string, string)
if lemma == "alcun":
lemma = "alcuno"
elif lemma == "qualcun":
lemma = "qualcuno"
return [lemma]
def lemmatize_adp(
self, string: str, morphology: dict, lookup_table: Dict[str, str]
) -> List[str]:
if string == "d'":
return ["di"]
return [lookup_table.get(string, string)]
def lemmatize_adj(
self, string: str, morphology: dict, lookup_table: Dict[str, str]
) -> List[str]:
lemma = lookup_table.get(string, string)
if lemma == "alcun":
lemma = "alcuno"
elif lemma == "qualcun":
lemma = "qualcuno"
return [lemma]
def lemmatize_noun(
self, string: str, morphology: dict, lookup_table: Dict[str, str]
) -> List[str]:
# this method is case-sensitive, in order to work
# for incorrectly tagged proper names
if string != string.lower():
if string.lower() in lookup_table:
return [lookup_table[string.lower()]]
elif string in lookup_table:
return [lookup_table[string]]
return [string.lower()]
return [lookup_table.get(string, string)]

View File

@ -25,7 +25,7 @@ for orth in [
"artt.",
"att.",
"avv.",
"Avv."
"Avv.",
"by-pass",
"c.d.",
"c/c",

View File

@ -27,7 +27,7 @@ _infixes = (
+ [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),

View File

@ -1,12 +1,14 @@
from typing import Optional
from thinc.api import Model
from .stop_words import STOP_WORDS
from .lemmatizer import DutchLemmatizer
from .lex_attrs import LEX_ATTRS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES
from .lemmatizer import DutchLemmatizer
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ...language import Language
@ -16,6 +18,7 @@ class DutchDefaults(Language.Defaults):
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS

View File

@ -0,0 +1,72 @@
from typing import Union, Iterator
from ...symbols import NOUN, PRON
from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
"""
Detect base noun phrases from a dependency parse. Works on Doc and Span.
The definition is inspired by https://www.nltk.org/book/ch07.html
Consider : [Noun + determinant / adjective] and also [Pronoun]
"""
# fmt: off
# labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
# fmt: on
doc = doclike.doc # Ensure works on both Doc and Span.
# Check for dependencies: POS, DEP
if not doc.has_annotation("POS"):
raise ValueError(Errors.E1019)
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
# See UD tags: https://universaldependencies.org/u/dep/index.html
# amod = adjectival modifier
# nmod:poss = possessive nominal modifier
# nummod = numeric modifier
# det = determiner
# det:poss = possessive determiner
noun_deps = [
doc.vocab.strings[label] for label in ["amod", "nmod:poss", "det", "det:poss"]
]
# nsubj = nominal subject
# nsubj:pass = passive nominal subject
pronoun_deps = [doc.vocab.strings[label] for label in ["nsubj", "nsubj:pass"]]
# Label NP for the Span to identify it as Noun-Phrase
span_label = doc.vocab.strings.add("NP")
# Only NOUNS and PRONOUNS matter
for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)):
# For NOUNS
# Pick children from syntactic parse (only those with certain dependencies)
if word.pos == NOUN:
# Some debugging. It happens that VERBS are POS-TAGGED as NOUNS
# We check if the word has a "nsubj", if it's the case, we eliminate it
nsubjs = filter(
lambda x: x.dep == doc.vocab.strings["nsubj"], word.children
)
next_word = next(nsubjs, None)
if next_word is not None:
# We found some nsubj, so we skip this word. Otherwise, consider it a normal NOUN
continue
children = filter(lambda x: x.dep in noun_deps, word.children)
children_i = [c.i for c in children] + [word.i]
start_span = min(children_i)
end_span = max(children_i) + 1
yield start_span, end_span, span_label
# PRONOUNS only if it is the subject of a verb
elif word.pos == PRON:
if word.dep in pronoun_deps:
start_span = word.i
end_span = word.i + 1
yield start_span, end_span, span_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}

View File

@ -12,8 +12,6 @@ PUNCT_RULES = {"«": '"', "»": '"'}
class RussianLemmatizer(Lemmatizer):
_morph = None
def __init__(
self,
vocab: Vocab,
@ -23,15 +21,16 @@ class RussianLemmatizer(Lemmatizer):
mode: str = "pymorphy2",
overwrite: bool = False,
) -> None:
try:
from pymorphy2 import MorphAnalyzer
except ImportError:
raise ImportError(
"The Russian lemmatizer requires the pymorphy2 library: "
'try to fix it with "pip install pymorphy2"'
) from None
if RussianLemmatizer._morph is None:
RussianLemmatizer._morph = MorphAnalyzer()
if mode == "pymorphy2":
try:
from pymorphy2 import MorphAnalyzer
except ImportError:
raise ImportError(
"The Russian lemmatizer mode 'pymorphy2' requires the "
"pymorphy2 library. Install it with: pip install pymorphy2"
) from None
if getattr(self, "_morph", None) is None:
self._morph = MorphAnalyzer()
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
def pymorphy2_lemmatize(self, token: Token) -> List[str]:

View File

@ -35,8 +35,8 @@ URL_PATTERN = (
# host & domain names
# mods: match is case-sensitive, so include [A-Z]
r"(?:" # noqa: E131
r"(?:"
r"[A-Za-z0-9\u00a1-\uffff]"
r"(?:" # noqa: E131
r"[A-Za-z0-9\u00a1-\uffff]" # noqa: E131
r"[A-Za-z0-9\u00a1-\uffff_-]{0,62}"
r")?"
r"[A-Za-z0-9\u00a1-\uffff]\."

View File

@ -7,8 +7,6 @@ from ...vocab import Vocab
class UkrainianLemmatizer(RussianLemmatizer):
_morph = None
def __init__(
self,
vocab: Vocab,
@ -18,14 +16,15 @@ class UkrainianLemmatizer(RussianLemmatizer):
mode: str = "pymorphy2",
overwrite: bool = False,
) -> None:
try:
from pymorphy2 import MorphAnalyzer
except ImportError:
raise ImportError(
"The Ukrainian lemmatizer requires the pymorphy2 library and "
"dictionaries: try to fix it with "
'"pip install pymorphy2 pymorphy2-dicts-uk"'
) from None
if UkrainianLemmatizer._morph is None:
UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk")
if mode == "pymorphy2":
try:
from pymorphy2 import MorphAnalyzer
except ImportError:
raise ImportError(
"The Ukrainian lemmatizer mode 'pymorphy2' requires the "
"pymorphy2 library and dictionaries. Install them with: "
"pip install pymorphy2 pymorphy2-dicts-uk"
) from None
if getattr(self, "_morph", None) is None:
self._morph = MorphAnalyzer(lang="uk")
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)

View File

@ -1,4 +1,5 @@
from typing import Optional, Any, Dict, Callable, Iterable, Union, List, Pattern
from typing import Iterator, Optional, Any, Dict, Callable, Iterable, TypeVar
from typing import Union, List, Pattern, overload
from typing import Tuple
from dataclasses import dataclass
import random
@ -13,6 +14,7 @@ import srsly
import multiprocessing as mp
from itertools import chain, cycle
from timeit import default_timer as timer
import traceback
from .tokens.underscore import Underscore
from .vocab import Vocab, create_vocab
@ -433,9 +435,9 @@ class Language:
default_config (Dict[str, Any]): Default configuration, describing the
default values of the factory arguments.
assigns (Iterable[str]): Doc/Token attributes assigned by this component,
e.g. "token.ent_id". Used for pipeline analyis.
e.g. "token.ent_id". Used for pipeline analysis.
requires (Iterable[str]): Doc/Token attributes required by this component,
e.g. "token.ent_id". Used for pipeline analyis.
e.g. "token.ent_id". Used for pipeline analysis.
retokenizes (bool): Whether the component changes the tokenization.
Used for pipeline analysis.
default_score_weights (Dict[str, float]): The scores to report during
@ -518,9 +520,9 @@ class Language:
name (str): The name of the component factory.
assigns (Iterable[str]): Doc/Token attributes assigned by this component,
e.g. "token.ent_id". Used for pipeline analyis.
e.g. "token.ent_id". Used for pipeline analysis.
requires (Iterable[str]): Doc/Token attributes required by this component,
e.g. "token.ent_id". Used for pipeline analyis.
e.g. "token.ent_id". Used for pipeline analysis.
retokenizes (bool): Whether the component changes the tokenization.
Used for pipeline analysis.
func (Optional[Callable]): Factory function if not used as a decorator.
@ -686,11 +688,13 @@ class Language:
if not isinstance(source, Language):
raise ValueError(Errors.E945.format(name=source_name, source=type(source)))
# Check vectors, with faster checks first
if self.vocab.vectors.shape != source.vocab.vectors.shape or \
self.vocab.vectors.key2row != source.vocab.vectors.key2row or \
self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes():
if (
self.vocab.vectors.shape != source.vocab.vectors.shape
or self.vocab.vectors.key2row != source.vocab.vectors.key2row
or self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes()
):
warnings.warn(Warnings.W113.format(name=source_name))
if not source_name in source.component_names:
if source_name not in source.component_names:
raise KeyError(
Errors.E944.format(
name=source_name,
@ -868,14 +872,14 @@ class Language:
DOCS: https://spacy.io/api/language#replace_pipe
"""
if name not in self.pipe_names:
if name not in self.component_names:
raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
if hasattr(factory_name, "__call__"):
err = Errors.E968.format(component=repr(factory_name), name=name)
raise ValueError(err)
# We need to delegate to Language.add_pipe here instead of just writing
# to Language.pipeline to make sure the configs are handled correctly
pipe_index = self.pipe_names.index(name)
pipe_index = self.component_names.index(name)
self.remove_pipe(name)
if not len(self._components) or pipe_index == len(self._components):
# we have no components to insert before/after, or we're replacing the last component
@ -931,6 +935,7 @@ class Language:
# because factory may be used for something else
self._pipe_meta.pop(name)
self._pipe_configs.pop(name)
self.meta.get("_sourced_vectors_hashes", {}).pop(name, None)
# Make sure name is removed from the [initialize] config
if name in self._config["initialize"]["components"]:
self._config["initialize"]["components"].pop(name)
@ -1427,7 +1432,22 @@ class Language:
except StopIteration:
pass
_AnyContext = TypeVar("_AnyContext")
@overload
def pipe(
self,
texts: Iterable[Tuple[str, _AnyContext]],
*,
as_tuples: bool = ...,
batch_size: Optional[int] = ...,
disable: Iterable[str] = ...,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = ...,
n_process: int = ...,
) -> Iterator[Tuple[Doc, _AnyContext]]:
...
def pipe( # noqa: F811
self,
texts: Iterable[str],
*,
@ -1436,7 +1456,7 @@ class Language:
disable: Iterable[str] = SimpleFrozenList(),
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
n_process: int = 1,
):
) -> Iterator[Doc]:
"""Process texts as a stream, and yield `Doc` objects in order.
texts (Iterable[str]): A sequence of texts to process.
@ -1538,11 +1558,21 @@ class Language:
# Cycle channels not to break the order of docs.
# The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
byte_docs = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch))
docs = (Doc(self.vocab).from_bytes(byte_doc) for byte_doc in byte_docs)
byte_tuples = chain.from_iterable(
recv.recv() for recv in cycle(bytedocs_recv_ch)
)
try:
for i, (_, doc) in enumerate(zip(raw_texts, docs), 1):
yield doc
for i, (_, (byte_doc, byte_error)) in enumerate(
zip(raw_texts, byte_tuples), 1
):
if byte_doc is not None:
doc = Doc(self.vocab).from_bytes(byte_doc)
yield doc
elif byte_error is not None:
error = srsly.msgpack_loads(byte_error)
self.default_error_handler(
None, None, None, ValueError(Errors.E871.format(error=error))
)
if i % batch_size == 0:
# tell `sender` that one batch was consumed.
sender.step()
@ -1667,6 +1697,8 @@ class Language:
# If components are loaded from a source (existing models), we cache
# them here so they're only loaded once
source_nlps = {}
source_nlp_vectors_hashes = {}
nlp.meta["_sourced_vectors_hashes"] = {}
for pipe_name in config["nlp"]["pipeline"]:
if pipe_name not in pipeline:
opts = ", ".join(pipeline.keys())
@ -1691,17 +1723,33 @@ class Language:
else:
model = pipe_cfg["source"]
if model not in source_nlps:
# We only need the components here and we need to init
# model with the same vocab as the current nlp object
source_nlps[model] = util.load_model(model, vocab=nlp.vocab)
# We only need the components here and we intentionally
# do not load the model with the same vocab because
# this would cause the vectors to be copied into the
# current nlp object (all the strings will be added in
# create_pipe_from_source)
source_nlps[model] = util.load_model(model)
source_name = pipe_cfg.get("component", pipe_name)
listeners_replaced = False
if "replace_listeners" in pipe_cfg:
for name, proc in source_nlps[model].pipeline:
if source_name in getattr(proc, "listening_components", []):
source_nlps[model].replace_listeners(name, source_name, pipe_cfg["replace_listeners"])
source_nlps[model].replace_listeners(
name, source_name, pipe_cfg["replace_listeners"]
)
listeners_replaced = True
nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name)
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="\\[W113\\]")
nlp.add_pipe(
source_name, source=source_nlps[model], name=pipe_name
)
if model not in source_nlp_vectors_hashes:
source_nlp_vectors_hashes[model] = hash(
source_nlps[model].vocab.vectors.to_bytes()
)
nlp.meta["_sourced_vectors_hashes"][
pipe_name
] = source_nlp_vectors_hashes[model]
# Delete from cache if listeners were replaced
if listeners_replaced:
del source_nlps[model]
@ -1719,12 +1767,16 @@ class Language:
for name, proc in nlp.pipeline:
# Remove listeners not in the pipeline
listener_names = getattr(proc, "listening_components", [])
unused_listener_names = [ll for ll in listener_names if ll not in nlp.pipe_names]
unused_listener_names = [
ll for ll in listener_names if ll not in nlp.pipe_names
]
for listener_name in unused_listener_names:
for listener in proc.listener_map.get(listener_name, []):
proc.remove_listener(listener, listener_name)
for listener in getattr(proc, "listening_components", []): # e.g. tok2vec/transformer
for listener in getattr(
proc, "listening_components", []
): # e.g. tok2vec/transformer
# If it's a component sourced from another pipeline, we check if
# the tok2vec listeners should be replaced with standalone tok2vec
# models (e.g. so component can be frozen without its performance
@ -1781,6 +1833,7 @@ class Language:
raise ValueError(err)
tok2vec = self.get_pipe(tok2vec_name)
tok2vec_cfg = self.get_pipe_config(tok2vec_name)
tok2vec_model = tok2vec.model
if (
not hasattr(tok2vec, "model")
or not hasattr(tok2vec, "listener_map")
@ -1789,6 +1842,7 @@ class Language:
):
raise ValueError(Errors.E888.format(name=tok2vec_name, pipe=type(tok2vec)))
pipe_listeners = tok2vec.listener_map.get(pipe_name, [])
pipe = self.get_pipe(pipe_name)
pipe_cfg = self._pipe_configs[pipe_name]
if listeners:
util.logger.debug(f"Replacing listeners of component '{pipe_name}'")
@ -1803,7 +1857,6 @@ class Language:
n_listeners=len(pipe_listeners),
)
raise ValueError(err)
pipe = self.get_pipe(pipe_name)
# Update the config accordingly by copying the tok2vec model to all
# sections defined in the listener paths
for listener_path in listeners:
@ -1815,10 +1868,19 @@ class Language:
name=pipe_name, tok2vec=tok2vec_name, path=listener_path
)
raise ValueError(err)
util.set_dot_to_object(pipe_cfg, listener_path, tok2vec_cfg["model"])
new_config = tok2vec_cfg["model"]
if "replace_listener_cfg" in tok2vec_model.attrs:
replace_func = tok2vec_model.attrs["replace_listener_cfg"]
new_config = replace_func(
tok2vec_cfg["model"], pipe_cfg["model"]["tok2vec"]
)
util.set_dot_to_object(pipe_cfg, listener_path, new_config)
# Go over the listener layers and replace them
for listener in pipe_listeners:
util.replace_model_node(pipe.model, listener, tok2vec.model.copy())
new_model = tok2vec_model.copy()
if "replace_listener" in tok2vec_model.attrs:
new_model = tok2vec_model.attrs["replace_listener"](new_model)
util.replace_model_node(pipe.model, listener, new_model)
tok2vec.remove_listener(listener, pipe_name)
def to_disk(
@ -1850,7 +1912,11 @@ class Language:
util.to_disk(path, serializers, exclude)
def from_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
self,
path: Union[str, Path],
*,
exclude: Iterable[str] = SimpleFrozenList(),
overrides: Dict[str, Any] = SimpleFrozenDict(),
) -> "Language":
"""Loads state from a directory. Modifies the object in place and
returns it. If the saved `Language` object contains a model, the
@ -1879,7 +1945,7 @@ class Language:
deserializers = {}
if Path(path / "config.cfg").exists():
deserializers["config.cfg"] = lambda p: self.config.from_disk(
p, interpolate=False
p, interpolate=False, overrides=overrides
)
deserializers["meta.json"] = deserialize_meta
deserializers["vocab"] = deserialize_vocab
@ -2036,12 +2102,19 @@ def _apply_pipes(
"""
Underscore.load_state(underscore_state)
while True:
texts = receiver.get()
docs = (make_doc(text) for text in texts)
for pipe in pipes:
docs = pipe(docs)
# Connection does not accept unpickable objects, so send list.
sender.send([doc.to_bytes() for doc in docs])
try:
texts = receiver.get()
docs = (make_doc(text) for text in texts)
for pipe in pipes:
docs = pipe(docs)
# Connection does not accept unpickable objects, so send list.
byte_docs = [(doc.to_bytes(), None) for doc in docs]
padding = [(None, None)] * (len(texts) - len(byte_docs))
sender.send(byte_docs + padding)
except Exception:
error_msg = [(None, srsly.msgpack_dumps(traceback.format_exc()))]
padding = [(None, None)] * (len(texts) - 1)
sender.send(error_msg + padding)
class _Sender:

View File

@ -163,7 +163,7 @@ cdef class Lexeme:
self.vocab.set_vector(self.c.orth, vector)
property rank:
"""RETURNS (str): Sequential ID of the lexemes's lexical type, used
"""RETURNS (str): Sequential ID of the lexeme's lexical type, used
to index into tables, e.g. for word vectors."""
def __get__(self):
return self.c.id
@ -205,7 +205,7 @@ cdef class Lexeme:
self.c.lower = x
property norm:
"""RETURNS (uint64): The lexemes's norm, i.e. a normalised form of the
"""RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the
lexeme text.
"""
def __get__(self):
@ -288,7 +288,7 @@ cdef class Lexeme:
self.c.lower = self.vocab.strings.add(x)
property norm_:
"""RETURNS (str): The lexemes's norm, i.e. a normalised form of the
"""RETURNS (str): The lexeme's norm, i.e. a normalised form of the
lexeme text.
"""
def __get__(self):

View File

@ -1,4 +1,4 @@
from typing import Dict, Any, List, Union, Optional
from typing import Any, List, Union, Optional
from pathlib import Path
import srsly
from preshed.bloom import BloomFilter
@ -12,18 +12,16 @@ from .strings import get_string_id
UNSET = object()
def load_lookups(
lang: str, tables: List[str], strict: bool = True
) -> Optional[Dict[str, Any]]:
def load_lookups(lang: str, tables: List[str], strict: bool = True) -> "Lookups":
"""Load the data from the spacy-lookups-data package for a given language,
if available. Returns an empty dict if there's no data or if the package
if available. Returns an empty `Lookups` container if there's no data or if the package
is not installed.
lang (str): The language code (corresponds to entry point exposed by
the spacy-lookups-data package).
tables (List[str]): Name of tables to load, e.g. ["lemma_lookup", "lemma_exc"]
strict (bool): Whether to raise an error if a table doesn't exist.
RETURNS (Dict[str, Any]): The lookups, keyed by table name.
RETURNS (Lookups): The lookups container containing the loaded tables.
"""
# TODO: import spacy_lookups_data instead of going via entry points here?
lookups = Lookups()

View File

@ -290,7 +290,13 @@ cdef class Matcher:
if on_match is not None:
on_match(self, doc, i, final_matches)
if as_spans:
return [Span(doc, start, end, label=key) for key, start, end in final_matches]
spans = []
for key, start, end in final_matches:
if isinstance(doclike, Span):
start += doclike.start
end += doclike.start
spans.append(Span(doc, start, end, label=key))
return spans
elif with_alignments:
# convert alignments List[Dict[str, int]] --> List[int]
final_matches = []

View File

@ -1,6 +1,9 @@
from thinc.api import Model, normal_init
from ..util import registry
@registry.layers("spacy.PrecomputableAffine.v1")
def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
model = Model(
"precomputable_affine",

View File

@ -1,8 +1,10 @@
from thinc.api import Model
from ..util import registry
from ..attrs import LOWER
@registry.layers("spacy.extract_ngrams.v1")
def extract_ngrams(ngram_size: int, attr: int = LOWER) -> Model:
model = Model("extract_ngrams", forward)
model.attrs["ngram_size"] = ngram_size

60
spacy/ml/extract_spans.py Normal file
View File

@ -0,0 +1,60 @@
from typing import Tuple, Callable
from thinc.api import Model, to_numpy
from thinc.types import Ragged, Ints1d
from ..util import registry
@registry.layers("spacy.extract_spans.v1")
def extract_spans() -> Model[Tuple[Ragged, Ragged], Ragged]:
"""Extract spans from a sequence of source arrays, as specified by an array
of (start, end) indices. The output is a ragged array of the
extracted spans.
"""
return Model(
"extract_spans", forward, layers=[], refs={}, attrs={}, dims={}, init=init
)
def init(model, X=None, Y=None):
pass
def forward(
model: Model, source_spans: Tuple[Ragged, Ragged], is_train: bool
) -> Tuple[Ragged, Callable]:
"""Get subsequences from source vectors."""
ops = model.ops
X, spans = source_spans
assert spans.dataXd.ndim == 2
indices = _get_span_indices(ops, spans, X.lengths)
Y = Ragged(X.dataXd[indices], spans.dataXd[:, 1] - spans.dataXd[:, 0])
x_shape = X.dataXd.shape
x_lengths = X.lengths
def backprop_windows(dY: Ragged) -> Tuple[Ragged, Ragged]:
dX = Ragged(ops.alloc2f(*x_shape), x_lengths)
ops.scatter_add(dX.dataXd, indices, dY.dataXd)
return (dX, spans)
return Y, backprop_windows
def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d:
"""Construct a flat array that has the indices we want to extract from the
source data. For instance, if we want the spans (5, 9), (8, 10) the
indices will be [5, 6, 7, 8, 8, 9].
"""
spans, lengths = _ensure_cpu(spans, lengths)
indices = []
offset = 0
for i, length in enumerate(lengths):
spans_i = spans[i].dataXd + offset
for j in range(spans_i.shape[0]):
indices.append(ops.xp.arange(spans_i[j, 0], spans_i[j, 1]))
offset += length
return ops.flatten(indices)
def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:
return (Ragged(to_numpy(spans.dataXd), to_numpy(spans.lengths)), to_numpy(lengths))

View File

@ -1,6 +1,7 @@
from .entity_linker import * # noqa
from .multi_task import * # noqa
from .parser import * # noqa
from .spancat import * # noqa
from .tagger import * # noqa
from .textcat import * # noqa
from .tok2vec import * # noqa

View File

@ -6,12 +6,13 @@ from thinc.api import Model, Maxout, Linear
from ...util import registry
from ...kb import KnowledgeBase, Candidate, get_candidates
from ...vocab import Vocab
from ...tokens import Span
@registry.architectures("spacy.EntityLinker.v1")
def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
with Model.define_operators({">>": chain, "**": clone}):
token_width = tok2vec.get_dim("nO")
token_width = tok2vec.maybe_get_dim("nO")
output_layer = Linear(nO=nO, nI=token_width)
model = (
tok2vec
@ -44,5 +45,5 @@ def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
@registry.misc("spacy.CandidateGenerator.v1")
def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
return get_candidates

View File

@ -3,7 +3,7 @@ from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Mode
from thinc.api import MultiSoftmax, list2array
from thinc.api import to_categorical, CosineDistance, L2Distance
from ...util import registry
from ...util import registry, OOV_RANK
from ...errors import Errors
from ...attrs import ID
@ -13,7 +13,7 @@ from functools import partial
if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports
from ...vocab import Vocab # noqa: F401
from ...tokens import Doc # noqa: F401
from ...tokens.doc import Doc # noqa: F401
@registry.architectures("spacy.PretrainVectors.v1")
@ -70,6 +70,7 @@ def get_vectors_loss(ops, docs, prediction, distance):
# and look them up all at once. This prevents data copying.
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
target = docs[0].vocab.vectors.data[ids]
target[ids == OOV_RANK] = 0
d_target, loss = distance(prediction, target)
return loss, d_target
@ -205,7 +206,7 @@ def _apply_mask(
docs: Iterable["Doc"], random_words: _RandomWords, mask_prob: float = 0.15
) -> Tuple[numpy.ndarray, List["Doc"]]:
# This needs to be here to avoid circular imports
from ...tokens import Doc # noqa: F811
from ...tokens.doc import Doc # noqa: F811
N = sum(len(doc) for doc in docs)
mask = numpy.random.uniform(0.0, 1.0, (N,))

View File

@ -10,48 +10,7 @@ from ..tb_framework import TransitionModel
from ...tokens import Doc
@registry.architectures("spacy.TransitionBasedParser.v1")
def transition_parser_v1(
tok2vec: Model[List[Doc], List[Floats2d]],
state_type: Literal["parser", "ner"],
extra_state_tokens: bool,
hidden_width: int,
maxout_pieces: int,
use_upper: bool = True,
nO: Optional[int] = None,
) -> Model:
return build_tb_parser_model(
tok2vec,
state_type,
extra_state_tokens,
hidden_width,
maxout_pieces,
use_upper,
nO,
)
@registry.architectures("spacy.TransitionBasedParser.v2")
def transition_parser_v2(
tok2vec: Model[List[Doc], List[Floats2d]],
state_type: Literal["parser", "ner"],
extra_state_tokens: bool,
hidden_width: int,
maxout_pieces: int,
use_upper: bool,
nO: Optional[int] = None,
) -> Model:
return build_tb_parser_model(
tok2vec,
state_type,
extra_state_tokens,
hidden_width,
maxout_pieces,
use_upper,
nO,
)
def build_tb_parser_model(
tok2vec: Model[List[Doc], List[Floats2d]],
state_type: Literal["parser", "ner"],

View File

@ -0,0 +1,54 @@
from typing import List, Tuple
from thinc.api import Model, with_getitem, chain, list2ragged, Logistic
from thinc.api import Maxout, Linear, concatenate, glorot_uniform_init
from thinc.api import reduce_mean, reduce_max, reduce_first, reduce_last
from thinc.types import Ragged, Floats2d
from ...util import registry
from ...tokens import Doc
from ..extract_spans import extract_spans
@registry.layers.register("spacy.LinearLogistic.v1")
def build_linear_logistic(nO=None, nI=None) -> Model[Floats2d, Floats2d]:
"""An output layer for multi-label classification. It uses a linear layer
followed by a logistic activation.
"""
return chain(Linear(nO=nO, nI=nI, init_W=glorot_uniform_init), Logistic())
@registry.layers.register("spacy.mean_max_reducer.v1")
def build_mean_max_reducer(hidden_size: int) -> Model[Ragged, Floats2d]:
"""Reduce sequences by concatenating their mean and max pooled vectors,
and then combine the concatenated vectors with a hidden layer.
"""
return chain(
concatenate(reduce_last(), reduce_first(), reduce_mean(), reduce_max()),
Maxout(nO=hidden_size, normalize=True, dropout=0.0),
)
@registry.architectures.register("spacy.SpanCategorizer.v1")
def build_spancat_model(
tok2vec: Model[List[Doc], List[Floats2d]],
reducer: Model[Ragged, Floats2d],
scorer: Model[Floats2d, Floats2d],
) -> Model[Tuple[List[Doc], Ragged], Floats2d]:
"""Build a span categorizer model, given a token-to-vector model, a
reducer model to map the sequence of vectors for each span down to a single
vector, and a scorer model to map the vectors to probabilities.
tok2vec (Model[List[Doc], List[Floats2d]]): The tok2vec model.
reducer (Model[Ragged, Floats2d]): The reducer model.
scorer (Model[Floats2d, Floats2d]): The scorer model.
"""
model = chain(
with_getitem(0, chain(tok2vec, list2ragged())),
extract_spans(),
reducer,
scorer,
)
model.set_ref("tok2vec", tok2vec)
model.set_ref("reducer", reducer)
model.set_ref("scorer", scorer)
return model

View File

@ -1,11 +1,13 @@
from functools import partial
from typing import Optional, List
from thinc.types import Floats2d
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
from thinc.api import with_cpu, Relu, residual, LayerNorm
from thinc.api import with_cpu, Relu, residual, LayerNorm, resizable
from thinc.layers.chain import init as init_chain
from thinc.layers.resizable import resize_model, resize_linear_weighted
from ...attrs import ORTH
from ...util import registry
@ -15,7 +17,10 @@ from ...tokens import Doc
from .tok2vec import get_tok2vec_width
@registry.architectures("spacy.TextCatCNN.v1")
NEG_VALUE = -5000
@registry.architectures("spacy.TextCatCNN.v2")
def build_simple_cnn_text_classifier(
tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
) -> Model[List[Doc], Floats2d]:
@ -25,38 +30,75 @@ def build_simple_cnn_text_classifier(
outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
is applied instead, so that outputs are in the range [0, 1].
"""
fill_defaults = {"b": 0, "W": 0}
with Model.define_operators({">>": chain}):
cnn = tok2vec >> list2ragged() >> reduce_mean()
nI = tok2vec.maybe_get_dim("nO")
if exclusive_classes:
output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
model = cnn >> output_layer
model.set_ref("output_layer", output_layer)
output_layer = Softmax(nO=nO, nI=nI)
fill_defaults["b"] = NEG_VALUE
resizable_layer = resizable(
output_layer,
resize_layer=partial(
resize_linear_weighted, fill_defaults=fill_defaults
),
)
model = cnn >> resizable_layer
else:
linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
model = cnn >> linear_layer >> Logistic()
model.set_ref("output_layer", linear_layer)
output_layer = Linear(nO=nO, nI=nI)
resizable_layer = resizable(
output_layer,
resize_layer=partial(
resize_linear_weighted, fill_defaults=fill_defaults
),
)
model = cnn >> resizable_layer >> Logistic()
model.set_ref("output_layer", output_layer)
model.attrs["resize_output"] = partial(
resize_and_set_ref,
resizable_layer=resizable_layer,
)
model.set_ref("tok2vec", tok2vec)
model.set_dim("nO", nO)
model.attrs["multi_label"] = not exclusive_classes
return model
@registry.architectures("spacy.TextCatBOW.v1")
def resize_and_set_ref(model, new_nO, resizable_layer):
resizable_layer = resize_model(resizable_layer, new_nO)
model.set_ref("output_layer", resizable_layer.layers[0])
model.set_dim("nO", new_nO, force=True)
return model
@registry.architectures("spacy.TextCatBOW.v2")
def build_bow_text_classifier(
exclusive_classes: bool,
ngram_size: int,
no_output_layer: bool,
nO: Optional[int] = None,
) -> Model[List[Doc], Floats2d]:
fill_defaults = {"b": 0, "W": 0}
with Model.define_operators({">>": chain}):
sparse_linear = SparseLinear(nO)
model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear
model = with_cpu(model, model.ops)
sparse_linear = SparseLinear(nO=nO)
output_layer = None
if not no_output_layer:
fill_defaults["b"] = NEG_VALUE
output_layer = softmax_activation() if exclusive_classes else Logistic()
resizable_layer = resizable(
sparse_linear,
resize_layer=partial(resize_linear_weighted, fill_defaults=fill_defaults),
)
model = extract_ngrams(ngram_size, attr=ORTH) >> resizable_layer
model = with_cpu(model, model.ops)
if output_layer:
model = model >> with_cpu(output_layer, output_layer.ops)
model.set_dim("nO", nO)
model.set_ref("output_layer", sparse_linear)
model.attrs["multi_label"] = not exclusive_classes
model.attrs["resize_output"] = partial(
resize_and_set_ref, resizable_layer=resizable_layer
)
return model
@ -69,9 +111,7 @@ def build_text_classifier_v2(
exclusive_classes = not linear_model.attrs["multi_label"]
with Model.define_operators({">>": chain, "|": concatenate}):
width = tok2vec.maybe_get_dim("nO")
attention_layer = ParametricAttention(
width
) # TODO: benchmark performance difference of this layer
attention_layer = ParametricAttention(width)
maxout_layer = Maxout(nO=width, nI=width)
norm_layer = LayerNorm(nI=width)
cnn_model = (

View File

@ -1,7 +1,9 @@
from thinc.api import Model, noop
from .parser_model import ParserStepModel
from ..util import registry
@registry.layers("spacy.TransitionModel.v1")
def TransitionModel(
tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()
):
@ -15,7 +17,7 @@ def TransitionModel(
return Model(
name="parser_model",
forward=forward,
dims={"nI": tok2vec.get_dim("nI") if tok2vec.has_dim("nI") else None},
dims={"nI": tok2vec.maybe_get_dim("nI")},
layers=[tok2vec, lower, upper],
refs={"tok2vec": tok2vec, "lower": lower, "upper": upper},
init=init,

View File

@ -11,6 +11,7 @@ from .senter import SentenceRecognizer
from .sentencizer import Sentencizer
from .tagger import Tagger
from .textcat import TextCategorizer
from .spancat import SpanCategorizer
from .textcat_multilabel import MultiLabel_TextCategorizer
from .tok2vec import Tok2Vec
from .functions import merge_entities, merge_noun_chunks, merge_subtokens
@ -27,6 +28,7 @@ __all__ = [
"Pipe",
"SentenceRecognizer",
"Sentencizer",
"SpanCategorizer",
"Tagger",
"TextCategorizer",
"Tok2Vec",

View File

@ -1,3 +1,5 @@
import os
import random
from libc.stdint cimport int32_t
from cymem.cymem cimport Pool
@ -6,10 +8,11 @@ from thinc.extra.search cimport Beam
from ...tokens.doc cimport Doc
from ...tokens.span import Span
from ...tokens.span cimport Span
from ...typedefs cimport weight_t, attr_t
from ...lexeme cimport Lexeme
from ...attrs cimport IS_SPACE
from ...structs cimport TokenC
from ...structs cimport TokenC, SpanC
from ...training.example cimport Example
from .stateclass cimport StateClass
from ._state cimport StateC
@ -25,7 +28,6 @@ cdef enum:
LAST
UNIT
OUT
ISNT
N_MOVES
@ -36,39 +38,62 @@ MOVE_NAMES[IN] = 'I'
MOVE_NAMES[LAST] = 'L'
MOVE_NAMES[UNIT] = 'U'
MOVE_NAMES[OUT] = 'O'
MOVE_NAMES[ISNT] = 'x'
cdef struct GoldNERStateC:
Transition* ner
SpanC* negs
int32_t length
int32_t nr_neg
cdef class BiluoGold:
cdef Pool mem
cdef GoldNERStateC c
def __init__(self, BiluoPushDown moves, StateClass stcls, Example example):
def __init__(self, BiluoPushDown moves, StateClass stcls, Example example, neg_key):
self.mem = Pool()
self.c = create_gold_state(self.mem, moves, stcls.c, example)
self.c = create_gold_state(self.mem, moves, stcls.c, example, neg_key)
def update(self, StateClass stcls):
update_gold_state(&self.c, stcls.c)
cdef GoldNERStateC create_gold_state(
Pool mem,
BiluoPushDown moves,
const StateC* stcls,
Example example
Example example,
neg_key
) except *:
cdef GoldNERStateC gs
cdef Span neg
if neg_key is not None:
negs = example.get_aligned_spans_y2x(
example.y.spans.get(neg_key, []),
allow_overlap=True
)
else:
negs = []
assert example.x.length > 0
gs.ner = <Transition*>mem.alloc(example.x.length, sizeof(Transition))
ner_tags = example.get_aligned_ner()
gs.negs = <SpanC*>mem.alloc(len(negs), sizeof(SpanC))
gs.nr_neg = len(negs)
ner_ents, ner_tags = example.get_aligned_ents_and_ner()
for i, ner_tag in enumerate(ner_tags):
gs.ner[i] = moves.lookup_transition(ner_tag)
# Prevent conflicting spans in the data. For NER, spans are equal if they have the same offsets and label.
neg_span_triples = {(neg_ent.start_char, neg_ent.end_char, neg_ent.label) for neg_ent in negs}
for pos_span in ner_ents:
if (pos_span.start_char, pos_span.end_char, pos_span.label) in neg_span_triples:
raise ValueError(Errors.E868.format(span=(pos_span.start_char, pos_span.end_char, pos_span.label_)))
# In order to handle negative samples, we need to maintain the full
# (start, end, label) triple. If we break it down to the 'isnt B-LOC'
# thing, we'll get blocked if there's an incorrect prefix.
for i, neg in enumerate(negs):
gs.negs[i] = neg.c
return gs
@ -156,21 +181,16 @@ cdef class BiluoPushDown(TransitionSystem):
cdef attr_t label
if name == '-' or name == '' or name is None:
return Transition(clas=0, move=MISSING, label=0, score=0)
elif name == '!O':
return Transition(clas=0, move=ISNT, label=0, score=0)
elif '-' in name:
move_str, label_str = name.split('-', 1)
# Hacky way to denote 'not this entity'
# Deprecated, hacky way to denote 'not this entity'
if label_str.startswith('!'):
label_str = label_str[1:]
move_str = 'x'
raise ValueError(Errors.E869.format(label=name))
label = self.strings.add(label_str)
else:
move_str = name
label = 0
move = MOVE_NAMES.index(move_str)
if move == ISNT:
return Transition(clas=0, move=ISNT, label=label, score=0)
for i in range(self.n_moves):
if self.c[i].move == move and self.c[i].label == label:
return self.c[i]
@ -220,7 +240,7 @@ cdef class BiluoPushDown(TransitionSystem):
label_id = label_name
if action == OUT and label_id != 0:
return None
if action == MISSING or action == ISNT:
if action == MISSING:
return None
# Check we're not creating a move we already have, so that this is
# idempotent
@ -247,7 +267,7 @@ cdef class BiluoPushDown(TransitionSystem):
for i in range(state.c._ents.size()):
ent = state.c._ents.at(i)
if ent.start != -1 and ent.end != -1:
ents.append(Span(doc, ent.start, ent.end, label=ent.label))
ents.append(Span(doc, ent.start, ent.end, label=ent.label, kb_id=doc.c[ent.start].ent_kb_id))
doc.set_ents(ents, default="unmodified")
# Set non-blocked tokens to O
for i in range(doc.length):
@ -270,9 +290,23 @@ cdef class BiluoPushDown(TransitionSystem):
return parses
def init_gold(self, StateClass state, Example example):
return BiluoGold(self, state, example)
return BiluoGold(self, state, example, self.neg_key)
def has_gold(self, Example eg, start=0, end=None):
# We get x and y referring to X, we want to check relative to Y,
# the reference
y_spans = eg.get_aligned_spans_x2y([eg.x[start:end]])
if not y_spans:
y_spans = [eg.y[:]]
y_span = y_spans[0]
start = y_span.start
end = y_span.end
neg_key = self.neg_key
if neg_key is not None:
# If we have any negative samples, count that as having annotation.
for span in eg.y.spans.get(neg_key, []):
if span.start >= start and span.end <= end:
return True
for word in eg.y[start:end]:
if word.ent_iob != 0:
return True
@ -306,8 +340,6 @@ cdef class BiluoPushDown(TransitionSystem):
n_gold += costs[i] <= 0
else:
costs[i] = 9000
if n_gold < 1:
raise ValueError
cdef class Missing:
@ -373,23 +405,33 @@ cdef class Begin:
@staticmethod
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
gold = <GoldNERStateC*>_gold
cdef int g_act = gold.ner[s.B(0)].move
cdef attr_t g_tag = gold.ner[s.B(0)].label
b0 = s.B(0)
cdef int cost = 0
cdef int g_act = gold.ner[b0].move
cdef attr_t g_tag = gold.ner[b0].label
if g_act == MISSING:
return 0
pass
elif g_act == BEGIN:
# B, Gold B --> Label match
return label != g_tag
# Support partial supervision in the form of "not this label"
elif g_act == ISNT:
return label == g_tag
cost += label != g_tag
else:
# B, Gold I --> False (P)
# B, Gold L --> False (P)
# B, Gold O --> False (P)
# B, Gold U --> False (P)
return 1
cost += 1
if s.buffer_length() < 3:
# Handle negatives. In general we can't really do much to block
# B, because we don't know whether the whole entity is going to
# be correct or not. However, we can at least tell whether we're
# going to be opening an entity where there's only one possible
# L.
for span in gold.negs[:gold.nr_neg]:
if span.label == label and span.start == b0:
cost += 1
break
return cost
cdef class In:
@ -462,9 +504,6 @@ cdef class In:
elif g_act == UNIT:
# I, Gold U --> True iff next tag == O
return next_act != OUT
# Support partial supervision in the form of "not this label"
elif g_act == ISNT:
return 0
else:
return 1
@ -504,32 +543,41 @@ cdef class Last:
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
gold = <GoldNERStateC*>_gold
move = LAST
b0 = s.B(0)
ent_start = s.E(0)
cdef int g_act = gold.ner[s.B(0)].move
cdef attr_t g_tag = gold.ner[s.B(0)].label
cdef int g_act = gold.ner[b0].move
cdef attr_t g_tag = gold.ner[b0].label
cdef int cost = 0
if g_act == MISSING:
return 0
pass
elif g_act == BEGIN:
# L, Gold B --> True
return 0
pass
elif g_act == IN:
# L, Gold I --> True iff this entity sunk
return not _entity_is_sunk(s, gold.ner)
cost += not _entity_is_sunk(s, gold.ner)
elif g_act == LAST:
# L, Gold L --> True
return 0
pass
elif g_act == OUT:
# L, Gold O --> True
return 0
pass
elif g_act == UNIT:
# L, Gold U --> True
return 0
# Support partial supervision in the form of "not this label"
elif g_act == ISNT:
return 0
pass
else:
return 1
cost += 1
# If we have negative-example entities, integrate them into the objective,
# by marking actions that close an entity that we know is incorrect
# as costly.
for span in gold.negs[:gold.nr_neg]:
if span.label == label and (span.end-1) == b0 and span.start == ent_start:
cost += 1
break
return cost
cdef class Unit:
@ -573,21 +621,29 @@ cdef class Unit:
gold = <GoldNERStateC*>_gold
cdef int g_act = gold.ner[s.B(0)].move
cdef attr_t g_tag = gold.ner[s.B(0)].label
cdef int cost = 0
if g_act == MISSING:
return 0
pass
elif g_act == UNIT:
# U, Gold U --> True iff tag match
return label != g_tag
# Support partial supervision in the form of "not this label"
elif g_act == ISNT:
return label == g_tag
cost += label != g_tag
else:
# U, Gold B --> False
# U, Gold I --> False
# U, Gold L --> False
# U, Gold O --> False
return 1
cost += 1
# If we have negative-example entities, integrate them into the objective.
# This is fairly straight-forward for U- entities, as we have a single
# action
cdef int b0 = s.B(0)
for span in gold.negs[:gold.nr_neg]:
if span.label == label and span.start == b0 and span.end == (b0+1):
cost += 1
break
return cost
cdef class Out:
@ -613,25 +669,24 @@ cdef class Out:
gold = <GoldNERStateC*>_gold
cdef int g_act = gold.ner[s.B(0)].move
cdef attr_t g_tag = gold.ner[s.B(0)].label
if g_act == ISNT and g_tag == 0:
return 1
elif g_act == MISSING or g_act == ISNT:
return 0
cdef weight_t cost = 0
if g_act == MISSING:
pass
elif g_act == BEGIN:
# O, Gold B --> False
return 1
cost += 1
elif g_act == IN:
# O, Gold I --> True
return 0
pass
elif g_act == LAST:
# O, Gold L --> True
return 0
pass
elif g_act == OUT:
# O, Gold O --> True
return 0
pass
elif g_act == UNIT:
# O, Gold U --> False
return 1
cost += 1
else:
return 1
cost += 1
return cost

View File

@ -41,6 +41,7 @@ cdef class TransitionSystem:
cdef public attr_t root_label
cdef public freqs
cdef public object labels
cdef public object cfg
cdef init_state_t init_beam_state
cdef del_state_t del_beam_state

View File

@ -33,7 +33,14 @@ cdef int _del_state(Pool mem, void* state, void* x) except -1:
cdef class TransitionSystem:
def __init__(self, StringStore string_table, labels_by_action=None, min_freq=None):
def __init__(
self,
StringStore string_table,
labels_by_action=None,
min_freq=None,
incorrect_spans_key=None
):
self.cfg = {"neg_key": incorrect_spans_key}
self.mem = Pool()
self.strings = string_table
self.n_moves = 0
@ -49,8 +56,13 @@ cdef class TransitionSystem:
self.del_beam_state = _del_state
def __reduce__(self):
# TODO: This loses the 'cfg'
return (self.__class__, (self.strings, self.labels), None, None)
@property
def neg_key(self):
return self.cfg.get("neg_key")
def init_batch(self, docs):
cdef StateClass state
states = []
@ -220,16 +232,21 @@ cdef class TransitionSystem:
transitions = []
serializers = {
'moves': lambda: srsly.json_dumps(self.labels),
'strings': lambda: self.strings.to_bytes()
'strings': lambda: self.strings.to_bytes(),
'cfg': lambda: self.cfg
}
return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, exclude=tuple()):
# We're adding a new field, 'cfg', here and we don't want to break
# previous models that don't have it.
msg = srsly.msgpack_loads(bytes_data)
labels = {}
deserializers = {
'moves': lambda b: labels.update(srsly.json_loads(b)),
'strings': lambda b: self.strings.from_bytes(b)
}
msg = util.from_bytes(bytes_data, deserializers, exclude)
if 'moves' not in exclude:
labels.update(srsly.json_loads(msg['moves']))
if 'strings' not in exclude:
self.strings.from_bytes(msg['strings'])
if 'cfg' not in exclude and 'cfg' in msg:
self.cfg.update(msg['cfg'])
self.initialize_actions(labels)
return self

View File

@ -106,7 +106,7 @@ class AttributeRuler(Pipe):
def match(self, doc: Doc):
matches = self.matcher(doc, allow_missing=True)
# Sort by the attribute ID, so that later rules have precendence
# Sort by the attribute ID, so that later rules have precedence
matches = [
(int(self.vocab.strings[m_id]), m_id, s, e) for m_id, s, e in matches
]

View File

@ -3,6 +3,7 @@ from collections import defaultdict
from typing import Optional, Iterable
from thinc.api import Model, Config
from ._parser_internals.transition_system import TransitionSystem
from .transition_parser cimport Parser
from ._parser_internals.arc_eager cimport ArcEager
@ -59,7 +60,7 @@ def make_parser(
nlp: Language,
name: str,
model: Model,
moves: Optional[list],
moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int,
learn_tokens: bool,
min_action_freq: int
@ -85,13 +86,13 @@ def make_parser(
model (Model): The model for the transition-based parser. The model needs
to have a specific substructure of named components --- see the
spacy.ml.tb_framework.TransitionModel for details.
moves (List[str]): A list of transition names. Inferred from the data if not
provided.
update_with_oracle_cut_size (int):
During training, cut long sequences into shorter segments by creating
intermediate states based on the gold-standard history. The model is
not very sensitive to this parameter, so you usually won't need to change
it. 100 is a good default.
moves (Optional[TransitionSystem]): This defines how the parse-state is created,
updated and evaluated. If 'moves' is None, a new instance is
created with `self.TransitionSystem()`. Defaults to `None`.
update_with_oracle_cut_size (int): During training, cut long sequences into
shorter segments by creating intermediate states based on the gold-standard
history. The model is not very sensitive to this parameter, so you usually
won't need to change it. 100 is a good default.
learn_tokens (bool): Whether to learn to merge subtokens that are split
relative to the gold standard. Experimental.
min_action_freq (int): The minimum frequency of labelled actions to retain.
@ -112,6 +113,9 @@ def make_parser(
beam_width=1,
beam_density=0.0,
beam_update_prob=0.0,
# At some point in the future we can try to implement support for
# partial annotations, perhaps only in the beam objective.
incorrect_spans_key=None
)
@Language.factory(
@ -140,7 +144,7 @@ def make_beam_parser(
nlp: Language,
name: str,
model: Model,
moves: Optional[list],
moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int,
learn_tokens: bool,
min_action_freq: int,
@ -165,8 +169,13 @@ def make_beam_parser(
model (Model): The model for the transition-based parser. The model needs
to have a specific substructure of named components --- see the
spacy.ml.tb_framework.TransitionModel for details.
moves (List[str]): A list of transition names. Inferred from the data if not
provided.
moves (Optional[TransitionSystem]): This defines how the parse-state is created,
updated and evaluated. If 'moves' is None, a new instance is
created with `self.TransitionSystem()`. Defaults to `None`.
update_with_oracle_cut_size (int): During training, cut long sequences into
shorter segments by creating intermediate states based on the gold-standard
history. The model is not very sensitive to this parameter, so you usually
won't need to change it. 100 is a good default.
beam_width (int): The number of candidate analyses to maintain.
beam_density (float): The minimum ratio between the scores of the first and
last candidates in the beam. This allows the parser to avoid exploring
@ -195,7 +204,10 @@ def make_beam_parser(
beam_update_prob=beam_update_prob,
multitasks=[],
learn_tokens=learn_tokens,
min_action_freq=min_action_freq
min_action_freq=min_action_freq,
# At some point in the future we can try to implement support for
# partial annotations, perhaps only in the beam objective.
incorrect_spans_key=None
)
@ -206,6 +218,39 @@ cdef class DependencyParser(Parser):
"""
TransitionSystem = ArcEager
def __init__(
self,
vocab,
model,
name="parser",
moves=None,
*,
update_with_oracle_cut_size=100,
min_action_freq=30,
learn_tokens=False,
beam_width=1,
beam_density=0.0,
beam_update_prob=0.0,
multitasks=tuple(),
incorrect_spans_key=None,
):
"""Create a DependencyParser.
"""
super().__init__(
vocab,
model,
name,
moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
min_action_freq=min_action_freq,
learn_tokens=learn_tokens,
beam_width=beam_width,
beam_density=beam_density,
beam_update_prob=beam_update_prob,
multitasks=multitasks,
incorrect_spans_key=incorrect_spans_key,
)
@property
def postprocesses(self):
output = [nonproj.deprojectivize]

View File

@ -9,7 +9,7 @@ import warnings
from ..kb import KnowledgeBase, Candidate
from ..ml import empty_kb
from ..tokens import Doc
from ..tokens import Doc, Span
from .pipe import deserialize_config
from .trainable_pipe import TrainablePipe
from ..language import Language
@ -67,7 +67,7 @@ def make_entity_linker(
incl_prior: bool,
incl_context: bool,
entity_vector_length: int,
get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]],
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
):
"""Construct an EntityLinker component.
@ -114,7 +114,7 @@ class EntityLinker(TrainablePipe):
incl_prior: bool,
incl_context: bool,
entity_vector_length: int,
get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]],
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
) -> None:
"""Initialize an entity linker.
@ -127,7 +127,7 @@ class EntityLinker(TrainablePipe):
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
incl_context (bool): Whether or not to include the local context in the model.
entity_vector_length (int): Size of encoding vectors in the KB.
get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
produces a list of candidates, given a certain knowledge base and a textual mention.
DOCS: https://spacy.io/api/entitylinker#init
@ -142,7 +142,7 @@ class EntityLinker(TrainablePipe):
self.get_candidates = get_candidates
self.cfg = {}
self.distance = CosineDistance(normalize=False)
# how many neightbour sentences to take into account
# how many neighbour sentences to take into account
# create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
self.kb = empty_kb(entity_vector_length)(self.vocab)
@ -156,6 +156,8 @@ class EntityLinker(TrainablePipe):
def validate_kb(self) -> None:
# Raise an error if the knowledge base is not initialized.
if self.kb is None:
raise ValueError(Errors.E1018.format(name=self.name))
if len(self.kb) == 0:
raise ValueError(Errors.E139.format(name=self.name))
@ -305,11 +307,9 @@ class EntityLinker(TrainablePipe):
sent = ent.sent
sent_index = sentences.index(sent)
assert sent_index >= 0
# get n_neightbour sentences, clipped to the length of the document
# get n_neighbour sentences, clipped to the length of the document
start_sentence = max(0, sent_index - self.n_sents)
end_sentence = min(
len(sentences) - 1, sent_index + self.n_sents
)
end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
start_token = sentences[start_sentence].start
end_token = sentences[end_sentence].end
sent_doc = doc[start_token:end_token].as_doc()
@ -335,22 +335,16 @@ class EntityLinker(TrainablePipe):
else:
random.shuffle(candidates)
# set all prior probabilities to 0 if incl_prior=False
prior_probs = xp.asarray(
[c.prior_prob for c in candidates]
)
prior_probs = xp.asarray([c.prior_prob for c in candidates])
if not self.incl_prior:
prior_probs = xp.asarray(
[0.0 for _ in candidates]
)
prior_probs = xp.asarray([0.0 for _ in candidates])
scores = prior_probs
# add in similarity from the context
if self.incl_context:
entity_encodings = xp.asarray(
[c.entity_vector for c in candidates]
)
entity_norm = xp.linalg.norm(
entity_encodings, axis=1
)
entity_norm = xp.linalg.norm(entity_encodings, axis=1)
if len(entity_encodings) != len(prior_probs):
raise RuntimeError(
Errors.E147.format(
@ -359,14 +353,12 @@ class EntityLinker(TrainablePipe):
)
)
# cosine similarity
sims = xp.dot(
entity_encodings, sentence_encoding_t
) / (sentence_norm * entity_norm)
sims = xp.dot(entity_encodings, sentence_encoding_t) / (
sentence_norm * entity_norm
)
if sims.shape != prior_probs.shape:
raise ValueError(Errors.E161)
scores = (
prior_probs + sims - (prior_probs * sims)
)
scores = prior_probs + sims - (prior_probs * sims)
# TODO: thresholding
best_index = scores.argmax().item()
best_candidate = candidates[best_index]
@ -408,6 +400,48 @@ class EntityLinker(TrainablePipe):
validate_examples(examples, "EntityLinker.score")
return Scorer.score_links(examples, negative_labels=[self.NIL])
def to_bytes(self, *, exclude=tuple()):
"""Serialize the pipe to a bytestring.
exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (bytes): The serialized object.
DOCS: https://spacy.io/api/entitylinker#to_bytes
"""
self._validate_serialization_attrs()
serialize = {}
if hasattr(self, "cfg") and self.cfg is not None:
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
serialize["vocab"] = self.vocab.to_bytes
serialize["kb"] = self.kb.to_bytes
serialize["model"] = self.model.to_bytes
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, *, exclude=tuple()):
"""Load the pipe from a bytestring.
exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (TrainablePipe): The loaded object.
DOCS: https://spacy.io/api/entitylinker#from_bytes
"""
self._validate_serialization_attrs()
def load_model(b):
try:
self.model.from_bytes(b)
except AttributeError:
raise ValueError(Errors.E149) from None
deserialize = {}
if hasattr(self, "cfg") and self.cfg is not None:
deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
deserialize["kb"] = lambda b: self.kb.from_bytes(b)
deserialize["model"] = load_model
util.from_bytes(bytes_data, deserialize, exclude)
return self
def to_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
) -> None:

View File

@ -141,7 +141,9 @@ class EntityRuler(Pipe):
def match(self, doc: Doc):
self._require_patterns()
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="\\[W036")
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
matches = set(
[(m_id, start, end) for m_id, start, end in matches if start != end]
)
@ -275,9 +277,7 @@ class EntityRuler(Pipe):
if self == pipe:
current_index = i
break
subsequent_pipes = [
pipe for pipe in self.nlp.pipe_names[current_index + 1 :]
]
subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index:]]
except ValueError:
subsequent_pipes = []
with self.nlp.select_pipes(disable=subsequent_pipes):
@ -298,7 +298,7 @@ class EntityRuler(Pipe):
self.nlp.pipe(phrase_pattern_texts),
phrase_pattern_ids,
):
phrase_pattern = {"label": label, "pattern": pattern, "id": ent_id}
phrase_pattern = {"label": label, "pattern": pattern}
if ent_id:
phrase_pattern["id"] = ent_id
phrase_patterns.append(phrase_pattern)

View File

@ -3,6 +3,7 @@ from collections import defaultdict
from typing import Optional, Iterable
from thinc.api import Model, Config
from ._parser_internals.transition_system import TransitionSystem
from .transition_parser cimport Parser
from ._parser_internals.ner cimport BiluoPushDown
@ -40,6 +41,7 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
"moves": None,
"update_with_oracle_cut_size": 100,
"model": DEFAULT_NER_MODEL,
"incorrect_spans_key": None
},
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
@ -48,8 +50,9 @@ def make_ner(
nlp: Language,
name: str,
model: Model,
moves: Optional[list],
moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int,
incorrect_spans_key: Optional[str]=None
):
"""Create a transition-based EntityRecognizer component. The entity recognizer
identifies non-overlapping labelled spans of tokens.
@ -67,13 +70,16 @@ def make_ner(
model (Model): The model for the transition-based parser. The model needs
to have a specific substructure of named components --- see the
spacy.ml.tb_framework.TransitionModel for details.
moves (list[str]): A list of transition names. Inferred from the data if not
provided.
update_with_oracle_cut_size (int):
During training, cut long sequences into shorter segments by creating
intermediate states based on the gold-standard history. The model is
not very sensitive to this parameter, so you usually won't need to change
it. 100 is a good default.
moves (Optional[TransitionSystem]): This defines how the parse-state is created,
updated and evaluated. If 'moves' is None, a new instance is
created with `self.TransitionSystem()`. Defaults to `None`.
update_with_oracle_cut_size (int): During training, cut long sequences into
shorter segments by creating intermediate states based on the gold-standard
history. The model is not very sensitive to this parameter, so you usually
won't need to change it. 100 is a good default.
incorrect_spans_key (Optional[str]): Identifies spans that are known
to be incorrect entity annotations. The incorrect entity annotations
can be stored in the span group, under this key.
"""
return EntityRecognizer(
nlp.vocab,
@ -81,9 +87,8 @@ def make_ner(
name,
moves=moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
incorrect_spans_key=incorrect_spans_key,
multitasks=[],
min_action_freq=1,
learn_tokens=False,
beam_width=1,
beam_density=0.0,
beam_update_prob=0.0,
@ -98,7 +103,8 @@ def make_ner(
"model": DEFAULT_NER_MODEL,
"beam_density": 0.01,
"beam_update_prob": 0.5,
"beam_width": 32
"beam_width": 32,
"incorrect_spans_key": None
},
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
)
@ -106,11 +112,12 @@ def make_beam_ner(
nlp: Language,
name: str,
model: Model,
moves: Optional[list],
moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int,
beam_width: int,
beam_density: float,
beam_update_prob: float,
incorrect_spans_key: Optional[str]=None
):
"""Create a transition-based EntityRecognizer component that uses beam-search.
The entity recognizer identifies non-overlapping labelled spans of tokens.
@ -128,13 +135,13 @@ def make_beam_ner(
model (Model): The model for the transition-based parser. The model needs
to have a specific substructure of named components --- see the
spacy.ml.tb_framework.TransitionModel for details.
moves (list[str]): A list of transition names. Inferred from the data if not
provided.
update_with_oracle_cut_size (int):
During training, cut long sequences into shorter segments by creating
intermediate states based on the gold-standard history. The model is
not very sensitive to this parameter, so you usually won't need to change
it. 100 is a good default.
moves (Optional[TransitionSystem]): This defines how the parse-state is created,
updated and evaluated. If 'moves' is None, a new instance is
created with `self.TransitionSystem()`. Defaults to `None`.
update_with_oracle_cut_size (int): During training, cut long sequences into
shorter segments by creating intermediate states based on the gold-standard
history. The model is not very sensitive to this parameter, so you usually
won't need to change it. 100 is a good default.
beam_width (int): The number of candidate analyses to maintain.
beam_density (float): The minimum ratio between the scores of the first and
last candidates in the beam. This allows the parser to avoid exploring
@ -144,6 +151,8 @@ def make_beam_ner(
beam_update_prob (float): The chance of making a beam update, instead of a
greedy update. Greedy updates are an approximation for the beam updates,
and are faster to compute.
incorrect_spans_key (Optional[str]): Optional key into span groups of
entities known to be non-entities.
"""
return EntityRecognizer(
nlp.vocab,
@ -152,11 +161,10 @@ def make_beam_ner(
moves=moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
multitasks=[],
min_action_freq=1,
learn_tokens=False,
beam_width=beam_width,
beam_density=beam_density,
beam_update_prob=beam_update_prob,
incorrect_spans_key=incorrect_spans_key
)
@ -167,6 +175,37 @@ cdef class EntityRecognizer(Parser):
"""
TransitionSystem = BiluoPushDown
def __init__(
self,
vocab,
model,
name="ner",
moves=None,
*,
update_with_oracle_cut_size=100,
beam_width=1,
beam_density=0.0,
beam_update_prob=0.0,
multitasks=tuple(),
incorrect_spans_key=None,
):
"""Create an EntityRecognizer.
"""
super().__init__(
vocab,
model,
name,
moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
min_action_freq=1, # not relevant for NER
learn_tokens=False, # not relevant for NER
beam_width=beam_width,
beam_density=beam_density,
beam_update_prob=beam_update_prob,
multitasks=multitasks,
incorrect_spans_key=incorrect_spans_key,
)
def add_multitask_objective(self, mt_component):
"""Register another component as a multi-task objective. Experimental."""
self._multitasks.append(mt_component)

423
spacy/pipeline/spancat.py Normal file
View File

@ -0,0 +1,423 @@
import numpy
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
from thinc.api import Optimizer
from thinc.types import Ragged, Ints2d, Floats2d
from ..scorer import Scorer
from ..language import Language
from .trainable_pipe import TrainablePipe
from ..tokens import Doc, SpanGroup, Span
from ..vocab import Vocab
from ..training import Example, validate_examples
from ..errors import Errors
from ..util import registry
spancat_default_config = """
[model]
@architectures = "spacy.SpanCategorizer.v1"
scorer = {"@layers": "spacy.LinearLogistic.v1"}
[model.reducer]
@layers = spacy.mean_max_reducer.v1
hidden_size = 128
[model.tok2vec]
@architectures = "spacy.Tok2Vec.v1"
[model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = 96
rows = [5000, 2000, 1000, 1000]
attrs = ["ORTH", "PREFIX", "SUFFIX", "SHAPE"]
include_static_vectors = false
[model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
width = ${model.tok2vec.embed.width}
window_size = 1
maxout_pieces = 3
depth = 4
"""
DEFAULT_SPANCAT_MODEL = Config().from_str(spancat_default_config)["model"]
@registry.misc("spacy.ngram_suggester.v1")
def build_ngram_suggester(sizes: List[int]) -> Callable[[List[Doc]], Ragged]:
"""Suggest all spans of the given lengths. Spans are returned as a ragged
array of integers. The array has two columns, indicating the start and end
position."""
def ngram_suggester(docs: List[Doc], *, ops: Optional[Ops] = None) -> Ragged:
if ops is None:
ops = get_current_ops()
spans = []
lengths = []
for doc in docs:
starts = ops.xp.arange(len(doc), dtype="i")
starts = starts.reshape((-1, 1))
length = 0
for size in sizes:
if size <= len(doc):
starts_size = starts[: len(doc) - (size - 1)]
spans.append(ops.xp.hstack((starts_size, starts_size + size)))
length += spans[-1].shape[0]
if spans:
assert spans[-1].ndim == 2, spans[-1].shape
lengths.append(length)
if len(spans) > 0:
output = Ragged(ops.xp.vstack(spans), ops.asarray(lengths, dtype="i"))
else:
output = Ragged(ops.xp.zeros((0, 0)), ops.asarray(lengths, dtype="i"))
assert output.dataXd.ndim == 2
return output
return ngram_suggester
@registry.misc("spacy.ngram_range_suggester.v1")
def build_ngram_range_suggester(
min_size: int, max_size: int
) -> Callable[[List[Doc]], Ragged]:
"""Suggest all spans of the given lengths between a given min and max value - both inclusive.
Spans are returned as a ragged array of integers. The array has two columns,
indicating the start and end position."""
sizes = range(min_size, max_size + 1)
return build_ngram_suggester(sizes)
@Language.factory(
"spancat",
assigns=["doc.spans"],
default_config={
"threshold": 0.5,
"spans_key": "sc",
"max_positive": None,
"model": DEFAULT_SPANCAT_MODEL,
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
},
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
)
def make_spancat(
nlp: Language,
name: str,
suggester: Callable[[List[Doc]], Ragged],
model: Model[Tuple[List[Doc], Ragged], Floats2d],
spans_key: str,
threshold: float = 0.5,
max_positive: Optional[int] = None,
) -> "SpanCategorizer":
"""Create a SpanCategorizer component. The span categorizer consists of two
parts: a suggester function that proposes candidate spans, and a labeller
model that predicts one or more labels for each span.
suggester (Callable[List[Doc], Ragged]): A function that suggests spans.
Spans are returned as a ragged array with two integer columns, for the
start and end positions.
model (Model[Tuple[List[Doc], Ragged], Floats2d]): A model instance that
is given a list of documents and (start, end) indices representing
candidate span offsets. The model predicts a probability for each category
for each span.
spans_key (str): Key of the doc.spans dict to save the spans under. During
initialization and training, the component will look for spans on the
reference document under the same key.
threshold (float): Minimum probability to consider a prediction positive.
Spans with a positive prediction will be saved on the Doc. Defaults to
0.5.
max_positive (Optional[int]): Maximum number of labels to consider positive
per span. Defaults to None, indicating no limit.
"""
return SpanCategorizer(
nlp.vocab,
suggester=suggester,
model=model,
spans_key=spans_key,
threshold=threshold,
max_positive=max_positive,
name=name,
)
class SpanCategorizer(TrainablePipe):
"""Pipeline component to label spans of text.
DOCS: https://spacy.io/api/spancategorizer
"""
def __init__(
self,
vocab: Vocab,
model: Model[Tuple[List[Doc], Ragged], Floats2d],
suggester: Callable[[List[Doc]], Ragged],
name: str = "spancat",
*,
spans_key: str = "spans",
threshold: float = 0.5,
max_positive: Optional[int] = None,
) -> None:
"""Initialize the span categorizer.
DOCS: https://spacy.io/api/spancategorizer#init
"""
self.cfg = {
"labels": [],
"spans_key": spans_key,
"threshold": threshold,
"max_positive": max_positive,
}
self.vocab = vocab
self.suggester = suggester
self.model = model
self.name = name
@property
def key(self) -> str:
"""Key of the doc.spans dict to save the spans under. During
initialization and training, the component will look for spans on the
reference document under the same key.
"""
return self.cfg["spans_key"]
def add_label(self, label: str) -> int:
"""Add a new label to the pipe.
label (str): The label to add.
RETURNS (int): 0 if label is already present, otherwise 1.
DOCS: https://spacy.io/api/spancategorizer#add_label
"""
if not isinstance(label, str):
raise ValueError(Errors.E187)
if label in self.labels:
return 0
self._allow_extra_label()
self.cfg["labels"].append(label)
self.vocab.strings.add(label)
return 1
@property
def labels(self) -> Tuple[str]:
"""RETURNS (Tuple[str]): The labels currently added to the component.
DOCS: https://spacy.io/api/spancategorizer#labels
"""
return tuple(self.cfg["labels"])
@property
def label_data(self) -> List[str]:
"""RETURNS (List[str]): Information about the component's labels.
DOCS: https://spacy.io/api/spancategorizer#label_data
"""
return list(self.labels)
def predict(self, docs: Iterable[Doc]):
"""Apply the pipeline's model to a batch of docs, without modifying them.
docs (Iterable[Doc]): The documents to predict.
RETURNS: The models prediction for each document.
DOCS: https://spacy.io/api/spancategorizer#predict
"""
indices = self.suggester(docs, ops=self.model.ops)
scores = self.model.predict((docs, indices))
return (indices, scores)
def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None:
"""Modify a batch of Doc objects, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
scores: The scores to set, produced by SpanCategorizer.predict.
DOCS: https://spacy.io/api/spancategorizer#set_annotations
"""
labels = self.labels
indices, scores = indices_scores
offset = 0
for i, doc in enumerate(docs):
indices_i = indices[i].dataXd
doc.spans[self.key] = self._make_span_group(
doc, indices_i, scores[offset : offset + indices.lengths[i]], labels
)
offset += indices.lengths[i]
def update(
self,
examples: Iterable[Example],
*,
drop: float = 0.0,
sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]] = None,
) -> Dict[str, float]:
"""Learn from a batch of documents and gold-standard information,
updating the pipe's model. Delegates to predict and get_loss.
examples (Iterable[Example]): A batch of Example objects.
drop (float): The dropout rate.
sgd (thinc.api.Optimizer): The optimizer.
losses (Dict[str, float]): Optional record of the loss during training.
Updated using the component name as the key.
RETURNS (Dict[str, float]): The updated losses dictionary.
DOCS: https://spacy.io/api/spancategorizer#update
"""
if losses is None:
losses = {}
losses.setdefault(self.name, 0.0)
validate_examples(examples, "SpanCategorizer.update")
self._validate_categories(examples)
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
# Handle cases where there are no tokens in any docs.
return losses
docs = [eg.predicted for eg in examples]
spans = self.suggester(docs, ops=self.model.ops)
if spans.lengths.sum() == 0:
return losses
set_dropout_rate(self.model, drop)
scores, backprop_scores = self.model.begin_update((docs, spans))
loss, d_scores = self.get_loss(examples, (spans, scores))
backprop_scores(d_scores)
if sgd is not None:
self.finish_update(sgd)
losses[self.name] += loss
return losses
def get_loss(
self, examples: Iterable[Example], spans_scores: Tuple[Ragged, Ragged]
) -> Tuple[float, float]:
"""Find the loss and gradient of loss for the batch of documents and
their predicted scores.
examples (Iterable[Examples]): The batch of examples.
spans_scores: Scores representing the model's predictions.
RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://spacy.io/api/spancategorizer#get_loss
"""
spans, scores = spans_scores
spans = Ragged(
self.model.ops.to_numpy(spans.data), self.model.ops.to_numpy(spans.lengths)
)
label_map = {label: i for i, label in enumerate(self.labels)}
target = numpy.zeros(scores.shape, dtype=scores.dtype)
offset = 0
for i, eg in enumerate(examples):
# Map (start, end) offset of spans to the row in the d_scores array,
# so that we can adjust the gradient for predictions that were
# in the gold standard.
spans_index = {}
spans_i = spans[i].dataXd
for j in range(spans.lengths[i]):
start = int(spans_i[j, 0])
end = int(spans_i[j, 1])
spans_index[(start, end)] = offset + j
for gold_span in self._get_aligned_spans(eg):
key = (gold_span.start, gold_span.end)
if key in spans_index:
row = spans_index[key]
k = label_map[gold_span.label_]
target[row, k] = 1.0
# The target is a flat array for all docs. Track the position
# we're at within the flat array.
offset += spans.lengths[i]
target = self.model.ops.asarray(target, dtype="f")
# The target will have the values 0 (for untrue predictions) or 1
# (for true predictions).
# The scores should be in the range [0, 1].
# If the prediction is 0.9 and it's true, the gradient
# will be -0.1 (0.9 - 1.0).
# If the prediction is 0.9 and it's false, the gradient will be
# 0.9 (0.9 - 0.0)
d_scores = scores - target
loss = float((d_scores ** 2).sum())
return loss, d_scores
def initialize(
self,
get_examples: Callable[[], Iterable[Example]],
*,
nlp: Language = None,
labels: Optional[List[str]] = None,
) -> None:
"""Initialize the pipe for training, using a representative set
of data examples.
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.
nlp (Language): The current nlp object the component is part of.
labels: The labels to add to the component, typically generated by the
`init labels` command. If no labels are provided, the get_examples
callback is used to extract the labels from the data.
DOCS: https://spacy.io/api/spancategorizer#initialize
"""
subbatch = []
if labels is not None:
for label in labels:
self.add_label(label)
for eg in get_examples():
if labels is None:
for span in eg.reference.spans.get(self.key, []):
self.add_label(span.label_)
if len(subbatch) < 10:
subbatch.append(eg)
self._require_labels()
if subbatch:
docs = [eg.x for eg in subbatch]
spans = self.suggester(docs)
Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels))
self.model.initialize(X=(docs, spans), Y=Y)
else:
self.model.initialize()
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
DOCS: https://spacy.io/api/spancategorizer#score
"""
validate_examples(examples, "SpanCategorizer.score")
self._validate_categories(examples)
kwargs = dict(kwargs)
attr_prefix = "spans_"
kwargs.setdefault("attr", f"{attr_prefix}{self.key}")
kwargs.setdefault("labels", self.labels)
kwargs.setdefault("multi_label", True)
kwargs.setdefault("threshold", self.cfg["threshold"])
kwargs.setdefault(
"getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], [])
)
kwargs.setdefault("has_annotation", lambda doc: self.key in doc.spans)
return Scorer.score_spans(examples, **kwargs)
def _validate_categories(self, examples):
# TODO
pass
def _get_aligned_spans(self, eg: Example):
return eg.get_aligned_spans_y2x(eg.reference.spans.get(self.key, []))
def _make_span_group(
self, doc: Doc, indices: Ints2d, scores: Floats2d, labels: List[str]
) -> SpanGroup:
spans = SpanGroup(doc, name=self.key)
max_positive = self.cfg["max_positive"]
threshold = self.cfg["threshold"]
for i in range(indices.shape[0]):
start = int(indices[i, 0])
end = int(indices[i, 1])
positives = []
for j, score in enumerate(scores[i]):
if score >= threshold:
positives.append((score, start, end, labels[j]))
positives.sort(reverse=True)
if max_positive:
positives = positives[:max_positive]
for score, start, end, label in positives:
spans.append(Span(doc, start, end, label=label))
return spans

View File

@ -222,7 +222,7 @@ class Tagger(TrainablePipe):
DOCS: https://spacy.io/api/tagger#get_loss
"""
validate_examples(examples, "Tagger.get_loss")
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix="!")
# Convert empty tag "" to missing value None so that both misaligned
# tokens and tokens with missing annotation have the default missing
# value None.

View File

@ -35,7 +35,7 @@ maxout_pieces = 3
depth = 2
[model.linear_model]
@architectures = "spacy.TextCatBOW.v1"
@architectures = "spacy.TextCatBOW.v2"
exclusive_classes = true
ngram_size = 1
no_output_layer = false
@ -44,7 +44,7 @@ DEFAULT_SINGLE_TEXTCAT_MODEL = Config().from_str(single_label_default_config)["m
single_label_bow_config = """
[model]
@architectures = "spacy.TextCatBOW.v1"
@architectures = "spacy.TextCatBOW.v2"
exclusive_classes = true
ngram_size = 1
no_output_layer = false
@ -52,7 +52,7 @@ no_output_layer = false
single_label_cnn_config = """
[model]
@architectures = "spacy.TextCatCNN.v1"
@architectures = "spacy.TextCatCNN.v2"
exclusive_classes = true
[model.tok2vec]
@ -298,6 +298,10 @@ class TextCategorizer(TrainablePipe):
return 0
self._allow_extra_label()
self.cfg["labels"].append(label)
if self.model and "resize_output" in self.model.attrs:
self.model = self.model.attrs["resize_output"](
self.model, len(self.cfg["labels"])
)
self.vocab.strings.add(label)
return 1
@ -332,6 +336,8 @@ class TextCategorizer(TrainablePipe):
else:
for label in labels:
self.add_label(label)
if len(self.labels) < 2:
raise ValueError(Errors.E867)
if positive_label is not None:
if positive_label not in self.labels:
err = Errors.E920.format(pos_label=positive_label, labels=self.labels)

Some files were not shown because too many files have changed in this diff Show More