mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-28 10:14:07 +03:00
Merge branch 'master' into spacy.io
This commit is contained in:
commit
065e94a6eb
18
.github/ISSUE_TEMPLATE.md
vendored
18
.github/ISSUE_TEMPLATE.md
vendored
|
@ -1,18 +0,0 @@
|
||||||
<!--- Please provide a summary in the title and describe your issue here.
|
|
||||||
Is this a bug or feature request? If a bug, include all the steps that led to the issue.
|
|
||||||
|
|
||||||
If you're looking for help with your code, consider posting a question here:
|
|
||||||
|
|
||||||
- GitHub Discussions: https://github.com/explosion/spaCy/discussions
|
|
||||||
- Stack Overflow: http://stackoverflow.com/questions/tagged/spacy
|
|
||||||
-->
|
|
||||||
|
|
||||||
## Your Environment
|
|
||||||
|
|
||||||
<!-- Include details of your environment. If you're using spaCy 1.7+, you can also type
|
|
||||||
`python -m spacy info --markdown` and copy-paste the result here.-->
|
|
||||||
|
|
||||||
- Operating System:
|
|
||||||
- Python Version Used:
|
|
||||||
- spaCy Version Used:
|
|
||||||
- Environment Information:
|
|
4
.github/ISSUE_TEMPLATE/01_bugs.md
vendored
4
.github/ISSUE_TEMPLATE/01_bugs.md
vendored
|
@ -1,6 +1,6 @@
|
||||||
---
|
---
|
||||||
name: "\U0001F6A8 Bug Report"
|
name: "\U0001F6A8 Submit a Bug Report"
|
||||||
about: Did you come across a bug or unexpected behaviour differing from the docs?
|
about: Use this template if you came across a bug or unexpected behaviour differing from the docs.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
2
.github/ISSUE_TEMPLATE/02_docs.md
vendored
2
.github/ISSUE_TEMPLATE/02_docs.md
vendored
|
@ -1,5 +1,5 @@
|
||||||
---
|
---
|
||||||
name: "\U0001F4DA Documentation"
|
name: "\U0001F4DA Submit a Documentation Report"
|
||||||
about: Did you spot a mistake in the docs, is anything unclear or do you have a
|
about: Did you spot a mistake in the docs, is anything unclear or do you have a
|
||||||
suggestion?
|
suggestion?
|
||||||
|
|
||||||
|
|
19
.github/ISSUE_TEMPLATE/03_other.md
vendored
19
.github/ISSUE_TEMPLATE/03_other.md
vendored
|
@ -1,19 +0,0 @@
|
||||||
---
|
|
||||||
name: "\U0001F4AC Anything else?"
|
|
||||||
about: For feature and project ideas, general usage questions or help with your code, please post on the GitHub Discussions board instead.
|
|
||||||
---
|
|
||||||
|
|
||||||
<!-- Describe your issue here. Please keep in mind that the GitHub issue tracker is mostly intended for reports related to the spaCy code base and source, and for bugs and enhancements. If you're looking for help with your code, consider posting a question here:
|
|
||||||
|
|
||||||
- GitHub Discussions: https://github.com/explosion/spaCy/discussions
|
|
||||||
- Stack Overflow: http://stackoverflow.com/questions/tagged/spacy
|
|
||||||
-->
|
|
||||||
|
|
||||||
## Your Environment
|
|
||||||
|
|
||||||
<!-- Include details of your environment. If you're using spaCy 1.7+, you can also type `python -m spacy info --markdown` and copy-paste the result here.-->
|
|
||||||
|
|
||||||
- Operating System:
|
|
||||||
- Python Version Used:
|
|
||||||
- spaCy Version Used:
|
|
||||||
- Environment Information:
|
|
14
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
14
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
blank_issues_enabled: false
|
||||||
|
contact_links:
|
||||||
|
- name: 🗯 Discussions Forum
|
||||||
|
url: https://github.com/explosion/spaCy/discussions
|
||||||
|
about: Usage questions, general discussion and anything else that isn't a bug report.
|
||||||
|
- name: 📖 spaCy FAQ & Troubleshooting
|
||||||
|
url: https://github.com/explosion/spaCy/discussions/8226
|
||||||
|
about: Before you post, check out the FAQ for answers to common community questions!
|
||||||
|
- name: 💫 spaCy Usage Guides & API reference
|
||||||
|
url: https://spacy.io/usage
|
||||||
|
about: Everything you need to know about spaCy and how to use it.
|
||||||
|
- name: 🛠 Submit a Pull Request
|
||||||
|
url: https://github.com/explosion/spaCy/pulls
|
||||||
|
about: Did you spot a mistake and know how to fix it? Feel free to submit a PR straight away!
|
47
.github/azure-steps.yml
vendored
47
.github/azure-steps.yml
vendored
|
@ -11,6 +11,10 @@ steps:
|
||||||
versionSpec: ${{ parameters.python_version }}
|
versionSpec: ${{ parameters.python_version }}
|
||||||
architecture: ${{ parameters.architecture }}
|
architecture: ${{ parameters.architecture }}
|
||||||
|
|
||||||
|
- bash: |
|
||||||
|
echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
|
||||||
|
displayName: 'Set variables'
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
${{ parameters.prefix }} python -m pip install -U pip setuptools
|
${{ parameters.prefix }} python -m pip install -U pip setuptools
|
||||||
${{ parameters.prefix }} python -m pip install -U -r requirements.txt
|
${{ parameters.prefix }} python -m pip install -U -r requirements.txt
|
||||||
|
@ -41,7 +45,7 @@ steps:
|
||||||
displayName: "Install test requirements"
|
displayName: "Install test requirements"
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
${{ parameters.prefix }} python -m pip install -U cupy-cuda110
|
${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
|
||||||
${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
|
${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
|
||||||
displayName: "Install GPU requirements"
|
displayName: "Install GPU requirements"
|
||||||
condition: eq(${{ parameters.gpu }}, true)
|
condition: eq(${{ parameters.gpu }}, true)
|
||||||
|
@ -55,3 +59,44 @@ steps:
|
||||||
${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu
|
${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu
|
||||||
displayName: "Run GPU tests"
|
displayName: "Run GPU tests"
|
||||||
condition: eq(${{ parameters.gpu }}, true)
|
condition: eq(${{ parameters.gpu }}, true)
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python -m spacy download ca_core_news_sm
|
||||||
|
python -m spacy download ca_core_news_md
|
||||||
|
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||||
|
displayName: 'Test download CLI'
|
||||||
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||||
|
displayName: 'Test convert CLI'
|
||||||
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python -m spacy init config -p ner -l ca ner.cfg
|
||||||
|
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
|
||||||
|
displayName: 'Test debug config CLI'
|
||||||
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
# will have errors due to sparse data, check for summary in output
|
||||||
|
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
|
||||||
|
displayName: 'Test debug data CLI'
|
||||||
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
||||||
|
displayName: 'Test train CLI'
|
||||||
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||||
|
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||||
|
displayName: 'Test assemble CLI'
|
||||||
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||||
|
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||||
|
displayName: 'Test assemble CLI vectors warning'
|
||||||
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
106
.github/contributors/ZeeD.md
vendored
Normal file
106
.github/contributors/ZeeD.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Vito De Tullio |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2021-06-01 |
|
||||||
|
| GitHub username | ZeeD |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/gtoffoli.md
vendored
Normal file
106
.github/contributors/gtoffoli.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | ------------------------ |
|
||||||
|
| Name | Giovanni Toffoli |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2021-05-12 |
|
||||||
|
| GitHub username | gtoffoli |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/narayanacharya6.md
vendored
Normal file
106
.github/contributors/narayanacharya6.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Narayan Acharya |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 29 APR 2021 |
|
||||||
|
| GitHub username | narayanacharya6 |
|
||||||
|
| Website (optional) | narayanacharya.com |
|
106
.github/contributors/sevdimali.md
vendored
Normal file
106
.github/contributors/sevdimali.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Sevdimali |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 10/4/2021 |
|
||||||
|
| GitHub username | sevdimali |
|
||||||
|
| Website (optional) | https://sevdimali.me |
|
106
.github/contributors/xadrianzetx.md
vendored
Normal file
106
.github/contributors/xadrianzetx.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name |Adrian Zuber |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date |20-06-2021 |
|
||||||
|
| GitHub username |xadrianzetx |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/yohasebe.md
vendored
Normal file
106
.github/contributors/yohasebe.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Yoichiro Hasebe |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | July 4th, 2021 |
|
||||||
|
| GitHub username | yohasebe |
|
||||||
|
| Website (optional) | https://yohasebe.com |
|
43
.github/workflows/autoblack.yml
vendored
Normal file
43
.github/workflows/autoblack.yml
vendored
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
# GitHub Action that uses Black to reformat all Python code and submits a PR
|
||||||
|
# in regular intervals. Inspired by: https://github.com/cclauss/autoblack
|
||||||
|
|
||||||
|
name: autoblack
|
||||||
|
on:
|
||||||
|
workflow_dispatch: # allow manual trigger
|
||||||
|
schedule:
|
||||||
|
- cron: '0 8 * * 5' # every Friday at 8am UTC
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
autoblack:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
with:
|
||||||
|
ref: ${{ github.head_ref }}
|
||||||
|
- uses: actions/setup-python@v2
|
||||||
|
- run: pip install black
|
||||||
|
- name: Auto-format code if needed
|
||||||
|
run: black spacy
|
||||||
|
# We can't run black --check here because that returns a non-zero excit
|
||||||
|
# code and makes GitHub think the action failed
|
||||||
|
- name: Check for modified files
|
||||||
|
id: git-check
|
||||||
|
run: echo ::set-output name=modified::$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi)
|
||||||
|
- name: Create Pull Request
|
||||||
|
if: steps.git-check.outputs.modified == 'true'
|
||||||
|
uses: peter-evans/create-pull-request@v3
|
||||||
|
with:
|
||||||
|
title: Auto-format code with black
|
||||||
|
labels: meta
|
||||||
|
commit-message: Auto-format code with black
|
||||||
|
committer: GitHub <noreply@github.com>
|
||||||
|
author: explosion-bot <explosion-bot@users.noreply.github.com>
|
||||||
|
body: _This PR is auto-generated._
|
||||||
|
branch: autoblack
|
||||||
|
delete-branch: true
|
||||||
|
draft: false
|
||||||
|
- name: Check outputs
|
||||||
|
if: steps.git-check.outputs.modified == 'true'
|
||||||
|
run: |
|
||||||
|
echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}"
|
||||||
|
echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}"
|
|
@ -2,11 +2,7 @@
|
||||||
|
|
||||||
# Contribute to spaCy
|
# Contribute to spaCy
|
||||||
|
|
||||||
Thanks for your interest in contributing to spaCy 🎉 The project is maintained
|
Thanks for your interest in contributing to spaCy 🎉 This page will give you a quick
|
||||||
by **[@honnibal](https://github.com/honnibal)**,
|
|
||||||
**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)** and
|
|
||||||
**[@adrianeboyd](https://github.com/adrianeboyd)**,
|
|
||||||
and we'll do our best to help you get started. This page will give you a quick
|
|
||||||
overview of how things are organized and most importantly, how to get involved.
|
overview of how things are organized and most importantly, how to get involved.
|
||||||
|
|
||||||
## Table of contents
|
## Table of contents
|
||||||
|
|
|
@ -8,3 +8,4 @@ recursive-exclude spacy/lang *.json
|
||||||
recursive-include spacy/lang *.json.gz
|
recursive-include spacy/lang *.json.gz
|
||||||
recursive-include spacy/cli *.json *.yml
|
recursive-include spacy/cli *.json *.yml
|
||||||
recursive-include licenses *
|
recursive-include licenses *
|
||||||
|
recursive-exclude spacy *.cpp
|
||||||
|
|
2
Makefile
2
Makefile
|
@ -1,7 +1,7 @@
|
||||||
SHELL := /bin/bash
|
SHELL := /bin/bash
|
||||||
|
|
||||||
ifndef SPACY_EXTRAS
|
ifndef SPACY_EXTRAS
|
||||||
override SPACY_EXTRAS = spacy-lookups-data==1.0.0 jieba spacy-pkuseg==0.0.28 sudachipy sudachidict_core pymorphy2
|
override SPACY_EXTRAS = spacy-lookups-data==1.0.2 jieba spacy-pkuseg==0.0.28 sudachipy sudachidict_core pymorphy2
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifndef PYVER
|
ifndef PYVER
|
||||||
|
|
10
README.md
10
README.md
|
@ -61,11 +61,11 @@ open-source software, released under the MIT license.
|
||||||
## 💬 Where to ask questions
|
## 💬 Where to ask questions
|
||||||
|
|
||||||
The spaCy project is maintained by **[@honnibal](https://github.com/honnibal)**,
|
The spaCy project is maintained by **[@honnibal](https://github.com/honnibal)**,
|
||||||
**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)** and
|
**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)**,
|
||||||
**[@adrianeboyd](https://github.com/adrianeboyd)**. Please understand that we won't
|
**[@adrianeboyd](https://github.com/adrianeboyd)** and **[@polm](https://github.com/polm)**.
|
||||||
be able to provide individual support via email. We also believe that help is
|
Please understand that we won't be able to provide individual support via email.
|
||||||
much more valuable if it's shared publicly, so that more people can benefit from
|
We also believe that help is much more valuable if it's shared publicly, so that
|
||||||
it.
|
more people can benefit from it.
|
||||||
|
|
||||||
| Type | Platforms |
|
| Type | Platforms |
|
||||||
| ------------------------------- | --------------------------------------- |
|
| ------------------------------- | --------------------------------------- |
|
||||||
|
|
|
@ -22,13 +22,13 @@ jobs:
|
||||||
# defined in .flake8 and overwrites the selected codes.
|
# defined in .flake8 and overwrites the selected codes.
|
||||||
- job: "Validate"
|
- job: "Validate"
|
||||||
pool:
|
pool:
|
||||||
vmImage: "ubuntu-16.04"
|
vmImage: "ubuntu-18.04"
|
||||||
steps:
|
steps:
|
||||||
- task: UsePythonVersion@0
|
- task: UsePythonVersion@0
|
||||||
inputs:
|
inputs:
|
||||||
versionSpec: "3.7"
|
versionSpec: "3.7"
|
||||||
- script: |
|
- script: |
|
||||||
pip install flake8==3.5.0
|
pip install flake8==3.9.2
|
||||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
|
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
|
||||||
displayName: "flake8"
|
displayName: "flake8"
|
||||||
|
|
||||||
|
@ -38,7 +38,7 @@ jobs:
|
||||||
matrix:
|
matrix:
|
||||||
# We're only running one platform per Python version to speed up builds
|
# We're only running one platform per Python version to speed up builds
|
||||||
Python36Linux:
|
Python36Linux:
|
||||||
imageName: "ubuntu-16.04"
|
imageName: "ubuntu-18.04"
|
||||||
python.version: "3.6"
|
python.version: "3.6"
|
||||||
# Python36Windows:
|
# Python36Windows:
|
||||||
# imageName: "vs2017-win2016"
|
# imageName: "vs2017-win2016"
|
||||||
|
@ -47,7 +47,7 @@ jobs:
|
||||||
# imageName: "macos-10.14"
|
# imageName: "macos-10.14"
|
||||||
# python.version: "3.6"
|
# python.version: "3.6"
|
||||||
# Python37Linux:
|
# Python37Linux:
|
||||||
# imageName: "ubuntu-16.04"
|
# imageName: "ubuntu-18.04"
|
||||||
# python.version: "3.7"
|
# python.version: "3.7"
|
||||||
Python37Windows:
|
Python37Windows:
|
||||||
imageName: "vs2017-win2016"
|
imageName: "vs2017-win2016"
|
||||||
|
@ -56,7 +56,7 @@ jobs:
|
||||||
# imageName: "macos-10.14"
|
# imageName: "macos-10.14"
|
||||||
# python.version: "3.7"
|
# python.version: "3.7"
|
||||||
# Python38Linux:
|
# Python38Linux:
|
||||||
# imageName: "ubuntu-16.04"
|
# imageName: "ubuntu-18.04"
|
||||||
# python.version: "3.8"
|
# python.version: "3.8"
|
||||||
# Python38Windows:
|
# Python38Windows:
|
||||||
# imageName: "vs2017-win2016"
|
# imageName: "vs2017-win2016"
|
||||||
|
@ -65,7 +65,7 @@ jobs:
|
||||||
imageName: "macos-10.14"
|
imageName: "macos-10.14"
|
||||||
python.version: "3.8"
|
python.version: "3.8"
|
||||||
Python39Linux:
|
Python39Linux:
|
||||||
imageName: "ubuntu-16.04"
|
imageName: "ubuntu-18.04"
|
||||||
python.version: "3.9"
|
python.version: "3.9"
|
||||||
Python39Windows:
|
Python39Windows:
|
||||||
imageName: "vs2017-win2016"
|
imageName: "vs2017-win2016"
|
||||||
|
@ -82,18 +82,18 @@ jobs:
|
||||||
python_version: '$(python.version)'
|
python_version: '$(python.version)'
|
||||||
architecture: 'x64'
|
architecture: 'x64'
|
||||||
|
|
||||||
- job: "TestGPU"
|
# - job: "TestGPU"
|
||||||
dependsOn: "Validate"
|
# dependsOn: "Validate"
|
||||||
strategy:
|
# strategy:
|
||||||
matrix:
|
# matrix:
|
||||||
Python38LinuxX64_GPU:
|
# Python38LinuxX64_GPU:
|
||||||
python.version: '3.8'
|
# python.version: '3.8'
|
||||||
pool:
|
# pool:
|
||||||
name: "LinuxX64_GPU"
|
# name: "LinuxX64_GPU"
|
||||||
steps:
|
# steps:
|
||||||
- template: .github/azure-steps.yml
|
# - template: .github/azure-steps.yml
|
||||||
parameters:
|
# parameters:
|
||||||
python_version: '$(python.version)'
|
# python_version: '$(python.version)'
|
||||||
architecture: 'x64'
|
# architecture: 'x64'
|
||||||
gpu: true
|
# gpu: true
|
||||||
num_build_jobs: 24
|
# num_build_jobs: 24
|
||||||
|
|
|
@ -43,8 +43,8 @@ scikit-learn
|
||||||
|
|
||||||
* Files: scorer.py
|
* Files: scorer.py
|
||||||
|
|
||||||
The following implementation of roc_auc_score() is adapted from
|
The implementation of roc_auc_score() is adapted from scikit-learn, which is
|
||||||
scikit-learn, which is distributed under the following license:
|
distributed under the following license:
|
||||||
|
|
||||||
New BSD License
|
New BSD License
|
||||||
|
|
||||||
|
@ -77,3 +77,30 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||||
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||||
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
|
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
|
||||||
DAMAGE.
|
DAMAGE.
|
||||||
|
|
||||||
|
|
||||||
|
pyvi
|
||||||
|
----
|
||||||
|
|
||||||
|
* Files: lang/vi/__init__.py
|
||||||
|
|
||||||
|
The MIT License (MIT)
|
||||||
|
Copyright (c) 2016 Viet-Trung Tran
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||||
|
this software and associated documentation files (the "Software"), to deal in
|
||||||
|
the Software without restriction, including without limitation the rights to
|
||||||
|
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||||
|
of the Software, and to permit persons to whom the Software is furnished to do
|
||||||
|
so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = [
|
requires = [
|
||||||
"setuptools",
|
"setuptools",
|
||||||
"cython>=0.25",
|
"cython>=0.25,<3.0",
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.0.3,<8.1.0",
|
"thinc>=8.0.7,<8.1.0",
|
||||||
"blis>=0.4.0,<0.8.0",
|
"blis>=0.4.0,<0.8.0",
|
||||||
"pathy",
|
"pathy",
|
||||||
"numpy>=1.15.0",
|
"numpy>=1.15.0",
|
||||||
|
|
|
@ -1,30 +1,30 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
spacy-legacy>=3.0.4,<3.1.0
|
spacy-legacy>=3.0.7,<3.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.3,<8.1.0
|
thinc>=8.0.7,<8.1.0
|
||||||
blis>=0.4.0,<0.8.0
|
blis>=0.4.0,<0.8.0
|
||||||
ml_datasets>=0.2.0,<0.3.0
|
ml_datasets>=0.2.0,<0.3.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.8.1,<1.1.0
|
wasabi>=0.8.1,<1.1.0
|
||||||
srsly>=2.4.1,<3.0.0
|
srsly>=2.4.1,<3.0.0
|
||||||
catalogue>=2.0.3,<2.1.0
|
catalogue>=2.0.4,<2.1.0
|
||||||
typer>=0.3.0,<0.4.0
|
typer>=0.3.0,<0.4.0
|
||||||
pathy>=0.3.5
|
pathy>=0.3.5
|
||||||
# Third party dependencies
|
# Third party dependencies
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
pydantic>=1.7.1,<1.8.0
|
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0
|
||||||
jinja2
|
jinja2
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
packaging>=20.0
|
packaging>=20.0
|
||||||
typing_extensions>=3.7.4.1,<4.0.0.0; python_version < "3.8"
|
typing_extensions>=3.7.4.1,<4.0.0.0; python_version < "3.8"
|
||||||
# Development dependencies
|
# Development dependencies
|
||||||
cython>=0.25
|
cython>=0.25,<3.0
|
||||||
pytest>=5.2.0
|
pytest>=5.2.0
|
||||||
pytest-timeout>=1.3.0,<2.0.0
|
pytest-timeout>=1.3.0,<2.0.0
|
||||||
mock>=2.0.0,<3.0.0
|
mock>=2.0.0,<3.0.0
|
||||||
flake8>=3.5.0,<3.6.0
|
flake8>=3.8.0,<3.10.0
|
||||||
hypothesis>=3.27.0,<7.0.0
|
hypothesis>=3.27.0,<7.0.0
|
||||||
|
|
43
setup.cfg
43
setup.cfg
|
@ -22,37 +22,40 @@ classifiers =
|
||||||
Programming Language :: Python :: 3.8
|
Programming Language :: Python :: 3.8
|
||||||
Programming Language :: Python :: 3.9
|
Programming Language :: Python :: 3.9
|
||||||
Topic :: Scientific/Engineering
|
Topic :: Scientific/Engineering
|
||||||
|
project_urls =
|
||||||
|
Release notes = https://github.com/explosion/spaCy/releases
|
||||||
|
Source = https://github.com/explosion/spaCy
|
||||||
|
|
||||||
[options]
|
[options]
|
||||||
zip_safe = false
|
zip_safe = false
|
||||||
include_package_data = true
|
include_package_data = true
|
||||||
python_requires = >=3.6
|
python_requires = >=3.6
|
||||||
setup_requires =
|
setup_requires =
|
||||||
cython>=0.25
|
cython>=0.25,<3.0
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
# We also need our Cython packages here to compile against
|
# We also need our Cython packages here to compile against
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.0.3,<8.1.0
|
thinc>=8.0.7,<8.1.0
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
spacy-legacy>=3.0.4,<3.1.0
|
spacy-legacy>=3.0.7,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.3,<8.1.0
|
thinc>=8.0.7,<8.1.0
|
||||||
blis>=0.4.0,<0.8.0
|
blis>=0.4.0,<0.8.0
|
||||||
wasabi>=0.8.1,<1.1.0
|
wasabi>=0.8.1,<1.1.0
|
||||||
srsly>=2.4.1,<3.0.0
|
srsly>=2.4.1,<3.0.0
|
||||||
catalogue>=2.0.3,<2.1.0
|
catalogue>=2.0.4,<2.1.0
|
||||||
typer>=0.3.0,<0.4.0
|
typer>=0.3.0,<0.4.0
|
||||||
pathy>=0.3.5
|
pathy>=0.3.5
|
||||||
# Third-party dependencies
|
# Third-party dependencies
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
pydantic>=1.7.1,<1.8.0
|
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0
|
||||||
jinja2
|
jinja2
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
|
@ -61,37 +64,37 @@ install_requires =
|
||||||
|
|
||||||
[options.entry_points]
|
[options.entry_points]
|
||||||
console_scripts =
|
console_scripts =
|
||||||
spacy = spacy.cli:app
|
spacy = spacy.cli:setup_cli
|
||||||
|
|
||||||
[options.extras_require]
|
[options.extras_require]
|
||||||
lookups =
|
lookups =
|
||||||
spacy_lookups_data>=1.0.0,<1.1.0
|
spacy_lookups_data>=1.0.2,<1.1.0
|
||||||
transformers =
|
transformers =
|
||||||
spacy_transformers>=1.0.1,<1.1.0
|
spacy_transformers>=1.0.1,<1.1.0
|
||||||
ray =
|
ray =
|
||||||
spacy_ray>=0.1.0,<1.0.0
|
spacy_ray>=0.1.0,<1.0.0
|
||||||
cuda =
|
cuda =
|
||||||
cupy>=5.0.0b4,<9.0.0
|
cupy>=5.0.0b4,<10.0.0
|
||||||
cuda80 =
|
cuda80 =
|
||||||
cupy-cuda80>=5.0.0b4,<9.0.0
|
cupy-cuda80>=5.0.0b4,<10.0.0
|
||||||
cuda90 =
|
cuda90 =
|
||||||
cupy-cuda90>=5.0.0b4,<9.0.0
|
cupy-cuda90>=5.0.0b4,<10.0.0
|
||||||
cuda91 =
|
cuda91 =
|
||||||
cupy-cuda91>=5.0.0b4,<9.0.0
|
cupy-cuda91>=5.0.0b4,<10.0.0
|
||||||
cuda92 =
|
cuda92 =
|
||||||
cupy-cuda92>=5.0.0b4,<9.0.0
|
cupy-cuda92>=5.0.0b4,<10.0.0
|
||||||
cuda100 =
|
cuda100 =
|
||||||
cupy-cuda100>=5.0.0b4,<9.0.0
|
cupy-cuda100>=5.0.0b4,<10.0.0
|
||||||
cuda101 =
|
cuda101 =
|
||||||
cupy-cuda101>=5.0.0b4,<9.0.0
|
cupy-cuda101>=5.0.0b4,<10.0.0
|
||||||
cuda102 =
|
cuda102 =
|
||||||
cupy-cuda102>=5.0.0b4,<9.0.0
|
cupy-cuda102>=5.0.0b4,<10.0.0
|
||||||
cuda110 =
|
cuda110 =
|
||||||
cupy-cuda110>=5.0.0b4,<9.0.0
|
cupy-cuda110>=5.0.0b4,<10.0.0
|
||||||
cuda111 =
|
cuda111 =
|
||||||
cupy-cuda111>=5.0.0b4,<9.0.0
|
cupy-cuda111>=5.0.0b4,<10.0.0
|
||||||
cuda112 =
|
cuda112 =
|
||||||
cupy-cuda112>=5.0.0b4,<9.0.0
|
cupy-cuda112>=5.0.0b4,<10.0.0
|
||||||
# Language tokenizers with external dependencies
|
# Language tokenizers with external dependencies
|
||||||
ja =
|
ja =
|
||||||
sudachipy>=0.4.9
|
sudachipy>=0.4.9
|
||||||
|
@ -108,7 +111,7 @@ universal = false
|
||||||
formats = gztar
|
formats = gztar
|
||||||
|
|
||||||
[flake8]
|
[flake8]
|
||||||
ignore = E203, E266, E501, E731, W503, E741
|
ignore = E203, E266, E501, E731, W503, E741, F541
|
||||||
max-line-length = 80
|
max-line-length = 80
|
||||||
select = B,C,E,F,W,T4,B9
|
select = B,C,E,F,W,T4,B9
|
||||||
exclude =
|
exclude =
|
||||||
|
|
|
@ -1,10 +1,11 @@
|
||||||
from typing import Union, Iterable, Dict, Any
|
from typing import Union, Iterable, Dict, Any
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import warnings
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
warnings.filterwarnings("ignore", message="numpy.dtype size changed") # noqa
|
# set library-specific custom warning handling before doing anything else
|
||||||
warnings.filterwarnings("ignore", message="numpy.ufunc size changed") # noqa
|
from .errors import setup_default_warnings
|
||||||
|
|
||||||
|
setup_default_warnings()
|
||||||
|
|
||||||
# These are imported as part of the API
|
# These are imported as part of the API
|
||||||
from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401
|
from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "3.0.6"
|
__version__ = "3.1.0"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__projects__ = "https://github.com/explosion/projects"
|
__projects__ = "https://github.com/explosion/projects"
|
||||||
|
|
|
@ -74,7 +74,6 @@ IDS = {
|
||||||
"SUFFIX": SUFFIX,
|
"SUFFIX": SUFFIX,
|
||||||
|
|
||||||
"LENGTH": LENGTH,
|
"LENGTH": LENGTH,
|
||||||
"CLUSTER": CLUSTER,
|
|
||||||
"LEMMA": LEMMA,
|
"LEMMA": LEMMA,
|
||||||
"POS": POS,
|
"POS": POS,
|
||||||
"TAG": TAG,
|
"TAG": TAG,
|
||||||
|
@ -85,9 +84,7 @@ IDS = {
|
||||||
"ENT_KB_ID": ENT_KB_ID,
|
"ENT_KB_ID": ENT_KB_ID,
|
||||||
"HEAD": HEAD,
|
"HEAD": HEAD,
|
||||||
"SENT_START": SENT_START,
|
"SENT_START": SENT_START,
|
||||||
"SENT_END": SENT_END,
|
|
||||||
"SPACY": SPACY,
|
"SPACY": SPACY,
|
||||||
"PROB": PROB,
|
|
||||||
"LANG": LANG,
|
"LANG": LANG,
|
||||||
"MORPH": MORPH,
|
"MORPH": MORPH,
|
||||||
"IDX": IDX
|
"IDX": IDX
|
||||||
|
|
|
@ -2,7 +2,7 @@ from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, TYPE_CHECK
|
||||||
import sys
|
import sys
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg, Printer
|
||||||
import srsly
|
import srsly
|
||||||
import hashlib
|
import hashlib
|
||||||
import typer
|
import typer
|
||||||
|
@ -504,12 +504,16 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def setup_gpu(use_gpu: int) -> None:
|
def setup_gpu(use_gpu: int, silent=None) -> None:
|
||||||
"""Configure the GPU and log info."""
|
"""Configure the GPU and log info."""
|
||||||
|
if silent is None:
|
||||||
|
local_msg = Printer()
|
||||||
|
else:
|
||||||
|
local_msg = Printer(no_print=silent, pretty=not silent)
|
||||||
if use_gpu >= 0:
|
if use_gpu >= 0:
|
||||||
msg.info(f"Using GPU: {use_gpu}")
|
local_msg.info(f"Using GPU: {use_gpu}")
|
||||||
require_gpu(use_gpu)
|
require_gpu(use_gpu)
|
||||||
else:
|
else:
|
||||||
msg.info("Using CPU")
|
local_msg.info("Using CPU")
|
||||||
if has_cupy and gpu_is_available():
|
if has_cupy and gpu_is_available():
|
||||||
msg.info("To switch to GPU 0, use the option: --gpu-id 0")
|
local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
|
||||||
|
|
|
@ -6,7 +6,6 @@ import logging
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||||
from ._util import import_code
|
from ._util import import_code
|
||||||
from ..training.initialize import init_nlp
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..util import get_sourced_components, load_model_from_config
|
from ..util import get_sourced_components, load_model_from_config
|
||||||
|
|
||||||
|
|
|
@ -115,7 +115,8 @@ def convert(
|
||||||
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
|
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
|
||||||
doc_files = []
|
doc_files = []
|
||||||
for input_loc in walk_directory(Path(input_path), converter):
|
for input_loc in walk_directory(Path(input_path), converter):
|
||||||
input_data = input_loc.open("r", encoding="utf-8").read()
|
with input_loc.open("r", encoding="utf-8") as infile:
|
||||||
|
input_data = infile.read()
|
||||||
# Use converter function to convert data
|
# Use converter function to convert data
|
||||||
func = CONVERTERS[converter]
|
func = CONVERTERS[converter]
|
||||||
docs = func(
|
docs = func(
|
||||||
|
|
|
@ -173,8 +173,9 @@ def debug_data(
|
||||||
)
|
)
|
||||||
n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
|
n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"{} words in training data without vectors ({:0.2f}%)".format(
|
"{} words in training data without vectors ({:.0f}%)".format(
|
||||||
n_missing_vectors, n_missing_vectors / gold_train_data["n_words"]
|
n_missing_vectors,
|
||||||
|
100 * (n_missing_vectors / gold_train_data["n_words"]),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
msg.text(
|
msg.text(
|
||||||
|
@ -282,42 +283,7 @@ def debug_data(
|
||||||
labels = _get_labels_from_model(nlp, "textcat")
|
labels = _get_labels_from_model(nlp, "textcat")
|
||||||
msg.info(f"Text Classification: {len(labels)} label(s)")
|
msg.info(f"Text Classification: {len(labels)} label(s)")
|
||||||
msg.text(f"Labels: {_format_labels(labels)}", show=verbose)
|
msg.text(f"Labels: {_format_labels(labels)}", show=verbose)
|
||||||
labels_with_counts = _format_labels(
|
missing_labels = labels - set(gold_train_data["cats"])
|
||||||
gold_train_data["cats"].most_common(), counts=True
|
|
||||||
)
|
|
||||||
msg.text(f"Labels in train data: {labels_with_counts}", show=verbose)
|
|
||||||
missing_labels = labels - set(gold_train_data["cats"].keys())
|
|
||||||
if missing_labels:
|
|
||||||
msg.warn(
|
|
||||||
"Some model labels are not present in the train data. The "
|
|
||||||
"model performance may be degraded for these labels after "
|
|
||||||
f"training: {_format_labels(missing_labels)}."
|
|
||||||
)
|
|
||||||
if gold_train_data["n_cats_multilabel"] > 0:
|
|
||||||
# Note: you should never get here because you run into E895 on
|
|
||||||
# initialization first.
|
|
||||||
msg.warn(
|
|
||||||
"The train data contains instances without "
|
|
||||||
"mutually-exclusive classes. Use the component "
|
|
||||||
"'textcat_multilabel' instead of 'textcat'."
|
|
||||||
)
|
|
||||||
if gold_dev_data["n_cats_multilabel"] > 0:
|
|
||||||
msg.fail(
|
|
||||||
"Train/dev mismatch: the dev data contains instances "
|
|
||||||
"without mutually-exclusive classes while the train data "
|
|
||||||
"contains only instances with mutually-exclusive classes."
|
|
||||||
)
|
|
||||||
|
|
||||||
if "textcat_multilabel" in factory_names:
|
|
||||||
msg.divider("Text Classification (Multilabel)")
|
|
||||||
labels = _get_labels_from_model(nlp, "textcat_multilabel")
|
|
||||||
msg.info(f"Text Classification: {len(labels)} label(s)")
|
|
||||||
msg.text(f"Labels: {_format_labels(labels)}", show=verbose)
|
|
||||||
labels_with_counts = _format_labels(
|
|
||||||
gold_train_data["cats"].most_common(), counts=True
|
|
||||||
)
|
|
||||||
msg.text(f"Labels in train data: {labels_with_counts}", show=verbose)
|
|
||||||
missing_labels = labels - set(gold_train_data["cats"].keys())
|
|
||||||
if missing_labels:
|
if missing_labels:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Some model labels are not present in the train data. The "
|
"Some model labels are not present in the train data. The "
|
||||||
|
@ -325,17 +291,76 @@ def debug_data(
|
||||||
f"training: {_format_labels(missing_labels)}."
|
f"training: {_format_labels(missing_labels)}."
|
||||||
)
|
)
|
||||||
if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
|
if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
|
||||||
msg.fail(
|
msg.warn(
|
||||||
f"The train and dev labels are not the same. "
|
"Potential train/dev mismatch: the train and dev labels are "
|
||||||
|
"not the same. "
|
||||||
f"Train labels: {_format_labels(gold_train_data['cats'])}. "
|
f"Train labels: {_format_labels(gold_train_data['cats'])}. "
|
||||||
f"Dev labels: {_format_labels(gold_dev_data['cats'])}."
|
f"Dev labels: {_format_labels(gold_dev_data['cats'])}."
|
||||||
)
|
)
|
||||||
|
if len(labels) < 2:
|
||||||
|
msg.fail(
|
||||||
|
"The model does not have enough labels. 'textcat' requires at "
|
||||||
|
"least two labels due to mutually-exclusive classes, e.g. "
|
||||||
|
"LABEL/NOT_LABEL or POSITIVE/NEGATIVE for a binary "
|
||||||
|
"classification task."
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
gold_train_data["n_cats_bad_values"] > 0
|
||||||
|
or gold_dev_data["n_cats_bad_values"] > 0
|
||||||
|
):
|
||||||
|
msg.fail(
|
||||||
|
"Unsupported values for cats: the supported values are "
|
||||||
|
"1.0/True and 0.0/False."
|
||||||
|
)
|
||||||
|
if gold_train_data["n_cats_multilabel"] > 0:
|
||||||
|
# Note: you should never get here because you run into E895 on
|
||||||
|
# initialization first.
|
||||||
|
msg.fail(
|
||||||
|
"The train data contains instances without mutually-exclusive "
|
||||||
|
"classes. Use the component 'textcat_multilabel' instead of "
|
||||||
|
"'textcat'."
|
||||||
|
)
|
||||||
|
if gold_dev_data["n_cats_multilabel"] > 0:
|
||||||
|
msg.fail(
|
||||||
|
"The dev data contains instances without mutually-exclusive "
|
||||||
|
"classes. Use the component 'textcat_multilabel' instead of "
|
||||||
|
"'textcat'."
|
||||||
|
)
|
||||||
|
|
||||||
|
if "textcat_multilabel" in factory_names:
|
||||||
|
msg.divider("Text Classification (Multilabel)")
|
||||||
|
labels = _get_labels_from_model(nlp, "textcat_multilabel")
|
||||||
|
msg.info(f"Text Classification: {len(labels)} label(s)")
|
||||||
|
msg.text(f"Labels: {_format_labels(labels)}", show=verbose)
|
||||||
|
missing_labels = labels - set(gold_train_data["cats"])
|
||||||
|
if missing_labels:
|
||||||
|
msg.warn(
|
||||||
|
"Some model labels are not present in the train data. The "
|
||||||
|
"model performance may be degraded for these labels after "
|
||||||
|
f"training: {_format_labels(missing_labels)}."
|
||||||
|
)
|
||||||
|
if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
|
||||||
|
msg.warn(
|
||||||
|
"Potential train/dev mismatch: the train and dev labels are "
|
||||||
|
"not the same. "
|
||||||
|
f"Train labels: {_format_labels(gold_train_data['cats'])}. "
|
||||||
|
f"Dev labels: {_format_labels(gold_dev_data['cats'])}."
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
gold_train_data["n_cats_bad_values"] > 0
|
||||||
|
or gold_dev_data["n_cats_bad_values"] > 0
|
||||||
|
):
|
||||||
|
msg.fail(
|
||||||
|
"Unsupported values for cats: the supported values are "
|
||||||
|
"1.0/True and 0.0/False."
|
||||||
|
)
|
||||||
if gold_train_data["n_cats_multilabel"] > 0:
|
if gold_train_data["n_cats_multilabel"] > 0:
|
||||||
if gold_dev_data["n_cats_multilabel"] == 0:
|
if gold_dev_data["n_cats_multilabel"] == 0:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Potential train/dev mismatch: the train data contains "
|
"Potential train/dev mismatch: the train data contains "
|
||||||
"instances without mutually-exclusive classes while the "
|
"instances without mutually-exclusive classes while the "
|
||||||
"dev data does not."
|
"dev data contains only instances with mutually-exclusive "
|
||||||
|
"classes."
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
|
@ -556,6 +581,7 @@ def _compile_gold(
|
||||||
"n_nonproj": 0,
|
"n_nonproj": 0,
|
||||||
"n_cycles": 0,
|
"n_cycles": 0,
|
||||||
"n_cats_multilabel": 0,
|
"n_cats_multilabel": 0,
|
||||||
|
"n_cats_bad_values": 0,
|
||||||
"texts": set(),
|
"texts": set(),
|
||||||
}
|
}
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
|
@ -599,7 +625,9 @@ def _compile_gold(
|
||||||
data["ner"]["-"] += 1
|
data["ner"]["-"] += 1
|
||||||
if "textcat" in factory_names or "textcat_multilabel" in factory_names:
|
if "textcat" in factory_names or "textcat_multilabel" in factory_names:
|
||||||
data["cats"].update(gold.cats)
|
data["cats"].update(gold.cats)
|
||||||
if list(gold.cats.values()).count(1.0) != 1:
|
if any(val not in (0, 1) for val in gold.cats.values()):
|
||||||
|
data["n_cats_bad_values"] += 1
|
||||||
|
if list(gold.cats.values()).count(1) != 1:
|
||||||
data["n_cats_multilabel"] += 1
|
data["n_cats_multilabel"] += 1
|
||||||
if "tagger" in factory_names:
|
if "tagger" in factory_names:
|
||||||
tags = eg.get_aligned("TAG", as_string=True)
|
tags = eg.get_aligned("TAG", as_string=True)
|
||||||
|
|
|
@ -1,10 +1,11 @@
|
||||||
from typing import Dict, Any, Optional, Iterable
|
from typing import Dict, Any, Optional
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import itertools
|
||||||
|
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.util import resolve_dot_names
|
from spacy.util import resolve_dot_names
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
from thinc.api import fix_random_seed, set_dropout_rate, Adam
|
from thinc.api import fix_random_seed, set_dropout_rate
|
||||||
from thinc.api import Model, data_validation, set_gpu_allocator
|
from thinc.api import Model, data_validation, set_gpu_allocator
|
||||||
import typer
|
import typer
|
||||||
|
|
||||||
|
@ -73,23 +74,24 @@ def debug_model_cli(
|
||||||
msg.info(f"Fixing random seed: {seed}")
|
msg.info(f"Fixing random seed: {seed}")
|
||||||
fix_random_seed(seed)
|
fix_random_seed(seed)
|
||||||
pipe = nlp.get_pipe(component)
|
pipe = nlp.get_pipe(component)
|
||||||
if not hasattr(pipe, "model"):
|
|
||||||
msg.fail(
|
debug_model(config, T, nlp, pipe, print_settings=print_settings)
|
||||||
f"The component '{component}' does not specify an object that holds a Model.",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
model = pipe.model
|
|
||||||
debug_model(config, T, nlp, model, print_settings=print_settings)
|
|
||||||
|
|
||||||
|
|
||||||
def debug_model(
|
def debug_model(
|
||||||
config,
|
config,
|
||||||
resolved_train_config,
|
resolved_train_config,
|
||||||
nlp,
|
nlp,
|
||||||
model: Model,
|
pipe,
|
||||||
*,
|
*,
|
||||||
print_settings: Optional[Dict[str, Any]] = None,
|
print_settings: Optional[Dict[str, Any]] = None,
|
||||||
):
|
):
|
||||||
|
if not hasattr(pipe, "model"):
|
||||||
|
msg.fail(
|
||||||
|
f"The component '{pipe}' does not specify an object that holds a Model.",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
model = pipe.model
|
||||||
if not isinstance(model, Model):
|
if not isinstance(model, Model):
|
||||||
msg.fail(
|
msg.fail(
|
||||||
f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
|
f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
|
||||||
|
@ -105,8 +107,6 @@ def debug_model(
|
||||||
_print_model(model, print_settings)
|
_print_model(model, print_settings)
|
||||||
|
|
||||||
# STEP 1: Initializing the model and printing again
|
# STEP 1: Initializing the model and printing again
|
||||||
X = _get_docs()
|
|
||||||
# The output vector might differ from the official type of the output layer
|
|
||||||
with data_validation(False):
|
with data_validation(False):
|
||||||
try:
|
try:
|
||||||
dot_names = [resolved_train_config["train_corpus"]]
|
dot_names = [resolved_train_config["train_corpus"]]
|
||||||
|
@ -114,15 +114,17 @@ def debug_model(
|
||||||
(train_corpus,) = resolve_dot_names(config, dot_names)
|
(train_corpus,) = resolve_dot_names(config, dot_names)
|
||||||
nlp.initialize(lambda: train_corpus(nlp))
|
nlp.initialize(lambda: train_corpus(nlp))
|
||||||
msg.info("Initialized the model with the training corpus.")
|
msg.info("Initialized the model with the training corpus.")
|
||||||
|
examples = list(itertools.islice(train_corpus(nlp), 5))
|
||||||
except ValueError:
|
except ValueError:
|
||||||
try:
|
try:
|
||||||
_set_output_dim(nO=7, model=model)
|
_set_output_dim(nO=7, model=model)
|
||||||
with show_validation_error():
|
with show_validation_error():
|
||||||
nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X])
|
examples = [Example.from_dict(x, {}) for x in _get_docs()]
|
||||||
|
nlp.initialize(lambda: examples)
|
||||||
msg.info("Initialized the model with dummy data.")
|
msg.info("Initialized the model with dummy data.")
|
||||||
except Exception:
|
except Exception:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.",
|
"Could not initialize the model: you'll have to provide a valid 'train_corpus' argument in the config file.",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -131,28 +133,26 @@ def debug_model(
|
||||||
_print_model(model, print_settings)
|
_print_model(model, print_settings)
|
||||||
|
|
||||||
# STEP 2: Updating the model and printing again
|
# STEP 2: Updating the model and printing again
|
||||||
optimizer = Adam(0.001)
|
|
||||||
set_dropout_rate(model, 0.2)
|
set_dropout_rate(model, 0.2)
|
||||||
# ugly hack to deal with Tok2Vec listeners
|
# ugly hack to deal with Tok2Vec/Transformer listeners
|
||||||
tok2vec = None
|
upstream_component = None
|
||||||
if model.has_ref("tok2vec") and model.get_ref("tok2vec").name == "tok2vec-listener":
|
if model.has_ref("tok2vec") and "tok2vec-listener" in model.get_ref("tok2vec").name:
|
||||||
tok2vec = nlp.get_pipe("tok2vec")
|
upstream_component = nlp.get_pipe("tok2vec")
|
||||||
goldY = None
|
if (
|
||||||
|
model.has_ref("tok2vec")
|
||||||
|
and "transformer-listener" in model.get_ref("tok2vec").name
|
||||||
|
):
|
||||||
|
upstream_component = nlp.get_pipe("transformer")
|
||||||
for e in range(3):
|
for e in range(3):
|
||||||
if tok2vec:
|
if upstream_component:
|
||||||
tok2vec.update([Example.from_dict(x, {}) for x in X])
|
upstream_component.update(examples)
|
||||||
Y, get_dX = model.begin_update(X)
|
pipe.update(examples)
|
||||||
if goldY is None:
|
|
||||||
goldY = _simulate_gold(Y)
|
|
||||||
dY = get_gradient(goldY, Y, model.ops)
|
|
||||||
get_dX(dY)
|
|
||||||
model.finish_update(optimizer)
|
|
||||||
if print_settings.get("print_after_training"):
|
if print_settings.get("print_after_training"):
|
||||||
msg.divider(f"STEP 2 - after training")
|
msg.divider(f"STEP 2 - after training")
|
||||||
_print_model(model, print_settings)
|
_print_model(model, print_settings)
|
||||||
|
|
||||||
# STEP 3: the final prediction
|
# STEP 3: the final prediction
|
||||||
prediction = model.predict(X)
|
prediction = model.predict([ex.predicted for ex in examples])
|
||||||
if print_settings.get("print_prediction"):
|
if print_settings.get("print_prediction"):
|
||||||
msg.divider(f"STEP 3 - prediction")
|
msg.divider(f"STEP 3 - prediction")
|
||||||
msg.info(str(prediction))
|
msg.info(str(prediction))
|
||||||
|
@ -160,19 +160,6 @@ def debug_model(
|
||||||
msg.good(f"Succesfully ended analysis - model looks good.")
|
msg.good(f"Succesfully ended analysis - model looks good.")
|
||||||
|
|
||||||
|
|
||||||
def get_gradient(goldY, Y, ops):
|
|
||||||
return ops.asarray(Y) - ops.asarray(goldY)
|
|
||||||
|
|
||||||
|
|
||||||
def _simulate_gold(element, counter=1):
|
|
||||||
if isinstance(element, Iterable):
|
|
||||||
for i in range(len(element)):
|
|
||||||
element[i] = _simulate_gold(element[i], counter + i)
|
|
||||||
return element
|
|
||||||
else:
|
|
||||||
return 1 / counter
|
|
||||||
|
|
||||||
|
|
||||||
def _sentences():
|
def _sentences():
|
||||||
return [
|
return [
|
||||||
"Apple is looking at buying U.K. startup for $1 billion",
|
"Apple is looking at buying U.K. startup for $1 billion",
|
||||||
|
@ -209,11 +196,7 @@ def _print_model(model, print_settings):
|
||||||
|
|
||||||
if dimensions:
|
if dimensions:
|
||||||
for name in node.dim_names:
|
for name in node.dim_names:
|
||||||
if node.has_dim(name):
|
msg.info(f" - dim {name}: {node.maybe_get_dim(name)}")
|
||||||
msg.info(f" - dim {name}: {node.get_dim(name)}")
|
|
||||||
else:
|
|
||||||
msg.info(f" - dim {name}: {node.has_dim(name)}")
|
|
||||||
|
|
||||||
if parameters:
|
if parameters:
|
||||||
for name in node.param_names:
|
for name in node.param_names:
|
||||||
if node.has_param(name):
|
if node.has_param(name):
|
||||||
|
|
|
@ -6,7 +6,7 @@ import typer
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
|
from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
|
||||||
from .. import about
|
from .. import about
|
||||||
from ..util import is_package, get_base_version, run_command
|
from ..util import is_package, get_minor_version, run_command
|
||||||
from ..errors import OLD_MODEL_SHORTCUTS
|
from ..errors import OLD_MODEL_SHORTCUTS
|
||||||
|
|
||||||
|
|
||||||
|
@ -74,7 +74,7 @@ def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -
|
||||||
|
|
||||||
|
|
||||||
def get_compatibility() -> dict:
|
def get_compatibility() -> dict:
|
||||||
version = get_base_version(about.__version__)
|
version = get_minor_version(about.__version__)
|
||||||
r = requests.get(about.__compatibility__)
|
r = requests.get(about.__compatibility__)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional, List, Dict
|
from typing import Optional, List, Dict, Any, Union
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
|
@ -60,10 +60,11 @@ def evaluate(
|
||||||
displacy_path: Optional[Path] = None,
|
displacy_path: Optional[Path] = None,
|
||||||
displacy_limit: int = 25,
|
displacy_limit: int = 25,
|
||||||
silent: bool = True,
|
silent: bool = True,
|
||||||
) -> Scorer:
|
spans_key: str = "sc",
|
||||||
|
) -> Dict[str, Any]:
|
||||||
msg = Printer(no_print=silent, pretty=not silent)
|
msg = Printer(no_print=silent, pretty=not silent)
|
||||||
fix_random_seed()
|
fix_random_seed()
|
||||||
setup_gpu(use_gpu)
|
setup_gpu(use_gpu, silent=silent)
|
||||||
data_path = util.ensure_path(data_path)
|
data_path = util.ensure_path(data_path)
|
||||||
output_path = util.ensure_path(output)
|
output_path = util.ensure_path(output)
|
||||||
displacy_path = util.ensure_path(displacy_path)
|
displacy_path = util.ensure_path(displacy_path)
|
||||||
|
@ -90,6 +91,9 @@ def evaluate(
|
||||||
"SENT P": "sents_p",
|
"SENT P": "sents_p",
|
||||||
"SENT R": "sents_r",
|
"SENT R": "sents_r",
|
||||||
"SENT F": "sents_f",
|
"SENT F": "sents_f",
|
||||||
|
"SPAN P": f"spans_{spans_key}_p",
|
||||||
|
"SPAN R": f"spans_{spans_key}_r",
|
||||||
|
"SPAN F": f"spans_{spans_key}_f",
|
||||||
"SPEED": "speed",
|
"SPEED": "speed",
|
||||||
}
|
}
|
||||||
results = {}
|
results = {}
|
||||||
|
@ -108,27 +112,7 @@ def evaluate(
|
||||||
data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
|
data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
|
||||||
|
|
||||||
msg.table(results, title="Results")
|
msg.table(results, title="Results")
|
||||||
|
data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
|
||||||
if "morph_per_feat" in scores:
|
|
||||||
if scores["morph_per_feat"]:
|
|
||||||
print_prf_per_type(msg, scores["morph_per_feat"], "MORPH", "feat")
|
|
||||||
data["morph_per_feat"] = scores["morph_per_feat"]
|
|
||||||
if "dep_las_per_type" in scores:
|
|
||||||
if scores["dep_las_per_type"]:
|
|
||||||
print_prf_per_type(msg, scores["dep_las_per_type"], "LAS", "type")
|
|
||||||
data["dep_las_per_type"] = scores["dep_las_per_type"]
|
|
||||||
if "ents_per_type" in scores:
|
|
||||||
if scores["ents_per_type"]:
|
|
||||||
print_prf_per_type(msg, scores["ents_per_type"], "NER", "type")
|
|
||||||
data["ents_per_type"] = scores["ents_per_type"]
|
|
||||||
if "cats_f_per_type" in scores:
|
|
||||||
if scores["cats_f_per_type"]:
|
|
||||||
print_prf_per_type(msg, scores["cats_f_per_type"], "Textcat F", "label")
|
|
||||||
data["cats_f_per_type"] = scores["cats_f_per_type"]
|
|
||||||
if "cats_auc_per_type" in scores:
|
|
||||||
if scores["cats_auc_per_type"]:
|
|
||||||
print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"])
|
|
||||||
data["cats_auc_per_type"] = scores["cats_auc_per_type"]
|
|
||||||
|
|
||||||
if displacy_path:
|
if displacy_path:
|
||||||
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
||||||
|
@ -151,6 +135,43 @@ def evaluate(
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def handle_scores_per_type(
|
||||||
|
scores: Union[Scorer, Dict[str, Any]],
|
||||||
|
data: Dict[str, Any] = {},
|
||||||
|
*,
|
||||||
|
spans_key: str = "sc",
|
||||||
|
silent: bool = False,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
msg = Printer(no_print=silent, pretty=not silent)
|
||||||
|
if "morph_per_feat" in scores:
|
||||||
|
if scores["morph_per_feat"]:
|
||||||
|
print_prf_per_type(msg, scores["morph_per_feat"], "MORPH", "feat")
|
||||||
|
data["morph_per_feat"] = scores["morph_per_feat"]
|
||||||
|
if "dep_las_per_type" in scores:
|
||||||
|
if scores["dep_las_per_type"]:
|
||||||
|
print_prf_per_type(msg, scores["dep_las_per_type"], "LAS", "type")
|
||||||
|
data["dep_las_per_type"] = scores["dep_las_per_type"]
|
||||||
|
if "ents_per_type" in scores:
|
||||||
|
if scores["ents_per_type"]:
|
||||||
|
print_prf_per_type(msg, scores["ents_per_type"], "NER", "type")
|
||||||
|
data["ents_per_type"] = scores["ents_per_type"]
|
||||||
|
if f"spans_{spans_key}_per_type" in scores:
|
||||||
|
if scores[f"spans_{spans_key}_per_type"]:
|
||||||
|
print_prf_per_type(
|
||||||
|
msg, scores[f"spans_{spans_key}_per_type"], "SPANS", "type"
|
||||||
|
)
|
||||||
|
data[f"spans_{spans_key}_per_type"] = scores[f"spans_{spans_key}_per_type"]
|
||||||
|
if "cats_f_per_type" in scores:
|
||||||
|
if scores["cats_f_per_type"]:
|
||||||
|
print_prf_per_type(msg, scores["cats_f_per_type"], "Textcat F", "label")
|
||||||
|
data["cats_f_per_type"] = scores["cats_f_per_type"]
|
||||||
|
if "cats_auc_per_type" in scores:
|
||||||
|
if scores["cats_auc_per_type"]:
|
||||||
|
print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"])
|
||||||
|
data["cats_auc_per_type"] = scores["cats_auc_per_type"]
|
||||||
|
return scores
|
||||||
|
|
||||||
|
|
||||||
def render_parses(
|
def render_parses(
|
||||||
docs: List[Doc],
|
docs: List[Doc],
|
||||||
output_path: Path,
|
output_path: Path,
|
||||||
|
|
|
@ -108,6 +108,10 @@ def init_labels_cli(
|
||||||
config = util.load_config(config_path, overrides=overrides)
|
config = util.load_config(config_path, overrides=overrides)
|
||||||
with show_validation_error(hint_fill=False):
|
with show_validation_error(hint_fill=False):
|
||||||
nlp = init_nlp(config, use_gpu=use_gpu)
|
nlp = init_nlp(config, use_gpu=use_gpu)
|
||||||
|
_init_labels(nlp, output_path)
|
||||||
|
|
||||||
|
|
||||||
|
def _init_labels(nlp, output_path):
|
||||||
for name, component in nlp.pipeline:
|
for name, component in nlp.pipeline:
|
||||||
if getattr(component, "label_data", None) is not None:
|
if getattr(component, "label_data", None) is not None:
|
||||||
output_file = output_path / f"{name}.json"
|
output_file = output_path / f"{name}.json"
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from typing import Optional, Union, Any, Dict, List, Tuple
|
from typing import Optional, Union, Any, Dict, List, Tuple
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer, get_raw_input
|
from wasabi import Printer, MarkdownRenderer, get_raw_input
|
||||||
import srsly
|
import srsly
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
@ -18,7 +18,7 @@ def package_cli(
|
||||||
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
|
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
|
||||||
code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"),
|
code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"),
|
||||||
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
|
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
|
||||||
create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
|
create_meta: bool = Opt(False, "--create-meta", "-C", help="Create meta.json, even if one exists"),
|
||||||
name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
|
name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
|
||||||
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
|
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
|
||||||
build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."),
|
build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."),
|
||||||
|
@ -112,7 +112,9 @@ def package(
|
||||||
msg.fail("Invalid pipeline meta.json")
|
msg.fail("Invalid pipeline meta.json")
|
||||||
print("\n".join(errors))
|
print("\n".join(errors))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
model_name = meta["lang"] + "_" + meta["name"]
|
model_name = meta["name"]
|
||||||
|
if not model_name.startswith(meta["lang"] + "_"):
|
||||||
|
model_name = f"{meta['lang']}_{model_name}"
|
||||||
model_name_v = model_name + "-" + meta["version"]
|
model_name_v = model_name + "-" + meta["version"]
|
||||||
main_path = output_dir / model_name_v
|
main_path = output_dir / model_name_v
|
||||||
package_path = main_path / model_name
|
package_path = main_path / model_name
|
||||||
|
@ -128,9 +130,15 @@ def package(
|
||||||
)
|
)
|
||||||
Path.mkdir(package_path, parents=True)
|
Path.mkdir(package_path, parents=True)
|
||||||
shutil.copytree(str(input_dir), str(package_path / model_name_v))
|
shutil.copytree(str(input_dir), str(package_path / model_name_v))
|
||||||
license_path = package_path / model_name_v / "LICENSE"
|
for file_name in FILENAMES_DOCS:
|
||||||
if license_path.exists():
|
file_path = package_path / model_name_v / file_name
|
||||||
shutil.move(str(license_path), str(main_path))
|
if file_path.exists():
|
||||||
|
shutil.copy(str(file_path), str(main_path))
|
||||||
|
readme_path = main_path / "README.md"
|
||||||
|
if not readme_path.exists():
|
||||||
|
readme = generate_readme(meta)
|
||||||
|
create_file(readme_path, readme)
|
||||||
|
create_file(package_path / model_name_v / "README.md", readme)
|
||||||
imports = []
|
imports = []
|
||||||
for code_path in code_paths:
|
for code_path in code_paths:
|
||||||
imports.append(code_path.stem)
|
imports.append(code_path.stem)
|
||||||
|
@ -231,6 +239,113 @@ def generate_meta(existing_meta: Dict[str, Any], msg: Printer) -> Dict[str, Any]
|
||||||
return meta
|
return meta
|
||||||
|
|
||||||
|
|
||||||
|
def generate_readme(meta: Dict[str, Any]) -> str:
|
||||||
|
"""
|
||||||
|
Generate a Markdown-formatted README text from a model meta.json. Used
|
||||||
|
within the GitHub release notes and as content for README.md file added
|
||||||
|
to model packages.
|
||||||
|
"""
|
||||||
|
md = MarkdownRenderer()
|
||||||
|
lang = meta["lang"]
|
||||||
|
name = f"{lang}_{meta['name']}"
|
||||||
|
version = meta["version"]
|
||||||
|
pipeline = ", ".join([md.code(p) for p in meta.get("pipeline", [])])
|
||||||
|
components = ", ".join([md.code(p) for p in meta.get("components", [])])
|
||||||
|
vecs = meta.get("vectors", {})
|
||||||
|
vectors = f"{vecs.get('keys', 0)} keys, {vecs.get('vectors', 0)} unique vectors ({ vecs.get('width', 0)} dimensions)"
|
||||||
|
author = meta.get("author") or "n/a"
|
||||||
|
notes = meta.get("notes", "")
|
||||||
|
license_name = meta.get("license")
|
||||||
|
sources = _format_sources(meta.get("sources"))
|
||||||
|
description = meta.get("description")
|
||||||
|
label_scheme = _format_label_scheme(meta.get("labels"))
|
||||||
|
accuracy = _format_accuracy(meta.get("performance"))
|
||||||
|
table_data = [
|
||||||
|
(md.bold("Name"), md.code(name)),
|
||||||
|
(md.bold("Version"), md.code(version)),
|
||||||
|
(md.bold("spaCy"), md.code(meta["spacy_version"])),
|
||||||
|
(md.bold("Default Pipeline"), pipeline),
|
||||||
|
(md.bold("Components"), components),
|
||||||
|
(md.bold("Vectors"), vectors),
|
||||||
|
(md.bold("Sources"), sources or "n/a"),
|
||||||
|
(md.bold("License"), md.code(license_name) if license_name else "n/a"),
|
||||||
|
(md.bold("Author"), md.link(author, meta["url"]) if "url" in meta else author),
|
||||||
|
]
|
||||||
|
# Put together Markdown body
|
||||||
|
if description:
|
||||||
|
md.add(description)
|
||||||
|
md.add(md.table(table_data, ["Feature", "Description"]))
|
||||||
|
if label_scheme:
|
||||||
|
md.add(md.title(3, "Label Scheme"))
|
||||||
|
md.add(label_scheme)
|
||||||
|
if accuracy:
|
||||||
|
md.add(md.title(3, "Accuracy"))
|
||||||
|
md.add(accuracy)
|
||||||
|
if notes:
|
||||||
|
md.add(notes)
|
||||||
|
return md.text
|
||||||
|
|
||||||
|
|
||||||
|
def _format_sources(data: Any) -> str:
|
||||||
|
if not data or not isinstance(data, list):
|
||||||
|
return "n/a"
|
||||||
|
sources = []
|
||||||
|
for source in data:
|
||||||
|
if not isinstance(source, dict):
|
||||||
|
source = {"name": source}
|
||||||
|
name = source.get("name")
|
||||||
|
if not name:
|
||||||
|
continue
|
||||||
|
url = source.get("url")
|
||||||
|
author = source.get("author")
|
||||||
|
result = name if not url else "[{}]({})".format(name, url)
|
||||||
|
if author:
|
||||||
|
result += " ({})".format(author)
|
||||||
|
sources.append(result)
|
||||||
|
return "<br />".join(sources)
|
||||||
|
|
||||||
|
|
||||||
|
def _format_accuracy(data: Dict[str, Any], exclude: List[str] = ["speed"]) -> str:
|
||||||
|
if not data:
|
||||||
|
return ""
|
||||||
|
md = MarkdownRenderer()
|
||||||
|
scalars = [(k, v) for k, v in data.items() if isinstance(v, (int, float))]
|
||||||
|
scores = [
|
||||||
|
(md.code(acc.upper()), f"{score*100:.2f}")
|
||||||
|
for acc, score in scalars
|
||||||
|
if acc not in exclude
|
||||||
|
]
|
||||||
|
md.add(md.table(scores, ["Type", "Score"]))
|
||||||
|
return md.text
|
||||||
|
|
||||||
|
|
||||||
|
def _format_label_scheme(data: Dict[str, Any]) -> str:
|
||||||
|
if not data:
|
||||||
|
return ""
|
||||||
|
md = MarkdownRenderer()
|
||||||
|
n_labels = 0
|
||||||
|
n_pipes = 0
|
||||||
|
label_data = []
|
||||||
|
for pipe, labels in data.items():
|
||||||
|
if not labels:
|
||||||
|
continue
|
||||||
|
col1 = md.bold(md.code(pipe))
|
||||||
|
col2 = ", ".join(
|
||||||
|
[md.code(label.replace("|", "\\|")) for label in labels]
|
||||||
|
) # noqa: W605
|
||||||
|
label_data.append((col1, col2))
|
||||||
|
n_labels += len(labels)
|
||||||
|
n_pipes += 1
|
||||||
|
if not label_data:
|
||||||
|
return ""
|
||||||
|
label_info = f"View label scheme ({n_labels} labels for {n_pipes} components)"
|
||||||
|
md.add("<details>")
|
||||||
|
md.add(f"<summary>{label_info}</summary>")
|
||||||
|
md.add(md.table(label_data, ["Component", "Labels"]))
|
||||||
|
md.add("</details>")
|
||||||
|
return md.text
|
||||||
|
|
||||||
|
|
||||||
TEMPLATE_SETUP = """
|
TEMPLATE_SETUP = """
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import io
|
import io
|
||||||
|
@ -245,6 +360,13 @@ def load_meta(fp):
|
||||||
return json.load(f)
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
|
def load_readme(fp):
|
||||||
|
if path.exists(fp):
|
||||||
|
with io.open(fp, encoding='utf8') as f:
|
||||||
|
return f.read()
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
def list_files(data_dir):
|
def list_files(data_dir):
|
||||||
output = []
|
output = []
|
||||||
for root, _, filenames in walk(data_dir):
|
for root, _, filenames in walk(data_dir):
|
||||||
|
@ -270,6 +392,8 @@ def setup_package():
|
||||||
root = path.abspath(path.dirname(__file__))
|
root = path.abspath(path.dirname(__file__))
|
||||||
meta_path = path.join(root, 'meta.json')
|
meta_path = path.join(root, 'meta.json')
|
||||||
meta = load_meta(meta_path)
|
meta = load_meta(meta_path)
|
||||||
|
readme_path = path.join(root, 'README.md')
|
||||||
|
readme = load_readme(readme_path)
|
||||||
model_name = str(meta['lang'] + '_' + meta['name'])
|
model_name = str(meta['lang'] + '_' + meta['name'])
|
||||||
model_dir = path.join(model_name, model_name + '-' + meta['version'])
|
model_dir = path.join(model_name, model_name + '-' + meta['version'])
|
||||||
|
|
||||||
|
@ -279,6 +403,7 @@ def setup_package():
|
||||||
setup(
|
setup(
|
||||||
name=model_name,
|
name=model_name,
|
||||||
description=meta.get('description'),
|
description=meta.get('description'),
|
||||||
|
long_description=readme,
|
||||||
author=meta.get('author'),
|
author=meta.get('author'),
|
||||||
author_email=meta.get('email'),
|
author_email=meta.get('email'),
|
||||||
url=meta.get('url'),
|
url=meta.get('url'),
|
||||||
|
@ -294,12 +419,14 @@ def setup_package():
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
setup_package()
|
setup_package()
|
||||||
""".strip()
|
""".lstrip()
|
||||||
|
|
||||||
|
|
||||||
TEMPLATE_MANIFEST = """
|
TEMPLATE_MANIFEST = """
|
||||||
include meta.json
|
include meta.json
|
||||||
include LICENSE
|
include LICENSE
|
||||||
|
include LICENSES_SOURCES
|
||||||
|
include README.md
|
||||||
""".strip()
|
""".strip()
|
||||||
|
|
||||||
|
|
||||||
|
@ -314,4 +441,7 @@ __version__ = get_model_meta(Path(__file__).parent)['version']
|
||||||
|
|
||||||
def load(**overrides):
|
def load(**overrides):
|
||||||
return load_model_from_init_py(__file__, **overrides)
|
return load_model_from_init_py(__file__, **overrides)
|
||||||
""".strip()
|
""".lstrip()
|
||||||
|
|
||||||
|
|
||||||
|
FILENAMES_DOCS = ["LICENSE", "LICENSES_SOURCES", "README.md"]
|
||||||
|
|
|
@ -95,6 +95,13 @@ def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
|
||||||
"then the new directory will be created for you.",
|
"then the new directory will be created for you.",
|
||||||
)
|
)
|
||||||
if resume_path is not None:
|
if resume_path is not None:
|
||||||
|
if resume_path.is_dir():
|
||||||
|
# This is necessary because Windows gives a Permission Denied when we
|
||||||
|
# try to open the directory later, which is confusing. See #7878
|
||||||
|
msg.fail(
|
||||||
|
"--resume-path should be a weights file, but {resume_path} is a directory.",
|
||||||
|
exits=True,
|
||||||
|
)
|
||||||
model_name = re.search(r"model\d+\.bin", str(resume_path))
|
model_name = re.search(r"model\d+\.bin", str(resume_path))
|
||||||
if not model_name and not epoch_resume:
|
if not model_name and not epoch_resume:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
|
|
|
@ -151,14 +151,14 @@ grad_factor = 1.0
|
||||||
@layers = "reduce_mean.v1"
|
@layers = "reduce_mean.v1"
|
||||||
|
|
||||||
[components.textcat.model.linear_model]
|
[components.textcat.model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v1"
|
@architectures = "spacy.TextCatBOW.v2"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
|
||||||
{% else -%}
|
{% else -%}
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatBOW.v1"
|
@architectures = "spacy.TextCatBOW.v2"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
@ -182,14 +182,14 @@ grad_factor = 1.0
|
||||||
@layers = "reduce_mean.v1"
|
@layers = "reduce_mean.v1"
|
||||||
|
|
||||||
[components.textcat_multilabel.model.linear_model]
|
[components.textcat_multilabel.model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v1"
|
@architectures = "spacy.TextCatBOW.v2"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
|
||||||
{% else -%}
|
{% else -%}
|
||||||
[components.textcat_multilabel.model]
|
[components.textcat_multilabel.model]
|
||||||
@architectures = "spacy.TextCatBOW.v1"
|
@architectures = "spacy.TextCatBOW.v2"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
@ -316,14 +316,14 @@ nO = null
|
||||||
width = ${components.tok2vec.model.encode.width}
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
|
||||||
[components.textcat.model.linear_model]
|
[components.textcat.model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v1"
|
@architectures = "spacy.TextCatBOW.v2"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
|
||||||
{% else -%}
|
{% else -%}
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatBOW.v1"
|
@architectures = "spacy.TextCatBOW.v2"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
@ -344,14 +344,14 @@ nO = null
|
||||||
width = ${components.tok2vec.model.encode.width}
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
|
||||||
[components.textcat_multilabel.model.linear_model]
|
[components.textcat_multilabel.model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v1"
|
@architectures = "spacy.TextCatBOW.v2"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
|
||||||
{% else -%}
|
{% else -%}
|
||||||
[components.textcat_multilabel.model]
|
[components.textcat_multilabel.model]
|
||||||
@architectures = "spacy.TextCatBOW.v1"
|
@architectures = "spacy.TextCatBOW.v2"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
@ -372,7 +372,7 @@ factory = "{{ pipe }}"
|
||||||
[corpora.train]
|
[corpora.train]
|
||||||
@readers = "spacy.Corpus.v1"
|
@readers = "spacy.Corpus.v1"
|
||||||
path = ${paths.train}
|
path = ${paths.train}
|
||||||
max_length = {{ 500 if hardware == "gpu" else 2000 }}
|
max_length = 0
|
||||||
|
|
||||||
[corpora.dev]
|
[corpora.dev]
|
||||||
@readers = "spacy.Corpus.v1"
|
@readers = "spacy.Corpus.v1"
|
||||||
|
@ -418,7 +418,7 @@ compound = 1.001
|
||||||
|
|
||||||
[initialize]
|
[initialize]
|
||||||
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
||||||
vectors = null
|
vectors = ${paths.vectors}
|
||||||
{% else -%}
|
{% else -%}
|
||||||
vectors = "{{ word_vectors }}"
|
vectors = "{{ word_vectors }}"
|
||||||
{% endif -%}
|
{% endif -%}
|
||||||
|
|
|
@ -28,7 +28,7 @@ def train_cli(
|
||||||
"""
|
"""
|
||||||
Train or update a spaCy pipeline. Requires data in spaCy's binary format. To
|
Train or update a spaCy pipeline. Requires data in spaCy's binary format. To
|
||||||
convert data from other formats, use the `spacy convert` command. The
|
convert data from other formats, use the `spacy convert` command. The
|
||||||
config file includes all settings and hyperparameters used during traing.
|
config file includes all settings and hyperparameters used during training.
|
||||||
To override settings in the config, e.g. settings that point to local
|
To override settings in the config, e.g. settings that point to local
|
||||||
paths or that you want to experiment with, you can override them as
|
paths or that you want to experiment with, you can override them as
|
||||||
command line options. For instance, --training.batch_size 128 overrides
|
command line options. For instance, --training.batch_size 128 overrides
|
||||||
|
|
|
@ -3,10 +3,11 @@ from pathlib import Path
|
||||||
import sys
|
import sys
|
||||||
import requests
|
import requests
|
||||||
from wasabi import msg, Printer
|
from wasabi import msg, Printer
|
||||||
|
import warnings
|
||||||
|
|
||||||
from ._util import app
|
from ._util import app
|
||||||
from .. import about
|
from .. import about
|
||||||
from ..util import get_package_version, get_installed_models, get_base_version
|
from ..util import get_package_version, get_installed_models, get_minor_version
|
||||||
from ..util import get_package_path, get_model_meta, is_compatible_version
|
from ..util import get_package_path, get_model_meta, is_compatible_version
|
||||||
|
|
||||||
|
|
||||||
|
@ -24,7 +25,7 @@ def validate_cli():
|
||||||
|
|
||||||
def validate() -> None:
|
def validate() -> None:
|
||||||
model_pkgs, compat = get_model_pkgs()
|
model_pkgs, compat = get_model_pkgs()
|
||||||
spacy_version = get_base_version(about.__version__)
|
spacy_version = get_minor_version(about.__version__)
|
||||||
current_compat = compat.get(spacy_version, {})
|
current_compat = compat.get(spacy_version, {})
|
||||||
if not current_compat:
|
if not current_compat:
|
||||||
msg.warn(f"No compatible packages found for v{spacy_version} of spaCy")
|
msg.warn(f"No compatible packages found for v{spacy_version} of spaCy")
|
||||||
|
@ -44,8 +45,8 @@ def validate() -> None:
|
||||||
comp = msg.text("", color="green", icon="good", no_print=True)
|
comp = msg.text("", color="green", icon="good", no_print=True)
|
||||||
version = msg.text(data["version"], color="green", no_print=True)
|
version = msg.text(data["version"], color="green", no_print=True)
|
||||||
else:
|
else:
|
||||||
version = msg.text(data["version"], color="red", no_print=True)
|
version = msg.text(data["version"], color="yellow", no_print=True)
|
||||||
comp = f"--> {compat.get(data['name'], ['n/a'])[0]}"
|
comp = f"--> {current_compat.get(data['name'], ['n/a'])[0]}"
|
||||||
rows.append((data["name"], data["spacy"], version, comp))
|
rows.append((data["name"], data["spacy"], version, comp))
|
||||||
msg.table(rows, header=header)
|
msg.table(rows, header=header)
|
||||||
else:
|
else:
|
||||||
|
@ -78,7 +79,9 @@ def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]:
|
||||||
msg.good("Loaded compatibility table")
|
msg.good("Loaded compatibility table")
|
||||||
compat = r.json()["spacy"]
|
compat = r.json()["spacy"]
|
||||||
all_models = set()
|
all_models = set()
|
||||||
installed_models = get_installed_models()
|
with warnings.catch_warnings():
|
||||||
|
warnings.filterwarnings("ignore", message="\\[W09[45]")
|
||||||
|
installed_models = get_installed_models()
|
||||||
for spacy_v, models in dict(compat).items():
|
for spacy_v, models in dict(compat).items():
|
||||||
all_models.update(models.keys())
|
all_models.update(models.keys())
|
||||||
for model, model_vs in models.items():
|
for model, model_vs in models.items():
|
||||||
|
@ -92,7 +95,9 @@ def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]:
|
||||||
spacy_version = about.__version__
|
spacy_version = about.__version__
|
||||||
else:
|
else:
|
||||||
model_path = get_package_path(package)
|
model_path = get_package_path(package)
|
||||||
model_meta = get_model_meta(model_path)
|
with warnings.catch_warnings():
|
||||||
|
warnings.filterwarnings("ignore", message="\\[W09[45]")
|
||||||
|
model_meta = get_model_meta(model_path)
|
||||||
spacy_version = model_meta.get("spacy_version", "n/a")
|
spacy_version = model_meta.get("spacy_version", "n/a")
|
||||||
is_compat = is_compatible_version(about.__version__, spacy_version)
|
is_compat = is_compatible_version(about.__version__, spacy_version)
|
||||||
pkgs[pkg_name] = {
|
pkgs[pkg_name] = {
|
||||||
|
|
|
@ -80,6 +80,8 @@ eval_frequency = 200
|
||||||
score_weights = {}
|
score_weights = {}
|
||||||
# Names of pipeline components that shouldn't be updated during training
|
# Names of pipeline components that shouldn't be updated during training
|
||||||
frozen_components = []
|
frozen_components = []
|
||||||
|
# Names of pipeline components that should set annotations during training
|
||||||
|
annotating_components = []
|
||||||
# Location in the config where the dev corpus is defined
|
# Location in the config where the dev corpus is defined
|
||||||
dev_corpus = "corpora.dev"
|
dev_corpus = "corpora.dev"
|
||||||
# Location in the config where the train corpus is defined
|
# Location in the config where the train corpus is defined
|
||||||
|
|
|
@ -120,7 +120,9 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
||||||
doc (Doc): Document do parse.
|
doc (Doc): Document do parse.
|
||||||
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
||||||
"""
|
"""
|
||||||
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data", "user_hooks"]))
|
doc = Doc(orig_doc.vocab).from_bytes(
|
||||||
|
orig_doc.to_bytes(exclude=["user_data", "user_hooks"])
|
||||||
|
)
|
||||||
if not doc.has_annotation("DEP"):
|
if not doc.has_annotation("DEP"):
|
||||||
warnings.warn(Warnings.W005)
|
warnings.warn(Warnings.W005)
|
||||||
if options.get("collapse_phrases", False):
|
if options.get("collapse_phrases", False):
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
|
||||||
def add_codes(err_cls):
|
def add_codes(err_cls):
|
||||||
"""Add error codes to string messages via class attribute names."""
|
"""Add error codes to string messages via class attribute names."""
|
||||||
|
|
||||||
|
@ -12,6 +15,33 @@ def add_codes(err_cls):
|
||||||
return ErrorsWithCodes()
|
return ErrorsWithCodes()
|
||||||
|
|
||||||
|
|
||||||
|
def setup_default_warnings():
|
||||||
|
# ignore certain numpy warnings
|
||||||
|
filter_warning("ignore", error_msg="numpy.dtype size changed") # noqa
|
||||||
|
filter_warning("ignore", error_msg="numpy.ufunc size changed") # noqa
|
||||||
|
|
||||||
|
# warn about entity_ruler & matcher having no patterns only once
|
||||||
|
for pipe in ["matcher", "entity_ruler"]:
|
||||||
|
filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
|
||||||
|
|
||||||
|
# warn once about lemmatizer without required POS
|
||||||
|
filter_warning("once", error_msg="[W108]")
|
||||||
|
|
||||||
|
|
||||||
|
def filter_warning(action: str, error_msg: str):
|
||||||
|
"""Customize how spaCy should handle a certain warning.
|
||||||
|
|
||||||
|
error_msg (str): e.g. "W006", or a full error message
|
||||||
|
action (str): "default", "error", "ignore", "always", "module" or "once"
|
||||||
|
"""
|
||||||
|
warnings.filterwarnings(action, message=_escape_warning_msg(error_msg))
|
||||||
|
|
||||||
|
|
||||||
|
def _escape_warning_msg(msg):
|
||||||
|
"""To filter with warnings.filterwarnings, the [] brackets need to be escaped"""
|
||||||
|
return msg.replace("[", "\\[").replace("]", "\\]")
|
||||||
|
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
@ -80,8 +110,9 @@ class Warnings:
|
||||||
"@misc = \"spacy.LookupsDataLoader.v1\"\n"
|
"@misc = \"spacy.LookupsDataLoader.v1\"\n"
|
||||||
"lang = ${{nlp.lang}}\n"
|
"lang = ${{nlp.lang}}\n"
|
||||||
"tables = [\"lexeme_norm\"]\n")
|
"tables = [\"lexeme_norm\"]\n")
|
||||||
W035 = ('Discarding subpattern "{pattern}" due to an unrecognized '
|
W035 = ("Discarding subpattern '{pattern}' due to an unrecognized "
|
||||||
"attribute or operator.")
|
"attribute or operator.")
|
||||||
|
W036 = ("The component '{name}' does not have any patterns defined.")
|
||||||
|
|
||||||
# New warnings added in v3.x
|
# New warnings added in v3.x
|
||||||
W086 = ("Component '{listener}' will be (re)trained, but it needs the component "
|
W086 = ("Component '{listener}' will be (re)trained, but it needs the component "
|
||||||
|
@ -119,12 +150,12 @@ class Warnings:
|
||||||
"released, because the model may say it's compatible when it's "
|
"released, because the model may say it's compatible when it's "
|
||||||
'not. Consider changing the "spacy_version" in your meta.json to a '
|
'not. Consider changing the "spacy_version" in your meta.json to a '
|
||||||
"version range, with a lower and upper pin. For example: {example}")
|
"version range, with a lower and upper pin. For example: {example}")
|
||||||
W095 = ("Model '{model}' ({model_version}) requires spaCy {version} and is "
|
W095 = ("Model '{model}' ({model_version}) was trained with spaCy "
|
||||||
"incompatible with the current version ({current}). This may lead "
|
"{version} and may not be 100% compatible with the current version "
|
||||||
"to unexpected results or runtime errors. To resolve this, "
|
"({current}). If you see errors or degraded performance, download "
|
||||||
"download a newer compatible model or retrain your custom model "
|
"a newer compatible model or retrain your custom model with the "
|
||||||
"with the current spaCy version. For more details and available "
|
"current spaCy version. For more details and available updates, "
|
||||||
"updates, run: python -m spacy validate")
|
"run: python -m spacy validate")
|
||||||
W096 = ("The method `nlp.disable_pipes` is now deprecated - use "
|
W096 = ("The method `nlp.disable_pipes` is now deprecated - use "
|
||||||
"`nlp.select_pipes` instead.")
|
"`nlp.select_pipes` instead.")
|
||||||
W100 = ("Skipping unsupported morphological feature(s): '{feature}'. "
|
W100 = ("Skipping unsupported morphological feature(s): '{feature}'. "
|
||||||
|
@ -375,21 +406,10 @@ class Errors:
|
||||||
E125 = ("Unexpected value: {value}")
|
E125 = ("Unexpected value: {value}")
|
||||||
E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. "
|
E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. "
|
||||||
"This is likely a bug in spaCy, so feel free to open an issue.")
|
"This is likely a bug in spaCy, so feel free to open an issue.")
|
||||||
E129 = ("Cannot write the label of an existing Span object because a Span "
|
|
||||||
"is a read-only view of the underlying Token objects stored in the "
|
|
||||||
"Doc. Instead, create a new Span object and specify the `label` "
|
|
||||||
"keyword argument, for example:\nfrom spacy.tokens import Span\n"
|
|
||||||
"span = Span(doc, start={start}, end={end}, label='{label}')")
|
|
||||||
E130 = ("You are running a narrow unicode build, which is incompatible "
|
E130 = ("You are running a narrow unicode build, which is incompatible "
|
||||||
"with spacy >= 2.1.0. To fix this, reinstall Python and use a wide "
|
"with spacy >= 2.1.0. To fix this, reinstall Python and use a wide "
|
||||||
"unicode build instead. You can also rebuild Python and set the "
|
"unicode build instead. You can also rebuild Python and set the "
|
||||||
"`--enable-unicode=ucs4 flag`.")
|
"`--enable-unicode=ucs4 flag`.")
|
||||||
E131 = ("Cannot write the kb_id of an existing Span object because a Span "
|
|
||||||
"is a read-only view of the underlying Token objects stored in "
|
|
||||||
"the Doc. Instead, create a new Span object and specify the "
|
|
||||||
"`kb_id` keyword argument, for example:\nfrom spacy.tokens "
|
|
||||||
"import Span\nspan = Span(doc, start={start}, end={end}, "
|
|
||||||
"label='{label}', kb_id='{kb_id}')")
|
|
||||||
E132 = ("The vectors for entities and probabilities for alias '{alias}' "
|
E132 = ("The vectors for entities and probabilities for alias '{alias}' "
|
||||||
"should have equal length, but found {entities_length} and "
|
"should have equal length, but found {entities_length} and "
|
||||||
"{probabilities_length} respectively.")
|
"{probabilities_length} respectively.")
|
||||||
|
@ -501,6 +521,24 @@ class Errors:
|
||||||
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
|
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
|
||||||
|
|
||||||
# New errors added in v3.x
|
# New errors added in v3.x
|
||||||
|
E867 = ("The 'textcat' component requires at least two labels because it "
|
||||||
|
"uses mutually exclusive classes where exactly one label is True "
|
||||||
|
"for each doc. For binary classification tasks, you can use two "
|
||||||
|
"labels with 'textcat' (LABEL / NOT_LABEL) or alternatively, you "
|
||||||
|
"can use the 'textcat_multilabel' component with one label.")
|
||||||
|
E868 = ("Found a conflicting gold annotation in a reference document, "
|
||||||
|
"with the following char-based span occurring both in the gold ents "
|
||||||
|
"as well as in the negative spans: {span}.")
|
||||||
|
E869 = ("The notation '{label}' is not supported anymore. To annotate "
|
||||||
|
"negative NER samples, use `doc.spans[key]` instead, and "
|
||||||
|
"specify the key as 'incorrect_spans_key' when constructing "
|
||||||
|
"the NER component.")
|
||||||
|
E870 = ("Could not serialize the DocBin because it is too large. Consider "
|
||||||
|
"splitting up your documents into several doc bins and serializing "
|
||||||
|
"each separately. spacy.Corpus.v1 will search recursively for all "
|
||||||
|
"*.spacy files if you provide a directory instead of a filename as "
|
||||||
|
"the 'path'.")
|
||||||
|
E871 = ("Error encountered in nlp.pipe with multiprocessing:\n\n{error}")
|
||||||
E872 = ("Unable to copy tokenizer from base model due to different "
|
E872 = ("Unable to copy tokenizer from base model due to different "
|
||||||
'tokenizer settings: current tokenizer config "{curr_config}" '
|
'tokenizer settings: current tokenizer config "{curr_config}" '
|
||||||
'vs. base model "{base_config}"')
|
'vs. base model "{base_config}"')
|
||||||
|
@ -820,6 +858,12 @@ class Errors:
|
||||||
"DependencyMatcher token patterns. The token pattern in "
|
"DependencyMatcher token patterns. The token pattern in "
|
||||||
"RIGHT_ATTR should return matches that are each exactly one token "
|
"RIGHT_ATTR should return matches that are each exactly one token "
|
||||||
"long. Invalid pattern:\n{node}")
|
"long. Invalid pattern:\n{node}")
|
||||||
|
E1017 = ("A Doc object requires both 'deps' and 'heads' for dependency "
|
||||||
|
"parses. If no dependency labels are available, provide "
|
||||||
|
"placeholder deps such as `deps=[\"dep\"]*len(heads)`.")
|
||||||
|
E1018 = ("Knowledge base for component '{name}' is not set. "
|
||||||
|
"Make sure either `nel.initialize` or `nel.set_kb` "
|
||||||
|
"is called with a `kb_loader` function.")
|
||||||
|
|
||||||
|
|
||||||
# Deprecated model shortcuts, only used in errors and warnings
|
# Deprecated model shortcuts, only used in errors and warnings
|
||||||
|
|
|
@ -58,7 +58,7 @@ GLOSSARY = {
|
||||||
"FW": "foreign word",
|
"FW": "foreign word",
|
||||||
"HYPH": "punctuation mark, hyphen",
|
"HYPH": "punctuation mark, hyphen",
|
||||||
"IN": "conjunction, subordinating or preposition",
|
"IN": "conjunction, subordinating or preposition",
|
||||||
"JJ": "adjective",
|
"JJ": "adjective (English), other noun-modifier (Chinese)",
|
||||||
"JJR": "adjective, comparative",
|
"JJR": "adjective, comparative",
|
||||||
"JJS": "adjective, superlative",
|
"JJS": "adjective, superlative",
|
||||||
"LS": "list item marker",
|
"LS": "list item marker",
|
||||||
|
@ -88,7 +88,7 @@ GLOSSARY = {
|
||||||
"WP": "wh-pronoun, personal",
|
"WP": "wh-pronoun, personal",
|
||||||
"WP$": "wh-pronoun, possessive",
|
"WP$": "wh-pronoun, possessive",
|
||||||
"WRB": "wh-adverb",
|
"WRB": "wh-adverb",
|
||||||
"SP": "space",
|
"SP": "space (English), sentence-final particle (Chinese)",
|
||||||
"ADD": "email",
|
"ADD": "email",
|
||||||
"NFP": "superfluous punctuation",
|
"NFP": "superfluous punctuation",
|
||||||
"GW": "additional word in multi-word expression",
|
"GW": "additional word in multi-word expression",
|
||||||
|
@ -152,6 +152,40 @@ GLOSSARY = {
|
||||||
"VVIZU": 'infinitive with "zu", full',
|
"VVIZU": 'infinitive with "zu", full',
|
||||||
"VVPP": "perfect participle, full",
|
"VVPP": "perfect participle, full",
|
||||||
"XY": "non-word containing non-letter",
|
"XY": "non-word containing non-letter",
|
||||||
|
# POS Tags (Chinese)
|
||||||
|
# OntoNotes / Chinese Penn Treebank
|
||||||
|
# https://repository.upenn.edu/cgi/viewcontent.cgi?article=1039&context=ircs_reports
|
||||||
|
"AD": "adverb",
|
||||||
|
"AS": "aspect marker",
|
||||||
|
"BA": "把 in ba-construction",
|
||||||
|
# "CD": "cardinal number",
|
||||||
|
"CS": "subordinating conjunction",
|
||||||
|
"DEC": "的 in a relative clause",
|
||||||
|
"DEG": "associative 的",
|
||||||
|
"DER": "得 in V-de const. and V-de-R",
|
||||||
|
"DEV": "地 before VP",
|
||||||
|
"ETC": "for words 等, 等等",
|
||||||
|
# "FW": "foreign words"
|
||||||
|
"IJ": "interjection",
|
||||||
|
# "JJ": "other noun-modifier",
|
||||||
|
"LB": "被 in long bei-const",
|
||||||
|
"LC": "localizer",
|
||||||
|
"M": "measure word",
|
||||||
|
"MSP": "other particle",
|
||||||
|
# "NN": "common noun",
|
||||||
|
"NR": "proper noun",
|
||||||
|
"NT": "temporal noun",
|
||||||
|
"OD": "ordinal number",
|
||||||
|
"ON": "onomatopoeia",
|
||||||
|
"P": "preposition excluding 把 and 被",
|
||||||
|
"PN": "pronoun",
|
||||||
|
"PU": "punctuation",
|
||||||
|
"SB": "被 in short bei-const",
|
||||||
|
# "SP": "sentence-final particle",
|
||||||
|
"VA": "predicative adjective",
|
||||||
|
"VC": "是 (copula)",
|
||||||
|
"VE": "有 as the main verb",
|
||||||
|
"VV": "other verb",
|
||||||
# Noun chunks
|
# Noun chunks
|
||||||
"NP": "noun phrase",
|
"NP": "noun phrase",
|
||||||
"PP": "prepositional phrase",
|
"PP": "prepositional phrase",
|
||||||
|
|
|
@ -28,7 +28,7 @@ cdef class Candidate:
|
||||||
|
|
||||||
cdef class KnowledgeBase:
|
cdef class KnowledgeBase:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cpdef readonly Vocab vocab
|
cdef readonly Vocab vocab
|
||||||
cdef int64_t entity_vector_length
|
cdef int64_t entity_vector_length
|
||||||
|
|
||||||
# This maps 64bit keys (hash of unique entity string)
|
# This maps 64bit keys (hash of unique entity string)
|
||||||
|
|
115
spacy/kb.pyx
115
spacy/kb.pyx
|
@ -93,6 +93,15 @@ cdef class KnowledgeBase:
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
|
self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
|
||||||
|
|
||||||
|
def initialize_entities(self, int64_t nr_entities):
|
||||||
|
self._entry_index = PreshMap(nr_entities + 1)
|
||||||
|
self._entries = entry_vec(nr_entities + 1)
|
||||||
|
self._vectors_table = float_matrix(nr_entities + 1)
|
||||||
|
|
||||||
|
def initialize_aliases(self, int64_t nr_aliases):
|
||||||
|
self._alias_index = PreshMap(nr_aliases + 1)
|
||||||
|
self._aliases_table = alias_vec(nr_aliases + 1)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def entity_vector_length(self):
|
def entity_vector_length(self):
|
||||||
"""RETURNS (uint64): length of the entity vectors"""
|
"""RETURNS (uint64): length of the entity vectors"""
|
||||||
|
@ -144,8 +153,7 @@ cdef class KnowledgeBase:
|
||||||
raise ValueError(Errors.E140)
|
raise ValueError(Errors.E140)
|
||||||
|
|
||||||
nr_entities = len(set(entity_list))
|
nr_entities = len(set(entity_list))
|
||||||
self._entry_index = PreshMap(nr_entities+1)
|
self.initialize_entities(nr_entities)
|
||||||
self._entries = entry_vec(nr_entities+1)
|
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
cdef KBEntryC entry
|
cdef KBEntryC entry
|
||||||
|
@ -325,6 +333,102 @@ cdef class KnowledgeBase:
|
||||||
|
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
|
def to_bytes(self, **kwargs):
|
||||||
|
"""Serialize the current state to a binary string.
|
||||||
|
"""
|
||||||
|
def serialize_header():
|
||||||
|
header = (self.get_size_entities(), self.get_size_aliases(), self.entity_vector_length)
|
||||||
|
return srsly.json_dumps(header)
|
||||||
|
|
||||||
|
def serialize_entries():
|
||||||
|
i = 1
|
||||||
|
tuples = []
|
||||||
|
for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
|
||||||
|
entry = self._entries[entry_index]
|
||||||
|
assert entry.entity_hash == entry_hash
|
||||||
|
assert entry_index == i
|
||||||
|
tuples.append((entry.entity_hash, entry.freq, entry.vector_index))
|
||||||
|
i = i + 1
|
||||||
|
return srsly.json_dumps(tuples)
|
||||||
|
|
||||||
|
def serialize_aliases():
|
||||||
|
i = 1
|
||||||
|
headers = []
|
||||||
|
indices_lists = []
|
||||||
|
probs_lists = []
|
||||||
|
for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
|
||||||
|
alias = self._aliases_table[alias_index]
|
||||||
|
assert alias_index == i
|
||||||
|
candidate_length = len(alias.entry_indices)
|
||||||
|
headers.append((alias_hash, candidate_length))
|
||||||
|
indices_lists.append(alias.entry_indices)
|
||||||
|
probs_lists.append(alias.probs)
|
||||||
|
i = i + 1
|
||||||
|
headers_dump = srsly.json_dumps(headers)
|
||||||
|
indices_dump = srsly.json_dumps(indices_lists)
|
||||||
|
probs_dump = srsly.json_dumps(probs_lists)
|
||||||
|
return srsly.json_dumps((headers_dump, indices_dump, probs_dump))
|
||||||
|
|
||||||
|
serializers = {
|
||||||
|
"header": serialize_header,
|
||||||
|
"entity_vectors": lambda: srsly.json_dumps(self._vectors_table),
|
||||||
|
"entries": serialize_entries,
|
||||||
|
"aliases": serialize_aliases,
|
||||||
|
}
|
||||||
|
return util.to_bytes(serializers, [])
|
||||||
|
|
||||||
|
def from_bytes(self, bytes_data, *, exclude=tuple()):
|
||||||
|
"""Load state from a binary string.
|
||||||
|
"""
|
||||||
|
def deserialize_header(b):
|
||||||
|
header = srsly.json_loads(b)
|
||||||
|
nr_entities = header[0]
|
||||||
|
nr_aliases = header[1]
|
||||||
|
entity_vector_length = header[2]
|
||||||
|
self.initialize_entities(nr_entities)
|
||||||
|
self.initialize_aliases(nr_aliases)
|
||||||
|
self.entity_vector_length = entity_vector_length
|
||||||
|
|
||||||
|
def deserialize_vectors(b):
|
||||||
|
self._vectors_table = srsly.json_loads(b)
|
||||||
|
|
||||||
|
def deserialize_entries(b):
|
||||||
|
cdef KBEntryC entry
|
||||||
|
tuples = srsly.json_loads(b)
|
||||||
|
i = 1
|
||||||
|
for (entity_hash, freq, vector_index) in tuples:
|
||||||
|
entry.entity_hash = entity_hash
|
||||||
|
entry.freq = freq
|
||||||
|
entry.vector_index = vector_index
|
||||||
|
entry.feats_row = -1 # Features table currently not implemented
|
||||||
|
self._entries[i] = entry
|
||||||
|
self._entry_index[entity_hash] = i
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
def deserialize_aliases(b):
|
||||||
|
cdef AliasC alias
|
||||||
|
i = 1
|
||||||
|
all_data = srsly.json_loads(b)
|
||||||
|
headers = srsly.json_loads(all_data[0])
|
||||||
|
indices = srsly.json_loads(all_data[1])
|
||||||
|
probs = srsly.json_loads(all_data[2])
|
||||||
|
for header, indices, probs in zip(headers, indices, probs):
|
||||||
|
alias_hash, candidate_length = header
|
||||||
|
alias.entry_indices = indices
|
||||||
|
alias.probs = probs
|
||||||
|
self._aliases_table[i] = alias
|
||||||
|
self._alias_index[alias_hash] = i
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
setters = {
|
||||||
|
"header": deserialize_header,
|
||||||
|
"entity_vectors": deserialize_vectors,
|
||||||
|
"entries": deserialize_entries,
|
||||||
|
"aliases": deserialize_aliases,
|
||||||
|
}
|
||||||
|
util.from_bytes(bytes_data, setters, exclude)
|
||||||
|
return self
|
||||||
|
|
||||||
def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
|
def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
|
||||||
path = ensure_path(path)
|
path = ensure_path(path)
|
||||||
if not path.exists():
|
if not path.exists():
|
||||||
|
@ -404,10 +508,8 @@ cdef class KnowledgeBase:
|
||||||
cdef int64_t entity_vector_length
|
cdef int64_t entity_vector_length
|
||||||
reader.read_header(&nr_entities, &entity_vector_length)
|
reader.read_header(&nr_entities, &entity_vector_length)
|
||||||
|
|
||||||
|
self.initialize_entities(nr_entities)
|
||||||
self.entity_vector_length = entity_vector_length
|
self.entity_vector_length = entity_vector_length
|
||||||
self._entry_index = PreshMap(nr_entities+1)
|
|
||||||
self._entries = entry_vec(nr_entities+1)
|
|
||||||
self._vectors_table = float_matrix(nr_entities+1)
|
|
||||||
|
|
||||||
# STEP 1: load entity vectors
|
# STEP 1: load entity vectors
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
|
@ -445,8 +547,7 @@ cdef class KnowledgeBase:
|
||||||
# STEP 3: load aliases
|
# STEP 3: load aliases
|
||||||
cdef int64_t nr_aliases
|
cdef int64_t nr_aliases
|
||||||
reader.read_alias_length(&nr_aliases)
|
reader.read_alias_length(&nr_aliases)
|
||||||
self._alias_index = PreshMap(nr_aliases+1)
|
self.initialize_aliases(nr_aliases)
|
||||||
self._aliases_table = alias_vec(nr_aliases+1)
|
|
||||||
|
|
||||||
cdef int64_t nr_candidates
|
cdef int64_t nr_candidates
|
||||||
cdef vector[int64_t] entry_indices
|
cdef vector[int64_t] entry_indices
|
||||||
|
|
21
spacy/lang/az/__init__.py
Normal file
21
spacy/lang/az/__init__.py
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from ...language import Language
|
||||||
|
|
||||||
|
|
||||||
|
class AzerbaijaniDefaults(Language.Defaults):
|
||||||
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
token_match = TOKEN_MATCH
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
|
class Azerbaijani(Language):
|
||||||
|
lang = "az"
|
||||||
|
Defaults = AzerbaijaniDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Azerbaijani"]
|
18
spacy/lang/az/examples.py
Normal file
18
spacy/lang/az/examples.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
>>> from spacy.lang.az.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Bu bir cümlədir.",
|
||||||
|
"Necəsən?",
|
||||||
|
"Qarabağ ordeni vətən müharibəsində qələbə münasibəti ilə təsis edilmişdir.",
|
||||||
|
"Məktəbimizə Bakıdan bir tarix müəllimi gəlmişdi.",
|
||||||
|
"Atılan növbəti mərmilər lap yaxınlıqda partladı.",
|
||||||
|
"Sinqapur koronavirus baxımından ən təhlükəsiz ölkələr sırasındadır.",
|
||||||
|
"Marsda ilk sınaq uçuşu həyata keçirilib.",
|
||||||
|
"SSRİ dağılandan bəri 5 sahil dövləti Xəzərin statusunu müəyyən edə bilməyiblər.",
|
||||||
|
"Videoda beyninə xüsusi çip yerləşdirilmiş meymun əks olunub.",
|
||||||
|
]
|
89
spacy/lang/az/lex_attrs.py
Normal file
89
spacy/lang/az/lex_attrs.py
Normal file
|
@ -0,0 +1,89 @@
|
||||||
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
|
# Eleven, twelve etc. are written separate: on bir, on iki
|
||||||
|
|
||||||
|
_num_words = [
|
||||||
|
"bir",
|
||||||
|
"iki",
|
||||||
|
"üç",
|
||||||
|
"dörd",
|
||||||
|
"beş",
|
||||||
|
"altı",
|
||||||
|
"yeddi",
|
||||||
|
"səkkiz",
|
||||||
|
"doqquz",
|
||||||
|
"on",
|
||||||
|
"iyirmi",
|
||||||
|
"otuz",
|
||||||
|
"qırx",
|
||||||
|
"əlli",
|
||||||
|
"altmış",
|
||||||
|
"yetmiş",
|
||||||
|
"səksən",
|
||||||
|
"doxsan",
|
||||||
|
"yüz",
|
||||||
|
"min",
|
||||||
|
"milyon",
|
||||||
|
"milyard",
|
||||||
|
"trilyon",
|
||||||
|
"kvadrilyon",
|
||||||
|
"kentilyon",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
_ordinal_words = [
|
||||||
|
"birinci",
|
||||||
|
"ikinci",
|
||||||
|
"üçüncü",
|
||||||
|
"dördüncü",
|
||||||
|
"beşinci",
|
||||||
|
"altıncı",
|
||||||
|
"yedinci",
|
||||||
|
"səkkizinci",
|
||||||
|
"doqquzuncu",
|
||||||
|
"onuncu",
|
||||||
|
"iyirminci",
|
||||||
|
"otuzuncu",
|
||||||
|
"qırxıncı",
|
||||||
|
"əllinci",
|
||||||
|
"altmışıncı",
|
||||||
|
"yetmişinci",
|
||||||
|
"səksəninci",
|
||||||
|
"doxsanıncı",
|
||||||
|
"yüzüncü",
|
||||||
|
"mininci",
|
||||||
|
"milyonuncu",
|
||||||
|
"milyardıncı",
|
||||||
|
"trilyonuncu",
|
||||||
|
"kvadrilyonuncu",
|
||||||
|
"kentilyonuncu",
|
||||||
|
]
|
||||||
|
|
||||||
|
_ordinal_endings = ("inci", "ıncı", "nci", "ncı", "uncu", "üncü")
|
||||||
|
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
|
text = text[1:]
|
||||||
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
if text.count("/") == 1:
|
||||||
|
num, denom = text.split("/")
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
text_lower = text.lower()
|
||||||
|
# Check cardinal number
|
||||||
|
if text_lower in _num_words:
|
||||||
|
return True
|
||||||
|
# Check ordinal number
|
||||||
|
if text_lower in _ordinal_words:
|
||||||
|
return True
|
||||||
|
if text_lower.endswith(_ordinal_endings):
|
||||||
|
if text_lower[:-3].isdigit() or text_lower[:-4].isdigit():
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
145
spacy/lang/az/stop_words.py
Normal file
145
spacy/lang/az/stop_words.py
Normal file
|
@ -0,0 +1,145 @@
|
||||||
|
# Source: https://github.com/eliasdabbas/advertools/blob/master/advertools/stopwords.py
|
||||||
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
|
amma
|
||||||
|
arasında
|
||||||
|
artıq
|
||||||
|
ay
|
||||||
|
az
|
||||||
|
bax
|
||||||
|
belə
|
||||||
|
beş
|
||||||
|
bilər
|
||||||
|
bir
|
||||||
|
biraz
|
||||||
|
biri
|
||||||
|
birşey
|
||||||
|
biz
|
||||||
|
bizim
|
||||||
|
bizlər
|
||||||
|
bu
|
||||||
|
buna
|
||||||
|
bundan
|
||||||
|
bunların
|
||||||
|
bunu
|
||||||
|
bunun
|
||||||
|
buradan
|
||||||
|
bütün
|
||||||
|
bəli
|
||||||
|
bəlkə
|
||||||
|
bəy
|
||||||
|
bəzi
|
||||||
|
bəzən
|
||||||
|
daha
|
||||||
|
dedi
|
||||||
|
deyil
|
||||||
|
dir
|
||||||
|
düz
|
||||||
|
də
|
||||||
|
dək
|
||||||
|
dən
|
||||||
|
dəqiqə
|
||||||
|
edir
|
||||||
|
edən
|
||||||
|
elə
|
||||||
|
et
|
||||||
|
etdi
|
||||||
|
etmə
|
||||||
|
etmək
|
||||||
|
faiz
|
||||||
|
gilə
|
||||||
|
görə
|
||||||
|
ha
|
||||||
|
haqqında
|
||||||
|
harada
|
||||||
|
heç
|
||||||
|
hə
|
||||||
|
həm
|
||||||
|
həmin
|
||||||
|
həmişə
|
||||||
|
hər
|
||||||
|
idi
|
||||||
|
il
|
||||||
|
ildə
|
||||||
|
ilk
|
||||||
|
ilə
|
||||||
|
in
|
||||||
|
indi
|
||||||
|
istifadə
|
||||||
|
isə
|
||||||
|
ki
|
||||||
|
kim
|
||||||
|
kimi
|
||||||
|
kimə
|
||||||
|
lakin
|
||||||
|
lap
|
||||||
|
mirşey
|
||||||
|
məhz
|
||||||
|
mən
|
||||||
|
mənə
|
||||||
|
niyə
|
||||||
|
nə
|
||||||
|
nəhayət
|
||||||
|
o
|
||||||
|
obirisi
|
||||||
|
of
|
||||||
|
olan
|
||||||
|
olar
|
||||||
|
olaraq
|
||||||
|
oldu
|
||||||
|
olduğu
|
||||||
|
olmadı
|
||||||
|
olmaz
|
||||||
|
olmuşdur
|
||||||
|
olsun
|
||||||
|
olur
|
||||||
|
on
|
||||||
|
ona
|
||||||
|
ondan
|
||||||
|
onlar
|
||||||
|
onlardan
|
||||||
|
onların
|
||||||
|
onsuzda
|
||||||
|
onu
|
||||||
|
onun
|
||||||
|
oradan
|
||||||
|
qarşı
|
||||||
|
qədər
|
||||||
|
saat
|
||||||
|
sadəcə
|
||||||
|
saniyə
|
||||||
|
siz
|
||||||
|
sizin
|
||||||
|
sizlər
|
||||||
|
sonra
|
||||||
|
səhv
|
||||||
|
sən
|
||||||
|
sənin
|
||||||
|
sənə
|
||||||
|
təəssüf
|
||||||
|
var
|
||||||
|
və
|
||||||
|
xan
|
||||||
|
xanım
|
||||||
|
xeyr
|
||||||
|
ya
|
||||||
|
yalnız
|
||||||
|
yaxşı
|
||||||
|
yeddi
|
||||||
|
yenə
|
||||||
|
yox
|
||||||
|
yoxdur
|
||||||
|
yoxsa
|
||||||
|
yəni
|
||||||
|
zaman
|
||||||
|
çox
|
||||||
|
çünki
|
||||||
|
öz
|
||||||
|
özü
|
||||||
|
üçün
|
||||||
|
əgər
|
||||||
|
əlbəttə
|
||||||
|
ən
|
||||||
|
əslində
|
||||||
|
""".split()
|
||||||
|
)
|
|
@ -22,13 +22,13 @@ _num_words = [
|
||||||
"тринадесет",
|
"тринадесет",
|
||||||
"тринайсет",
|
"тринайсет",
|
||||||
"четиринадесет",
|
"четиринадесет",
|
||||||
"четиринайсет"
|
"четиринайсет",
|
||||||
"петнадесет",
|
"петнадесет",
|
||||||
"петнайсет"
|
"петнайсет",
|
||||||
"шестнадесет",
|
"шестнадесет",
|
||||||
"шестнайсет",
|
"шестнайсет",
|
||||||
"седемнадесет",
|
"седемнадесет",
|
||||||
"седемнайсет"
|
"седемнайсет",
|
||||||
"осемнадесет",
|
"осемнадесет",
|
||||||
"осемнайсет",
|
"осемнайсет",
|
||||||
"деветнадесет",
|
"деветнадесет",
|
||||||
|
@ -36,7 +36,7 @@ _num_words = [
|
||||||
"двадесет",
|
"двадесет",
|
||||||
"двайсет",
|
"двайсет",
|
||||||
"тридесет",
|
"тридесет",
|
||||||
"трийсет"
|
"трийсет",
|
||||||
"четиридесет",
|
"четиридесет",
|
||||||
"четиресет",
|
"четиресет",
|
||||||
"петдесет",
|
"петдесет",
|
||||||
|
|
|
@ -58,7 +58,6 @@ _abbr_dot_exc = [
|
||||||
{ORTH: "стр.", NORM: "страница"},
|
{ORTH: "стр.", NORM: "страница"},
|
||||||
{ORTH: "ул.", NORM: "улица"},
|
{ORTH: "ул.", NORM: "улица"},
|
||||||
{ORTH: "чл.", NORM: "член"},
|
{ORTH: "чл.", NORM: "член"},
|
||||||
|
|
||||||
]
|
]
|
||||||
|
|
||||||
for abbr in _abbr_dot_exc:
|
for abbr in _abbr_dot_exc:
|
||||||
|
|
|
@ -1,15 +1,23 @@
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from thinc.api import Model
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from .lemmatizer import CatalanLemmatizer
|
||||||
|
|
||||||
|
|
||||||
class CatalanDefaults(Language.Defaults):
|
class CatalanDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
class Catalan(Language):
|
class Catalan(Language):
|
||||||
|
@ -17,4 +25,16 @@ class Catalan(Language):
|
||||||
Defaults = CatalanDefaults
|
Defaults = CatalanDefaults
|
||||||
|
|
||||||
|
|
||||||
|
@Catalan.factory(
|
||||||
|
"lemmatizer",
|
||||||
|
assigns=["token.lemma"],
|
||||||
|
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||||
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
|
)
|
||||||
|
def make_lemmatizer(
|
||||||
|
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||||
|
):
|
||||||
|
return CatalanLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Catalan"]
|
__all__ = ["Catalan"]
|
||||||
|
|
81
spacy/lang/ca/lemmatizer.py
Normal file
81
spacy/lang/ca/lemmatizer.py
Normal file
|
@ -0,0 +1,81 @@
|
||||||
|
from typing import List, Tuple
|
||||||
|
|
||||||
|
from ...pipeline import Lemmatizer
|
||||||
|
from ...tokens import Token
|
||||||
|
|
||||||
|
|
||||||
|
class CatalanLemmatizer(Lemmatizer):
|
||||||
|
"""
|
||||||
|
Copied from French Lemmatizer
|
||||||
|
Catalan language lemmatizer applies the default rule based lemmatization
|
||||||
|
procedure with some modifications for better Catalan language support.
|
||||||
|
|
||||||
|
The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use
|
||||||
|
the rule-based lemmatization. As a last resort, the lemmatizer checks in
|
||||||
|
the lookup table.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
|
||||||
|
if mode == "rule":
|
||||||
|
required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
|
||||||
|
return (required, [])
|
||||||
|
else:
|
||||||
|
return super().get_lookups_config(mode)
|
||||||
|
|
||||||
|
def rule_lemmatize(self, token: Token) -> List[str]:
|
||||||
|
cache_key = (token.orth, token.pos)
|
||||||
|
if cache_key in self.cache:
|
||||||
|
return self.cache[cache_key]
|
||||||
|
string = token.text
|
||||||
|
univ_pos = token.pos_.lower()
|
||||||
|
if univ_pos in ("", "eol", "space"):
|
||||||
|
return [string.lower()]
|
||||||
|
elif "lemma_rules" not in self.lookups or univ_pos not in (
|
||||||
|
"noun",
|
||||||
|
"verb",
|
||||||
|
"adj",
|
||||||
|
"adp",
|
||||||
|
"adv",
|
||||||
|
"aux",
|
||||||
|
"cconj",
|
||||||
|
"det",
|
||||||
|
"pron",
|
||||||
|
"punct",
|
||||||
|
"sconj",
|
||||||
|
):
|
||||||
|
return self.lookup_lemmatize(token)
|
||||||
|
index_table = self.lookups.get_table("lemma_index", {})
|
||||||
|
exc_table = self.lookups.get_table("lemma_exc", {})
|
||||||
|
rules_table = self.lookups.get_table("lemma_rules", {})
|
||||||
|
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||||
|
index = index_table.get(univ_pos, {})
|
||||||
|
exceptions = exc_table.get(univ_pos, {})
|
||||||
|
rules = rules_table.get(univ_pos, [])
|
||||||
|
string = string.lower()
|
||||||
|
forms = []
|
||||||
|
if string in index:
|
||||||
|
forms.append(string)
|
||||||
|
self.cache[cache_key] = forms
|
||||||
|
return forms
|
||||||
|
forms.extend(exceptions.get(string, []))
|
||||||
|
oov_forms = []
|
||||||
|
if not forms:
|
||||||
|
for old, new in rules:
|
||||||
|
if string.endswith(old):
|
||||||
|
form = string[: len(string) - len(old)] + new
|
||||||
|
if not form:
|
||||||
|
pass
|
||||||
|
elif form in index or not form.isalpha():
|
||||||
|
forms.append(form)
|
||||||
|
else:
|
||||||
|
oov_forms.append(form)
|
||||||
|
if not forms:
|
||||||
|
forms.extend(oov_forms)
|
||||||
|
if not forms and string in lookup_table.keys():
|
||||||
|
forms.append(self.lookup_lemmatize(token)[0])
|
||||||
|
if not forms:
|
||||||
|
forms.append(string)
|
||||||
|
forms = list(set(forms))
|
||||||
|
self.cache[cache_key] = forms
|
||||||
|
return forms
|
|
@ -1,12 +1,46 @@
|
||||||
from ..punctuation import TOKENIZER_INFIXES
|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
||||||
from ..char_classes import ALPHA
|
from ..char_classes import CURRENCY
|
||||||
|
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
|
||||||
|
from ..char_classes import merge_chars, _units
|
||||||
|
|
||||||
|
|
||||||
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
|
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
|
||||||
|
|
||||||
|
|
||||||
_infixes = TOKENIZER_INFIXES + [
|
_infixes = (
|
||||||
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
|
LIST_ELLIPSES
|
||||||
]
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
|
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||||
|
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||||
|
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||||
|
),
|
||||||
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
_units = _units.replace("% ", "")
|
||||||
|
UNITS = merge_chars(_units)
|
||||||
|
|
||||||
|
_suffixes = (
|
||||||
|
LIST_PUNCT
|
||||||
|
+ LIST_ELLIPSES
|
||||||
|
+ LIST_QUOTES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [r"-", "—", "–"]
|
||||||
|
+ [
|
||||||
|
r"(?<=[0-9])\+",
|
||||||
|
r"(?<=°[FfCcKk])\.",
|
||||||
|
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||||
|
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||||
|
r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
|
||||||
|
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
|
||||||
|
),
|
||||||
|
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
TOKENIZER_INFIXES = _infixes
|
TOKENIZER_INFIXES = _infixes
|
||||||
|
TOKENIZER_SUFFIXES = _suffixes
|
||||||
|
|
46
spacy/lang/ca/syntax_iterators.py
Normal file
46
spacy/lang/ca/syntax_iterators.py
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
from ...symbols import NOUN, PROPN
|
||||||
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
|
def noun_chunks(doclike):
|
||||||
|
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||||
|
# fmt: off
|
||||||
|
labels = ["nsubj", "nsubj:pass", "obj", "obl", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||||
|
# fmt: on
|
||||||
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
if not doc.has_annotation("DEP"):
|
||||||
|
raise ValueError(Errors.E029)
|
||||||
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
|
np_label = doc.vocab.strings.add("NP")
|
||||||
|
prev_end = -1
|
||||||
|
for i, word in enumerate(doclike):
|
||||||
|
if word.pos not in (NOUN, PROPN):
|
||||||
|
continue
|
||||||
|
# Prevent nested chunks from being produced
|
||||||
|
if word.left_edge.i <= prev_end:
|
||||||
|
continue
|
||||||
|
if word.dep in np_deps:
|
||||||
|
left = word.left_edge.i
|
||||||
|
right = word.right_edge.i + 1
|
||||||
|
# leave prepositions and punctuation out of the left side of the chunk
|
||||||
|
if word.left_edge.pos_ == "ADP" or word.left_edge.pos_ == "PUNCT":
|
||||||
|
left = word.left_edge.i + 1
|
||||||
|
prev_end = word.right_edge.i
|
||||||
|
# leave subordinated clauses and appositions out of the chunk
|
||||||
|
a = word.i + 1
|
||||||
|
while a < word.right_edge.i:
|
||||||
|
paraula = doc[a]
|
||||||
|
if paraula.pos_ == "VERB":
|
||||||
|
right = paraula.left_edge.i
|
||||||
|
prev_end = paraula.left_edge.i - 1
|
||||||
|
elif paraula.dep_ == "appos":
|
||||||
|
right = paraula.left_edge.i + 1
|
||||||
|
prev_end = paraula.left_edge.i - 1
|
||||||
|
a += 1
|
||||||
|
# leave punctuation out of the right side of the chunk
|
||||||
|
if word.right_edge.pos_ == "PUNCT":
|
||||||
|
right = right - 1
|
||||||
|
yield left, right, np_label
|
||||||
|
|
||||||
|
|
||||||
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
|
@ -24,6 +24,13 @@ for exc_data in [
|
||||||
{ORTH: "núm", NORM: "número"},
|
{ORTH: "núm", NORM: "número"},
|
||||||
{ORTH: "St.", NORM: "sant"},
|
{ORTH: "St.", NORM: "sant"},
|
||||||
{ORTH: "Sta.", NORM: "santa"},
|
{ORTH: "Sta.", NORM: "santa"},
|
||||||
|
{ORTH: "'l"},
|
||||||
|
{ORTH: "'ls"},
|
||||||
|
{ORTH: "'m"},
|
||||||
|
{ORTH: "'n"},
|
||||||
|
{ORTH: "'ns"},
|
||||||
|
{ORTH: "'s"},
|
||||||
|
{ORTH: "'t"},
|
||||||
]:
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
|
|
@ -260,7 +260,10 @@ _units = (
|
||||||
"кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб"
|
"кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб"
|
||||||
"كم كم² كم³ م م² م³ سم سم² سم³ مم مم² مم³ كم غرام جرام جم كغ ملغ كوب اكواب"
|
"كم كم² كم³ م م² م³ سم سم² سم³ مم مم² مم³ كم غرام جرام جم كغ ملغ كوب اكواب"
|
||||||
)
|
)
|
||||||
_currency = r"\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼ ₴"
|
_currency = (
|
||||||
|
r"\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼ ₴ ₠ ₡ ₢ ₣ ₤ ₥ ₦ ₧ ₨ ₩ ₪ ₫ € ₭ ₮ ₯ ₰ "
|
||||||
|
r"₱ ₲ ₳ ₴ ₵ ₶ ₷ ₸ ₹ ₺ ₻ ₼ ₽ ₾ ₿"
|
||||||
|
)
|
||||||
|
|
||||||
# These expressions contain various unicode variations, including characters
|
# These expressions contain various unicode variations, including characters
|
||||||
# used in Chinese (see #1333, #1340, #1351) – unless there are cross-language
|
# used in Chinese (see #1333, #1340, #1351) – unless there are cross-language
|
||||||
|
|
|
@ -57,6 +57,6 @@ class GreekLemmatizer(Lemmatizer):
|
||||||
forms.extend(oov_forms)
|
forms.extend(oov_forms)
|
||||||
if not forms:
|
if not forms:
|
||||||
forms.append(string)
|
forms.append(string)
|
||||||
forms = list(set(forms))
|
forms = list(dict.fromkeys(forms))
|
||||||
self.cache[cache_key] = forms
|
self.cache[cache_key] = forms
|
||||||
return forms
|
return forms
|
||||||
|
|
|
@ -35,7 +35,7 @@ def like_num(text: str) -> bool:
|
||||||
# Check ordinal number
|
# Check ordinal number
|
||||||
if text_lower in _ordinal_words:
|
if text_lower in _ordinal_words:
|
||||||
return True
|
return True
|
||||||
if text_lower.endswith("th"):
|
if text_lower.endswith(("st", "nd", "rd", "th")):
|
||||||
if text_lower[:-2].isdigit():
|
if text_lower[:-2].isdigit():
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH
|
from ...symbols import ORTH, NORM
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
|
@ -79,5 +79,34 @@ for exc_data in [
|
||||||
]:
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
# Source: https://kaino.kotus.fi/visk/sisallys.php?p=141
|
||||||
|
conj_contraction_bases = [
|
||||||
|
("ett", "että"),
|
||||||
|
("jott", "jotta"),
|
||||||
|
("kosk", "koska"),
|
||||||
|
("mutt", "mutta"),
|
||||||
|
("vaikk", "vaikka"),
|
||||||
|
("ehk", "ehkä"),
|
||||||
|
("miks", "miksi"),
|
||||||
|
("siks", "siksi"),
|
||||||
|
("joll", "jos"),
|
||||||
|
("ell", "jos"),
|
||||||
|
]
|
||||||
|
conj_contraction_negations = [
|
||||||
|
("en", "en"),
|
||||||
|
("et", "et"),
|
||||||
|
("ei", "ei"),
|
||||||
|
("emme", "emme"),
|
||||||
|
("ette", "ette"),
|
||||||
|
("eivat", "eivät"),
|
||||||
|
("eivät", "eivät"),
|
||||||
|
]
|
||||||
|
for (base_lower, base_norm) in conj_contraction_bases:
|
||||||
|
for base in [base_lower, base_lower.title()]:
|
||||||
|
for (suffix, suffix_norm) in conj_contraction_negations:
|
||||||
|
_exc[base + suffix] = [
|
||||||
|
{ORTH: base, NORM: base_norm},
|
||||||
|
{ORTH: suffix, NORM: suffix_norm},
|
||||||
|
]
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -1,30 +1,31 @@
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
a à â abord afin ah ai aie ainsi ait allaient allons
|
a à â abord afin ah ai aie ainsi ait allaient allons
|
||||||
alors anterieur anterieure anterieures apres après as assez attendu au
|
alors anterieur anterieure anterieures antérieur antérieure antérieures
|
||||||
aucun aucune aujourd aujourd'hui aupres auquel aura auraient aurait auront
|
apres après as assez attendu au
|
||||||
|
aupres auquel aura auraient aurait auront
|
||||||
aussi autre autrement autres autrui aux auxquelles auxquels avaient
|
aussi autre autrement autres autrui aux auxquelles auxquels avaient
|
||||||
avais avait avant avec avoir avons ayant
|
avais avait avant avec avoir avons ayant
|
||||||
|
|
||||||
bas basee bat
|
bas basee bat
|
||||||
|
|
||||||
c' c’ ça car ce ceci cela celle celle-ci celle-là celles celles-ci celles-là celui
|
c' c’ ça car ce ceci cela celle celle-ci celle-la celle-là celles celles-ci celles-la celles-là
|
||||||
celui-ci celui-là cent cependant certain certaine certaines certains certes ces
|
celui celui-ci celui-la celui-là cent cependant certain certaine certaines certains certes ces
|
||||||
cet cette ceux ceux-ci ceux-là chacun chacune chaque chez ci cinq cinquantaine cinquante
|
cet cette ceux ceux-ci ceux-là chacun chacune chaque chez ci cinq cinquantaine cinquante
|
||||||
cinquantième cinquième combien comme comment compris concernant
|
cinquantième cinquième combien comme comment compris concernant
|
||||||
|
|
||||||
d' d’ da dans de debout dedans dehors deja delà depuis derriere
|
d' d’ da dans de debout dedans dehors deja dejà delà depuis derriere
|
||||||
derrière des desormais desquelles desquels dessous dessus deux deuxième
|
derrière des desormais desquelles desquels dessous dessus deux deuxième
|
||||||
deuxièmement devant devers devra different differentes differents différent
|
deuxièmement devant devers devra different differente differentes differents différent
|
||||||
différente différentes différents dire directe directement dit dite dits divers
|
différente différentes différents dire directe directement dit dite dits divers
|
||||||
diverse diverses dix dix-huit dix-neuf dix-sept dixième doit doivent donc dont
|
diverse diverses dix dix-huit dix-neuf dix-sept dixième doit doivent donc dont
|
||||||
douze douzième du duquel durant dès désormais
|
douze douzième du duquel durant dès déja déjà désormais
|
||||||
|
|
||||||
effet egale egalement egales eh elle elle-même elles elles-mêmes en encore
|
effet egalement eh elle elle-meme elle-même elles elles-memes elles-mêmes en encore
|
||||||
enfin entre envers environ es ès est et etaient étaient etais étais etait était
|
enfin entre envers environ es ès est et etaient étaient etais étais etait était
|
||||||
etant étant etc été etre être eu eux eux-mêmes exactement excepté
|
etant étant etc etre être eu eux eux-mêmes exactement excepté également
|
||||||
|
|
||||||
fais faisaient faisant fait façon feront font
|
fais faisaient faisant fait facon façon feront font
|
||||||
|
|
||||||
gens
|
gens
|
||||||
|
|
||||||
|
@ -36,45 +37,48 @@ j' j’ je jusqu jusque juste
|
||||||
l' l’ la laisser laquelle le lequel les lesquelles lesquels leur leurs longtemps
|
l' l’ la laisser laquelle le lequel les lesquelles lesquels leur leurs longtemps
|
||||||
lors lorsque lui lui-meme lui-même là lès
|
lors lorsque lui lui-meme lui-même là lès
|
||||||
|
|
||||||
m' m’ ma maint maintenant mais malgre me meme memes merci mes mien
|
m' m’ ma maint maintenant mais malgre malgré me meme memes merci mes mien
|
||||||
mienne miennes miens mille moi moi-meme moi-même moindres moins
|
mienne miennes miens mille moi moi-meme moi-même moindres moins
|
||||||
mon même mêmes
|
mon même mêmes
|
||||||
|
|
||||||
n' n’ na ne neanmoins neuvième ni nombreuses nombreux nos notamment
|
n' n’ na ne neanmoins neuvième ni nombreuses nombreux nos notamment
|
||||||
notre nous nous-mêmes nouvea nul néanmoins nôtre nôtres
|
notre nous nous-mêmes nouveau nul néanmoins nôtre nôtres
|
||||||
|
|
||||||
o ô on ont onze onzième ore ou ouias oust outre
|
o ô on ont onze onzième or ou ouias ouste outre
|
||||||
ouvert ouverte ouverts où
|
ouvert ouverte ouverts où
|
||||||
|
|
||||||
par parce parfois parle parlent parler parmi parseme partant
|
par parce parfois parle parlent parler parmi partant
|
||||||
pas pendant pense permet personne peu peut peuvent peux plus
|
pas pendant pense permet personne peu peut peuvent peux plus
|
||||||
plusieurs plutôt possible possibles pour pourquoi
|
plusieurs plutot plutôt possible possibles pour pourquoi
|
||||||
pourrais pourrait pouvait prealable precisement premier première premièrement
|
pourrais pourrait pouvait prealable precisement
|
||||||
pres procedant proche près pu puis puisque
|
premier première premièrement
|
||||||
|
pres procedant proche près préalable précisement pu puis puisque
|
||||||
|
|
||||||
qu' qu’ quand quant quant-à-soi quanta quarante quatorze quatre quatre-vingt
|
qu' qu’ quand quant quant-à-soi quarante quatorze quatre quatre-vingt
|
||||||
quatrième quatrièmement que quel quelconque quelle quelles quelqu'un quelque
|
quatrième quatrièmement que quel quelconque quelle quelles quelqu'un quelque
|
||||||
quelques quels qui quiconque quinze quoi quoique
|
quelques quels qui quiconque quinze quoi quoique
|
||||||
|
|
||||||
relative relativement rend rendre restant reste
|
relative relativement rend rendre restant reste
|
||||||
restent retour revoici revoilà
|
restent retour revoici revoila revoilà
|
||||||
|
|
||||||
s' s’ sa sait sans sauf se seize selon semblable semblaient
|
s' s’ sa sait sans sauf se seize selon semblable semblaient
|
||||||
semble semblent sent sept septième sera seraient serait seront ses seul seule
|
semble semblent sent sept septième sera seraient serait seront ses seul seule
|
||||||
seulement si sien sienne siennes siens sinon six sixième soi soi-même soit
|
seulement seuls seules si sien sienne siennes siens sinon six sixième soi soi-meme soi-même soit
|
||||||
soixante son sont sous souvent specifique specifiques stop
|
soixante son sont sous souvent specifique specifiques spécifique spécifiques stop
|
||||||
suffisant suffisante suffit suis suit suivant suivante
|
suffisant suffisante suffit suis suit suivant suivante
|
||||||
suivantes suivants suivre sur surtout
|
suivantes suivants suivre sur surtout
|
||||||
|
|
||||||
t' t’ ta tant te tel telle tellement telles tels tenant tend tenir tente
|
t' t’ ta tant te tel telle tellement telles tels tenant tend tenir tente
|
||||||
tes tien tienne tiennes tiens toi toi-même ton touchant toujours tous
|
tes tien tienne tiennes tiens toi toi-meme toi-même ton touchant toujours tous
|
||||||
tout toute toutes treize trente tres trois troisième troisièmement
|
tout toute toutes treize trente tres trois troisième troisièmement très
|
||||||
tu té
|
tu té
|
||||||
|
|
||||||
un une unes uns
|
un une unes uns
|
||||||
|
|
||||||
va vais vas vers via vingt voici voilà vont vos
|
va vais vas vers via vingt voici voila voilà vont vos
|
||||||
votre vous vous-mêmes vu vé vôtre vôtres
|
votre votres vous vous-mêmes vu vé vôtre vôtres
|
||||||
|
|
||||||
|
y
|
||||||
|
|
||||||
""".split()
|
""".split()
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,7 +1,11 @@
|
||||||
|
from typing import Optional
|
||||||
|
from thinc.api import Model
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from .lemmatizer import ItalianLemmatizer
|
||||||
|
|
||||||
|
|
||||||
class ItalianDefaults(Language.Defaults):
|
class ItalianDefaults(Language.Defaults):
|
||||||
|
@ -16,4 +20,16 @@ class Italian(Language):
|
||||||
Defaults = ItalianDefaults
|
Defaults = ItalianDefaults
|
||||||
|
|
||||||
|
|
||||||
|
@Italian.factory(
|
||||||
|
"lemmatizer",
|
||||||
|
assigns=["token.lemma"],
|
||||||
|
default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
|
||||||
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
|
)
|
||||||
|
def make_lemmatizer(
|
||||||
|
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||||
|
):
|
||||||
|
return ItalianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Italian"]
|
__all__ = ["Italian"]
|
||||||
|
|
132
spacy/lang/it/lemmatizer.py
Normal file
132
spacy/lang/it/lemmatizer.py
Normal file
|
@ -0,0 +1,132 @@
|
||||||
|
from typing import List, Dict, Tuple
|
||||||
|
|
||||||
|
from ...pipeline import Lemmatizer
|
||||||
|
from ...tokens import Token
|
||||||
|
|
||||||
|
|
||||||
|
class ItalianLemmatizer(Lemmatizer):
|
||||||
|
"""This lemmatizer was adapted from the Polish one (version of April 2021).
|
||||||
|
It implements lookup lemmatization based on the morphological lexicon
|
||||||
|
morph-it (Baroni and Zanchetta). The table lemma_lookup with non-POS-aware
|
||||||
|
entries is used as a backup for words that aren't handled by morph-it."""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
|
||||||
|
if mode == "pos_lookup":
|
||||||
|
required = [
|
||||||
|
"lemma_lookup_num",
|
||||||
|
"lemma_lookup_det",
|
||||||
|
"lemma_lookup_adp",
|
||||||
|
"lemma_lookup_adj",
|
||||||
|
"lemma_lookup_noun",
|
||||||
|
"lemma_lookup_pron",
|
||||||
|
"lemma_lookup_verb",
|
||||||
|
"lemma_lookup_aux",
|
||||||
|
"lemma_lookup_adv",
|
||||||
|
"lemma_lookup_other",
|
||||||
|
"lemma_lookup",
|
||||||
|
]
|
||||||
|
return (required, [])
|
||||||
|
else:
|
||||||
|
return super().get_lookups_config(mode)
|
||||||
|
|
||||||
|
def pos_lookup_lemmatize(self, token: Token) -> List[str]:
|
||||||
|
string = token.text
|
||||||
|
univ_pos = token.pos_
|
||||||
|
morphology = token.morph.to_dict()
|
||||||
|
lookup_pos = univ_pos.lower()
|
||||||
|
if univ_pos == "PROPN":
|
||||||
|
lookup_pos = "noun"
|
||||||
|
elif univ_pos == "PART":
|
||||||
|
lookup_pos = "pron"
|
||||||
|
lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})
|
||||||
|
if univ_pos == "NOUN":
|
||||||
|
return self.lemmatize_noun(string, morphology, lookup_table)
|
||||||
|
else:
|
||||||
|
if univ_pos != "PROPN":
|
||||||
|
string = string.lower()
|
||||||
|
if univ_pos == "DET":
|
||||||
|
return self.lemmatize_det(string, morphology, lookup_table)
|
||||||
|
elif univ_pos == "PRON":
|
||||||
|
return self.lemmatize_pron(string, morphology, lookup_table)
|
||||||
|
elif univ_pos == "ADP":
|
||||||
|
return self.lemmatize_adp(string, morphology, lookup_table)
|
||||||
|
elif univ_pos == "ADJ":
|
||||||
|
return self.lemmatize_adj(string, morphology, lookup_table)
|
||||||
|
else:
|
||||||
|
lemma = lookup_table.get(string, "")
|
||||||
|
if not lemma:
|
||||||
|
lookup_table = self.lookups.get_table("lemma_lookup_other")
|
||||||
|
lemma = lookup_table.get(string, "")
|
||||||
|
if not lemma:
|
||||||
|
lookup_table = self.lookups.get_table(
|
||||||
|
"lemma_lookup"
|
||||||
|
) # "legacy" lookup table
|
||||||
|
lemma = lookup_table.get(string, string.lower())
|
||||||
|
return [lemma]
|
||||||
|
|
||||||
|
def lemmatize_det(
|
||||||
|
self, string: str, morphology: dict, lookup_table: Dict[str, str]
|
||||||
|
) -> List[str]:
|
||||||
|
if string in [
|
||||||
|
"l'",
|
||||||
|
"lo",
|
||||||
|
"la",
|
||||||
|
"i",
|
||||||
|
"gli",
|
||||||
|
"le",
|
||||||
|
]:
|
||||||
|
return ["il"]
|
||||||
|
if string in ["un'", "un", "una"]:
|
||||||
|
return ["uno"]
|
||||||
|
return [lookup_table.get(string, string)]
|
||||||
|
|
||||||
|
def lemmatize_pron(
|
||||||
|
self, string: str, morphology: dict, lookup_table: Dict[str, str]
|
||||||
|
) -> List[str]:
|
||||||
|
if string in [
|
||||||
|
"l'",
|
||||||
|
"li",
|
||||||
|
"la",
|
||||||
|
"gli",
|
||||||
|
"le",
|
||||||
|
]:
|
||||||
|
return ["lo"]
|
||||||
|
if string in ["un'", "un", "una"]:
|
||||||
|
return ["uno"]
|
||||||
|
lemma = lookup_table.get(string, string)
|
||||||
|
if lemma == "alcun":
|
||||||
|
lemma = "alcuno"
|
||||||
|
elif lemma == "qualcun":
|
||||||
|
lemma = "qualcuno"
|
||||||
|
return [lemma]
|
||||||
|
|
||||||
|
def lemmatize_adp(
|
||||||
|
self, string: str, morphology: dict, lookup_table: Dict[str, str]
|
||||||
|
) -> List[str]:
|
||||||
|
if string == "d'":
|
||||||
|
return ["di"]
|
||||||
|
return [lookup_table.get(string, string)]
|
||||||
|
|
||||||
|
def lemmatize_adj(
|
||||||
|
self, string: str, morphology: dict, lookup_table: Dict[str, str]
|
||||||
|
) -> List[str]:
|
||||||
|
lemma = lookup_table.get(string, string)
|
||||||
|
if lemma == "alcun":
|
||||||
|
lemma = "alcuno"
|
||||||
|
elif lemma == "qualcun":
|
||||||
|
lemma = "qualcuno"
|
||||||
|
return [lemma]
|
||||||
|
|
||||||
|
def lemmatize_noun(
|
||||||
|
self, string: str, morphology: dict, lookup_table: Dict[str, str]
|
||||||
|
) -> List[str]:
|
||||||
|
# this method is case-sensitive, in order to work
|
||||||
|
# for incorrectly tagged proper names
|
||||||
|
if string != string.lower():
|
||||||
|
if string.lower() in lookup_table:
|
||||||
|
return [lookup_table[string.lower()]]
|
||||||
|
elif string in lookup_table:
|
||||||
|
return [lookup_table[string]]
|
||||||
|
return [string.lower()]
|
||||||
|
return [lookup_table.get(string, string)]
|
|
@ -72,7 +72,7 @@ steste stesti stette stettero stetti stia stiamo stiano stiate sto su sua
|
||||||
subito successivamente successivo sue sugl sugli sui sul sull sulla sulle
|
subito successivamente successivo sue sugl sugli sui sul sull sulla sulle
|
||||||
sullo suo suoi
|
sullo suo suoi
|
||||||
|
|
||||||
tale tali talvolta tanto te tempo ti titolo torino tra tranne tre trenta
|
tale tali talvolta tanto te tempo ti titolo tra tranne tre trenta
|
||||||
troppo trovato tu tua tue tuo tuoi tutta tuttavia tutte tutti tutto
|
troppo trovato tu tua tue tuo tuoi tutta tuttavia tutte tutti tutto
|
||||||
|
|
||||||
uguali ulteriore ultimo un una uno uomo
|
uguali ulteriore ultimo un una uno uomo
|
||||||
|
|
|
@ -25,7 +25,7 @@ for orth in [
|
||||||
"artt.",
|
"artt.",
|
||||||
"att.",
|
"att.",
|
||||||
"avv.",
|
"avv.",
|
||||||
"Avv."
|
"Avv.",
|
||||||
"by-pass",
|
"by-pass",
|
||||||
"c.d.",
|
"c.d.",
|
||||||
"c/c",
|
"c/c",
|
||||||
|
|
|
@ -27,7 +27,7 @@ _infixes = (
|
||||||
+ [
|
+ [
|
||||||
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||||
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
|
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
|
||||||
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||||
|
|
|
@ -23,15 +23,16 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
mode: str = "pymorphy2",
|
mode: str = "pymorphy2",
|
||||||
overwrite: bool = False,
|
overwrite: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
try:
|
if mode == "pymorphy2":
|
||||||
from pymorphy2 import MorphAnalyzer
|
try:
|
||||||
except ImportError:
|
from pymorphy2 import MorphAnalyzer
|
||||||
raise ImportError(
|
except ImportError:
|
||||||
"The Russian lemmatizer requires the pymorphy2 library: "
|
raise ImportError(
|
||||||
'try to fix it with "pip install pymorphy2"'
|
"The Russian lemmatizer mode 'pymorphy2' requires the "
|
||||||
) from None
|
"pymorphy2 library. Install it with: pip install pymorphy2"
|
||||||
if RussianLemmatizer._morph is None:
|
) from None
|
||||||
RussianLemmatizer._morph = MorphAnalyzer()
|
if RussianLemmatizer._morph is None:
|
||||||
|
RussianLemmatizer._morph = MorphAnalyzer()
|
||||||
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
|
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
|
||||||
|
|
||||||
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
|
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
|
||||||
|
|
|
@ -35,8 +35,8 @@ URL_PATTERN = (
|
||||||
# host & domain names
|
# host & domain names
|
||||||
# mods: match is case-sensitive, so include [A-Z]
|
# mods: match is case-sensitive, so include [A-Z]
|
||||||
r"(?:" # noqa: E131
|
r"(?:" # noqa: E131
|
||||||
r"(?:"
|
r"(?:" # noqa: E131
|
||||||
r"[A-Za-z0-9\u00a1-\uffff]"
|
r"[A-Za-z0-9\u00a1-\uffff]" # noqa: E131
|
||||||
r"[A-Za-z0-9\u00a1-\uffff_-]{0,62}"
|
r"[A-Za-z0-9\u00a1-\uffff_-]{0,62}"
|
||||||
r")?"
|
r")?"
|
||||||
r"[A-Za-z0-9\u00a1-\uffff]\."
|
r"[A-Za-z0-9\u00a1-\uffff]\."
|
||||||
|
|
|
@ -18,14 +18,15 @@ class UkrainianLemmatizer(RussianLemmatizer):
|
||||||
mode: str = "pymorphy2",
|
mode: str = "pymorphy2",
|
||||||
overwrite: bool = False,
|
overwrite: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
try:
|
if mode == "pymorphy2":
|
||||||
from pymorphy2 import MorphAnalyzer
|
try:
|
||||||
except ImportError:
|
from pymorphy2 import MorphAnalyzer
|
||||||
raise ImportError(
|
except ImportError:
|
||||||
"The Ukrainian lemmatizer requires the pymorphy2 library and "
|
raise ImportError(
|
||||||
"dictionaries: try to fix it with "
|
"The Ukrainian lemmatizer mode 'pymorphy2' requires the "
|
||||||
'"pip install pymorphy2 pymorphy2-dicts-uk"'
|
"pymorphy2 library and dictionaries. Install them with: "
|
||||||
) from None
|
"pip install pymorphy2 pymorphy2-dicts-uk"
|
||||||
if UkrainianLemmatizer._morph is None:
|
) from None
|
||||||
UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk")
|
if UkrainianLemmatizer._morph is None:
|
||||||
|
UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk")
|
||||||
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
|
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
|
||||||
|
|
|
@ -1,8 +1,15 @@
|
||||||
|
from typing import Any, Dict, Union
|
||||||
|
from pathlib import Path
|
||||||
|
import re
|
||||||
|
import srsly
|
||||||
|
import string
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
|
from ... import util
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -40,17 +47,108 @@ class VietnameseTokenizer(DummyTokenizer):
|
||||||
|
|
||||||
def __call__(self, text: str) -> Doc:
|
def __call__(self, text: str) -> Doc:
|
||||||
if self.use_pyvi:
|
if self.use_pyvi:
|
||||||
words, spaces = self.ViTokenizer.spacy_tokenize(text)
|
words = self.pyvi_tokenize(text)
|
||||||
|
words, spaces = util.get_words_and_spaces(words, text)
|
||||||
return Doc(self.vocab, words=words, spaces=spaces)
|
return Doc(self.vocab, words=words, spaces=spaces)
|
||||||
else:
|
else:
|
||||||
words = []
|
words, spaces = util.get_words_and_spaces(text.split(), text)
|
||||||
spaces = []
|
|
||||||
for token in self.tokenizer(text):
|
|
||||||
words.extend(list(token.text))
|
|
||||||
spaces.extend([False] * len(token.text))
|
|
||||||
spaces[-1] = bool(token.whitespace_)
|
|
||||||
return Doc(self.vocab, words=words, spaces=spaces)
|
return Doc(self.vocab, words=words, spaces=spaces)
|
||||||
|
|
||||||
|
# The methods pyvi_sylabelize_with_ws and pyvi_tokenize are adapted from
|
||||||
|
# pyvi v0.1, MIT License, Copyright (c) 2016 Viet-Trung Tran.
|
||||||
|
# See licenses/3rd_party_licenses.txt
|
||||||
|
def pyvi_sylabelize_with_ws(self, text):
|
||||||
|
"""Modified from pyvi to preserve whitespace and skip unicode
|
||||||
|
normalization."""
|
||||||
|
specials = [r"==>", r"->", r"\.\.\.", r">>"]
|
||||||
|
digit = r"\d+([\.,_]\d+)+"
|
||||||
|
email = r"([a-zA-Z0-9_.+-]+@([a-zA-Z0-9-]+\.)+[a-zA-Z0-9-]+)"
|
||||||
|
web = r"\w+://[^\s]+"
|
||||||
|
word = r"\w+"
|
||||||
|
non_word = r"[^\w\s]"
|
||||||
|
abbreviations = [
|
||||||
|
r"[A-ZĐ]+\.",
|
||||||
|
r"Tp\.",
|
||||||
|
r"Mr\.",
|
||||||
|
r"Mrs\.",
|
||||||
|
r"Ms\.",
|
||||||
|
r"Dr\.",
|
||||||
|
r"ThS\.",
|
||||||
|
]
|
||||||
|
|
||||||
|
patterns = []
|
||||||
|
patterns.extend(abbreviations)
|
||||||
|
patterns.extend(specials)
|
||||||
|
patterns.extend([web, email])
|
||||||
|
patterns.extend([digit, non_word, word])
|
||||||
|
|
||||||
|
patterns = r"(\s+|" + "|".join(patterns) + ")"
|
||||||
|
tokens = re.findall(patterns, text, re.UNICODE)
|
||||||
|
|
||||||
|
return [token[0] for token in tokens]
|
||||||
|
|
||||||
|
def pyvi_tokenize(self, text):
|
||||||
|
"""Modified from pyvi to preserve text and whitespace."""
|
||||||
|
if len(text) == 0:
|
||||||
|
return []
|
||||||
|
elif text.isspace():
|
||||||
|
return [text]
|
||||||
|
segs = self.pyvi_sylabelize_with_ws(text)
|
||||||
|
words = []
|
||||||
|
preceding_ws = []
|
||||||
|
for i, token in enumerate(segs):
|
||||||
|
if not token.isspace():
|
||||||
|
words.append(token)
|
||||||
|
preceding_ws.append(
|
||||||
|
"" if (i == 0 or not segs[i - 1].isspace()) else segs[i - 1]
|
||||||
|
)
|
||||||
|
labels = self.ViTokenizer.ViTokenizer.model.predict(
|
||||||
|
[self.ViTokenizer.ViTokenizer.sent2features(words, False)]
|
||||||
|
)
|
||||||
|
token = words[0]
|
||||||
|
tokens = []
|
||||||
|
for i in range(1, len(labels[0])):
|
||||||
|
if (
|
||||||
|
labels[0][i] == "I_W"
|
||||||
|
and words[i] not in string.punctuation
|
||||||
|
and words[i - 1] not in string.punctuation
|
||||||
|
and not words[i][0].isdigit()
|
||||||
|
and not words[i - 1][0].isdigit()
|
||||||
|
and not (words[i][0].istitle() and not words[i - 1][0].istitle())
|
||||||
|
):
|
||||||
|
token = token + preceding_ws[i] + words[i]
|
||||||
|
else:
|
||||||
|
tokens.append(token)
|
||||||
|
token = words[i]
|
||||||
|
tokens.append(token)
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
def _get_config(self) -> Dict[str, Any]:
|
||||||
|
return {"use_pyvi": self.use_pyvi}
|
||||||
|
|
||||||
|
def _set_config(self, config: Dict[str, Any] = {}) -> None:
|
||||||
|
self.use_pyvi = config.get("use_pyvi", False)
|
||||||
|
|
||||||
|
def to_bytes(self, **kwargs) -> bytes:
|
||||||
|
serializers = {"cfg": lambda: srsly.json_dumps(self._get_config())}
|
||||||
|
return util.to_bytes(serializers, [])
|
||||||
|
|
||||||
|
def from_bytes(self, data: bytes, **kwargs) -> "VietnameseTokenizer":
|
||||||
|
deserializers = {"cfg": lambda b: self._set_config(srsly.json_loads(b))}
|
||||||
|
util.from_bytes(data, deserializers, [])
|
||||||
|
return self
|
||||||
|
|
||||||
|
def to_disk(self, path: Union[str, Path], **kwargs) -> None:
|
||||||
|
path = util.ensure_path(path)
|
||||||
|
serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())}
|
||||||
|
return util.to_disk(path, serializers, [])
|
||||||
|
|
||||||
|
def from_disk(self, path: Union[str, Path], **kwargs) -> "VietnameseTokenizer":
|
||||||
|
path = util.ensure_path(path)
|
||||||
|
serializers = {"cfg": lambda p: self._set_config(srsly.read_json(p))}
|
||||||
|
util.from_disk(path, serializers, [])
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
class VietnameseDefaults(Language.Defaults):
|
class VietnameseDefaults(Language.Defaults):
|
||||||
config = load_config_from_str(DEFAULT_CONFIG)
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
from typing import Optional, Any, Dict, Callable, Iterable, Union, List, Pattern
|
from typing import Iterator, Optional, Any, Dict, Callable, Iterable, TypeVar
|
||||||
|
from typing import Union, List, Pattern, overload
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import random
|
import random
|
||||||
|
@ -13,6 +14,7 @@ import srsly
|
||||||
import multiprocessing as mp
|
import multiprocessing as mp
|
||||||
from itertools import chain, cycle
|
from itertools import chain, cycle
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
|
import traceback
|
||||||
|
|
||||||
from .tokens.underscore import Underscore
|
from .tokens.underscore import Underscore
|
||||||
from .vocab import Vocab, create_vocab
|
from .vocab import Vocab, create_vocab
|
||||||
|
@ -433,9 +435,9 @@ class Language:
|
||||||
default_config (Dict[str, Any]): Default configuration, describing the
|
default_config (Dict[str, Any]): Default configuration, describing the
|
||||||
default values of the factory arguments.
|
default values of the factory arguments.
|
||||||
assigns (Iterable[str]): Doc/Token attributes assigned by this component,
|
assigns (Iterable[str]): Doc/Token attributes assigned by this component,
|
||||||
e.g. "token.ent_id". Used for pipeline analyis.
|
e.g. "token.ent_id". Used for pipeline analysis.
|
||||||
requires (Iterable[str]): Doc/Token attributes required by this component,
|
requires (Iterable[str]): Doc/Token attributes required by this component,
|
||||||
e.g. "token.ent_id". Used for pipeline analyis.
|
e.g. "token.ent_id". Used for pipeline analysis.
|
||||||
retokenizes (bool): Whether the component changes the tokenization.
|
retokenizes (bool): Whether the component changes the tokenization.
|
||||||
Used for pipeline analysis.
|
Used for pipeline analysis.
|
||||||
default_score_weights (Dict[str, float]): The scores to report during
|
default_score_weights (Dict[str, float]): The scores to report during
|
||||||
|
@ -518,9 +520,9 @@ class Language:
|
||||||
|
|
||||||
name (str): The name of the component factory.
|
name (str): The name of the component factory.
|
||||||
assigns (Iterable[str]): Doc/Token attributes assigned by this component,
|
assigns (Iterable[str]): Doc/Token attributes assigned by this component,
|
||||||
e.g. "token.ent_id". Used for pipeline analyis.
|
e.g. "token.ent_id". Used for pipeline analysis.
|
||||||
requires (Iterable[str]): Doc/Token attributes required by this component,
|
requires (Iterable[str]): Doc/Token attributes required by this component,
|
||||||
e.g. "token.ent_id". Used for pipeline analyis.
|
e.g. "token.ent_id". Used for pipeline analysis.
|
||||||
retokenizes (bool): Whether the component changes the tokenization.
|
retokenizes (bool): Whether the component changes the tokenization.
|
||||||
Used for pipeline analysis.
|
Used for pipeline analysis.
|
||||||
func (Optional[Callable]): Factory function if not used as a decorator.
|
func (Optional[Callable]): Factory function if not used as a decorator.
|
||||||
|
@ -686,11 +688,13 @@ class Language:
|
||||||
if not isinstance(source, Language):
|
if not isinstance(source, Language):
|
||||||
raise ValueError(Errors.E945.format(name=source_name, source=type(source)))
|
raise ValueError(Errors.E945.format(name=source_name, source=type(source)))
|
||||||
# Check vectors, with faster checks first
|
# Check vectors, with faster checks first
|
||||||
if self.vocab.vectors.shape != source.vocab.vectors.shape or \
|
if (
|
||||||
self.vocab.vectors.key2row != source.vocab.vectors.key2row or \
|
self.vocab.vectors.shape != source.vocab.vectors.shape
|
||||||
self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes():
|
or self.vocab.vectors.key2row != source.vocab.vectors.key2row
|
||||||
util.logger.warning(Warnings.W113.format(name=source_name))
|
or self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes()
|
||||||
if not source_name in source.component_names:
|
):
|
||||||
|
warnings.warn(Warnings.W113.format(name=source_name))
|
||||||
|
if source_name not in source.component_names:
|
||||||
raise KeyError(
|
raise KeyError(
|
||||||
Errors.E944.format(
|
Errors.E944.format(
|
||||||
name=source_name,
|
name=source_name,
|
||||||
|
@ -931,6 +935,7 @@ class Language:
|
||||||
# because factory may be used for something else
|
# because factory may be used for something else
|
||||||
self._pipe_meta.pop(name)
|
self._pipe_meta.pop(name)
|
||||||
self._pipe_configs.pop(name)
|
self._pipe_configs.pop(name)
|
||||||
|
self.meta.get("_sourced_vectors_hashes", {}).pop(name, None)
|
||||||
# Make sure name is removed from the [initialize] config
|
# Make sure name is removed from the [initialize] config
|
||||||
if name in self._config["initialize"]["components"]:
|
if name in self._config["initialize"]["components"]:
|
||||||
self._config["initialize"]["components"].pop(name)
|
self._config["initialize"]["components"].pop(name)
|
||||||
|
@ -1074,6 +1079,7 @@ class Language:
|
||||||
losses: Optional[Dict[str, float]] = None,
|
losses: Optional[Dict[str, float]] = None,
|
||||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||||
exclude: Iterable[str] = SimpleFrozenList(),
|
exclude: Iterable[str] = SimpleFrozenList(),
|
||||||
|
annotates: Iterable[str] = SimpleFrozenList(),
|
||||||
):
|
):
|
||||||
"""Update the models in the pipeline.
|
"""Update the models in the pipeline.
|
||||||
|
|
||||||
|
@ -1081,10 +1087,13 @@ class Language:
|
||||||
_: Should not be set - serves to catch backwards-incompatible scripts.
|
_: Should not be set - serves to catch backwards-incompatible scripts.
|
||||||
drop (float): The dropout rate.
|
drop (float): The dropout rate.
|
||||||
sgd (Optimizer): An optimizer.
|
sgd (Optimizer): An optimizer.
|
||||||
losses (Dict[str, float]): Dictionary to update with the loss, keyed by component.
|
losses (Dict[str, float]): Dictionary to update with the loss, keyed by
|
||||||
|
component.
|
||||||
component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
|
component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
|
||||||
components, keyed by component name.
|
components, keyed by component name.
|
||||||
exclude (Iterable[str]): Names of components that shouldn't be updated.
|
exclude (Iterable[str]): Names of components that shouldn't be updated.
|
||||||
|
annotates (Iterable[str]): Names of components that should set
|
||||||
|
annotations on the predicted examples after updating.
|
||||||
RETURNS (Dict[str, float]): The updated losses dictionary
|
RETURNS (Dict[str, float]): The updated losses dictionary
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#update
|
DOCS: https://spacy.io/api/language#update
|
||||||
|
@ -1103,15 +1112,16 @@ class Language:
|
||||||
sgd = self._optimizer
|
sgd = self._optimizer
|
||||||
if component_cfg is None:
|
if component_cfg is None:
|
||||||
component_cfg = {}
|
component_cfg = {}
|
||||||
|
pipe_kwargs = {}
|
||||||
for i, (name, proc) in enumerate(self.pipeline):
|
for i, (name, proc) in enumerate(self.pipeline):
|
||||||
component_cfg.setdefault(name, {})
|
component_cfg.setdefault(name, {})
|
||||||
|
pipe_kwargs[name] = deepcopy(component_cfg[name])
|
||||||
component_cfg[name].setdefault("drop", drop)
|
component_cfg[name].setdefault("drop", drop)
|
||||||
|
pipe_kwargs[name].setdefault("batch_size", self.batch_size)
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if name in exclude or not hasattr(proc, "update"):
|
if name not in exclude and hasattr(proc, "update"):
|
||||||
continue
|
proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
|
||||||
proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
|
if sgd not in (None, False):
|
||||||
if sgd not in (None, False):
|
|
||||||
for name, proc in self.pipeline:
|
|
||||||
if (
|
if (
|
||||||
name not in exclude
|
name not in exclude
|
||||||
and hasattr(proc, "is_trainable")
|
and hasattr(proc, "is_trainable")
|
||||||
|
@ -1119,6 +1129,18 @@ class Language:
|
||||||
and proc.model not in (True, False, None)
|
and proc.model not in (True, False, None)
|
||||||
):
|
):
|
||||||
proc.finish_update(sgd)
|
proc.finish_update(sgd)
|
||||||
|
if name in annotates:
|
||||||
|
for doc, eg in zip(
|
||||||
|
_pipe(
|
||||||
|
(eg.predicted for eg in examples),
|
||||||
|
proc=proc,
|
||||||
|
name=name,
|
||||||
|
default_error_handler=self.default_error_handler,
|
||||||
|
kwargs=pipe_kwargs[name],
|
||||||
|
),
|
||||||
|
examples,
|
||||||
|
):
|
||||||
|
eg.predicted = doc
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def rehearse(
|
def rehearse(
|
||||||
|
@ -1410,6 +1432,21 @@ class Language:
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
_AnyContext = TypeVar("_AnyContext")
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def pipe(
|
||||||
|
self,
|
||||||
|
texts: Iterable[Tuple[str, _AnyContext]],
|
||||||
|
*,
|
||||||
|
as_tuples: bool = ...,
|
||||||
|
batch_size: Optional[int] = ...,
|
||||||
|
disable: Iterable[str] = ...,
|
||||||
|
component_cfg: Optional[Dict[str, Dict[str, Any]]] = ...,
|
||||||
|
n_process: int = ...,
|
||||||
|
) -> Iterator[Tuple[Doc, _AnyContext]]:
|
||||||
|
...
|
||||||
|
|
||||||
def pipe(
|
def pipe(
|
||||||
self,
|
self,
|
||||||
texts: Iterable[str],
|
texts: Iterable[str],
|
||||||
|
@ -1419,7 +1456,7 @@ class Language:
|
||||||
disable: Iterable[str] = SimpleFrozenList(),
|
disable: Iterable[str] = SimpleFrozenList(),
|
||||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||||
n_process: int = 1,
|
n_process: int = 1,
|
||||||
):
|
) -> Iterator[Doc]:
|
||||||
"""Process texts as a stream, and yield `Doc` objects in order.
|
"""Process texts as a stream, and yield `Doc` objects in order.
|
||||||
|
|
||||||
texts (Iterable[str]): A sequence of texts to process.
|
texts (Iterable[str]): A sequence of texts to process.
|
||||||
|
@ -1521,11 +1558,21 @@ class Language:
|
||||||
|
|
||||||
# Cycle channels not to break the order of docs.
|
# Cycle channels not to break the order of docs.
|
||||||
# The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
|
# The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
|
||||||
byte_docs = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch))
|
byte_tuples = chain.from_iterable(
|
||||||
docs = (Doc(self.vocab).from_bytes(byte_doc) for byte_doc in byte_docs)
|
recv.recv() for recv in cycle(bytedocs_recv_ch)
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
for i, (_, doc) in enumerate(zip(raw_texts, docs), 1):
|
for i, (_, (byte_doc, byte_error)) in enumerate(
|
||||||
yield doc
|
zip(raw_texts, byte_tuples), 1
|
||||||
|
):
|
||||||
|
if byte_doc is not None:
|
||||||
|
doc = Doc(self.vocab).from_bytes(byte_doc)
|
||||||
|
yield doc
|
||||||
|
elif byte_error is not None:
|
||||||
|
error = srsly.msgpack_loads(byte_error)
|
||||||
|
self.default_error_handler(
|
||||||
|
None, None, None, ValueError(Errors.E871.format(error=error))
|
||||||
|
)
|
||||||
if i % batch_size == 0:
|
if i % batch_size == 0:
|
||||||
# tell `sender` that one batch was consumed.
|
# tell `sender` that one batch was consumed.
|
||||||
sender.step()
|
sender.step()
|
||||||
|
@ -1650,6 +1697,8 @@ class Language:
|
||||||
# If components are loaded from a source (existing models), we cache
|
# If components are loaded from a source (existing models), we cache
|
||||||
# them here so they're only loaded once
|
# them here so they're only loaded once
|
||||||
source_nlps = {}
|
source_nlps = {}
|
||||||
|
source_nlp_vectors_hashes = {}
|
||||||
|
nlp.meta["_sourced_vectors_hashes"] = {}
|
||||||
for pipe_name in config["nlp"]["pipeline"]:
|
for pipe_name in config["nlp"]["pipeline"]:
|
||||||
if pipe_name not in pipeline:
|
if pipe_name not in pipeline:
|
||||||
opts = ", ".join(pipeline.keys())
|
opts = ", ".join(pipeline.keys())
|
||||||
|
@ -1674,17 +1723,27 @@ class Language:
|
||||||
else:
|
else:
|
||||||
model = pipe_cfg["source"]
|
model = pipe_cfg["source"]
|
||||||
if model not in source_nlps:
|
if model not in source_nlps:
|
||||||
# We only need the components here and we need to init
|
# We only need the components here and we intentionally
|
||||||
# model with the same vocab as the current nlp object
|
# do not load the model with the same vocab because
|
||||||
source_nlps[model] = util.load_model(model, vocab=nlp.vocab)
|
# this would cause the vectors to be copied into the
|
||||||
|
# current nlp object (all the strings will be added in
|
||||||
|
# create_pipe_from_source)
|
||||||
|
source_nlps[model] = util.load_model(model)
|
||||||
source_name = pipe_cfg.get("component", pipe_name)
|
source_name = pipe_cfg.get("component", pipe_name)
|
||||||
listeners_replaced = False
|
listeners_replaced = False
|
||||||
if "replace_listeners" in pipe_cfg:
|
if "replace_listeners" in pipe_cfg:
|
||||||
for name, proc in source_nlps[model].pipeline:
|
for name, proc in source_nlps[model].pipeline:
|
||||||
if source_name in getattr(proc, "listening_components", []):
|
if source_name in getattr(proc, "listening_components", []):
|
||||||
source_nlps[model].replace_listeners(name, source_name, pipe_cfg["replace_listeners"])
|
source_nlps[model].replace_listeners(
|
||||||
|
name, source_name, pipe_cfg["replace_listeners"]
|
||||||
|
)
|
||||||
listeners_replaced = True
|
listeners_replaced = True
|
||||||
nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name)
|
with warnings.catch_warnings():
|
||||||
|
warnings.filterwarnings("ignore", message="\\[W113\\]")
|
||||||
|
nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name)
|
||||||
|
if model not in source_nlp_vectors_hashes:
|
||||||
|
source_nlp_vectors_hashes[model] = hash(source_nlps[model].vocab.vectors.to_bytes())
|
||||||
|
nlp.meta["_sourced_vectors_hashes"][pipe_name] = source_nlp_vectors_hashes[model]
|
||||||
# Delete from cache if listeners were replaced
|
# Delete from cache if listeners were replaced
|
||||||
if listeners_replaced:
|
if listeners_replaced:
|
||||||
del source_nlps[model]
|
del source_nlps[model]
|
||||||
|
@ -1702,12 +1761,16 @@ class Language:
|
||||||
for name, proc in nlp.pipeline:
|
for name, proc in nlp.pipeline:
|
||||||
# Remove listeners not in the pipeline
|
# Remove listeners not in the pipeline
|
||||||
listener_names = getattr(proc, "listening_components", [])
|
listener_names = getattr(proc, "listening_components", [])
|
||||||
unused_listener_names = [ll for ll in listener_names if ll not in nlp.pipe_names]
|
unused_listener_names = [
|
||||||
|
ll for ll in listener_names if ll not in nlp.pipe_names
|
||||||
|
]
|
||||||
for listener_name in unused_listener_names:
|
for listener_name in unused_listener_names:
|
||||||
for listener in proc.listener_map.get(listener_name, []):
|
for listener in proc.listener_map.get(listener_name, []):
|
||||||
proc.remove_listener(listener, listener_name)
|
proc.remove_listener(listener, listener_name)
|
||||||
|
|
||||||
for listener in getattr(proc, "listening_components", []): # e.g. tok2vec/transformer
|
for listener in getattr(
|
||||||
|
proc, "listening_components", []
|
||||||
|
): # e.g. tok2vec/transformer
|
||||||
# If it's a component sourced from another pipeline, we check if
|
# If it's a component sourced from another pipeline, we check if
|
||||||
# the tok2vec listeners should be replaced with standalone tok2vec
|
# the tok2vec listeners should be replaced with standalone tok2vec
|
||||||
# models (e.g. so component can be frozen without its performance
|
# models (e.g. so component can be frozen without its performance
|
||||||
|
@ -1764,6 +1827,7 @@ class Language:
|
||||||
raise ValueError(err)
|
raise ValueError(err)
|
||||||
tok2vec = self.get_pipe(tok2vec_name)
|
tok2vec = self.get_pipe(tok2vec_name)
|
||||||
tok2vec_cfg = self.get_pipe_config(tok2vec_name)
|
tok2vec_cfg = self.get_pipe_config(tok2vec_name)
|
||||||
|
tok2vec_model = tok2vec.model
|
||||||
if (
|
if (
|
||||||
not hasattr(tok2vec, "model")
|
not hasattr(tok2vec, "model")
|
||||||
or not hasattr(tok2vec, "listener_map")
|
or not hasattr(tok2vec, "listener_map")
|
||||||
|
@ -1772,6 +1836,7 @@ class Language:
|
||||||
):
|
):
|
||||||
raise ValueError(Errors.E888.format(name=tok2vec_name, pipe=type(tok2vec)))
|
raise ValueError(Errors.E888.format(name=tok2vec_name, pipe=type(tok2vec)))
|
||||||
pipe_listeners = tok2vec.listener_map.get(pipe_name, [])
|
pipe_listeners = tok2vec.listener_map.get(pipe_name, [])
|
||||||
|
pipe = self.get_pipe(pipe_name)
|
||||||
pipe_cfg = self._pipe_configs[pipe_name]
|
pipe_cfg = self._pipe_configs[pipe_name]
|
||||||
if listeners:
|
if listeners:
|
||||||
util.logger.debug(f"Replacing listeners of component '{pipe_name}'")
|
util.logger.debug(f"Replacing listeners of component '{pipe_name}'")
|
||||||
|
@ -1786,7 +1851,6 @@ class Language:
|
||||||
n_listeners=len(pipe_listeners),
|
n_listeners=len(pipe_listeners),
|
||||||
)
|
)
|
||||||
raise ValueError(err)
|
raise ValueError(err)
|
||||||
pipe = self.get_pipe(pipe_name)
|
|
||||||
# Update the config accordingly by copying the tok2vec model to all
|
# Update the config accordingly by copying the tok2vec model to all
|
||||||
# sections defined in the listener paths
|
# sections defined in the listener paths
|
||||||
for listener_path in listeners:
|
for listener_path in listeners:
|
||||||
|
@ -1798,10 +1862,19 @@ class Language:
|
||||||
name=pipe_name, tok2vec=tok2vec_name, path=listener_path
|
name=pipe_name, tok2vec=tok2vec_name, path=listener_path
|
||||||
)
|
)
|
||||||
raise ValueError(err)
|
raise ValueError(err)
|
||||||
util.set_dot_to_object(pipe_cfg, listener_path, tok2vec_cfg["model"])
|
new_config = tok2vec_cfg["model"]
|
||||||
|
if "replace_listener_cfg" in tok2vec_model.attrs:
|
||||||
|
replace_func = tok2vec_model.attrs["replace_listener_cfg"]
|
||||||
|
new_config = replace_func(
|
||||||
|
tok2vec_cfg["model"], pipe_cfg["model"]["tok2vec"]
|
||||||
|
)
|
||||||
|
util.set_dot_to_object(pipe_cfg, listener_path, new_config)
|
||||||
# Go over the listener layers and replace them
|
# Go over the listener layers and replace them
|
||||||
for listener in pipe_listeners:
|
for listener in pipe_listeners:
|
||||||
util.replace_model_node(pipe.model, listener, tok2vec.model.copy())
|
new_model = tok2vec_model.copy()
|
||||||
|
if "replace_listener" in tok2vec_model.attrs:
|
||||||
|
new_model = tok2vec_model.attrs["replace_listener"](new_model)
|
||||||
|
util.replace_model_node(pipe.model, listener, new_model)
|
||||||
tok2vec.remove_listener(listener, pipe_name)
|
tok2vec.remove_listener(listener, pipe_name)
|
||||||
|
|
||||||
def to_disk(
|
def to_disk(
|
||||||
|
@ -1833,7 +1906,11 @@ class Language:
|
||||||
util.to_disk(path, serializers, exclude)
|
util.to_disk(path, serializers, exclude)
|
||||||
|
|
||||||
def from_disk(
|
def from_disk(
|
||||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
self,
|
||||||
|
path: Union[str, Path],
|
||||||
|
*,
|
||||||
|
exclude: Iterable[str] = SimpleFrozenList(),
|
||||||
|
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
||||||
) -> "Language":
|
) -> "Language":
|
||||||
"""Loads state from a directory. Modifies the object in place and
|
"""Loads state from a directory. Modifies the object in place and
|
||||||
returns it. If the saved `Language` object contains a model, the
|
returns it. If the saved `Language` object contains a model, the
|
||||||
|
@ -1862,7 +1939,7 @@ class Language:
|
||||||
deserializers = {}
|
deserializers = {}
|
||||||
if Path(path / "config.cfg").exists():
|
if Path(path / "config.cfg").exists():
|
||||||
deserializers["config.cfg"] = lambda p: self.config.from_disk(
|
deserializers["config.cfg"] = lambda p: self.config.from_disk(
|
||||||
p, interpolate=False
|
p, interpolate=False, overrides=overrides
|
||||||
)
|
)
|
||||||
deserializers["meta.json"] = deserialize_meta
|
deserializers["meta.json"] = deserialize_meta
|
||||||
deserializers["vocab"] = deserialize_vocab
|
deserializers["vocab"] = deserialize_vocab
|
||||||
|
@ -2019,12 +2096,19 @@ def _apply_pipes(
|
||||||
"""
|
"""
|
||||||
Underscore.load_state(underscore_state)
|
Underscore.load_state(underscore_state)
|
||||||
while True:
|
while True:
|
||||||
texts = receiver.get()
|
try:
|
||||||
docs = (make_doc(text) for text in texts)
|
texts = receiver.get()
|
||||||
for pipe in pipes:
|
docs = (make_doc(text) for text in texts)
|
||||||
docs = pipe(docs)
|
for pipe in pipes:
|
||||||
# Connection does not accept unpickable objects, so send list.
|
docs = pipe(docs)
|
||||||
sender.send([doc.to_bytes() for doc in docs])
|
# Connection does not accept unpickable objects, so send list.
|
||||||
|
byte_docs = [(doc.to_bytes(), None) for doc in docs]
|
||||||
|
padding = [(None, None)] * (len(texts) - len(byte_docs))
|
||||||
|
sender.send(byte_docs + padding)
|
||||||
|
except Exception:
|
||||||
|
error_msg = [(None, srsly.msgpack_dumps(traceback.format_exc()))]
|
||||||
|
padding = [(None, None)] * (len(texts) - 1)
|
||||||
|
sender.send(error_msg + padding)
|
||||||
|
|
||||||
|
|
||||||
class _Sender:
|
class _Sender:
|
||||||
|
|
|
@ -163,7 +163,7 @@ cdef class Lexeme:
|
||||||
self.vocab.set_vector(self.c.orth, vector)
|
self.vocab.set_vector(self.c.orth, vector)
|
||||||
|
|
||||||
property rank:
|
property rank:
|
||||||
"""RETURNS (str): Sequential ID of the lexemes's lexical type, used
|
"""RETURNS (str): Sequential ID of the lexeme's lexical type, used
|
||||||
to index into tables, e.g. for word vectors."""
|
to index into tables, e.g. for word vectors."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.id
|
return self.c.id
|
||||||
|
@ -205,7 +205,7 @@ cdef class Lexeme:
|
||||||
self.c.lower = x
|
self.c.lower = x
|
||||||
|
|
||||||
property norm:
|
property norm:
|
||||||
"""RETURNS (uint64): The lexemes's norm, i.e. a normalised form of the
|
"""RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the
|
||||||
lexeme text.
|
lexeme text.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -288,7 +288,7 @@ cdef class Lexeme:
|
||||||
self.c.lower = self.vocab.strings.add(x)
|
self.c.lower = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property norm_:
|
property norm_:
|
||||||
"""RETURNS (str): The lexemes's norm, i.e. a normalised form of the
|
"""RETURNS (str): The lexeme's norm, i.e. a normalised form of the
|
||||||
lexeme text.
|
lexeme text.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
|
|
@ -12,9 +12,7 @@ from .strings import get_string_id
|
||||||
UNSET = object()
|
UNSET = object()
|
||||||
|
|
||||||
|
|
||||||
def load_lookups(
|
def load_lookups(lang: str, tables: List[str], strict: bool = True) -> "Lookups":
|
||||||
lang: str, tables: List[str], strict: bool = True
|
|
||||||
) -> 'Lookups':
|
|
||||||
"""Load the data from the spacy-lookups-data package for a given language,
|
"""Load the data from the spacy-lookups-data package for a given language,
|
||||||
if available. Returns an empty `Lookups` container if there's no data or if the package
|
if available. Returns an empty `Lookups` container if there's no data or if the package
|
||||||
is not installed.
|
is not installed.
|
||||||
|
|
|
@ -4,6 +4,7 @@ from collections import defaultdict
|
||||||
from itertools import product
|
from itertools import product
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
|
import warnings
|
||||||
|
|
||||||
from .matcher cimport Matcher
|
from .matcher cimport Matcher
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
|
@ -11,7 +12,6 @@ from ..tokens.doc cimport Doc
|
||||||
|
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..tokens import Span
|
from ..tokens import Span
|
||||||
from ..util import logger
|
|
||||||
|
|
||||||
|
|
||||||
DELIMITER = "||"
|
DELIMITER = "||"
|
||||||
|
@ -282,7 +282,7 @@ cdef class DependencyMatcher:
|
||||||
keys_to_position_maps = defaultdict(lambda: defaultdict(list))
|
keys_to_position_maps = defaultdict(lambda: defaultdict(list))
|
||||||
for match_id, start, end in self._matcher(doc):
|
for match_id, start, end in self._matcher(doc):
|
||||||
if start + 1 != end:
|
if start + 1 != end:
|
||||||
logger.warning(Warnings.W110.format(tokens=[t.text for t in doc[start:end]], pattern=self._matcher.get(match_id)[1][0][0]))
|
warnings.warn(Warnings.W110.format(tokens=[t.text for t in doc[start:end]], pattern=self._matcher.get(match_id)[1][0][0]))
|
||||||
token = doc[start]
|
token = doc[start]
|
||||||
root = ([token] + list(token.ancestors))[-1]
|
root = ([token] + list(token.ancestors))[-1]
|
||||||
keys_to_position_maps[root.i][match_id].append(start)
|
keys_to_position_maps[root.i][match_id].append(start)
|
||||||
|
|
|
@ -138,6 +138,11 @@ cdef class Matcher:
|
||||||
self._filter[key] = greedy
|
self._filter[key] = greedy
|
||||||
self._patterns[key].extend(patterns)
|
self._patterns[key].extend(patterns)
|
||||||
|
|
||||||
|
def _require_patterns(self) -> None:
|
||||||
|
"""Raise a warning if this component has no patterns defined."""
|
||||||
|
if len(self) == 0:
|
||||||
|
warnings.warn(Warnings.W036.format(name="matcher"))
|
||||||
|
|
||||||
def remove(self, key):
|
def remove(self, key):
|
||||||
"""Remove a rule from the matcher. A KeyError is raised if the key does
|
"""Remove a rule from the matcher. A KeyError is raised if the key does
|
||||||
not exist.
|
not exist.
|
||||||
|
@ -215,6 +220,7 @@ cdef class Matcher:
|
||||||
If with_alignments is set to True and as_spans is set to False,
|
If with_alignments is set to True and as_spans is set to False,
|
||||||
A list of `(match_id, start, end, alignments)` tuples is returned.
|
A list of `(match_id, start, end, alignments)` tuples is returned.
|
||||||
"""
|
"""
|
||||||
|
self._require_patterns()
|
||||||
if isinstance(doclike, Doc):
|
if isinstance(doclike, Doc):
|
||||||
doc = doclike
|
doc = doclike
|
||||||
length = len(doc)
|
length = len(doc)
|
||||||
|
@ -284,7 +290,13 @@ cdef class Matcher:
|
||||||
if on_match is not None:
|
if on_match is not None:
|
||||||
on_match(self, doc, i, final_matches)
|
on_match(self, doc, i, final_matches)
|
||||||
if as_spans:
|
if as_spans:
|
||||||
return [Span(doc, start, end, label=key) for key, start, end in final_matches]
|
spans = []
|
||||||
|
for key, start, end in final_matches:
|
||||||
|
if isinstance(doclike, Span):
|
||||||
|
start += doclike.start
|
||||||
|
end += doclike.start
|
||||||
|
spans.append(Span(doc, start, end, label=key))
|
||||||
|
return spans
|
||||||
elif with_alignments:
|
elif with_alignments:
|
||||||
# convert alignments List[Dict[str, int]] --> List[int]
|
# convert alignments List[Dict[str, int]] --> List[int]
|
||||||
final_matches = []
|
final_matches = []
|
||||||
|
|
|
@ -50,6 +50,8 @@ cdef class PhraseMatcher:
|
||||||
if isinstance(attr, (int, long)):
|
if isinstance(attr, (int, long)):
|
||||||
self.attr = attr
|
self.attr = attr
|
||||||
else:
|
else:
|
||||||
|
if attr is None:
|
||||||
|
attr = "ORTH"
|
||||||
attr = attr.upper()
|
attr = attr.upper()
|
||||||
if attr == "TEXT":
|
if attr == "TEXT":
|
||||||
attr = "ORTH"
|
attr = "ORTH"
|
||||||
|
|
|
@ -1,6 +1,9 @@
|
||||||
from thinc.api import Model, normal_init
|
from thinc.api import Model, normal_init
|
||||||
|
|
||||||
|
from ..util import registry
|
||||||
|
|
||||||
|
|
||||||
|
@registry.layers("spacy.PrecomputableAffine.v1")
|
||||||
def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
|
def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
|
||||||
model = Model(
|
model = Model(
|
||||||
"precomputable_affine",
|
"precomputable_affine",
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
|
from ..util import registry
|
||||||
from ..attrs import LOWER
|
from ..attrs import LOWER
|
||||||
|
|
||||||
|
|
||||||
|
@registry.layers("spacy.extract_ngrams.v1")
|
||||||
def extract_ngrams(ngram_size: int, attr: int = LOWER) -> Model:
|
def extract_ngrams(ngram_size: int, attr: int = LOWER) -> Model:
|
||||||
model = Model("extract_ngrams", forward)
|
model = Model("extract_ngrams", forward)
|
||||||
model.attrs["ngram_size"] = ngram_size
|
model.attrs["ngram_size"] = ngram_size
|
||||||
|
|
60
spacy/ml/extract_spans.py
Normal file
60
spacy/ml/extract_spans.py
Normal file
|
@ -0,0 +1,60 @@
|
||||||
|
from typing import Tuple, Callable
|
||||||
|
from thinc.api import Model, to_numpy
|
||||||
|
from thinc.types import Ragged, Ints1d
|
||||||
|
|
||||||
|
from ..util import registry
|
||||||
|
|
||||||
|
|
||||||
|
@registry.layers("spacy.extract_spans.v1")
|
||||||
|
def extract_spans() -> Model[Tuple[Ragged, Ragged], Ragged]:
|
||||||
|
"""Extract spans from a sequence of source arrays, as specified by an array
|
||||||
|
of (start, end) indices. The output is a ragged array of the
|
||||||
|
extracted spans.
|
||||||
|
"""
|
||||||
|
return Model(
|
||||||
|
"extract_spans", forward, layers=[], refs={}, attrs={}, dims={}, init=init
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def init(model, X=None, Y=None):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
model: Model, source_spans: Tuple[Ragged, Ragged], is_train: bool
|
||||||
|
) -> Tuple[Ragged, Callable]:
|
||||||
|
"""Get subsequences from source vectors."""
|
||||||
|
ops = model.ops
|
||||||
|
X, spans = source_spans
|
||||||
|
assert spans.dataXd.ndim == 2
|
||||||
|
indices = _get_span_indices(ops, spans, X.lengths)
|
||||||
|
Y = Ragged(X.dataXd[indices], spans.dataXd[:, 1] - spans.dataXd[:, 0])
|
||||||
|
x_shape = X.dataXd.shape
|
||||||
|
x_lengths = X.lengths
|
||||||
|
|
||||||
|
def backprop_windows(dY: Ragged) -> Tuple[Ragged, Ragged]:
|
||||||
|
dX = Ragged(ops.alloc2f(*x_shape), x_lengths)
|
||||||
|
ops.scatter_add(dX.dataXd, indices, dY.dataXd)
|
||||||
|
return (dX, spans)
|
||||||
|
|
||||||
|
return Y, backprop_windows
|
||||||
|
|
||||||
|
|
||||||
|
def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d:
|
||||||
|
"""Construct a flat array that has the indices we want to extract from the
|
||||||
|
source data. For instance, if we want the spans (5, 9), (8, 10) the
|
||||||
|
indices will be [5, 6, 7, 8, 8, 9].
|
||||||
|
"""
|
||||||
|
spans, lengths = _ensure_cpu(spans, lengths)
|
||||||
|
indices = []
|
||||||
|
offset = 0
|
||||||
|
for i, length in enumerate(lengths):
|
||||||
|
spans_i = spans[i].dataXd + offset
|
||||||
|
for j in range(spans_i.shape[0]):
|
||||||
|
indices.append(ops.xp.arange(spans_i[j, 0], spans_i[j, 1]))
|
||||||
|
offset += length
|
||||||
|
return ops.flatten(indices)
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:
|
||||||
|
return (Ragged(to_numpy(spans.dataXd), to_numpy(spans.lengths)), to_numpy(lengths))
|
|
@ -1,6 +1,7 @@
|
||||||
from .entity_linker import * # noqa
|
from .entity_linker import * # noqa
|
||||||
from .multi_task import * # noqa
|
from .multi_task import * # noqa
|
||||||
from .parser import * # noqa
|
from .parser import * # noqa
|
||||||
|
from .spancat import * # noqa
|
||||||
from .tagger import * # noqa
|
from .tagger import * # noqa
|
||||||
from .textcat import * # noqa
|
from .textcat import * # noqa
|
||||||
from .tok2vec import * # noqa
|
from .tok2vec import * # noqa
|
||||||
|
|
|
@ -6,12 +6,13 @@ from thinc.api import Model, Maxout, Linear
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from ...kb import KnowledgeBase, Candidate, get_candidates
|
from ...kb import KnowledgeBase, Candidate, get_candidates
|
||||||
from ...vocab import Vocab
|
from ...vocab import Vocab
|
||||||
|
from ...tokens import Span
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures("spacy.EntityLinker.v1")
|
@registry.architectures("spacy.EntityLinker.v1")
|
||||||
def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
|
def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
|
||||||
with Model.define_operators({">>": chain, "**": clone}):
|
with Model.define_operators({">>": chain, "**": clone}):
|
||||||
token_width = tok2vec.get_dim("nO")
|
token_width = tok2vec.maybe_get_dim("nO")
|
||||||
output_layer = Linear(nO=nO, nI=token_width)
|
output_layer = Linear(nO=nO, nI=token_width)
|
||||||
model = (
|
model = (
|
||||||
tok2vec
|
tok2vec
|
||||||
|
@ -44,5 +45,5 @@ def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.CandidateGenerator.v1")
|
@registry.misc("spacy.CandidateGenerator.v1")
|
||||||
def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
|
def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
|
||||||
return get_candidates
|
return get_candidates
|
||||||
|
|
|
@ -13,7 +13,7 @@ from functools import partial
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
# This lets us add type hints for mypy etc. without causing circular imports
|
# This lets us add type hints for mypy etc. without causing circular imports
|
||||||
from ...vocab import Vocab # noqa: F401
|
from ...vocab import Vocab # noqa: F401
|
||||||
from ...tokens import Doc # noqa: F401
|
from ...tokens.doc import Doc # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures("spacy.PretrainVectors.v1")
|
@registry.architectures("spacy.PretrainVectors.v1")
|
||||||
|
@ -205,7 +205,7 @@ def _apply_mask(
|
||||||
docs: Iterable["Doc"], random_words: _RandomWords, mask_prob: float = 0.15
|
docs: Iterable["Doc"], random_words: _RandomWords, mask_prob: float = 0.15
|
||||||
) -> Tuple[numpy.ndarray, List["Doc"]]:
|
) -> Tuple[numpy.ndarray, List["Doc"]]:
|
||||||
# This needs to be here to avoid circular imports
|
# This needs to be here to avoid circular imports
|
||||||
from ...tokens import Doc # noqa: F811
|
from ...tokens.doc import Doc # noqa: F811
|
||||||
|
|
||||||
N = sum(len(doc) for doc in docs)
|
N = sum(len(doc) for doc in docs)
|
||||||
mask = numpy.random.uniform(0.0, 1.0, (N,))
|
mask = numpy.random.uniform(0.0, 1.0, (N,))
|
||||||
|
|
|
@ -10,48 +10,7 @@ from ..tb_framework import TransitionModel
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures("spacy.TransitionBasedParser.v1")
|
|
||||||
def transition_parser_v1(
|
|
||||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
|
||||||
state_type: Literal["parser", "ner"],
|
|
||||||
extra_state_tokens: bool,
|
|
||||||
hidden_width: int,
|
|
||||||
maxout_pieces: int,
|
|
||||||
use_upper: bool = True,
|
|
||||||
nO: Optional[int] = None,
|
|
||||||
) -> Model:
|
|
||||||
return build_tb_parser_model(
|
|
||||||
tok2vec,
|
|
||||||
state_type,
|
|
||||||
extra_state_tokens,
|
|
||||||
hidden_width,
|
|
||||||
maxout_pieces,
|
|
||||||
use_upper,
|
|
||||||
nO,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures("spacy.TransitionBasedParser.v2")
|
@registry.architectures("spacy.TransitionBasedParser.v2")
|
||||||
def transition_parser_v2(
|
|
||||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
|
||||||
state_type: Literal["parser", "ner"],
|
|
||||||
extra_state_tokens: bool,
|
|
||||||
hidden_width: int,
|
|
||||||
maxout_pieces: int,
|
|
||||||
use_upper: bool,
|
|
||||||
nO: Optional[int] = None,
|
|
||||||
) -> Model:
|
|
||||||
return build_tb_parser_model(
|
|
||||||
tok2vec,
|
|
||||||
state_type,
|
|
||||||
extra_state_tokens,
|
|
||||||
hidden_width,
|
|
||||||
maxout_pieces,
|
|
||||||
use_upper,
|
|
||||||
nO,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def build_tb_parser_model(
|
def build_tb_parser_model(
|
||||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||||
state_type: Literal["parser", "ner"],
|
state_type: Literal["parser", "ner"],
|
||||||
|
|
54
spacy/ml/models/spancat.py
Normal file
54
spacy/ml/models/spancat.py
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
from typing import List, Tuple
|
||||||
|
from thinc.api import Model, with_getitem, chain, list2ragged, Logistic
|
||||||
|
from thinc.api import Maxout, Linear, concatenate, glorot_uniform_init
|
||||||
|
from thinc.api import reduce_mean, reduce_max, reduce_first, reduce_last
|
||||||
|
from thinc.types import Ragged, Floats2d
|
||||||
|
|
||||||
|
from ...util import registry
|
||||||
|
from ...tokens import Doc
|
||||||
|
from ..extract_spans import extract_spans
|
||||||
|
|
||||||
|
|
||||||
|
@registry.layers.register("spacy.LinearLogistic.v1")
|
||||||
|
def build_linear_logistic(nO=None, nI=None) -> Model[Floats2d, Floats2d]:
|
||||||
|
"""An output layer for multi-label classification. It uses a linear layer
|
||||||
|
followed by a logistic activation.
|
||||||
|
"""
|
||||||
|
return chain(Linear(nO=nO, nI=nI, init_W=glorot_uniform_init), Logistic())
|
||||||
|
|
||||||
|
|
||||||
|
@registry.layers.register("spacy.mean_max_reducer.v1")
|
||||||
|
def build_mean_max_reducer(hidden_size: int) -> Model[Ragged, Floats2d]:
|
||||||
|
"""Reduce sequences by concatenating their mean and max pooled vectors,
|
||||||
|
and then combine the concatenated vectors with a hidden layer.
|
||||||
|
"""
|
||||||
|
return chain(
|
||||||
|
concatenate(reduce_last(), reduce_first(), reduce_mean(), reduce_max()),
|
||||||
|
Maxout(nO=hidden_size, normalize=True, dropout=0.0),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.SpanCategorizer.v1")
|
||||||
|
def build_spancat_model(
|
||||||
|
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||||
|
reducer: Model[Ragged, Floats2d],
|
||||||
|
scorer: Model[Floats2d, Floats2d],
|
||||||
|
) -> Model[Tuple[List[Doc], Ragged], Floats2d]:
|
||||||
|
"""Build a span categorizer model, given a token-to-vector model, a
|
||||||
|
reducer model to map the sequence of vectors for each span down to a single
|
||||||
|
vector, and a scorer model to map the vectors to probabilities.
|
||||||
|
|
||||||
|
tok2vec (Model[List[Doc], List[Floats2d]]): The tok2vec model.
|
||||||
|
reducer (Model[Ragged, Floats2d]): The reducer model.
|
||||||
|
scorer (Model[Floats2d, Floats2d]): The scorer model.
|
||||||
|
"""
|
||||||
|
model = chain(
|
||||||
|
with_getitem(0, chain(tok2vec, list2ragged())),
|
||||||
|
extract_spans(),
|
||||||
|
reducer,
|
||||||
|
scorer,
|
||||||
|
)
|
||||||
|
model.set_ref("tok2vec", tok2vec)
|
||||||
|
model.set_ref("reducer", reducer)
|
||||||
|
model.set_ref("scorer", scorer)
|
||||||
|
return model
|
|
@ -1,11 +1,13 @@
|
||||||
|
from functools import partial
|
||||||
from typing import Optional, List
|
from typing import Optional, List
|
||||||
|
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d
|
||||||
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
||||||
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
||||||
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
||||||
from thinc.api import with_cpu, Relu, residual, LayerNorm
|
from thinc.api import with_cpu, Relu, residual, LayerNorm, resizable
|
||||||
from thinc.layers.chain import init as init_chain
|
from thinc.layers.chain import init as init_chain
|
||||||
|
from thinc.layers.resizable import resize_model, resize_linear_weighted
|
||||||
|
|
||||||
from ...attrs import ORTH
|
from ...attrs import ORTH
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
|
@ -15,7 +17,10 @@ from ...tokens import Doc
|
||||||
from .tok2vec import get_tok2vec_width
|
from .tok2vec import get_tok2vec_width
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures("spacy.TextCatCNN.v1")
|
NEG_VALUE = -5000
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures("spacy.TextCatCNN.v2")
|
||||||
def build_simple_cnn_text_classifier(
|
def build_simple_cnn_text_classifier(
|
||||||
tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
|
tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
|
||||||
) -> Model[List[Doc], Floats2d]:
|
) -> Model[List[Doc], Floats2d]:
|
||||||
|
@ -25,38 +30,75 @@ def build_simple_cnn_text_classifier(
|
||||||
outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
|
outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
|
||||||
is applied instead, so that outputs are in the range [0, 1].
|
is applied instead, so that outputs are in the range [0, 1].
|
||||||
"""
|
"""
|
||||||
|
fill_defaults = {"b": 0, "W": 0}
|
||||||
with Model.define_operators({">>": chain}):
|
with Model.define_operators({">>": chain}):
|
||||||
cnn = tok2vec >> list2ragged() >> reduce_mean()
|
cnn = tok2vec >> list2ragged() >> reduce_mean()
|
||||||
|
nI = tok2vec.maybe_get_dim("nO")
|
||||||
if exclusive_classes:
|
if exclusive_classes:
|
||||||
output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
|
output_layer = Softmax(nO=nO, nI=nI)
|
||||||
model = cnn >> output_layer
|
fill_defaults["b"] = NEG_VALUE
|
||||||
model.set_ref("output_layer", output_layer)
|
resizable_layer = resizable(
|
||||||
|
output_layer,
|
||||||
|
resize_layer=partial(
|
||||||
|
resize_linear_weighted, fill_defaults=fill_defaults
|
||||||
|
),
|
||||||
|
)
|
||||||
|
model = cnn >> resizable_layer
|
||||||
else:
|
else:
|
||||||
linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
|
output_layer = Linear(nO=nO, nI=nI)
|
||||||
model = cnn >> linear_layer >> Logistic()
|
resizable_layer = resizable(
|
||||||
model.set_ref("output_layer", linear_layer)
|
output_layer,
|
||||||
|
resize_layer=partial(
|
||||||
|
resize_linear_weighted, fill_defaults=fill_defaults
|
||||||
|
),
|
||||||
|
)
|
||||||
|
model = cnn >> resizable_layer >> Logistic()
|
||||||
|
model.set_ref("output_layer", output_layer)
|
||||||
|
model.attrs["resize_output"] = partial(
|
||||||
|
resize_and_set_ref,
|
||||||
|
resizable_layer=resizable_layer,
|
||||||
|
)
|
||||||
model.set_ref("tok2vec", tok2vec)
|
model.set_ref("tok2vec", tok2vec)
|
||||||
model.set_dim("nO", nO)
|
model.set_dim("nO", nO)
|
||||||
model.attrs["multi_label"] = not exclusive_classes
|
model.attrs["multi_label"] = not exclusive_classes
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures("spacy.TextCatBOW.v1")
|
def resize_and_set_ref(model, new_nO, resizable_layer):
|
||||||
|
resizable_layer = resize_model(resizable_layer, new_nO)
|
||||||
|
model.set_ref("output_layer", resizable_layer.layers[0])
|
||||||
|
model.set_dim("nO", new_nO, force=True)
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures("spacy.TextCatBOW.v2")
|
||||||
def build_bow_text_classifier(
|
def build_bow_text_classifier(
|
||||||
exclusive_classes: bool,
|
exclusive_classes: bool,
|
||||||
ngram_size: int,
|
ngram_size: int,
|
||||||
no_output_layer: bool,
|
no_output_layer: bool,
|
||||||
nO: Optional[int] = None,
|
nO: Optional[int] = None,
|
||||||
) -> Model[List[Doc], Floats2d]:
|
) -> Model[List[Doc], Floats2d]:
|
||||||
|
fill_defaults = {"b": 0, "W": 0}
|
||||||
with Model.define_operators({">>": chain}):
|
with Model.define_operators({">>": chain}):
|
||||||
sparse_linear = SparseLinear(nO)
|
sparse_linear = SparseLinear(nO=nO)
|
||||||
model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear
|
output_layer = None
|
||||||
model = with_cpu(model, model.ops)
|
|
||||||
if not no_output_layer:
|
if not no_output_layer:
|
||||||
|
fill_defaults["b"] = NEG_VALUE
|
||||||
output_layer = softmax_activation() if exclusive_classes else Logistic()
|
output_layer = softmax_activation() if exclusive_classes else Logistic()
|
||||||
|
resizable_layer = resizable(
|
||||||
|
sparse_linear,
|
||||||
|
resize_layer=partial(resize_linear_weighted, fill_defaults=fill_defaults),
|
||||||
|
)
|
||||||
|
model = extract_ngrams(ngram_size, attr=ORTH) >> resizable_layer
|
||||||
|
model = with_cpu(model, model.ops)
|
||||||
|
if output_layer:
|
||||||
model = model >> with_cpu(output_layer, output_layer.ops)
|
model = model >> with_cpu(output_layer, output_layer.ops)
|
||||||
|
model.set_dim("nO", nO)
|
||||||
model.set_ref("output_layer", sparse_linear)
|
model.set_ref("output_layer", sparse_linear)
|
||||||
model.attrs["multi_label"] = not exclusive_classes
|
model.attrs["multi_label"] = not exclusive_classes
|
||||||
|
model.attrs["resize_output"] = partial(
|
||||||
|
resize_and_set_ref, resizable_layer=resizable_layer
|
||||||
|
)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@ -69,9 +111,7 @@ def build_text_classifier_v2(
|
||||||
exclusive_classes = not linear_model.attrs["multi_label"]
|
exclusive_classes = not linear_model.attrs["multi_label"]
|
||||||
with Model.define_operators({">>": chain, "|": concatenate}):
|
with Model.define_operators({">>": chain, "|": concatenate}):
|
||||||
width = tok2vec.maybe_get_dim("nO")
|
width = tok2vec.maybe_get_dim("nO")
|
||||||
attention_layer = ParametricAttention(
|
attention_layer = ParametricAttention(width)
|
||||||
width
|
|
||||||
) # TODO: benchmark performance difference of this layer
|
|
||||||
maxout_layer = Maxout(nO=width, nI=width)
|
maxout_layer = Maxout(nO=width, nI=width)
|
||||||
norm_layer = LayerNorm(nI=width)
|
norm_layer = LayerNorm(nI=width)
|
||||||
cnn_model = (
|
cnn_model = (
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
from thinc.api import Model, noop
|
from thinc.api import Model, noop
|
||||||
from .parser_model import ParserStepModel
|
from .parser_model import ParserStepModel
|
||||||
|
from ..util import registry
|
||||||
|
|
||||||
|
|
||||||
|
@registry.layers("spacy.TransitionModel.v1")
|
||||||
def TransitionModel(
|
def TransitionModel(
|
||||||
tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()
|
tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()
|
||||||
):
|
):
|
||||||
|
@ -15,7 +17,7 @@ def TransitionModel(
|
||||||
return Model(
|
return Model(
|
||||||
name="parser_model",
|
name="parser_model",
|
||||||
forward=forward,
|
forward=forward,
|
||||||
dims={"nI": tok2vec.get_dim("nI") if tok2vec.has_dim("nI") else None},
|
dims={"nI": tok2vec.maybe_get_dim("nI")},
|
||||||
layers=[tok2vec, lower, upper],
|
layers=[tok2vec, lower, upper],
|
||||||
refs={"tok2vec": tok2vec, "lower": lower, "upper": upper},
|
refs={"tok2vec": tok2vec, "lower": lower, "upper": upper},
|
||||||
init=init,
|
init=init,
|
||||||
|
|
|
@ -1,14 +1,11 @@
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from preshed.maps cimport PreshMap, PreshMapArray
|
from preshed.maps cimport PreshMap
|
||||||
from libc.stdint cimport uint64_t
|
|
||||||
from murmurhash cimport mrmr
|
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
|
from libc.stdint cimport uint64_t
|
||||||
|
|
||||||
from .structs cimport TokenC, MorphAnalysisC
|
from .structs cimport MorphAnalysisC
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
from .typedefs cimport hash_t, attr_t, flags_t
|
from .typedefs cimport attr_t, hash_t
|
||||||
from .parts_of_speech cimport univ_pos_t
|
|
||||||
from . cimport symbols
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Morphology:
|
cdef class Morphology:
|
||||||
|
@ -16,14 +13,6 @@ cdef class Morphology:
|
||||||
cdef readonly StringStore strings
|
cdef readonly StringStore strings
|
||||||
cdef PreshMap tags # Keyed by hash, value is pointer to tag
|
cdef PreshMap tags # Keyed by hash, value is pointer to tag
|
||||||
|
|
||||||
cdef public object lemmatizer
|
|
||||||
cdef readonly object tag_map
|
|
||||||
cdef readonly object tag_names
|
|
||||||
cdef readonly object reverse_index
|
|
||||||
cdef readonly object _exc
|
|
||||||
cdef readonly PreshMapArray _cache
|
|
||||||
cdef readonly int n_tags
|
|
||||||
|
|
||||||
cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
|
cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
|
||||||
cdef int insert(self, MorphAnalysisC tag) except -1
|
cdef int insert(self, MorphAnalysisC tag) except -1
|
||||||
|
|
||||||
|
|
|
@ -1,20 +1,11 @@
|
||||||
# cython: infer_types
|
# cython: infer_types
|
||||||
from libc.string cimport memset
|
|
||||||
|
|
||||||
import srsly
|
|
||||||
from collections import Counter
|
|
||||||
import numpy
|
import numpy
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from .attrs cimport POS, IS_SPACE
|
from .attrs cimport POS
|
||||||
from .parts_of_speech cimport SPACE
|
|
||||||
from .lexeme cimport Lexeme
|
|
||||||
|
|
||||||
from .strings import get_string_id
|
|
||||||
from .attrs import LEMMA, intify_attrs
|
|
||||||
from .parts_of_speech import IDS as POS_IDS
|
from .parts_of_speech import IDS as POS_IDS
|
||||||
from .errors import Errors, Warnings
|
from .errors import Warnings
|
||||||
from .util import ensure_path
|
|
||||||
from . import symbols
|
from . import symbols
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -11,6 +11,7 @@ from .senter import SentenceRecognizer
|
||||||
from .sentencizer import Sentencizer
|
from .sentencizer import Sentencizer
|
||||||
from .tagger import Tagger
|
from .tagger import Tagger
|
||||||
from .textcat import TextCategorizer
|
from .textcat import TextCategorizer
|
||||||
|
from .spancat import SpanCategorizer
|
||||||
from .textcat_multilabel import MultiLabel_TextCategorizer
|
from .textcat_multilabel import MultiLabel_TextCategorizer
|
||||||
from .tok2vec import Tok2Vec
|
from .tok2vec import Tok2Vec
|
||||||
from .functions import merge_entities, merge_noun_chunks, merge_subtokens
|
from .functions import merge_entities, merge_noun_chunks, merge_subtokens
|
||||||
|
@ -27,6 +28,7 @@ __all__ = [
|
||||||
"Pipe",
|
"Pipe",
|
||||||
"SentenceRecognizer",
|
"SentenceRecognizer",
|
||||||
"Sentencizer",
|
"Sentencizer",
|
||||||
|
"SpanCategorizer",
|
||||||
"Tagger",
|
"Tagger",
|
||||||
"TextCategorizer",
|
"TextCategorizer",
|
||||||
"Tok2Vec",
|
"Tok2Vec",
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
import os
|
||||||
|
import random
|
||||||
from libc.stdint cimport int32_t
|
from libc.stdint cimport int32_t
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
|
@ -6,10 +8,11 @@ from thinc.extra.search cimport Beam
|
||||||
|
|
||||||
from ...tokens.doc cimport Doc
|
from ...tokens.doc cimport Doc
|
||||||
from ...tokens.span import Span
|
from ...tokens.span import Span
|
||||||
|
from ...tokens.span cimport Span
|
||||||
from ...typedefs cimport weight_t, attr_t
|
from ...typedefs cimport weight_t, attr_t
|
||||||
from ...lexeme cimport Lexeme
|
from ...lexeme cimport Lexeme
|
||||||
from ...attrs cimport IS_SPACE
|
from ...attrs cimport IS_SPACE
|
||||||
from ...structs cimport TokenC
|
from ...structs cimport TokenC, SpanC
|
||||||
from ...training.example cimport Example
|
from ...training.example cimport Example
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
|
@ -25,7 +28,6 @@ cdef enum:
|
||||||
LAST
|
LAST
|
||||||
UNIT
|
UNIT
|
||||||
OUT
|
OUT
|
||||||
ISNT
|
|
||||||
N_MOVES
|
N_MOVES
|
||||||
|
|
||||||
|
|
||||||
|
@ -36,39 +38,62 @@ MOVE_NAMES[IN] = 'I'
|
||||||
MOVE_NAMES[LAST] = 'L'
|
MOVE_NAMES[LAST] = 'L'
|
||||||
MOVE_NAMES[UNIT] = 'U'
|
MOVE_NAMES[UNIT] = 'U'
|
||||||
MOVE_NAMES[OUT] = 'O'
|
MOVE_NAMES[OUT] = 'O'
|
||||||
MOVE_NAMES[ISNT] = 'x'
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct GoldNERStateC:
|
cdef struct GoldNERStateC:
|
||||||
Transition* ner
|
Transition* ner
|
||||||
|
SpanC* negs
|
||||||
int32_t length
|
int32_t length
|
||||||
|
int32_t nr_neg
|
||||||
|
|
||||||
|
|
||||||
cdef class BiluoGold:
|
cdef class BiluoGold:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cdef GoldNERStateC c
|
cdef GoldNERStateC c
|
||||||
|
|
||||||
def __init__(self, BiluoPushDown moves, StateClass stcls, Example example):
|
def __init__(self, BiluoPushDown moves, StateClass stcls, Example example, neg_key):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self.c = create_gold_state(self.mem, moves, stcls.c, example)
|
self.c = create_gold_state(self.mem, moves, stcls.c, example, neg_key)
|
||||||
|
|
||||||
def update(self, StateClass stcls):
|
def update(self, StateClass stcls):
|
||||||
update_gold_state(&self.c, stcls.c)
|
update_gold_state(&self.c, stcls.c)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef GoldNERStateC create_gold_state(
|
cdef GoldNERStateC create_gold_state(
|
||||||
Pool mem,
|
Pool mem,
|
||||||
BiluoPushDown moves,
|
BiluoPushDown moves,
|
||||||
const StateC* stcls,
|
const StateC* stcls,
|
||||||
Example example
|
Example example,
|
||||||
|
neg_key
|
||||||
) except *:
|
) except *:
|
||||||
cdef GoldNERStateC gs
|
cdef GoldNERStateC gs
|
||||||
|
cdef Span neg
|
||||||
|
if neg_key is not None:
|
||||||
|
negs = example.get_aligned_spans_y2x(
|
||||||
|
example.y.spans.get(neg_key, []),
|
||||||
|
allow_overlap=True
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
negs = []
|
||||||
assert example.x.length > 0
|
assert example.x.length > 0
|
||||||
gs.ner = <Transition*>mem.alloc(example.x.length, sizeof(Transition))
|
gs.ner = <Transition*>mem.alloc(example.x.length, sizeof(Transition))
|
||||||
ner_tags = example.get_aligned_ner()
|
gs.negs = <SpanC*>mem.alloc(len(negs), sizeof(SpanC))
|
||||||
|
gs.nr_neg = len(negs)
|
||||||
|
ner_ents, ner_tags = example.get_aligned_ents_and_ner()
|
||||||
for i, ner_tag in enumerate(ner_tags):
|
for i, ner_tag in enumerate(ner_tags):
|
||||||
gs.ner[i] = moves.lookup_transition(ner_tag)
|
gs.ner[i] = moves.lookup_transition(ner_tag)
|
||||||
|
|
||||||
|
# Prevent conflicting spans in the data. For NER, spans are equal if they have the same offsets and label.
|
||||||
|
neg_span_triples = {(neg_ent.start_char, neg_ent.end_char, neg_ent.label) for neg_ent in negs}
|
||||||
|
for pos_span in ner_ents:
|
||||||
|
if (pos_span.start_char, pos_span.end_char, pos_span.label) in neg_span_triples:
|
||||||
|
raise ValueError(Errors.E868.format(span=(pos_span.start_char, pos_span.end_char, pos_span.label_)))
|
||||||
|
|
||||||
|
# In order to handle negative samples, we need to maintain the full
|
||||||
|
# (start, end, label) triple. If we break it down to the 'isnt B-LOC'
|
||||||
|
# thing, we'll get blocked if there's an incorrect prefix.
|
||||||
|
for i, neg in enumerate(negs):
|
||||||
|
gs.negs[i] = neg.c
|
||||||
return gs
|
return gs
|
||||||
|
|
||||||
|
|
||||||
|
@ -156,21 +181,16 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
cdef attr_t label
|
cdef attr_t label
|
||||||
if name == '-' or name == '' or name is None:
|
if name == '-' or name == '' or name is None:
|
||||||
return Transition(clas=0, move=MISSING, label=0, score=0)
|
return Transition(clas=0, move=MISSING, label=0, score=0)
|
||||||
elif name == '!O':
|
|
||||||
return Transition(clas=0, move=ISNT, label=0, score=0)
|
|
||||||
elif '-' in name:
|
elif '-' in name:
|
||||||
move_str, label_str = name.split('-', 1)
|
move_str, label_str = name.split('-', 1)
|
||||||
# Hacky way to denote 'not this entity'
|
# Deprecated, hacky way to denote 'not this entity'
|
||||||
if label_str.startswith('!'):
|
if label_str.startswith('!'):
|
||||||
label_str = label_str[1:]
|
raise ValueError(Errors.E869.format(label=name))
|
||||||
move_str = 'x'
|
|
||||||
label = self.strings.add(label_str)
|
label = self.strings.add(label_str)
|
||||||
else:
|
else:
|
||||||
move_str = name
|
move_str = name
|
||||||
label = 0
|
label = 0
|
||||||
move = MOVE_NAMES.index(move_str)
|
move = MOVE_NAMES.index(move_str)
|
||||||
if move == ISNT:
|
|
||||||
return Transition(clas=0, move=ISNT, label=label, score=0)
|
|
||||||
for i in range(self.n_moves):
|
for i in range(self.n_moves):
|
||||||
if self.c[i].move == move and self.c[i].label == label:
|
if self.c[i].move == move and self.c[i].label == label:
|
||||||
return self.c[i]
|
return self.c[i]
|
||||||
|
@ -220,7 +240,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
label_id = label_name
|
label_id = label_name
|
||||||
if action == OUT and label_id != 0:
|
if action == OUT and label_id != 0:
|
||||||
return None
|
return None
|
||||||
if action == MISSING or action == ISNT:
|
if action == MISSING:
|
||||||
return None
|
return None
|
||||||
# Check we're not creating a move we already have, so that this is
|
# Check we're not creating a move we already have, so that this is
|
||||||
# idempotent
|
# idempotent
|
||||||
|
@ -247,7 +267,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
for i in range(state.c._ents.size()):
|
for i in range(state.c._ents.size()):
|
||||||
ent = state.c._ents.at(i)
|
ent = state.c._ents.at(i)
|
||||||
if ent.start != -1 and ent.end != -1:
|
if ent.start != -1 and ent.end != -1:
|
||||||
ents.append(Span(doc, ent.start, ent.end, label=ent.label))
|
ents.append(Span(doc, ent.start, ent.end, label=ent.label, kb_id=doc.c[ent.start].ent_kb_id))
|
||||||
doc.set_ents(ents, default="unmodified")
|
doc.set_ents(ents, default="unmodified")
|
||||||
# Set non-blocked tokens to O
|
# Set non-blocked tokens to O
|
||||||
for i in range(doc.length):
|
for i in range(doc.length):
|
||||||
|
@ -270,9 +290,23 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
return parses
|
return parses
|
||||||
|
|
||||||
def init_gold(self, StateClass state, Example example):
|
def init_gold(self, StateClass state, Example example):
|
||||||
return BiluoGold(self, state, example)
|
return BiluoGold(self, state, example, self.neg_key)
|
||||||
|
|
||||||
def has_gold(self, Example eg, start=0, end=None):
|
def has_gold(self, Example eg, start=0, end=None):
|
||||||
|
# We get x and y referring to X, we want to check relative to Y,
|
||||||
|
# the reference
|
||||||
|
y_spans = eg.get_aligned_spans_x2y([eg.x[start:end]])
|
||||||
|
if not y_spans:
|
||||||
|
y_spans = [eg.y[:]]
|
||||||
|
y_span = y_spans[0]
|
||||||
|
start = y_span.start
|
||||||
|
end = y_span.end
|
||||||
|
neg_key = self.neg_key
|
||||||
|
if neg_key is not None:
|
||||||
|
# If we have any negative samples, count that as having annotation.
|
||||||
|
for span in eg.y.spans.get(neg_key, []):
|
||||||
|
if span.start >= start and span.end <= end:
|
||||||
|
return True
|
||||||
for word in eg.y[start:end]:
|
for word in eg.y[start:end]:
|
||||||
if word.ent_iob != 0:
|
if word.ent_iob != 0:
|
||||||
return True
|
return True
|
||||||
|
@ -306,8 +340,6 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
n_gold += costs[i] <= 0
|
n_gold += costs[i] <= 0
|
||||||
else:
|
else:
|
||||||
costs[i] = 9000
|
costs[i] = 9000
|
||||||
if n_gold < 1:
|
|
||||||
raise ValueError
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Missing:
|
cdef class Missing:
|
||||||
|
@ -373,23 +405,33 @@ cdef class Begin:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
||||||
gold = <GoldNERStateC*>_gold
|
gold = <GoldNERStateC*>_gold
|
||||||
cdef int g_act = gold.ner[s.B(0)].move
|
b0 = s.B(0)
|
||||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
cdef int cost = 0
|
||||||
|
cdef int g_act = gold.ner[b0].move
|
||||||
|
cdef attr_t g_tag = gold.ner[b0].label
|
||||||
|
|
||||||
if g_act == MISSING:
|
if g_act == MISSING:
|
||||||
return 0
|
pass
|
||||||
elif g_act == BEGIN:
|
elif g_act == BEGIN:
|
||||||
# B, Gold B --> Label match
|
# B, Gold B --> Label match
|
||||||
return label != g_tag
|
cost += label != g_tag
|
||||||
# Support partial supervision in the form of "not this label"
|
|
||||||
elif g_act == ISNT:
|
|
||||||
return label == g_tag
|
|
||||||
else:
|
else:
|
||||||
# B, Gold I --> False (P)
|
# B, Gold I --> False (P)
|
||||||
# B, Gold L --> False (P)
|
# B, Gold L --> False (P)
|
||||||
# B, Gold O --> False (P)
|
# B, Gold O --> False (P)
|
||||||
# B, Gold U --> False (P)
|
# B, Gold U --> False (P)
|
||||||
return 1
|
cost += 1
|
||||||
|
if s.buffer_length() < 3:
|
||||||
|
# Handle negatives. In general we can't really do much to block
|
||||||
|
# B, because we don't know whether the whole entity is going to
|
||||||
|
# be correct or not. However, we can at least tell whether we're
|
||||||
|
# going to be opening an entity where there's only one possible
|
||||||
|
# L.
|
||||||
|
for span in gold.negs[:gold.nr_neg]:
|
||||||
|
if span.label == label and span.start == b0:
|
||||||
|
cost += 1
|
||||||
|
break
|
||||||
|
return cost
|
||||||
|
|
||||||
|
|
||||||
cdef class In:
|
cdef class In:
|
||||||
|
@ -462,9 +504,6 @@ cdef class In:
|
||||||
elif g_act == UNIT:
|
elif g_act == UNIT:
|
||||||
# I, Gold U --> True iff next tag == O
|
# I, Gold U --> True iff next tag == O
|
||||||
return next_act != OUT
|
return next_act != OUT
|
||||||
# Support partial supervision in the form of "not this label"
|
|
||||||
elif g_act == ISNT:
|
|
||||||
return 0
|
|
||||||
else:
|
else:
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
@ -504,32 +543,41 @@ cdef class Last:
|
||||||
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
||||||
gold = <GoldNERStateC*>_gold
|
gold = <GoldNERStateC*>_gold
|
||||||
move = LAST
|
move = LAST
|
||||||
|
b0 = s.B(0)
|
||||||
|
ent_start = s.E(0)
|
||||||
|
|
||||||
cdef int g_act = gold.ner[s.B(0)].move
|
cdef int g_act = gold.ner[b0].move
|
||||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
cdef attr_t g_tag = gold.ner[b0].label
|
||||||
|
|
||||||
|
cdef int cost = 0
|
||||||
|
|
||||||
if g_act == MISSING:
|
if g_act == MISSING:
|
||||||
return 0
|
pass
|
||||||
elif g_act == BEGIN:
|
elif g_act == BEGIN:
|
||||||
# L, Gold B --> True
|
# L, Gold B --> True
|
||||||
return 0
|
pass
|
||||||
elif g_act == IN:
|
elif g_act == IN:
|
||||||
# L, Gold I --> True iff this entity sunk
|
# L, Gold I --> True iff this entity sunk
|
||||||
return not _entity_is_sunk(s, gold.ner)
|
cost += not _entity_is_sunk(s, gold.ner)
|
||||||
elif g_act == LAST:
|
elif g_act == LAST:
|
||||||
# L, Gold L --> True
|
# L, Gold L --> True
|
||||||
return 0
|
pass
|
||||||
elif g_act == OUT:
|
elif g_act == OUT:
|
||||||
# L, Gold O --> True
|
# L, Gold O --> True
|
||||||
return 0
|
pass
|
||||||
elif g_act == UNIT:
|
elif g_act == UNIT:
|
||||||
# L, Gold U --> True
|
# L, Gold U --> True
|
||||||
return 0
|
pass
|
||||||
# Support partial supervision in the form of "not this label"
|
|
||||||
elif g_act == ISNT:
|
|
||||||
return 0
|
|
||||||
else:
|
else:
|
||||||
return 1
|
cost += 1
|
||||||
|
# If we have negative-example entities, integrate them into the objective,
|
||||||
|
# by marking actions that close an entity that we know is incorrect
|
||||||
|
# as costly.
|
||||||
|
for span in gold.negs[:gold.nr_neg]:
|
||||||
|
if span.label == label and (span.end-1) == b0 and span.start == ent_start:
|
||||||
|
cost += 1
|
||||||
|
break
|
||||||
|
return cost
|
||||||
|
|
||||||
|
|
||||||
cdef class Unit:
|
cdef class Unit:
|
||||||
|
@ -573,21 +621,29 @@ cdef class Unit:
|
||||||
gold = <GoldNERStateC*>_gold
|
gold = <GoldNERStateC*>_gold
|
||||||
cdef int g_act = gold.ner[s.B(0)].move
|
cdef int g_act = gold.ner[s.B(0)].move
|
||||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||||
|
cdef int cost = 0
|
||||||
|
|
||||||
if g_act == MISSING:
|
if g_act == MISSING:
|
||||||
return 0
|
pass
|
||||||
elif g_act == UNIT:
|
elif g_act == UNIT:
|
||||||
# U, Gold U --> True iff tag match
|
# U, Gold U --> True iff tag match
|
||||||
return label != g_tag
|
cost += label != g_tag
|
||||||
# Support partial supervision in the form of "not this label"
|
|
||||||
elif g_act == ISNT:
|
|
||||||
return label == g_tag
|
|
||||||
else:
|
else:
|
||||||
# U, Gold B --> False
|
# U, Gold B --> False
|
||||||
# U, Gold I --> False
|
# U, Gold I --> False
|
||||||
# U, Gold L --> False
|
# U, Gold L --> False
|
||||||
# U, Gold O --> False
|
# U, Gold O --> False
|
||||||
return 1
|
cost += 1
|
||||||
|
# If we have negative-example entities, integrate them into the objective.
|
||||||
|
# This is fairly straight-forward for U- entities, as we have a single
|
||||||
|
# action
|
||||||
|
cdef int b0 = s.B(0)
|
||||||
|
for span in gold.negs[:gold.nr_neg]:
|
||||||
|
if span.label == label and span.start == b0 and span.end == (b0+1):
|
||||||
|
cost += 1
|
||||||
|
break
|
||||||
|
return cost
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Out:
|
cdef class Out:
|
||||||
|
@ -613,25 +669,24 @@ cdef class Out:
|
||||||
gold = <GoldNERStateC*>_gold
|
gold = <GoldNERStateC*>_gold
|
||||||
cdef int g_act = gold.ner[s.B(0)].move
|
cdef int g_act = gold.ner[s.B(0)].move
|
||||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||||
|
cdef weight_t cost = 0
|
||||||
if g_act == ISNT and g_tag == 0:
|
if g_act == MISSING:
|
||||||
return 1
|
pass
|
||||||
elif g_act == MISSING or g_act == ISNT:
|
|
||||||
return 0
|
|
||||||
elif g_act == BEGIN:
|
elif g_act == BEGIN:
|
||||||
# O, Gold B --> False
|
# O, Gold B --> False
|
||||||
return 1
|
cost += 1
|
||||||
elif g_act == IN:
|
elif g_act == IN:
|
||||||
# O, Gold I --> True
|
# O, Gold I --> True
|
||||||
return 0
|
pass
|
||||||
elif g_act == LAST:
|
elif g_act == LAST:
|
||||||
# O, Gold L --> True
|
# O, Gold L --> True
|
||||||
return 0
|
pass
|
||||||
elif g_act == OUT:
|
elif g_act == OUT:
|
||||||
# O, Gold O --> True
|
# O, Gold O --> True
|
||||||
return 0
|
pass
|
||||||
elif g_act == UNIT:
|
elif g_act == UNIT:
|
||||||
# O, Gold U --> False
|
# O, Gold U --> False
|
||||||
return 1
|
cost += 1
|
||||||
else:
|
else:
|
||||||
return 1
|
cost += 1
|
||||||
|
return cost
|
||||||
|
|
|
@ -41,6 +41,7 @@ cdef class TransitionSystem:
|
||||||
cdef public attr_t root_label
|
cdef public attr_t root_label
|
||||||
cdef public freqs
|
cdef public freqs
|
||||||
cdef public object labels
|
cdef public object labels
|
||||||
|
cdef public object cfg
|
||||||
cdef init_state_t init_beam_state
|
cdef init_state_t init_beam_state
|
||||||
cdef del_state_t del_beam_state
|
cdef del_state_t del_beam_state
|
||||||
|
|
||||||
|
|
|
@ -33,7 +33,14 @@ cdef int _del_state(Pool mem, void* state, void* x) except -1:
|
||||||
|
|
||||||
|
|
||||||
cdef class TransitionSystem:
|
cdef class TransitionSystem:
|
||||||
def __init__(self, StringStore string_table, labels_by_action=None, min_freq=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
StringStore string_table,
|
||||||
|
labels_by_action=None,
|
||||||
|
min_freq=None,
|
||||||
|
incorrect_spans_key=None
|
||||||
|
):
|
||||||
|
self.cfg = {"neg_key": incorrect_spans_key}
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self.strings = string_table
|
self.strings = string_table
|
||||||
self.n_moves = 0
|
self.n_moves = 0
|
||||||
|
@ -49,8 +56,13 @@ cdef class TransitionSystem:
|
||||||
self.del_beam_state = _del_state
|
self.del_beam_state = _del_state
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
|
# TODO: This loses the 'cfg'
|
||||||
return (self.__class__, (self.strings, self.labels), None, None)
|
return (self.__class__, (self.strings, self.labels), None, None)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def neg_key(self):
|
||||||
|
return self.cfg.get("neg_key")
|
||||||
|
|
||||||
def init_batch(self, docs):
|
def init_batch(self, docs):
|
||||||
cdef StateClass state
|
cdef StateClass state
|
||||||
states = []
|
states = []
|
||||||
|
@ -220,16 +232,21 @@ cdef class TransitionSystem:
|
||||||
transitions = []
|
transitions = []
|
||||||
serializers = {
|
serializers = {
|
||||||
'moves': lambda: srsly.json_dumps(self.labels),
|
'moves': lambda: srsly.json_dumps(self.labels),
|
||||||
'strings': lambda: self.strings.to_bytes()
|
'strings': lambda: self.strings.to_bytes(),
|
||||||
|
'cfg': lambda: self.cfg
|
||||||
}
|
}
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_bytes(serializers, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, exclude=tuple()):
|
def from_bytes(self, bytes_data, exclude=tuple()):
|
||||||
|
# We're adding a new field, 'cfg', here and we don't want to break
|
||||||
|
# previous models that don't have it.
|
||||||
|
msg = srsly.msgpack_loads(bytes_data)
|
||||||
labels = {}
|
labels = {}
|
||||||
deserializers = {
|
if 'moves' not in exclude:
|
||||||
'moves': lambda b: labels.update(srsly.json_loads(b)),
|
labels.update(srsly.json_loads(msg['moves']))
|
||||||
'strings': lambda b: self.strings.from_bytes(b)
|
if 'strings' not in exclude:
|
||||||
}
|
self.strings.from_bytes(msg['strings'])
|
||||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
if 'cfg' not in exclude and 'cfg' in msg:
|
||||||
|
self.cfg.update(msg['cfg'])
|
||||||
self.initialize_actions(labels)
|
self.initialize_actions(labels)
|
||||||
return self
|
return self
|
||||||
|
|
|
@ -106,7 +106,7 @@ class AttributeRuler(Pipe):
|
||||||
|
|
||||||
def match(self, doc: Doc):
|
def match(self, doc: Doc):
|
||||||
matches = self.matcher(doc, allow_missing=True)
|
matches = self.matcher(doc, allow_missing=True)
|
||||||
# Sort by the attribute ID, so that later rules have precendence
|
# Sort by the attribute ID, so that later rules have precedence
|
||||||
matches = [
|
matches = [
|
||||||
(int(self.vocab.strings[m_id]), m_id, s, e) for m_id, s, e in matches
|
(int(self.vocab.strings[m_id]), m_id, s, e) for m_id, s, e in matches
|
||||||
]
|
]
|
||||||
|
|
|
@ -3,6 +3,7 @@ from collections import defaultdict
|
||||||
from typing import Optional, Iterable
|
from typing import Optional, Iterable
|
||||||
from thinc.api import Model, Config
|
from thinc.api import Model, Config
|
||||||
|
|
||||||
|
from ._parser_internals.transition_system import TransitionSystem
|
||||||
from .transition_parser cimport Parser
|
from .transition_parser cimport Parser
|
||||||
from ._parser_internals.arc_eager cimport ArcEager
|
from ._parser_internals.arc_eager cimport ArcEager
|
||||||
|
|
||||||
|
@ -59,7 +60,7 @@ def make_parser(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
model: Model,
|
model: Model,
|
||||||
moves: Optional[list],
|
moves: Optional[TransitionSystem],
|
||||||
update_with_oracle_cut_size: int,
|
update_with_oracle_cut_size: int,
|
||||||
learn_tokens: bool,
|
learn_tokens: bool,
|
||||||
min_action_freq: int
|
min_action_freq: int
|
||||||
|
@ -85,13 +86,13 @@ def make_parser(
|
||||||
model (Model): The model for the transition-based parser. The model needs
|
model (Model): The model for the transition-based parser. The model needs
|
||||||
to have a specific substructure of named components --- see the
|
to have a specific substructure of named components --- see the
|
||||||
spacy.ml.tb_framework.TransitionModel for details.
|
spacy.ml.tb_framework.TransitionModel for details.
|
||||||
moves (List[str]): A list of transition names. Inferred from the data if not
|
moves (Optional[TransitionSystem]): This defines how the parse-state is created,
|
||||||
provided.
|
updated and evaluated. If 'moves' is None, a new instance is
|
||||||
update_with_oracle_cut_size (int):
|
created with `self.TransitionSystem()`. Defaults to `None`.
|
||||||
During training, cut long sequences into shorter segments by creating
|
update_with_oracle_cut_size (int): During training, cut long sequences into
|
||||||
intermediate states based on the gold-standard history. The model is
|
shorter segments by creating intermediate states based on the gold-standard
|
||||||
not very sensitive to this parameter, so you usually won't need to change
|
history. The model is not very sensitive to this parameter, so you usually
|
||||||
it. 100 is a good default.
|
won't need to change it. 100 is a good default.
|
||||||
learn_tokens (bool): Whether to learn to merge subtokens that are split
|
learn_tokens (bool): Whether to learn to merge subtokens that are split
|
||||||
relative to the gold standard. Experimental.
|
relative to the gold standard. Experimental.
|
||||||
min_action_freq (int): The minimum frequency of labelled actions to retain.
|
min_action_freq (int): The minimum frequency of labelled actions to retain.
|
||||||
|
@ -112,6 +113,9 @@ def make_parser(
|
||||||
beam_width=1,
|
beam_width=1,
|
||||||
beam_density=0.0,
|
beam_density=0.0,
|
||||||
beam_update_prob=0.0,
|
beam_update_prob=0.0,
|
||||||
|
# At some point in the future we can try to implement support for
|
||||||
|
# partial annotations, perhaps only in the beam objective.
|
||||||
|
incorrect_spans_key=None
|
||||||
)
|
)
|
||||||
|
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
|
@ -140,7 +144,7 @@ def make_beam_parser(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
model: Model,
|
model: Model,
|
||||||
moves: Optional[list],
|
moves: Optional[TransitionSystem],
|
||||||
update_with_oracle_cut_size: int,
|
update_with_oracle_cut_size: int,
|
||||||
learn_tokens: bool,
|
learn_tokens: bool,
|
||||||
min_action_freq: int,
|
min_action_freq: int,
|
||||||
|
@ -165,8 +169,13 @@ def make_beam_parser(
|
||||||
model (Model): The model for the transition-based parser. The model needs
|
model (Model): The model for the transition-based parser. The model needs
|
||||||
to have a specific substructure of named components --- see the
|
to have a specific substructure of named components --- see the
|
||||||
spacy.ml.tb_framework.TransitionModel for details.
|
spacy.ml.tb_framework.TransitionModel for details.
|
||||||
moves (List[str]): A list of transition names. Inferred from the data if not
|
moves (Optional[TransitionSystem]): This defines how the parse-state is created,
|
||||||
provided.
|
updated and evaluated. If 'moves' is None, a new instance is
|
||||||
|
created with `self.TransitionSystem()`. Defaults to `None`.
|
||||||
|
update_with_oracle_cut_size (int): During training, cut long sequences into
|
||||||
|
shorter segments by creating intermediate states based on the gold-standard
|
||||||
|
history. The model is not very sensitive to this parameter, so you usually
|
||||||
|
won't need to change it. 100 is a good default.
|
||||||
beam_width (int): The number of candidate analyses to maintain.
|
beam_width (int): The number of candidate analyses to maintain.
|
||||||
beam_density (float): The minimum ratio between the scores of the first and
|
beam_density (float): The minimum ratio between the scores of the first and
|
||||||
last candidates in the beam. This allows the parser to avoid exploring
|
last candidates in the beam. This allows the parser to avoid exploring
|
||||||
|
@ -195,7 +204,10 @@ def make_beam_parser(
|
||||||
beam_update_prob=beam_update_prob,
|
beam_update_prob=beam_update_prob,
|
||||||
multitasks=[],
|
multitasks=[],
|
||||||
learn_tokens=learn_tokens,
|
learn_tokens=learn_tokens,
|
||||||
min_action_freq=min_action_freq
|
min_action_freq=min_action_freq,
|
||||||
|
# At some point in the future we can try to implement support for
|
||||||
|
# partial annotations, perhaps only in the beam objective.
|
||||||
|
incorrect_spans_key=None
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -206,6 +218,39 @@ cdef class DependencyParser(Parser):
|
||||||
"""
|
"""
|
||||||
TransitionSystem = ArcEager
|
TransitionSystem = ArcEager
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab,
|
||||||
|
model,
|
||||||
|
name="parser",
|
||||||
|
moves=None,
|
||||||
|
*,
|
||||||
|
update_with_oracle_cut_size=100,
|
||||||
|
min_action_freq=30,
|
||||||
|
learn_tokens=False,
|
||||||
|
beam_width=1,
|
||||||
|
beam_density=0.0,
|
||||||
|
beam_update_prob=0.0,
|
||||||
|
multitasks=tuple(),
|
||||||
|
incorrect_spans_key=None,
|
||||||
|
):
|
||||||
|
"""Create a DependencyParser.
|
||||||
|
"""
|
||||||
|
super().__init__(
|
||||||
|
vocab,
|
||||||
|
model,
|
||||||
|
name,
|
||||||
|
moves,
|
||||||
|
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
||||||
|
min_action_freq=min_action_freq,
|
||||||
|
learn_tokens=learn_tokens,
|
||||||
|
beam_width=beam_width,
|
||||||
|
beam_density=beam_density,
|
||||||
|
beam_update_prob=beam_update_prob,
|
||||||
|
multitasks=multitasks,
|
||||||
|
incorrect_spans_key=incorrect_spans_key,
|
||||||
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def postprocesses(self):
|
def postprocesses(self):
|
||||||
output = [nonproj.deprojectivize]
|
output = [nonproj.deprojectivize]
|
||||||
|
|
|
@ -9,7 +9,7 @@ import warnings
|
||||||
|
|
||||||
from ..kb import KnowledgeBase, Candidate
|
from ..kb import KnowledgeBase, Candidate
|
||||||
from ..ml import empty_kb
|
from ..ml import empty_kb
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc, Span
|
||||||
from .pipe import deserialize_config
|
from .pipe import deserialize_config
|
||||||
from .trainable_pipe import TrainablePipe
|
from .trainable_pipe import TrainablePipe
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
|
@ -67,7 +67,7 @@ def make_entity_linker(
|
||||||
incl_prior: bool,
|
incl_prior: bool,
|
||||||
incl_context: bool,
|
incl_context: bool,
|
||||||
entity_vector_length: int,
|
entity_vector_length: int,
|
||||||
get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]],
|
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||||
):
|
):
|
||||||
"""Construct an EntityLinker component.
|
"""Construct an EntityLinker component.
|
||||||
|
|
||||||
|
@ -114,7 +114,7 @@ class EntityLinker(TrainablePipe):
|
||||||
incl_prior: bool,
|
incl_prior: bool,
|
||||||
incl_context: bool,
|
incl_context: bool,
|
||||||
entity_vector_length: int,
|
entity_vector_length: int,
|
||||||
get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]],
|
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize an entity linker.
|
"""Initialize an entity linker.
|
||||||
|
|
||||||
|
@ -127,7 +127,7 @@ class EntityLinker(TrainablePipe):
|
||||||
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
|
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
|
||||||
incl_context (bool): Whether or not to include the local context in the model.
|
incl_context (bool): Whether or not to include the local context in the model.
|
||||||
entity_vector_length (int): Size of encoding vectors in the KB.
|
entity_vector_length (int): Size of encoding vectors in the KB.
|
||||||
get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
|
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
||||||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker#init
|
DOCS: https://spacy.io/api/entitylinker#init
|
||||||
|
@ -142,7 +142,7 @@ class EntityLinker(TrainablePipe):
|
||||||
self.get_candidates = get_candidates
|
self.get_candidates = get_candidates
|
||||||
self.cfg = {}
|
self.cfg = {}
|
||||||
self.distance = CosineDistance(normalize=False)
|
self.distance = CosineDistance(normalize=False)
|
||||||
# how many neightbour sentences to take into account
|
# how many neighbour sentences to take into account
|
||||||
# create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
|
# create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
|
||||||
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
||||||
|
|
||||||
|
@ -156,6 +156,8 @@ class EntityLinker(TrainablePipe):
|
||||||
|
|
||||||
def validate_kb(self) -> None:
|
def validate_kb(self) -> None:
|
||||||
# Raise an error if the knowledge base is not initialized.
|
# Raise an error if the knowledge base is not initialized.
|
||||||
|
if self.kb is None:
|
||||||
|
raise ValueError(Errors.E1018.format(name=self.name))
|
||||||
if len(self.kb) == 0:
|
if len(self.kb) == 0:
|
||||||
raise ValueError(Errors.E139.format(name=self.name))
|
raise ValueError(Errors.E139.format(name=self.name))
|
||||||
|
|
||||||
|
@ -305,11 +307,9 @@ class EntityLinker(TrainablePipe):
|
||||||
sent = ent.sent
|
sent = ent.sent
|
||||||
sent_index = sentences.index(sent)
|
sent_index = sentences.index(sent)
|
||||||
assert sent_index >= 0
|
assert sent_index >= 0
|
||||||
# get n_neightbour sentences, clipped to the length of the document
|
# get n_neighbour sentences, clipped to the length of the document
|
||||||
start_sentence = max(0, sent_index - self.n_sents)
|
start_sentence = max(0, sent_index - self.n_sents)
|
||||||
end_sentence = min(
|
end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
|
||||||
len(sentences) - 1, sent_index + self.n_sents
|
|
||||||
)
|
|
||||||
start_token = sentences[start_sentence].start
|
start_token = sentences[start_sentence].start
|
||||||
end_token = sentences[end_sentence].end
|
end_token = sentences[end_sentence].end
|
||||||
sent_doc = doc[start_token:end_token].as_doc()
|
sent_doc = doc[start_token:end_token].as_doc()
|
||||||
|
@ -335,22 +335,16 @@ class EntityLinker(TrainablePipe):
|
||||||
else:
|
else:
|
||||||
random.shuffle(candidates)
|
random.shuffle(candidates)
|
||||||
# set all prior probabilities to 0 if incl_prior=False
|
# set all prior probabilities to 0 if incl_prior=False
|
||||||
prior_probs = xp.asarray(
|
prior_probs = xp.asarray([c.prior_prob for c in candidates])
|
||||||
[c.prior_prob for c in candidates]
|
|
||||||
)
|
|
||||||
if not self.incl_prior:
|
if not self.incl_prior:
|
||||||
prior_probs = xp.asarray(
|
prior_probs = xp.asarray([0.0 for _ in candidates])
|
||||||
[0.0 for _ in candidates]
|
|
||||||
)
|
|
||||||
scores = prior_probs
|
scores = prior_probs
|
||||||
# add in similarity from the context
|
# add in similarity from the context
|
||||||
if self.incl_context:
|
if self.incl_context:
|
||||||
entity_encodings = xp.asarray(
|
entity_encodings = xp.asarray(
|
||||||
[c.entity_vector for c in candidates]
|
[c.entity_vector for c in candidates]
|
||||||
)
|
)
|
||||||
entity_norm = xp.linalg.norm(
|
entity_norm = xp.linalg.norm(entity_encodings, axis=1)
|
||||||
entity_encodings, axis=1
|
|
||||||
)
|
|
||||||
if len(entity_encodings) != len(prior_probs):
|
if len(entity_encodings) != len(prior_probs):
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
Errors.E147.format(
|
Errors.E147.format(
|
||||||
|
@ -359,14 +353,12 @@ class EntityLinker(TrainablePipe):
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
# cosine similarity
|
# cosine similarity
|
||||||
sims = xp.dot(
|
sims = xp.dot(entity_encodings, sentence_encoding_t) / (
|
||||||
entity_encodings, sentence_encoding_t
|
sentence_norm * entity_norm
|
||||||
) / (sentence_norm * entity_norm)
|
)
|
||||||
if sims.shape != prior_probs.shape:
|
if sims.shape != prior_probs.shape:
|
||||||
raise ValueError(Errors.E161)
|
raise ValueError(Errors.E161)
|
||||||
scores = (
|
scores = prior_probs + sims - (prior_probs * sims)
|
||||||
prior_probs + sims - (prior_probs * sims)
|
|
||||||
)
|
|
||||||
# TODO: thresholding
|
# TODO: thresholding
|
||||||
best_index = scores.argmax().item()
|
best_index = scores.argmax().item()
|
||||||
best_candidate = candidates[best_index]
|
best_candidate = candidates[best_index]
|
||||||
|
@ -408,6 +400,48 @@ class EntityLinker(TrainablePipe):
|
||||||
validate_examples(examples, "EntityLinker.score")
|
validate_examples(examples, "EntityLinker.score")
|
||||||
return Scorer.score_links(examples, negative_labels=[self.NIL])
|
return Scorer.score_links(examples, negative_labels=[self.NIL])
|
||||||
|
|
||||||
|
def to_bytes(self, *, exclude=tuple()):
|
||||||
|
"""Serialize the pipe to a bytestring.
|
||||||
|
|
||||||
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
RETURNS (bytes): The serialized object.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/entitylinker#to_bytes
|
||||||
|
"""
|
||||||
|
self._validate_serialization_attrs()
|
||||||
|
serialize = {}
|
||||||
|
if hasattr(self, "cfg") and self.cfg is not None:
|
||||||
|
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||||
|
serialize["vocab"] = self.vocab.to_bytes
|
||||||
|
serialize["kb"] = self.kb.to_bytes
|
||||||
|
serialize["model"] = self.model.to_bytes
|
||||||
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
|
def from_bytes(self, bytes_data, *, exclude=tuple()):
|
||||||
|
"""Load the pipe from a bytestring.
|
||||||
|
|
||||||
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
RETURNS (TrainablePipe): The loaded object.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/entitylinker#from_bytes
|
||||||
|
"""
|
||||||
|
self._validate_serialization_attrs()
|
||||||
|
|
||||||
|
def load_model(b):
|
||||||
|
try:
|
||||||
|
self.model.from_bytes(b)
|
||||||
|
except AttributeError:
|
||||||
|
raise ValueError(Errors.E149) from None
|
||||||
|
|
||||||
|
deserialize = {}
|
||||||
|
if hasattr(self, "cfg") and self.cfg is not None:
|
||||||
|
deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
|
||||||
|
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
|
||||||
|
deserialize["kb"] = lambda b: self.kb.from_bytes(b)
|
||||||
|
deserialize["model"] = load_model
|
||||||
|
util.from_bytes(bytes_data, deserialize, exclude)
|
||||||
|
return self
|
||||||
|
|
||||||
def to_disk(
|
def to_disk(
|
||||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||||
) -> None:
|
) -> None:
|
||||||
|
@ -439,7 +473,8 @@ class EntityLinker(TrainablePipe):
|
||||||
|
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
try:
|
try:
|
||||||
self.model.from_bytes(p.open("rb").read())
|
with p.open("rb") as infile:
|
||||||
|
self.model.from_bytes(infile.read())
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
raise ValueError(Errors.E149) from None
|
raise ValueError(Errors.E149) from None
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import warnings
|
||||||
from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence
|
from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -6,7 +7,7 @@ import srsly
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
from ..training import Example
|
from ..training import Example
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..errors import Errors
|
from ..errors import Errors, Warnings
|
||||||
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
|
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
from ..matcher import Matcher, PhraseMatcher
|
from ..matcher import Matcher, PhraseMatcher
|
||||||
|
@ -101,17 +102,12 @@ class EntityRuler(Pipe):
|
||||||
self.overwrite = overwrite_ents
|
self.overwrite = overwrite_ents
|
||||||
self.token_patterns = defaultdict(list)
|
self.token_patterns = defaultdict(list)
|
||||||
self.phrase_patterns = defaultdict(list)
|
self.phrase_patterns = defaultdict(list)
|
||||||
|
self._validate = validate
|
||||||
self.matcher = Matcher(nlp.vocab, validate=validate)
|
self.matcher = Matcher(nlp.vocab, validate=validate)
|
||||||
if phrase_matcher_attr is not None:
|
self.phrase_matcher_attr = phrase_matcher_attr
|
||||||
if phrase_matcher_attr.upper() == "TEXT":
|
self.phrase_matcher = PhraseMatcher(
|
||||||
phrase_matcher_attr = "ORTH"
|
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
|
||||||
self.phrase_matcher_attr = phrase_matcher_attr
|
)
|
||||||
self.phrase_matcher = PhraseMatcher(
|
|
||||||
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
self.phrase_matcher_attr = None
|
|
||||||
self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate)
|
|
||||||
self.ent_id_sep = ent_id_sep
|
self.ent_id_sep = ent_id_sep
|
||||||
self._ent_ids = defaultdict(dict)
|
self._ent_ids = defaultdict(dict)
|
||||||
if patterns is not None:
|
if patterns is not None:
|
||||||
|
@ -144,7 +140,10 @@ class EntityRuler(Pipe):
|
||||||
error_handler(self.name, self, [doc], e)
|
error_handler(self.name, self, [doc], e)
|
||||||
|
|
||||||
def match(self, doc: Doc):
|
def match(self, doc: Doc):
|
||||||
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
|
self._require_patterns()
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.filterwarnings("ignore", message="\\[W036")
|
||||||
|
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
|
||||||
matches = set(
|
matches = set(
|
||||||
[(m_id, start, end) for m_id, start, end in matches if start != end]
|
[(m_id, start, end) for m_id, start, end in matches if start != end]
|
||||||
)
|
)
|
||||||
|
@ -278,9 +277,7 @@ class EntityRuler(Pipe):
|
||||||
if self == pipe:
|
if self == pipe:
|
||||||
current_index = i
|
current_index = i
|
||||||
break
|
break
|
||||||
subsequent_pipes = [
|
subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index:]]
|
||||||
pipe for pipe in self.nlp.pipe_names[current_index + 1 :]
|
|
||||||
]
|
|
||||||
except ValueError:
|
except ValueError:
|
||||||
subsequent_pipes = []
|
subsequent_pipes = []
|
||||||
with self.nlp.select_pipes(disable=subsequent_pipes):
|
with self.nlp.select_pipes(disable=subsequent_pipes):
|
||||||
|
@ -301,7 +298,7 @@ class EntityRuler(Pipe):
|
||||||
self.nlp.pipe(phrase_pattern_texts),
|
self.nlp.pipe(phrase_pattern_texts),
|
||||||
phrase_pattern_ids,
|
phrase_pattern_ids,
|
||||||
):
|
):
|
||||||
phrase_pattern = {"label": label, "pattern": pattern, "id": ent_id}
|
phrase_pattern = {"label": label, "pattern": pattern}
|
||||||
if ent_id:
|
if ent_id:
|
||||||
phrase_pattern["id"] = ent_id
|
phrase_pattern["id"] = ent_id
|
||||||
phrase_patterns.append(phrase_pattern)
|
phrase_patterns.append(phrase_pattern)
|
||||||
|
@ -315,20 +312,27 @@ class EntityRuler(Pipe):
|
||||||
pattern = entry["pattern"]
|
pattern = entry["pattern"]
|
||||||
if isinstance(pattern, Doc):
|
if isinstance(pattern, Doc):
|
||||||
self.phrase_patterns[label].append(pattern)
|
self.phrase_patterns[label].append(pattern)
|
||||||
|
self.phrase_matcher.add(label, [pattern])
|
||||||
elif isinstance(pattern, list):
|
elif isinstance(pattern, list):
|
||||||
self.token_patterns[label].append(pattern)
|
self.token_patterns[label].append(pattern)
|
||||||
|
self.matcher.add(label, [pattern])
|
||||||
else:
|
else:
|
||||||
raise ValueError(Errors.E097.format(pattern=pattern))
|
raise ValueError(Errors.E097.format(pattern=pattern))
|
||||||
for label, patterns in self.token_patterns.items():
|
|
||||||
self.matcher.add(label, patterns)
|
|
||||||
for label, patterns in self.phrase_patterns.items():
|
|
||||||
self.phrase_matcher.add(label, patterns)
|
|
||||||
|
|
||||||
def clear(self) -> None:
|
def clear(self) -> None:
|
||||||
"""Reset all patterns."""
|
"""Reset all patterns."""
|
||||||
self.token_patterns = defaultdict(list)
|
self.token_patterns = defaultdict(list)
|
||||||
self.phrase_patterns = defaultdict(list)
|
self.phrase_patterns = defaultdict(list)
|
||||||
self._ent_ids = defaultdict(dict)
|
self._ent_ids = defaultdict(dict)
|
||||||
|
self.matcher = Matcher(self.nlp.vocab, validate=self._validate)
|
||||||
|
self.phrase_matcher = PhraseMatcher(
|
||||||
|
self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
|
||||||
|
)
|
||||||
|
|
||||||
|
def _require_patterns(self) -> None:
|
||||||
|
"""Raise a warning if this component has no patterns defined."""
|
||||||
|
if len(self) == 0:
|
||||||
|
warnings.warn(Warnings.W036.format(name=self.name))
|
||||||
|
|
||||||
def _split_label(self, label: str) -> Tuple[str, str]:
|
def _split_label(self, label: str) -> Tuple[str, str]:
|
||||||
"""Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
|
"""Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
|
||||||
|
@ -374,10 +378,9 @@ class EntityRuler(Pipe):
|
||||||
self.add_patterns(cfg.get("patterns", cfg))
|
self.add_patterns(cfg.get("patterns", cfg))
|
||||||
self.overwrite = cfg.get("overwrite", False)
|
self.overwrite = cfg.get("overwrite", False)
|
||||||
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
|
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
|
||||||
if self.phrase_matcher_attr is not None:
|
self.phrase_matcher = PhraseMatcher(
|
||||||
self.phrase_matcher = PhraseMatcher(
|
self.nlp.vocab, attr=self.phrase_matcher_attr
|
||||||
self.nlp.vocab, attr=self.phrase_matcher_attr
|
)
|
||||||
)
|
|
||||||
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
|
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
|
||||||
else:
|
else:
|
||||||
self.add_patterns(cfg)
|
self.add_patterns(cfg)
|
||||||
|
@ -428,10 +431,9 @@ class EntityRuler(Pipe):
|
||||||
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
|
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
|
||||||
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
|
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
|
||||||
|
|
||||||
if self.phrase_matcher_attr is not None:
|
self.phrase_matcher = PhraseMatcher(
|
||||||
self.phrase_matcher = PhraseMatcher(
|
self.nlp.vocab, attr=self.phrase_matcher_attr
|
||||||
self.nlp.vocab, attr=self.phrase_matcher_attr
|
)
|
||||||
)
|
|
||||||
from_disk(path, deserializers_patterns, {})
|
from_disk(path, deserializers_patterns, {})
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,8 @@ from typing import Optional, List, Dict, Any, Callable, Iterable, Union, Tuple
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
|
@ -182,7 +184,7 @@ class Lemmatizer(Pipe):
|
||||||
univ_pos = token.pos_.lower()
|
univ_pos = token.pos_.lower()
|
||||||
if univ_pos in ("", "eol", "space"):
|
if univ_pos in ("", "eol", "space"):
|
||||||
if univ_pos == "":
|
if univ_pos == "":
|
||||||
logger.warning(Warnings.W108.format(text=string))
|
warnings.warn(Warnings.W108.format(text=string))
|
||||||
return [string.lower()]
|
return [string.lower()]
|
||||||
# See Issue #435 for example of where this logic is requied.
|
# See Issue #435 for example of where this logic is requied.
|
||||||
if self.is_base_form(token):
|
if self.is_base_form(token):
|
||||||
|
|
|
@ -3,6 +3,7 @@ from collections import defaultdict
|
||||||
from typing import Optional, Iterable
|
from typing import Optional, Iterable
|
||||||
from thinc.api import Model, Config
|
from thinc.api import Model, Config
|
||||||
|
|
||||||
|
from ._parser_internals.transition_system import TransitionSystem
|
||||||
from .transition_parser cimport Parser
|
from .transition_parser cimport Parser
|
||||||
from ._parser_internals.ner cimport BiluoPushDown
|
from ._parser_internals.ner cimport BiluoPushDown
|
||||||
|
|
||||||
|
@ -40,6 +41,7 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
"moves": None,
|
"moves": None,
|
||||||
"update_with_oracle_cut_size": 100,
|
"update_with_oracle_cut_size": 100,
|
||||||
"model": DEFAULT_NER_MODEL,
|
"model": DEFAULT_NER_MODEL,
|
||||||
|
"incorrect_spans_key": None
|
||||||
},
|
},
|
||||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
||||||
|
|
||||||
|
@ -48,8 +50,9 @@ def make_ner(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
model: Model,
|
model: Model,
|
||||||
moves: Optional[list],
|
moves: Optional[TransitionSystem],
|
||||||
update_with_oracle_cut_size: int,
|
update_with_oracle_cut_size: int,
|
||||||
|
incorrect_spans_key: Optional[str]=None
|
||||||
):
|
):
|
||||||
"""Create a transition-based EntityRecognizer component. The entity recognizer
|
"""Create a transition-based EntityRecognizer component. The entity recognizer
|
||||||
identifies non-overlapping labelled spans of tokens.
|
identifies non-overlapping labelled spans of tokens.
|
||||||
|
@ -67,13 +70,16 @@ def make_ner(
|
||||||
model (Model): The model for the transition-based parser. The model needs
|
model (Model): The model for the transition-based parser. The model needs
|
||||||
to have a specific substructure of named components --- see the
|
to have a specific substructure of named components --- see the
|
||||||
spacy.ml.tb_framework.TransitionModel for details.
|
spacy.ml.tb_framework.TransitionModel for details.
|
||||||
moves (list[str]): A list of transition names. Inferred from the data if not
|
moves (Optional[TransitionSystem]): This defines how the parse-state is created,
|
||||||
provided.
|
updated and evaluated. If 'moves' is None, a new instance is
|
||||||
update_with_oracle_cut_size (int):
|
created with `self.TransitionSystem()`. Defaults to `None`.
|
||||||
During training, cut long sequences into shorter segments by creating
|
update_with_oracle_cut_size (int): During training, cut long sequences into
|
||||||
intermediate states based on the gold-standard history. The model is
|
shorter segments by creating intermediate states based on the gold-standard
|
||||||
not very sensitive to this parameter, so you usually won't need to change
|
history. The model is not very sensitive to this parameter, so you usually
|
||||||
it. 100 is a good default.
|
won't need to change it. 100 is a good default.
|
||||||
|
incorrect_spans_key (Optional[str]): Identifies spans that are known
|
||||||
|
to be incorrect entity annotations. The incorrect entity annotations
|
||||||
|
can be stored in the span group, under this key.
|
||||||
"""
|
"""
|
||||||
return EntityRecognizer(
|
return EntityRecognizer(
|
||||||
nlp.vocab,
|
nlp.vocab,
|
||||||
|
@ -81,9 +87,8 @@ def make_ner(
|
||||||
name,
|
name,
|
||||||
moves=moves,
|
moves=moves,
|
||||||
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
||||||
|
incorrect_spans_key=incorrect_spans_key,
|
||||||
multitasks=[],
|
multitasks=[],
|
||||||
min_action_freq=1,
|
|
||||||
learn_tokens=False,
|
|
||||||
beam_width=1,
|
beam_width=1,
|
||||||
beam_density=0.0,
|
beam_density=0.0,
|
||||||
beam_update_prob=0.0,
|
beam_update_prob=0.0,
|
||||||
|
@ -98,7 +103,8 @@ def make_ner(
|
||||||
"model": DEFAULT_NER_MODEL,
|
"model": DEFAULT_NER_MODEL,
|
||||||
"beam_density": 0.01,
|
"beam_density": 0.01,
|
||||||
"beam_update_prob": 0.5,
|
"beam_update_prob": 0.5,
|
||||||
"beam_width": 32
|
"beam_width": 32,
|
||||||
|
"incorrect_spans_key": None
|
||||||
},
|
},
|
||||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
||||||
)
|
)
|
||||||
|
@ -106,11 +112,12 @@ def make_beam_ner(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
model: Model,
|
model: Model,
|
||||||
moves: Optional[list],
|
moves: Optional[TransitionSystem],
|
||||||
update_with_oracle_cut_size: int,
|
update_with_oracle_cut_size: int,
|
||||||
beam_width: int,
|
beam_width: int,
|
||||||
beam_density: float,
|
beam_density: float,
|
||||||
beam_update_prob: float,
|
beam_update_prob: float,
|
||||||
|
incorrect_spans_key: Optional[str]=None
|
||||||
):
|
):
|
||||||
"""Create a transition-based EntityRecognizer component that uses beam-search.
|
"""Create a transition-based EntityRecognizer component that uses beam-search.
|
||||||
The entity recognizer identifies non-overlapping labelled spans of tokens.
|
The entity recognizer identifies non-overlapping labelled spans of tokens.
|
||||||
|
@ -128,13 +135,13 @@ def make_beam_ner(
|
||||||
model (Model): The model for the transition-based parser. The model needs
|
model (Model): The model for the transition-based parser. The model needs
|
||||||
to have a specific substructure of named components --- see the
|
to have a specific substructure of named components --- see the
|
||||||
spacy.ml.tb_framework.TransitionModel for details.
|
spacy.ml.tb_framework.TransitionModel for details.
|
||||||
moves (list[str]): A list of transition names. Inferred from the data if not
|
moves (Optional[TransitionSystem]): This defines how the parse-state is created,
|
||||||
provided.
|
updated and evaluated. If 'moves' is None, a new instance is
|
||||||
update_with_oracle_cut_size (int):
|
created with `self.TransitionSystem()`. Defaults to `None`.
|
||||||
During training, cut long sequences into shorter segments by creating
|
update_with_oracle_cut_size (int): During training, cut long sequences into
|
||||||
intermediate states based on the gold-standard history. The model is
|
shorter segments by creating intermediate states based on the gold-standard
|
||||||
not very sensitive to this parameter, so you usually won't need to change
|
history. The model is not very sensitive to this parameter, so you usually
|
||||||
it. 100 is a good default.
|
won't need to change it. 100 is a good default.
|
||||||
beam_width (int): The number of candidate analyses to maintain.
|
beam_width (int): The number of candidate analyses to maintain.
|
||||||
beam_density (float): The minimum ratio between the scores of the first and
|
beam_density (float): The minimum ratio between the scores of the first and
|
||||||
last candidates in the beam. This allows the parser to avoid exploring
|
last candidates in the beam. This allows the parser to avoid exploring
|
||||||
|
@ -144,6 +151,8 @@ def make_beam_ner(
|
||||||
beam_update_prob (float): The chance of making a beam update, instead of a
|
beam_update_prob (float): The chance of making a beam update, instead of a
|
||||||
greedy update. Greedy updates are an approximation for the beam updates,
|
greedy update. Greedy updates are an approximation for the beam updates,
|
||||||
and are faster to compute.
|
and are faster to compute.
|
||||||
|
incorrect_spans_key (Optional[str]): Optional key into span groups of
|
||||||
|
entities known to be non-entities.
|
||||||
"""
|
"""
|
||||||
return EntityRecognizer(
|
return EntityRecognizer(
|
||||||
nlp.vocab,
|
nlp.vocab,
|
||||||
|
@ -152,11 +161,10 @@ def make_beam_ner(
|
||||||
moves=moves,
|
moves=moves,
|
||||||
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
||||||
multitasks=[],
|
multitasks=[],
|
||||||
min_action_freq=1,
|
|
||||||
learn_tokens=False,
|
|
||||||
beam_width=beam_width,
|
beam_width=beam_width,
|
||||||
beam_density=beam_density,
|
beam_density=beam_density,
|
||||||
beam_update_prob=beam_update_prob,
|
beam_update_prob=beam_update_prob,
|
||||||
|
incorrect_spans_key=incorrect_spans_key
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -167,6 +175,37 @@ cdef class EntityRecognizer(Parser):
|
||||||
"""
|
"""
|
||||||
TransitionSystem = BiluoPushDown
|
TransitionSystem = BiluoPushDown
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab,
|
||||||
|
model,
|
||||||
|
name="ner",
|
||||||
|
moves=None,
|
||||||
|
*,
|
||||||
|
update_with_oracle_cut_size=100,
|
||||||
|
beam_width=1,
|
||||||
|
beam_density=0.0,
|
||||||
|
beam_update_prob=0.0,
|
||||||
|
multitasks=tuple(),
|
||||||
|
incorrect_spans_key=None,
|
||||||
|
):
|
||||||
|
"""Create an EntityRecognizer.
|
||||||
|
"""
|
||||||
|
super().__init__(
|
||||||
|
vocab,
|
||||||
|
model,
|
||||||
|
name,
|
||||||
|
moves,
|
||||||
|
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
||||||
|
min_action_freq=1, # not relevant for NER
|
||||||
|
learn_tokens=False, # not relevant for NER
|
||||||
|
beam_width=beam_width,
|
||||||
|
beam_density=beam_density,
|
||||||
|
beam_update_prob=beam_update_prob,
|
||||||
|
multitasks=multitasks,
|
||||||
|
incorrect_spans_key=incorrect_spans_key,
|
||||||
|
)
|
||||||
|
|
||||||
def add_multitask_objective(self, mt_component):
|
def add_multitask_objective(self, mt_component):
|
||||||
"""Register another component as a multi-task objective. Experimental."""
|
"""Register another component as a multi-task objective. Experimental."""
|
||||||
self._multitasks.append(mt_component)
|
self._multitasks.append(mt_component)
|
||||||
|
|
412
spacy/pipeline/spancat.py
Normal file
412
spacy/pipeline/spancat.py
Normal file
|
@ -0,0 +1,412 @@
|
||||||
|
import numpy
|
||||||
|
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any
|
||||||
|
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
|
||||||
|
from thinc.api import Optimizer
|
||||||
|
from thinc.types import Ragged, Ints2d, Floats2d
|
||||||
|
|
||||||
|
from ..scorer import Scorer
|
||||||
|
from ..language import Language
|
||||||
|
from .trainable_pipe import TrainablePipe
|
||||||
|
from ..tokens import Doc, SpanGroup, Span
|
||||||
|
from ..vocab import Vocab
|
||||||
|
from ..training import Example, validate_examples
|
||||||
|
from ..errors import Errors
|
||||||
|
from ..util import registry
|
||||||
|
|
||||||
|
|
||||||
|
spancat_default_config = """
|
||||||
|
[model]
|
||||||
|
@architectures = "spacy.SpanCategorizer.v1"
|
||||||
|
scorer = {"@layers": "spacy.LinearLogistic.v1"}
|
||||||
|
|
||||||
|
[model.reducer]
|
||||||
|
@layers = spacy.mean_max_reducer.v1
|
||||||
|
hidden_size = 128
|
||||||
|
|
||||||
|
[model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2Vec.v1"
|
||||||
|
|
||||||
|
[model.tok2vec.embed]
|
||||||
|
@architectures = "spacy.MultiHashEmbed.v1"
|
||||||
|
width = 96
|
||||||
|
rows = [5000, 2000, 1000, 1000]
|
||||||
|
attrs = ["ORTH", "PREFIX", "SUFFIX", "SHAPE"]
|
||||||
|
include_static_vectors = false
|
||||||
|
|
||||||
|
[model.tok2vec.encode]
|
||||||
|
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||||
|
width = ${model.tok2vec.embed.width}
|
||||||
|
window_size = 1
|
||||||
|
maxout_pieces = 3
|
||||||
|
depth = 4
|
||||||
|
"""
|
||||||
|
|
||||||
|
DEFAULT_SPANCAT_MODEL = Config().from_str(spancat_default_config)["model"]
|
||||||
|
|
||||||
|
|
||||||
|
@registry.misc("spacy.ngram_suggester.v1")
|
||||||
|
def build_ngram_suggester(sizes: List[int]) -> Callable[[List[Doc]], Ragged]:
|
||||||
|
"""Suggest all spans of the given lengths. Spans are returned as a ragged
|
||||||
|
array of integers. The array has two columns, indicating the start and end
|
||||||
|
position."""
|
||||||
|
|
||||||
|
def ngram_suggester(docs: List[Doc], *, ops: Optional[Ops] = None) -> Ragged:
|
||||||
|
if ops is None:
|
||||||
|
ops = get_current_ops()
|
||||||
|
spans = []
|
||||||
|
lengths = []
|
||||||
|
for doc in docs:
|
||||||
|
starts = ops.xp.arange(len(doc), dtype="i")
|
||||||
|
starts = starts.reshape((-1, 1))
|
||||||
|
length = 0
|
||||||
|
for size in sizes:
|
||||||
|
if size <= len(doc):
|
||||||
|
starts_size = starts[: len(doc) - (size - 1)]
|
||||||
|
spans.append(ops.xp.hstack((starts_size, starts_size + size)))
|
||||||
|
length += spans[-1].shape[0]
|
||||||
|
if spans:
|
||||||
|
assert spans[-1].ndim == 2, spans[-1].shape
|
||||||
|
lengths.append(length)
|
||||||
|
if len(spans) > 0:
|
||||||
|
output = Ragged(ops.xp.vstack(spans), ops.asarray(lengths, dtype="i"))
|
||||||
|
else:
|
||||||
|
output = Ragged(ops.xp.zeros((0, 0)), ops.asarray(lengths, dtype="i"))
|
||||||
|
|
||||||
|
assert output.dataXd.ndim == 2
|
||||||
|
return output
|
||||||
|
|
||||||
|
return ngram_suggester
|
||||||
|
|
||||||
|
|
||||||
|
@Language.factory(
|
||||||
|
"spancat",
|
||||||
|
assigns=["doc.spans"],
|
||||||
|
default_config={
|
||||||
|
"threshold": 0.5,
|
||||||
|
"spans_key": "sc",
|
||||||
|
"max_positive": None,
|
||||||
|
"model": DEFAULT_SPANCAT_MODEL,
|
||||||
|
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
||||||
|
},
|
||||||
|
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
||||||
|
)
|
||||||
|
def make_spancat(
|
||||||
|
nlp: Language,
|
||||||
|
name: str,
|
||||||
|
suggester: Callable[[List[Doc]], Ragged],
|
||||||
|
model: Model[Tuple[List[Doc], Ragged], Floats2d],
|
||||||
|
spans_key: str,
|
||||||
|
threshold: float = 0.5,
|
||||||
|
max_positive: Optional[int] = None,
|
||||||
|
) -> "SpanCategorizer":
|
||||||
|
"""Create a SpanCategorizer component. The span categorizer consists of two
|
||||||
|
parts: a suggester function that proposes candidate spans, and a labeller
|
||||||
|
model that predicts one or more labels for each span.
|
||||||
|
|
||||||
|
suggester (Callable[List[Doc], Ragged]): A function that suggests spans.
|
||||||
|
Spans are returned as a ragged array with two integer columns, for the
|
||||||
|
start and end positions.
|
||||||
|
model (Model[Tuple[List[Doc], Ragged], Floats2d]): A model instance that
|
||||||
|
is given a list of documents and (start, end) indices representing
|
||||||
|
candidate span offsets. The model predicts a probability for each category
|
||||||
|
for each span.
|
||||||
|
spans_key (str): Key of the doc.spans dict to save the spans under. During
|
||||||
|
initialization and training, the component will look for spans on the
|
||||||
|
reference document under the same key.
|
||||||
|
threshold (float): Minimum probability to consider a prediction positive.
|
||||||
|
Spans with a positive prediction will be saved on the Doc. Defaults to
|
||||||
|
0.5.
|
||||||
|
max_positive (Optional[int]): Maximum number of labels to consider positive
|
||||||
|
per span. Defaults to None, indicating no limit.
|
||||||
|
"""
|
||||||
|
return SpanCategorizer(
|
||||||
|
nlp.vocab,
|
||||||
|
suggester=suggester,
|
||||||
|
model=model,
|
||||||
|
spans_key=spans_key,
|
||||||
|
threshold=threshold,
|
||||||
|
max_positive=max_positive,
|
||||||
|
name=name,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class SpanCategorizer(TrainablePipe):
|
||||||
|
"""Pipeline component to label spans of text.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/spancategorizer
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab: Vocab,
|
||||||
|
model: Model[Tuple[List[Doc], Ragged], Floats2d],
|
||||||
|
suggester: Callable[[List[Doc]], Ragged],
|
||||||
|
name: str = "spancat",
|
||||||
|
*,
|
||||||
|
spans_key: str = "spans",
|
||||||
|
threshold: float = 0.5,
|
||||||
|
max_positive: Optional[int] = None,
|
||||||
|
) -> None:
|
||||||
|
"""Initialize the span categorizer.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/spancategorizer#init
|
||||||
|
"""
|
||||||
|
self.cfg = {
|
||||||
|
"labels": [],
|
||||||
|
"spans_key": spans_key,
|
||||||
|
"threshold": threshold,
|
||||||
|
"max_positive": max_positive,
|
||||||
|
}
|
||||||
|
self.vocab = vocab
|
||||||
|
self.suggester = suggester
|
||||||
|
self.model = model
|
||||||
|
self.name = name
|
||||||
|
|
||||||
|
@property
|
||||||
|
def key(self) -> str:
|
||||||
|
"""Key of the doc.spans dict to save the spans under. During
|
||||||
|
initialization and training, the component will look for spans on the
|
||||||
|
reference document under the same key.
|
||||||
|
"""
|
||||||
|
return self.cfg["spans_key"]
|
||||||
|
|
||||||
|
def add_label(self, label: str) -> int:
|
||||||
|
"""Add a new label to the pipe.
|
||||||
|
|
||||||
|
label (str): The label to add.
|
||||||
|
RETURNS (int): 0 if label is already present, otherwise 1.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/spancategorizer#add_label
|
||||||
|
"""
|
||||||
|
if not isinstance(label, str):
|
||||||
|
raise ValueError(Errors.E187)
|
||||||
|
if label in self.labels:
|
||||||
|
return 0
|
||||||
|
self._allow_extra_label()
|
||||||
|
self.cfg["labels"].append(label)
|
||||||
|
self.vocab.strings.add(label)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
@property
|
||||||
|
def labels(self) -> Tuple[str]:
|
||||||
|
"""RETURNS (Tuple[str]): The labels currently added to the component.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/spancategorizer#labels
|
||||||
|
"""
|
||||||
|
return tuple(self.cfg["labels"])
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label_data(self) -> List[str]:
|
||||||
|
"""RETURNS (List[str]): Information about the component's labels.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/spancategorizer#label_data
|
||||||
|
"""
|
||||||
|
return list(self.labels)
|
||||||
|
|
||||||
|
def predict(self, docs: Iterable[Doc]):
|
||||||
|
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
||||||
|
|
||||||
|
docs (Iterable[Doc]): The documents to predict.
|
||||||
|
RETURNS: The models prediction for each document.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/spancategorizer#predict
|
||||||
|
"""
|
||||||
|
indices = self.suggester(docs, ops=self.model.ops)
|
||||||
|
scores = self.model.predict((docs, indices))
|
||||||
|
return (indices, scores)
|
||||||
|
|
||||||
|
def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None:
|
||||||
|
"""Modify a batch of Doc objects, using pre-computed scores.
|
||||||
|
|
||||||
|
docs (Iterable[Doc]): The documents to modify.
|
||||||
|
scores: The scores to set, produced by SpanCategorizer.predict.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/spancategorizer#set_annotations
|
||||||
|
"""
|
||||||
|
labels = self.labels
|
||||||
|
indices, scores = indices_scores
|
||||||
|
offset = 0
|
||||||
|
for i, doc in enumerate(docs):
|
||||||
|
indices_i = indices[i].dataXd
|
||||||
|
doc.spans[self.key] = self._make_span_group(
|
||||||
|
doc, indices_i, scores[offset : offset + indices.lengths[i]], labels
|
||||||
|
)
|
||||||
|
offset += indices.lengths[i]
|
||||||
|
|
||||||
|
def update(
|
||||||
|
self,
|
||||||
|
examples: Iterable[Example],
|
||||||
|
*,
|
||||||
|
drop: float = 0.0,
|
||||||
|
sgd: Optional[Optimizer] = None,
|
||||||
|
losses: Optional[Dict[str, float]] = None,
|
||||||
|
) -> Dict[str, float]:
|
||||||
|
"""Learn from a batch of documents and gold-standard information,
|
||||||
|
updating the pipe's model. Delegates to predict and get_loss.
|
||||||
|
|
||||||
|
examples (Iterable[Example]): A batch of Example objects.
|
||||||
|
drop (float): The dropout rate.
|
||||||
|
sgd (thinc.api.Optimizer): The optimizer.
|
||||||
|
losses (Dict[str, float]): Optional record of the loss during training.
|
||||||
|
Updated using the component name as the key.
|
||||||
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/spancategorizer#update
|
||||||
|
"""
|
||||||
|
if losses is None:
|
||||||
|
losses = {}
|
||||||
|
losses.setdefault(self.name, 0.0)
|
||||||
|
validate_examples(examples, "SpanCategorizer.update")
|
||||||
|
self._validate_categories(examples)
|
||||||
|
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
||||||
|
# Handle cases where there are no tokens in any docs.
|
||||||
|
return losses
|
||||||
|
docs = [eg.predicted for eg in examples]
|
||||||
|
spans = self.suggester(docs, ops=self.model.ops)
|
||||||
|
if spans.lengths.sum() == 0:
|
||||||
|
return losses
|
||||||
|
set_dropout_rate(self.model, drop)
|
||||||
|
scores, backprop_scores = self.model.begin_update((docs, spans))
|
||||||
|
loss, d_scores = self.get_loss(examples, (spans, scores))
|
||||||
|
backprop_scores(d_scores)
|
||||||
|
if sgd is not None:
|
||||||
|
self.finish_update(sgd)
|
||||||
|
losses[self.name] += loss
|
||||||
|
return losses
|
||||||
|
|
||||||
|
def get_loss(
|
||||||
|
self, examples: Iterable[Example], spans_scores: Tuple[Ragged, Ragged]
|
||||||
|
) -> Tuple[float, float]:
|
||||||
|
"""Find the loss and gradient of loss for the batch of documents and
|
||||||
|
their predicted scores.
|
||||||
|
|
||||||
|
examples (Iterable[Examples]): The batch of examples.
|
||||||
|
spans_scores: Scores representing the model's predictions.
|
||||||
|
RETURNS (Tuple[float, float]): The loss and the gradient.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/spancategorizer#get_loss
|
||||||
|
"""
|
||||||
|
spans, scores = spans_scores
|
||||||
|
spans = Ragged(
|
||||||
|
self.model.ops.to_numpy(spans.data), self.model.ops.to_numpy(spans.lengths)
|
||||||
|
)
|
||||||
|
label_map = {label: i for i, label in enumerate(self.labels)}
|
||||||
|
target = numpy.zeros(scores.shape, dtype=scores.dtype)
|
||||||
|
offset = 0
|
||||||
|
for i, eg in enumerate(examples):
|
||||||
|
# Map (start, end) offset of spans to the row in the d_scores array,
|
||||||
|
# so that we can adjust the gradient for predictions that were
|
||||||
|
# in the gold standard.
|
||||||
|
spans_index = {}
|
||||||
|
spans_i = spans[i].dataXd
|
||||||
|
for j in range(spans.lengths[i]):
|
||||||
|
start = int(spans_i[j, 0])
|
||||||
|
end = int(spans_i[j, 1])
|
||||||
|
spans_index[(start, end)] = offset + j
|
||||||
|
for gold_span in self._get_aligned_spans(eg):
|
||||||
|
key = (gold_span.start, gold_span.end)
|
||||||
|
if key in spans_index:
|
||||||
|
row = spans_index[key]
|
||||||
|
k = label_map[gold_span.label_]
|
||||||
|
target[row, k] = 1.0
|
||||||
|
# The target is a flat array for all docs. Track the position
|
||||||
|
# we're at within the flat array.
|
||||||
|
offset += spans.lengths[i]
|
||||||
|
target = self.model.ops.asarray(target, dtype="f")
|
||||||
|
# The target will have the values 0 (for untrue predictions) or 1
|
||||||
|
# (for true predictions).
|
||||||
|
# The scores should be in the range [0, 1].
|
||||||
|
# If the prediction is 0.9 and it's true, the gradient
|
||||||
|
# will be -0.1 (0.9 - 1.0).
|
||||||
|
# If the prediction is 0.9 and it's false, the gradient will be
|
||||||
|
# 0.9 (0.9 - 0.0)
|
||||||
|
d_scores = scores - target
|
||||||
|
loss = float((d_scores ** 2).sum())
|
||||||
|
return loss, d_scores
|
||||||
|
|
||||||
|
def initialize(
|
||||||
|
self,
|
||||||
|
get_examples: Callable[[], Iterable[Example]],
|
||||||
|
*,
|
||||||
|
nlp: Language = None,
|
||||||
|
labels: Optional[List[str]] = None,
|
||||||
|
) -> None:
|
||||||
|
"""Initialize the pipe for training, using a representative set
|
||||||
|
of data examples.
|
||||||
|
|
||||||
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
|
returns a representative sample of gold-standard Example objects.
|
||||||
|
nlp (Language): The current nlp object the component is part of.
|
||||||
|
labels: The labels to add to the component, typically generated by the
|
||||||
|
`init labels` command. If no labels are provided, the get_examples
|
||||||
|
callback is used to extract the labels from the data.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/spancategorizer#initialize
|
||||||
|
"""
|
||||||
|
subbatch = []
|
||||||
|
if labels is not None:
|
||||||
|
for label in labels:
|
||||||
|
self.add_label(label)
|
||||||
|
for eg in get_examples():
|
||||||
|
if labels is None:
|
||||||
|
for span in eg.reference.spans.get(self.key, []):
|
||||||
|
self.add_label(span.label_)
|
||||||
|
if len(subbatch) < 10:
|
||||||
|
subbatch.append(eg)
|
||||||
|
self._require_labels()
|
||||||
|
if subbatch:
|
||||||
|
docs = [eg.x for eg in subbatch]
|
||||||
|
spans = self.suggester(docs)
|
||||||
|
Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels))
|
||||||
|
self.model.initialize(X=(docs, spans), Y=Y)
|
||||||
|
else:
|
||||||
|
self.model.initialize()
|
||||||
|
|
||||||
|
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
|
"""Score a batch of examples.
|
||||||
|
|
||||||
|
examples (Iterable[Example]): The examples to score.
|
||||||
|
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/spancategorizer#score
|
||||||
|
"""
|
||||||
|
validate_examples(examples, "SpanCategorizer.score")
|
||||||
|
self._validate_categories(examples)
|
||||||
|
kwargs = dict(kwargs)
|
||||||
|
attr_prefix = "spans_"
|
||||||
|
kwargs.setdefault("attr", f"{attr_prefix}{self.key}")
|
||||||
|
kwargs.setdefault("labels", self.labels)
|
||||||
|
kwargs.setdefault("multi_label", True)
|
||||||
|
kwargs.setdefault("threshold", self.cfg["threshold"])
|
||||||
|
kwargs.setdefault(
|
||||||
|
"getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], [])
|
||||||
|
)
|
||||||
|
kwargs.setdefault("has_annotation", lambda doc: self.key in doc.spans)
|
||||||
|
return Scorer.score_spans(examples, **kwargs)
|
||||||
|
|
||||||
|
def _validate_categories(self, examples):
|
||||||
|
# TODO
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _get_aligned_spans(self, eg: Example):
|
||||||
|
return eg.get_aligned_spans_y2x(eg.reference.spans.get(self.key, []))
|
||||||
|
|
||||||
|
def _make_span_group(
|
||||||
|
self, doc: Doc, indices: Ints2d, scores: Floats2d, labels: List[str]
|
||||||
|
) -> SpanGroup:
|
||||||
|
spans = SpanGroup(doc, name=self.key)
|
||||||
|
max_positive = self.cfg["max_positive"]
|
||||||
|
threshold = self.cfg["threshold"]
|
||||||
|
for i in range(indices.shape[0]):
|
||||||
|
start = int(indices[i, 0])
|
||||||
|
end = int(indices[i, 1])
|
||||||
|
positives = []
|
||||||
|
for j, score in enumerate(scores[i]):
|
||||||
|
if score >= threshold:
|
||||||
|
positives.append((score, start, end, labels[j]))
|
||||||
|
positives.sort(reverse=True)
|
||||||
|
if max_positive:
|
||||||
|
positives = positives[:max_positive]
|
||||||
|
for score, start, end, label in positives:
|
||||||
|
spans.append(Span(doc, start, end, label=label))
|
||||||
|
return spans
|
|
@ -35,7 +35,7 @@ maxout_pieces = 3
|
||||||
depth = 2
|
depth = 2
|
||||||
|
|
||||||
[model.linear_model]
|
[model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v1"
|
@architectures = "spacy.TextCatBOW.v2"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
@ -44,7 +44,7 @@ DEFAULT_SINGLE_TEXTCAT_MODEL = Config().from_str(single_label_default_config)["m
|
||||||
|
|
||||||
single_label_bow_config = """
|
single_label_bow_config = """
|
||||||
[model]
|
[model]
|
||||||
@architectures = "spacy.TextCatBOW.v1"
|
@architectures = "spacy.TextCatBOW.v2"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
@ -52,7 +52,7 @@ no_output_layer = false
|
||||||
|
|
||||||
single_label_cnn_config = """
|
single_label_cnn_config = """
|
||||||
[model]
|
[model]
|
||||||
@architectures = "spacy.TextCatCNN.v1"
|
@architectures = "spacy.TextCatCNN.v2"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
|
|
||||||
[model.tok2vec]
|
[model.tok2vec]
|
||||||
|
@ -298,6 +298,10 @@ class TextCategorizer(TrainablePipe):
|
||||||
return 0
|
return 0
|
||||||
self._allow_extra_label()
|
self._allow_extra_label()
|
||||||
self.cfg["labels"].append(label)
|
self.cfg["labels"].append(label)
|
||||||
|
if self.model and "resize_output" in self.model.attrs:
|
||||||
|
self.model = self.model.attrs["resize_output"](
|
||||||
|
self.model, len(self.cfg["labels"])
|
||||||
|
)
|
||||||
self.vocab.strings.add(label)
|
self.vocab.strings.add(label)
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
@ -332,6 +336,8 @@ class TextCategorizer(TrainablePipe):
|
||||||
else:
|
else:
|
||||||
for label in labels:
|
for label in labels:
|
||||||
self.add_label(label)
|
self.add_label(label)
|
||||||
|
if len(self.labels) < 2:
|
||||||
|
raise ValueError(Errors.E867)
|
||||||
if positive_label is not None:
|
if positive_label is not None:
|
||||||
if positive_label not in self.labels:
|
if positive_label not in self.labels:
|
||||||
err = Errors.E920.format(pos_label=positive_label, labels=self.labels)
|
err = Errors.E920.format(pos_label=positive_label, labels=self.labels)
|
||||||
|
|
|
@ -35,7 +35,7 @@ maxout_pieces = 3
|
||||||
depth = 2
|
depth = 2
|
||||||
|
|
||||||
[model.linear_model]
|
[model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v1"
|
@architectures = "spacy.TextCatBOW.v2"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
@ -44,7 +44,7 @@ DEFAULT_MULTI_TEXTCAT_MODEL = Config().from_str(multi_label_default_config)["mod
|
||||||
|
|
||||||
multi_label_bow_config = """
|
multi_label_bow_config = """
|
||||||
[model]
|
[model]
|
||||||
@architectures = "spacy.TextCatBOW.v1"
|
@architectures = "spacy.TextCatBOW.v2"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
@ -52,7 +52,7 @@ no_output_layer = false
|
||||||
|
|
||||||
multi_label_cnn_config = """
|
multi_label_cnn_config = """
|
||||||
[model]
|
[model]
|
||||||
@architectures = "spacy.TextCatCNN.v1"
|
@architectures = "spacy.TextCatCNN.v2"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
|
|
||||||
[model.tok2vec]
|
[model.tok2vec]
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user