mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Merge pull request #9612 from adrianeboyd/chore/switch-to-master-v3.2.0
Switch v3.2.0 to master
This commit is contained in:
commit
2bf52c44b1
10
.github/azure-steps.yml
vendored
10
.github/azure-steps.yml
vendored
|
@ -65,8 +65,11 @@ steps:
|
||||||
condition: eq(${{ parameters.gpu }}, true)
|
condition: eq(${{ parameters.gpu }}, true)
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -m spacy download ca_core_news_sm
|
#python -m spacy download ca_core_news_sm
|
||||||
python -m spacy download ca_core_news_md
|
#python -m spacy download ca_core_news_md
|
||||||
|
# temporarily install the v3.1.0 models
|
||||||
|
pip install --no-deps https://github.com/explosion/spacy-models/releases/download/ca_core_news_sm-3.1.0/ca_core_news_sm-3.1.0-py3-none-any.whl
|
||||||
|
pip install --no-deps https://github.com/explosion/spacy-models/releases/download/ca_core_news_md-3.1.0/ca_core_news_md-3.1.0-py3-none-any.whl
|
||||||
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||||
displayName: 'Test download CLI'
|
displayName: 'Test download CLI'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
@ -95,7 +98,8 @@ steps:
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||||
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
# temporarily ignore W095
|
||||||
|
PYTHONWARNINGS="error,ignore:[W095]:UserWarning,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||||
displayName: 'Test assemble CLI'
|
displayName: 'Test assemble CLI'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
|
|
106
.github/contributors/avi197.md
vendored
Normal file
106
.github/contributors/avi197.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Son Pham |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 09/10/2021 |
|
||||||
|
| GitHub username | Avi197 |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/fgaim.md
vendored
Normal file
106
.github/contributors/fgaim.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Fitsum Gaim |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2021-08-07 |
|
||||||
|
| GitHub username | fgaim |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/syrull.md
vendored
Normal file
106
.github/contributors/syrull.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Dimitar Ganev |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2021/8/2 |
|
||||||
|
| GitHub username | syrull |
|
||||||
|
| Website (optional) | |
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -9,6 +9,7 @@ keys/
|
||||||
spacy/tests/package/setup.cfg
|
spacy/tests/package/setup.cfg
|
||||||
spacy/tests/package/pyproject.toml
|
spacy/tests/package/pyproject.toml
|
||||||
spacy/tests/package/requirements.txt
|
spacy/tests/package/requirements.txt
|
||||||
|
spacy/tests/universe/universe.json
|
||||||
|
|
||||||
# Website
|
# Website
|
||||||
website/.cache/
|
website/.cache/
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
spacy-legacy>=3.0.8,<3.1.0
|
spacy-legacy>=3.0.8,<3.1.0
|
||||||
|
spacy-loggers>=1.0.0,<2.0.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.12,<8.1.0
|
thinc>=8.0.12,<8.1.0
|
||||||
|
@ -17,6 +18,7 @@ requests>=2.13.0,<3.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0
|
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0
|
||||||
jinja2
|
jinja2
|
||||||
|
langcodes>=3.2.0,<4.0.0
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
packaging>=20.0
|
packaging>=20.0
|
||||||
|
|
|
@ -42,6 +42,7 @@ setup_requires =
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
spacy-legacy>=3.0.8,<3.1.0
|
spacy-legacy>=3.0.8,<3.1.0
|
||||||
|
spacy-loggers>=1.0.0,<2.0.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
|
@ -62,6 +63,7 @@ install_requires =
|
||||||
setuptools
|
setuptools
|
||||||
packaging>=20.0
|
packaging>=20.0
|
||||||
typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8"
|
typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8"
|
||||||
|
langcodes>=3.2.0,<4.0.0
|
||||||
|
|
||||||
[options.entry_points]
|
[options.entry_points]
|
||||||
console_scripts =
|
console_scripts =
|
||||||
|
@ -69,9 +71,9 @@ console_scripts =
|
||||||
|
|
||||||
[options.extras_require]
|
[options.extras_require]
|
||||||
lookups =
|
lookups =
|
||||||
spacy_lookups_data>=1.0.2,<1.1.0
|
spacy_lookups_data>=1.0.3,<1.1.0
|
||||||
transformers =
|
transformers =
|
||||||
spacy_transformers>=1.0.1,<1.2.0
|
spacy_transformers>=1.1.2,<1.2.0
|
||||||
ray =
|
ray =
|
||||||
spacy_ray>=0.1.0,<1.0.0
|
spacy_ray>=0.1.0,<1.0.0
|
||||||
cuda =
|
cuda =
|
||||||
|
|
1
setup.py
1
setup.py
|
@ -81,6 +81,7 @@ COPY_FILES = {
|
||||||
ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package",
|
ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package",
|
||||||
ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package",
|
ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package",
|
||||||
ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package",
|
ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package",
|
||||||
|
ROOT / "website" / "meta" / "universe.json": PACKAGE_ROOT / "tests" / "universe",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "3.1.4"
|
__version__ = "3.2.0"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__projects__ = "https://github.com/explosion/projects"
|
__projects__ = "https://github.com/explosion/projects"
|
||||||
|
|
|
@ -142,7 +142,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||||
for name, value in stringy_attrs.items():
|
for name, value in stringy_attrs.items():
|
||||||
int_key = intify_attr(name)
|
int_key = intify_attr(name)
|
||||||
if int_key is not None:
|
if int_key is not None:
|
||||||
if strings_map is not None and isinstance(value, basestring):
|
if strings_map is not None and isinstance(value, str):
|
||||||
if hasattr(strings_map, 'add'):
|
if hasattr(strings_map, 'add'):
|
||||||
value = strings_map.add(value)
|
value = strings_map.add(value)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -20,6 +20,7 @@ def init_vectors_cli(
|
||||||
output_dir: Path = Arg(..., help="Pipeline output directory"),
|
output_dir: Path = Arg(..., help="Pipeline output directory"),
|
||||||
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
|
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
|
||||||
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
||||||
|
mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
|
||||||
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
|
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
|
||||||
|
@ -34,7 +35,14 @@ def init_vectors_cli(
|
||||||
nlp = util.get_lang_class(lang)()
|
nlp = util.get_lang_class(lang)()
|
||||||
if jsonl_loc is not None:
|
if jsonl_loc is not None:
|
||||||
update_lexemes(nlp, jsonl_loc)
|
update_lexemes(nlp, jsonl_loc)
|
||||||
convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
|
convert_vectors(
|
||||||
|
nlp,
|
||||||
|
vectors_loc,
|
||||||
|
truncate=truncate,
|
||||||
|
prune=prune,
|
||||||
|
name=name,
|
||||||
|
mode=mode,
|
||||||
|
)
|
||||||
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
||||||
nlp.to_disk(output_dir)
|
nlp.to_disk(output_dir)
|
||||||
msg.good(
|
msg.good(
|
||||||
|
|
|
@ -5,6 +5,7 @@ raw_text = null
|
||||||
max_epochs = 1000
|
max_epochs = 1000
|
||||||
dropout = 0.2
|
dropout = 0.2
|
||||||
n_save_every = null
|
n_save_every = null
|
||||||
|
n_save_epoch = null
|
||||||
component = "tok2vec"
|
component = "tok2vec"
|
||||||
layer = ""
|
layer = ""
|
||||||
corpus = "corpora.pretrain"
|
corpus = "corpora.pretrain"
|
||||||
|
|
|
@ -22,6 +22,9 @@ def setup_default_warnings():
|
||||||
# warn once about lemmatizer without required POS
|
# warn once about lemmatizer without required POS
|
||||||
filter_warning("once", error_msg=Warnings.W108)
|
filter_warning("once", error_msg=Warnings.W108)
|
||||||
|
|
||||||
|
# floret vector table cannot be modified
|
||||||
|
filter_warning("once", error_msg="[W114]")
|
||||||
|
|
||||||
|
|
||||||
def filter_warning(action: str, error_msg: str):
|
def filter_warning(action: str, error_msg: str):
|
||||||
"""Customize how spaCy should handle a certain warning.
|
"""Customize how spaCy should handle a certain warning.
|
||||||
|
@ -186,6 +189,8 @@ class Warnings(metaclass=ErrorsWithCodes):
|
||||||
"vectors are not identical to current pipeline vectors.")
|
"vectors are not identical to current pipeline vectors.")
|
||||||
W114 = ("Using multiprocessing with GPU models is not recommended and may "
|
W114 = ("Using multiprocessing with GPU models is not recommended and may "
|
||||||
"lead to errors.")
|
"lead to errors.")
|
||||||
|
W115 = ("Skipping {method}: the floret vector table cannot be modified. "
|
||||||
|
"Vectors are calculated from character ngrams.")
|
||||||
|
|
||||||
|
|
||||||
class Errors(metaclass=ErrorsWithCodes):
|
class Errors(metaclass=ErrorsWithCodes):
|
||||||
|
@ -277,7 +282,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"you forget to call the `set_extension` method?")
|
"you forget to call the `set_extension` method?")
|
||||||
E047 = ("Can't assign a value to unregistered extension attribute "
|
E047 = ("Can't assign a value to unregistered extension attribute "
|
||||||
"'{name}'. Did you forget to call the `set_extension` method?")
|
"'{name}'. Did you forget to call the `set_extension` method?")
|
||||||
E048 = ("Can't import language {lang} from spacy.lang: {err}")
|
E048 = ("Can't import language {lang} or any matching language from spacy.lang: {err}")
|
||||||
E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
|
E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
|
||||||
"package or a valid path to a data directory.")
|
"package or a valid path to a data directory.")
|
||||||
E052 = ("Can't find model directory: {path}")
|
E052 = ("Can't find model directory: {path}")
|
||||||
|
@ -511,13 +516,24 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
|
E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
|
||||||
E200 = ("Can't yet set {attr} from Span. Vote for this feature on the "
|
E200 = ("Can't yet set {attr} from Span. Vote for this feature on the "
|
||||||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||||
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
|
E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
|
||||||
|
|
||||||
# New errors added in v3.x
|
# New errors added in v3.x
|
||||||
E866 = ("A SpanGroup is not functional after the corresponding Doc has "
|
E858 = ("The {mode} vector table does not support this operation. "
|
||||||
|
"{alternative}")
|
||||||
|
E859 = ("The floret vector table cannot be modified.")
|
||||||
|
E860 = ("Can't truncate fasttext-bloom vectors.")
|
||||||
|
E861 = ("No 'keys' should be provided when initializing floret vectors "
|
||||||
|
"with 'minn' and 'maxn'.")
|
||||||
|
E862 = ("'hash_count' must be between 1-4 for floret vectors.")
|
||||||
|
E863 = ("'maxn' must be greater than or equal to 'minn'.")
|
||||||
|
E864 = ("The complete vector table 'data' is required to initialize floret "
|
||||||
|
"vectors.")
|
||||||
|
E865 = ("A SpanGroup is not functional after the corresponding Doc has "
|
||||||
"been garbage collected. To keep using the spans, make sure that "
|
"been garbage collected. To keep using the spans, make sure that "
|
||||||
"the corresponding Doc object is still available in the scope of "
|
"the corresponding Doc object is still available in the scope of "
|
||||||
"your function.")
|
"your function.")
|
||||||
|
E866 = ("Expected a string or 'Doc' as input, but got: {type}.")
|
||||||
E867 = ("The 'textcat' component requires at least two labels because it "
|
E867 = ("The 'textcat' component requires at least two labels because it "
|
||||||
"uses mutually exclusive classes where exactly one label is True "
|
"uses mutually exclusive classes where exactly one label is True "
|
||||||
"for each doc. For binary classification tasks, you can use two "
|
"for each doc. For binary classification tasks, you can use two "
|
||||||
|
|
20
spacy/kb.pyx
20
spacy/kb.pyx
|
@ -124,7 +124,7 @@ cdef class KnowledgeBase:
|
||||||
def get_alias_strings(self):
|
def get_alias_strings(self):
|
||||||
return [self.vocab.strings[x] for x in self._alias_index]
|
return [self.vocab.strings[x] for x in self._alias_index]
|
||||||
|
|
||||||
def add_entity(self, unicode entity, float freq, vector[float] entity_vector):
|
def add_entity(self, str entity, float freq, vector[float] entity_vector):
|
||||||
"""
|
"""
|
||||||
Add an entity to the KB, optionally specifying its log probability based on corpus frequency
|
Add an entity to the KB, optionally specifying its log probability based on corpus frequency
|
||||||
Return the hash of the entity ID/name at the end.
|
Return the hash of the entity ID/name at the end.
|
||||||
|
@ -185,15 +185,15 @@ cdef class KnowledgeBase:
|
||||||
|
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
def contains_entity(self, unicode entity):
|
def contains_entity(self, str entity):
|
||||||
cdef hash_t entity_hash = self.vocab.strings.add(entity)
|
cdef hash_t entity_hash = self.vocab.strings.add(entity)
|
||||||
return entity_hash in self._entry_index
|
return entity_hash in self._entry_index
|
||||||
|
|
||||||
def contains_alias(self, unicode alias):
|
def contains_alias(self, str alias):
|
||||||
cdef hash_t alias_hash = self.vocab.strings.add(alias)
|
cdef hash_t alias_hash = self.vocab.strings.add(alias)
|
||||||
return alias_hash in self._alias_index
|
return alias_hash in self._alias_index
|
||||||
|
|
||||||
def add_alias(self, unicode alias, entities, probabilities):
|
def add_alias(self, str alias, entities, probabilities):
|
||||||
"""
|
"""
|
||||||
For a given alias, add its potential entities and prior probabilies to the KB.
|
For a given alias, add its potential entities and prior probabilies to the KB.
|
||||||
Return the alias_hash at the end
|
Return the alias_hash at the end
|
||||||
|
@ -239,7 +239,7 @@ cdef class KnowledgeBase:
|
||||||
raise RuntimeError(Errors.E891.format(alias=alias))
|
raise RuntimeError(Errors.E891.format(alias=alias))
|
||||||
return alias_hash
|
return alias_hash
|
||||||
|
|
||||||
def append_alias(self, unicode alias, unicode entity, float prior_prob, ignore_warnings=False):
|
def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False):
|
||||||
"""
|
"""
|
||||||
For an alias already existing in the KB, extend its potential entities with one more.
|
For an alias already existing in the KB, extend its potential entities with one more.
|
||||||
Throw a warning if either the alias or the entity is unknown,
|
Throw a warning if either the alias or the entity is unknown,
|
||||||
|
@ -286,7 +286,7 @@ cdef class KnowledgeBase:
|
||||||
alias_entry.probs = probs
|
alias_entry.probs = probs
|
||||||
self._aliases_table[alias_index] = alias_entry
|
self._aliases_table[alias_index] = alias_entry
|
||||||
|
|
||||||
def get_alias_candidates(self, unicode alias) -> Iterator[Candidate]:
|
def get_alias_candidates(self, str alias) -> Iterator[Candidate]:
|
||||||
"""
|
"""
|
||||||
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
||||||
and the prior probability of that alias resolving to that entity.
|
and the prior probability of that alias resolving to that entity.
|
||||||
|
@ -307,7 +307,7 @@ cdef class KnowledgeBase:
|
||||||
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
|
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
|
||||||
if entry_index != 0]
|
if entry_index != 0]
|
||||||
|
|
||||||
def get_vector(self, unicode entity):
|
def get_vector(self, str entity):
|
||||||
cdef hash_t entity_hash = self.vocab.strings[entity]
|
cdef hash_t entity_hash = self.vocab.strings[entity]
|
||||||
|
|
||||||
# Return an empty list if this entity is unknown in this KB
|
# Return an empty list if this entity is unknown in this KB
|
||||||
|
@ -317,7 +317,7 @@ cdef class KnowledgeBase:
|
||||||
|
|
||||||
return self._vectors_table[self._entries[entry_index].vector_index]
|
return self._vectors_table[self._entries[entry_index].vector_index]
|
||||||
|
|
||||||
def get_prior_prob(self, unicode entity, unicode alias):
|
def get_prior_prob(self, str entity, str alias):
|
||||||
""" Return the prior probability of a given alias being linked to a given entity,
|
""" Return the prior probability of a given alias being linked to a given entity,
|
||||||
or return 0.0 when this combination is not known in the knowledge base"""
|
or return 0.0 when this combination is not known in the knowledge base"""
|
||||||
cdef hash_t alias_hash = self.vocab.strings[alias]
|
cdef hash_t alias_hash = self.vocab.strings[alias]
|
||||||
|
@ -587,7 +587,7 @@ cdef class Writer:
|
||||||
def __init__(self, path):
|
def __init__(self, path):
|
||||||
assert isinstance(path, Path)
|
assert isinstance(path, Path)
|
||||||
content = bytes(path)
|
content = bytes(path)
|
||||||
cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
|
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
|
||||||
self._fp = fopen(<char*>bytes_loc, 'wb')
|
self._fp = fopen(<char*>bytes_loc, 'wb')
|
||||||
if not self._fp:
|
if not self._fp:
|
||||||
raise IOError(Errors.E146.format(path=path))
|
raise IOError(Errors.E146.format(path=path))
|
||||||
|
@ -629,7 +629,7 @@ cdef class Writer:
|
||||||
cdef class Reader:
|
cdef class Reader:
|
||||||
def __init__(self, path):
|
def __init__(self, path):
|
||||||
content = bytes(path)
|
content = bytes(path)
|
||||||
cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
|
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
|
||||||
self._fp = fopen(<char*>bytes_loc, 'rb')
|
self._fp = fopen(<char*>bytes_loc, 'rb')
|
||||||
if not self._fp:
|
if not self._fp:
|
||||||
PyErr_SetFromErrno(IOError)
|
PyErr_SetFromErrno(IOError)
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
||||||
from ..char_classes import UNITS, ALPHA_UPPER
|
from ..char_classes import UNITS, ALPHA_UPPER
|
||||||
|
|
||||||
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧".strip().split()
|
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
|
||||||
|
|
||||||
_suffixes = (
|
_suffixes = (
|
||||||
_list_punct
|
_list_punct
|
||||||
|
|
|
@ -1,265 +1,79 @@
|
||||||
# Source: https://github.com/Alir3z4/stop-words
|
"""
|
||||||
|
References:
|
||||||
|
https://github.com/Alir3z4/stop-words - Original list, serves as a base.
|
||||||
|
https://postvai.com/books/stop-dumi.pdf - Additions to the original list in order to improve it.
|
||||||
|
"""
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
а
|
а автентичен аз ако ала
|
||||||
автентичен
|
|
||||||
аз
|
бе без беше би бивш бивша бившо бивши бил била били било благодаря близо бъдат
|
||||||
ако
|
бъде бъда бяха
|
||||||
ала
|
|
||||||
бе
|
в вас ваш ваша вашата вашият вероятно вече взема ви вие винаги внимава време все
|
||||||
без
|
всеки всички вместо всичко вследствие всъщност всяка втори във въпреки върху
|
||||||
беше
|
вътре веднъж
|
||||||
би
|
|
||||||
бивш
|
г ги главен главна главно глас го годно година години годишен
|
||||||
бивша
|
|
||||||
бившо
|
д да дали далеч далече два двама двамата две двете ден днес дни до добра добре
|
||||||
бил
|
добро добър достатъчно докато докога дори досега доста друг друга другаде други
|
||||||
била
|
|
||||||
били
|
е евтин едва един една еднаква еднакви еднакъв едно екип ето
|
||||||
било
|
|
||||||
благодаря
|
живот жив
|
||||||
близо
|
|
||||||
бъдат
|
за здравей здрасти знае зная забавям зад зададени заедно заради засега заспал
|
||||||
бъде
|
затова запазва започвам защо защото завинаги
|
||||||
бяха
|
|
||||||
в
|
и из или им има имат иска искам използвайки изглежда изглеждаше изглеждайки
|
||||||
вас
|
извън имайки
|
||||||
ваш
|
|
||||||
ваша
|
й йо
|
||||||
вероятно
|
|
||||||
вече
|
каза казва казвайки казвам как каква какво както какъв като кога кауза каузи
|
||||||
взема
|
когато когото което които кой който колко която къде където към край кратък
|
||||||
ви
|
кръгъл
|
||||||
вие
|
|
||||||
винаги
|
лесен лесно ли летя летиш летим лош
|
||||||
внимава
|
|
||||||
време
|
м май малко макар малцина междувременно минус ме между мек мен месец ми мис
|
||||||
все
|
мисля много мнозина мога могат може мой можем мокър моля момента му
|
||||||
всеки
|
|
||||||
всички
|
н на над назад най наш навсякъде навътре нагоре направи напред надолу наистина
|
||||||
всичко
|
например наопаки наполовина напоследък нека независимо нас насам наскоро
|
||||||
всяка
|
настрана необходимо него негов нещо нея ни ние никой нито нищо но нов някак нова
|
||||||
във
|
нови новина някои някой някога някъде няколко няма
|
||||||
въпреки
|
|
||||||
върху
|
о обаче около описан опитах опитва опитвайки опитвам определен определено освен
|
||||||
г
|
обикновено осигурява обратно означава особен особено от ох отвъд отгоре отдолу
|
||||||
ги
|
отново отива отивам отидох отсега отделно отколкото откъдето очевидно оттам
|
||||||
главен
|
относно още
|
||||||
главна
|
|
||||||
главно
|
п пак по повече повечето под поне просто пряко поради после последен последно
|
||||||
глас
|
посочен почти прави прав прави правя пред преди през при пък първата първи първо
|
||||||
го
|
път пъти плюс
|
||||||
година
|
|
||||||
години
|
равен равна различен различни разумен разумно
|
||||||
годишен
|
|
||||||
д
|
с са сам само себе сериозно сигурен сигурно се сега си син скоро скорошен след
|
||||||
да
|
следващ следващия следва следното следователно случва сме смях собствен
|
||||||
дали
|
сравнително смея според сред става срещу съвсем съдържа съдържащ съжалявам
|
||||||
два
|
съответен съответно сте съм със също
|
||||||
двама
|
|
||||||
двамата
|
т така техен техни такива такъв твърде там трета твой те тези ти то това
|
||||||
две
|
тогава този той търси толкова точно три трябва тук тъй тя тях
|
||||||
двете
|
|
||||||
ден
|
у утре ужасно употреба успоредно уточнен уточняване
|
||||||
днес
|
|
||||||
дни
|
харесва харесали хиляди
|
||||||
до
|
|
||||||
добра
|
ч часа ценя цяло цялостен че често чрез чудя
|
||||||
добре
|
|
||||||
добро
|
ще щеше щом щяха
|
||||||
добър
|
|
||||||
докато
|
|
||||||
докога
|
|
||||||
дори
|
|
||||||
досега
|
|
||||||
доста
|
|
||||||
друг
|
|
||||||
друга
|
|
||||||
други
|
|
||||||
е
|
|
||||||
евтин
|
|
||||||
едва
|
|
||||||
един
|
|
||||||
една
|
|
||||||
еднаква
|
|
||||||
еднакви
|
|
||||||
еднакъв
|
|
||||||
едно
|
|
||||||
екип
|
|
||||||
ето
|
|
||||||
живот
|
|
||||||
за
|
|
||||||
забавям
|
|
||||||
зад
|
|
||||||
заедно
|
|
||||||
заради
|
|
||||||
засега
|
|
||||||
заспал
|
|
||||||
затова
|
|
||||||
защо
|
|
||||||
защото
|
|
||||||
и
|
|
||||||
из
|
|
||||||
или
|
|
||||||
им
|
|
||||||
има
|
|
||||||
имат
|
|
||||||
иска
|
|
||||||
й
|
|
||||||
каза
|
|
||||||
как
|
|
||||||
каква
|
|
||||||
какво
|
|
||||||
както
|
|
||||||
какъв
|
|
||||||
като
|
|
||||||
кога
|
|
||||||
когато
|
|
||||||
което
|
|
||||||
които
|
|
||||||
кой
|
|
||||||
който
|
|
||||||
колко
|
|
||||||
която
|
|
||||||
къде
|
|
||||||
където
|
|
||||||
към
|
|
||||||
лесен
|
|
||||||
лесно
|
|
||||||
ли
|
|
||||||
лош
|
|
||||||
м
|
|
||||||
май
|
|
||||||
малко
|
|
||||||
ме
|
|
||||||
между
|
|
||||||
мек
|
|
||||||
мен
|
|
||||||
месец
|
|
||||||
ми
|
|
||||||
много
|
|
||||||
мнозина
|
|
||||||
мога
|
|
||||||
могат
|
|
||||||
може
|
|
||||||
мокър
|
|
||||||
моля
|
|
||||||
момента
|
|
||||||
му
|
|
||||||
н
|
|
||||||
на
|
|
||||||
над
|
|
||||||
назад
|
|
||||||
най
|
|
||||||
направи
|
|
||||||
напред
|
|
||||||
например
|
|
||||||
нас
|
|
||||||
не
|
|
||||||
него
|
|
||||||
нещо
|
|
||||||
нея
|
|
||||||
ни
|
|
||||||
ние
|
|
||||||
никой
|
|
||||||
нито
|
|
||||||
нищо
|
|
||||||
но
|
|
||||||
нов
|
|
||||||
нова
|
|
||||||
нови
|
|
||||||
новина
|
|
||||||
някои
|
|
||||||
някой
|
|
||||||
няколко
|
|
||||||
няма
|
|
||||||
обаче
|
|
||||||
около
|
|
||||||
освен
|
|
||||||
особено
|
|
||||||
от
|
|
||||||
отгоре
|
|
||||||
отново
|
|
||||||
още
|
|
||||||
пак
|
|
||||||
по
|
|
||||||
повече
|
|
||||||
повечето
|
|
||||||
под
|
|
||||||
поне
|
|
||||||
поради
|
|
||||||
после
|
|
||||||
почти
|
|
||||||
прави
|
|
||||||
пред
|
|
||||||
преди
|
|
||||||
през
|
|
||||||
при
|
|
||||||
пък
|
|
||||||
първата
|
|
||||||
първи
|
|
||||||
първо
|
|
||||||
пъти
|
|
||||||
равен
|
|
||||||
равна
|
|
||||||
с
|
|
||||||
са
|
|
||||||
сам
|
|
||||||
само
|
|
||||||
се
|
|
||||||
сега
|
|
||||||
си
|
|
||||||
син
|
|
||||||
скоро
|
|
||||||
след
|
|
||||||
следващ
|
|
||||||
сме
|
|
||||||
смях
|
|
||||||
според
|
|
||||||
сред
|
|
||||||
срещу
|
|
||||||
сте
|
|
||||||
съм
|
|
||||||
със
|
|
||||||
също
|
|
||||||
т
|
|
||||||
тази
|
|
||||||
така
|
|
||||||
такива
|
|
||||||
такъв
|
|
||||||
там
|
|
||||||
твой
|
|
||||||
те
|
|
||||||
тези
|
|
||||||
ти
|
|
||||||
т.н.
|
|
||||||
то
|
|
||||||
това
|
|
||||||
тогава
|
|
||||||
този
|
|
||||||
той
|
|
||||||
толкова
|
|
||||||
точно
|
|
||||||
три
|
|
||||||
трябва
|
|
||||||
тук
|
|
||||||
тъй
|
|
||||||
тя
|
|
||||||
тях
|
|
||||||
у
|
|
||||||
утре
|
|
||||||
харесва
|
|
||||||
хиляди
|
|
||||||
ч
|
|
||||||
часа
|
|
||||||
че
|
|
||||||
често
|
|
||||||
чрез
|
|
||||||
ще
|
|
||||||
щом
|
|
||||||
юмрук
|
юмрук
|
||||||
я
|
|
||||||
як
|
я як
|
||||||
""".split()
|
""".split()
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,10 +1,16 @@
|
||||||
|
"""
|
||||||
|
References:
|
||||||
|
https://slovored.com/bg/abbr/grammar/ - Additional refs for abbreviations
|
||||||
|
(countries, occupations, fields of studies and more).
|
||||||
|
"""
|
||||||
|
|
||||||
from ...symbols import ORTH, NORM
|
from ...symbols import ORTH, NORM
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
# measurements
|
||||||
_abbr_exc = [
|
for abbr in [
|
||||||
{ORTH: "м", NORM: "метър"},
|
{ORTH: "м", NORM: "метър"},
|
||||||
{ORTH: "мм", NORM: "милиметър"},
|
{ORTH: "мм", NORM: "милиметър"},
|
||||||
{ORTH: "см", NORM: "сантиметър"},
|
{ORTH: "см", NORM: "сантиметър"},
|
||||||
|
@ -17,51 +23,191 @@ _abbr_exc = [
|
||||||
{ORTH: "хл", NORM: "хектолиър"},
|
{ORTH: "хл", NORM: "хектолиър"},
|
||||||
{ORTH: "дкл", NORM: "декалитър"},
|
{ORTH: "дкл", NORM: "декалитър"},
|
||||||
{ORTH: "л", NORM: "литър"},
|
{ORTH: "л", NORM: "литър"},
|
||||||
]
|
]:
|
||||||
for abbr in _abbr_exc:
|
|
||||||
_exc[abbr[ORTH]] = [abbr]
|
_exc[abbr[ORTH]] = [abbr]
|
||||||
|
|
||||||
_abbr_line_exc = [
|
# line abbreviations
|
||||||
|
for abbr in [
|
||||||
{ORTH: "г-жа", NORM: "госпожа"},
|
{ORTH: "г-жа", NORM: "госпожа"},
|
||||||
{ORTH: "г-н", NORM: "господин"},
|
{ORTH: "г-н", NORM: "господин"},
|
||||||
{ORTH: "г-ца", NORM: "госпожица"},
|
{ORTH: "г-ца", NORM: "госпожица"},
|
||||||
{ORTH: "д-р", NORM: "доктор"},
|
{ORTH: "д-р", NORM: "доктор"},
|
||||||
{ORTH: "о-в", NORM: "остров"},
|
{ORTH: "о-в", NORM: "остров"},
|
||||||
{ORTH: "п-в", NORM: "полуостров"},
|
{ORTH: "п-в", NORM: "полуостров"},
|
||||||
]
|
{ORTH: "с-у", NORM: "срещу"},
|
||||||
|
{ORTH: "в-у", NORM: "върху"},
|
||||||
for abbr in _abbr_line_exc:
|
{ORTH: "м-у", NORM: "между"},
|
||||||
|
]:
|
||||||
_exc[abbr[ORTH]] = [abbr]
|
_exc[abbr[ORTH]] = [abbr]
|
||||||
|
|
||||||
_abbr_dot_exc = [
|
# foreign language related abbreviations
|
||||||
|
for abbr in [
|
||||||
|
{ORTH: "англ.", NORM: "английски"},
|
||||||
|
{ORTH: "ан.", NORM: "английски термин"},
|
||||||
|
{ORTH: "араб.", NORM: "арабски"},
|
||||||
|
{ORTH: "афр.", NORM: "африкански"},
|
||||||
|
{ORTH: "гр.", NORM: "гръцки"},
|
||||||
|
{ORTH: "лат.", NORM: "латински"},
|
||||||
|
{ORTH: "рим.", NORM: "римски"},
|
||||||
|
{ORTH: "старогр.", NORM: "старогръцки"},
|
||||||
|
{ORTH: "староевр.", NORM: "староеврейски"},
|
||||||
|
{ORTH: "фр.", NORM: "френски"},
|
||||||
|
{ORTH: "хол.", NORM: "холандски"},
|
||||||
|
{ORTH: "швед.", NORM: "шведски"},
|
||||||
|
{ORTH: "шотл.", NORM: "шотландски"},
|
||||||
|
{ORTH: "яп.", NORM: "японски"},
|
||||||
|
]:
|
||||||
|
_exc[abbr[ORTH]] = [abbr]
|
||||||
|
|
||||||
|
# profession and academic titles abbreviations
|
||||||
|
for abbr in [
|
||||||
{ORTH: "акад.", NORM: "академик"},
|
{ORTH: "акад.", NORM: "академик"},
|
||||||
{ORTH: "ал.", NORM: "алинея"},
|
|
||||||
{ORTH: "арх.", NORM: "архитект"},
|
{ORTH: "арх.", NORM: "архитект"},
|
||||||
|
{ORTH: "инж.", NORM: "инженер"},
|
||||||
|
{ORTH: "канц.", NORM: "канцлер"},
|
||||||
|
{ORTH: "проф.", NORM: "професор"},
|
||||||
|
{ORTH: "св.", NORM: "свети"},
|
||||||
|
]:
|
||||||
|
_exc[abbr[ORTH]] = [abbr]
|
||||||
|
|
||||||
|
# fields of studies
|
||||||
|
for abbr in [
|
||||||
|
{ORTH: "агр.", NORM: "агрономия"},
|
||||||
|
{ORTH: "ав.", NORM: "авиация"},
|
||||||
|
{ORTH: "агр.", NORM: "агрономия"},
|
||||||
|
{ORTH: "археол.", NORM: "археология"},
|
||||||
|
{ORTH: "астр.", NORM: "астрономия"},
|
||||||
|
{ORTH: "геод.", NORM: "геодезия"},
|
||||||
|
{ORTH: "геол.", NORM: "геология"},
|
||||||
|
{ORTH: "геом.", NORM: "геометрия"},
|
||||||
|
{ORTH: "гимн.", NORM: "гимнастика"},
|
||||||
|
{ORTH: "грам.", NORM: "граматика"},
|
||||||
|
{ORTH: "жур.", NORM: "журналистика"},
|
||||||
|
{ORTH: "журн.", NORM: "журналистика"},
|
||||||
|
{ORTH: "зем.", NORM: "земеделие"},
|
||||||
|
{ORTH: "икон.", NORM: "икономика"},
|
||||||
|
{ORTH: "лит.", NORM: "литература"},
|
||||||
|
{ORTH: "мат.", NORM: "математика"},
|
||||||
|
{ORTH: "мед.", NORM: "медицина"},
|
||||||
|
{ORTH: "муз.", NORM: "музика"},
|
||||||
|
{ORTH: "печ.", NORM: "печатарство"},
|
||||||
|
{ORTH: "пол.", NORM: "политика"},
|
||||||
|
{ORTH: "псих.", NORM: "психология"},
|
||||||
|
{ORTH: "соц.", NORM: "социология"},
|
||||||
|
{ORTH: "стат.", NORM: "статистика"},
|
||||||
|
{ORTH: "стил.", NORM: "стилистика"},
|
||||||
|
{ORTH: "топогр.", NORM: "топография"},
|
||||||
|
{ORTH: "търг.", NORM: "търговия"},
|
||||||
|
{ORTH: "фарм.", NORM: "фармацевтика"},
|
||||||
|
{ORTH: "фехт.", NORM: "фехтовка"},
|
||||||
|
{ORTH: "физиол.", NORM: "физиология"},
|
||||||
|
{ORTH: "физ.", NORM: "физика"},
|
||||||
|
{ORTH: "фил.", NORM: "философия"},
|
||||||
|
{ORTH: "фин.", NORM: "финанси"},
|
||||||
|
{ORTH: "фолкл.", NORM: "фолклор"},
|
||||||
|
{ORTH: "фон.", NORM: "фонетика"},
|
||||||
|
{ORTH: "фот.", NORM: "фотография"},
|
||||||
|
{ORTH: "футб.", NORM: "футбол"},
|
||||||
|
{ORTH: "хим.", NORM: "химия"},
|
||||||
|
{ORTH: "хир.", NORM: "хирургия"},
|
||||||
|
{ORTH: "ел.", NORM: "електротехника"},
|
||||||
|
]:
|
||||||
|
_exc[abbr[ORTH]] = [abbr]
|
||||||
|
|
||||||
|
for abbr in [
|
||||||
|
{ORTH: "ал.", NORM: "алинея"},
|
||||||
|
{ORTH: "авт.", NORM: "автоматично"},
|
||||||
|
{ORTH: "адм.", NORM: "администрация"},
|
||||||
|
{ORTH: "арт.", NORM: "артилерия"},
|
||||||
{ORTH: "бл.", NORM: "блок"},
|
{ORTH: "бл.", NORM: "блок"},
|
||||||
{ORTH: "бр.", NORM: "брой"},
|
{ORTH: "бр.", NORM: "брой"},
|
||||||
{ORTH: "бул.", NORM: "булевард"},
|
{ORTH: "бул.", NORM: "булевард"},
|
||||||
|
{ORTH: "букв.", NORM: "буквално"},
|
||||||
{ORTH: "в.", NORM: "век"},
|
{ORTH: "в.", NORM: "век"},
|
||||||
|
{ORTH: "вр.", NORM: "време"},
|
||||||
|
{ORTH: "вм.", NORM: "вместо"},
|
||||||
|
{ORTH: "воен.", NORM: "военен термин"},
|
||||||
{ORTH: "г.", NORM: "година"},
|
{ORTH: "г.", NORM: "година"},
|
||||||
{ORTH: "гр.", NORM: "град"},
|
{ORTH: "гр.", NORM: "град"},
|
||||||
|
{ORTH: "гл.", NORM: "глагол"},
|
||||||
|
{ORTH: "др.", NORM: "други"},
|
||||||
|
{ORTH: "ез.", NORM: "езеро"},
|
||||||
{ORTH: "ж.р.", NORM: "женски род"},
|
{ORTH: "ж.р.", NORM: "женски род"},
|
||||||
{ORTH: "инж.", NORM: "инженер"},
|
{ORTH: "жп.", NORM: "железопът"},
|
||||||
|
{ORTH: "застр.", NORM: "застрахователно дело"},
|
||||||
|
{ORTH: "знач.", NORM: "значение"},
|
||||||
|
{ORTH: "и др.", NORM: "и други"},
|
||||||
|
{ORTH: "и под.", NORM: "и подобни"},
|
||||||
|
{ORTH: "и пр.", NORM: "и прочие"},
|
||||||
|
{ORTH: "изр.", NORM: "изречение"},
|
||||||
|
{ORTH: "изт.", NORM: "източен"},
|
||||||
|
{ORTH: "конкр.", NORM: "конкретно"},
|
||||||
{ORTH: "лв.", NORM: "лев"},
|
{ORTH: "лв.", NORM: "лев"},
|
||||||
|
{ORTH: "л.", NORM: "лице"},
|
||||||
{ORTH: "м.р.", NORM: "мъжки род"},
|
{ORTH: "м.р.", NORM: "мъжки род"},
|
||||||
{ORTH: "мат.", NORM: "математика"},
|
{ORTH: "мин.вр.", NORM: "минало време"},
|
||||||
{ORTH: "мед.", NORM: "медицина"},
|
{ORTH: "мн.ч.", NORM: "множествено число"},
|
||||||
|
{ORTH: "напр.", NORM: "например"},
|
||||||
|
{ORTH: "нар.", NORM: "наречие"},
|
||||||
|
{ORTH: "науч.", NORM: "научен термин"},
|
||||||
|
{ORTH: "непр.", NORM: "неправилно"},
|
||||||
|
{ORTH: "обик.", NORM: "обикновено"},
|
||||||
|
{ORTH: "опред.", NORM: "определение"},
|
||||||
|
{ORTH: "особ.", NORM: "особено"},
|
||||||
|
{ORTH: "ост.", NORM: "остаряло"},
|
||||||
|
{ORTH: "относ.", NORM: "относително"},
|
||||||
|
{ORTH: "отр.", NORM: "отрицателно"},
|
||||||
{ORTH: "пл.", NORM: "площад"},
|
{ORTH: "пл.", NORM: "площад"},
|
||||||
{ORTH: "проф.", NORM: "професор"},
|
{ORTH: "пад.", NORM: "падеж"},
|
||||||
|
{ORTH: "парл.", NORM: "парламентарен"},
|
||||||
|
{ORTH: "погов.", NORM: "поговорка"},
|
||||||
|
{ORTH: "пон.", NORM: "понякога"},
|
||||||
|
{ORTH: "правосл.", NORM: "православен"},
|
||||||
|
{ORTH: "прибл.", NORM: "приблизително"},
|
||||||
|
{ORTH: "прил.", NORM: "прилагателно име"},
|
||||||
|
{ORTH: "пр.", NORM: "прочие"},
|
||||||
{ORTH: "с.", NORM: "село"},
|
{ORTH: "с.", NORM: "село"},
|
||||||
{ORTH: "с.р.", NORM: "среден род"},
|
{ORTH: "с.р.", NORM: "среден род"},
|
||||||
{ORTH: "св.", NORM: "свети"},
|
|
||||||
{ORTH: "сп.", NORM: "списание"},
|
{ORTH: "сп.", NORM: "списание"},
|
||||||
{ORTH: "стр.", NORM: "страница"},
|
{ORTH: "стр.", NORM: "страница"},
|
||||||
|
{ORTH: "сз.", NORM: "съюз"},
|
||||||
|
{ORTH: "сег.", NORM: "сегашно"},
|
||||||
|
{ORTH: "сп.", NORM: "спорт"},
|
||||||
|
{ORTH: "срв.", NORM: "сравни"},
|
||||||
|
{ORTH: "с.ст.", NORM: "селскостопанска техника"},
|
||||||
|
{ORTH: "счет.", NORM: "счетоводство"},
|
||||||
|
{ORTH: "съкр.", NORM: "съкратено"},
|
||||||
|
{ORTH: "съобщ.", NORM: "съобщение"},
|
||||||
|
{ORTH: "същ.", NORM: "съществително"},
|
||||||
|
{ORTH: "текст.", NORM: "текстилен"},
|
||||||
|
{ORTH: "телев.", NORM: "телевизия"},
|
||||||
|
{ORTH: "тел.", NORM: "телефон"},
|
||||||
|
{ORTH: "т.е.", NORM: "тоест"},
|
||||||
|
{ORTH: "т.н.", NORM: "така нататък"},
|
||||||
|
{ORTH: "т.нар.", NORM: "така наречен"},
|
||||||
|
{ORTH: "търж.", NORM: "тържествено"},
|
||||||
{ORTH: "ул.", NORM: "улица"},
|
{ORTH: "ул.", NORM: "улица"},
|
||||||
|
{ORTH: "уч.", NORM: "училище"},
|
||||||
|
{ORTH: "унив.", NORM: "университет"},
|
||||||
|
{ORTH: "харт.", NORM: "хартия"},
|
||||||
|
{ORTH: "хидр.", NORM: "хидравлика"},
|
||||||
|
{ORTH: "хран.", NORM: "хранителна"},
|
||||||
|
{ORTH: "църк.", NORM: "църковен термин"},
|
||||||
|
{ORTH: "числ.", NORM: "числително"},
|
||||||
{ORTH: "чл.", NORM: "член"},
|
{ORTH: "чл.", NORM: "член"},
|
||||||
]
|
{ORTH: "ч.", NORM: "число"},
|
||||||
|
{ORTH: "числ.", NORM: "числително"},
|
||||||
for abbr in _abbr_dot_exc:
|
{ORTH: "шахм.", NORM: "шахмат"},
|
||||||
|
{ORTH: "шах.", NORM: "шахмат"},
|
||||||
|
{ORTH: "юр.", NORM: "юридически"},
|
||||||
|
]:
|
||||||
_exc[abbr[ORTH]] = [abbr]
|
_exc[abbr[ORTH]] = [abbr]
|
||||||
|
|
||||||
|
# slash abbreviations
|
||||||
|
for abbr in [
|
||||||
|
{ORTH: "м/у", NORM: "между"},
|
||||||
|
{ORTH: "с/у", NORM: "срещу"},
|
||||||
|
]:
|
||||||
|
_exc[abbr[ORTH]] = [abbr]
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = _exc
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
|
@ -23,13 +23,25 @@ class Bengali(Language):
|
||||||
@Bengali.factory(
|
@Bengali.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "rule",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return Lemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Bengali"]
|
__all__ = ["Bengali"]
|
||||||
|
|
23
spacy/lang/ca/__init__.py
Normal file → Executable file
23
spacy/lang/ca/__init__.py
Normal file → Executable file
|
@ -1,9 +1,9 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
@ -15,6 +15,7 @@ class CatalanDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
prefixes = TOKENIZER_PREFIXES
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
@ -28,13 +29,25 @@ class Catalan(Language):
|
||||||
@Catalan.factory(
|
@Catalan.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "rule",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return CatalanLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return CatalanLemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Catalan"]
|
__all__ = ["Catalan"]
|
||||||
|
|
11
spacy/lang/ca/punctuation.py
Normal file → Executable file
11
spacy/lang/ca/punctuation.py
Normal file → Executable file
|
@ -1,4 +1,5 @@
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
||||||
|
from ..char_classes import LIST_CURRENCY
|
||||||
from ..char_classes import CURRENCY
|
from ..char_classes import CURRENCY
|
||||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
|
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
|
||||||
from ..char_classes import merge_chars, _units
|
from ..char_classes import merge_chars, _units
|
||||||
|
@ -6,6 +7,14 @@ from ..char_classes import merge_chars, _units
|
||||||
|
|
||||||
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
|
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
|
||||||
|
|
||||||
|
_prefixes = (
|
||||||
|
["§", "%", "=", "—", "–", "-", r"\+(?![0-9])"]
|
||||||
|
+ LIST_PUNCT
|
||||||
|
+ LIST_ELLIPSES
|
||||||
|
+ LIST_QUOTES
|
||||||
|
+ LIST_CURRENCY
|
||||||
|
+ LIST_ICONS
|
||||||
|
)
|
||||||
|
|
||||||
_infixes = (
|
_infixes = (
|
||||||
LIST_ELLIPSES
|
LIST_ELLIPSES
|
||||||
|
@ -18,6 +27,7 @@ _infixes = (
|
||||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION),
|
r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION),
|
||||||
|
r"('ls|'l|'ns|'t|'m|'n|-les|-la|-lo|-li|-los|-me|-nos|-te|-vos|-se|-hi|-ne|-ho)(?![A-Za-z])|(-l'|-m'|-t'|-n')",
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -44,3 +54,4 @@ _suffixes = (
|
||||||
|
|
||||||
TOKENIZER_INFIXES = _infixes
|
TOKENIZER_INFIXES = _infixes
|
||||||
TOKENIZER_SUFFIXES = _suffixes
|
TOKENIZER_SUFFIXES = _suffixes
|
||||||
|
TOKENIZER_PREFIXES = _prefixes
|
||||||
|
|
21
spacy/lang/ca/tokenizer_exceptions.py
Normal file → Executable file
21
spacy/lang/ca/tokenizer_exceptions.py
Normal file → Executable file
|
@ -18,12 +18,21 @@ for exc_data in [
|
||||||
{ORTH: "nov.", NORM: "novembre"},
|
{ORTH: "nov.", NORM: "novembre"},
|
||||||
{ORTH: "dec.", NORM: "desembre"},
|
{ORTH: "dec.", NORM: "desembre"},
|
||||||
{ORTH: "Dr.", NORM: "doctor"},
|
{ORTH: "Dr.", NORM: "doctor"},
|
||||||
|
{ORTH: "Dra.", NORM: "doctora"},
|
||||||
{ORTH: "Sr.", NORM: "senyor"},
|
{ORTH: "Sr.", NORM: "senyor"},
|
||||||
{ORTH: "Sra.", NORM: "senyora"},
|
{ORTH: "Sra.", NORM: "senyora"},
|
||||||
{ORTH: "Srta.", NORM: "senyoreta"},
|
{ORTH: "Srta.", NORM: "senyoreta"},
|
||||||
{ORTH: "núm", NORM: "número"},
|
{ORTH: "núm", NORM: "número"},
|
||||||
{ORTH: "St.", NORM: "sant"},
|
{ORTH: "St.", NORM: "sant"},
|
||||||
{ORTH: "Sta.", NORM: "santa"},
|
{ORTH: "Sta.", NORM: "santa"},
|
||||||
|
{ORTH: "pl.", NORM: "plaça"},
|
||||||
|
{ORTH: "à."},
|
||||||
|
{ORTH: "è."},
|
||||||
|
{ORTH: "é."},
|
||||||
|
{ORTH: "í."},
|
||||||
|
{ORTH: "ò."},
|
||||||
|
{ORTH: "ó."},
|
||||||
|
{ORTH: "ú."},
|
||||||
{ORTH: "'l"},
|
{ORTH: "'l"},
|
||||||
{ORTH: "'ls"},
|
{ORTH: "'ls"},
|
||||||
{ORTH: "'m"},
|
{ORTH: "'m"},
|
||||||
|
@ -34,6 +43,18 @@ for exc_data in [
|
||||||
]:
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
_exc["del"] = [{ORTH: "d", NORM: "de"}, {ORTH: "el"}]
|
||||||
|
_exc["dels"] = [{ORTH: "d", NORM: "de"}, {ORTH: "els"}]
|
||||||
|
|
||||||
|
_exc["al"] = [{ORTH: "a"}, {ORTH: "l", NORM: "el"}]
|
||||||
|
_exc["als"] = [{ORTH: "a"}, {ORTH: "ls", NORM: "els"}]
|
||||||
|
|
||||||
|
_exc["pel"] = [{ORTH: "p", NORM: "per"}, {ORTH: "el"}]
|
||||||
|
_exc["pels"] = [{ORTH: "p", NORM: "per"}, {ORTH: "els"}]
|
||||||
|
|
||||||
|
_exc["holahola"] = [{ORTH: "holahola", NORM: "cocacola"}]
|
||||||
|
|
||||||
|
|
||||||
# Times
|
# Times
|
||||||
_exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", NORM: "p.m."}]
|
_exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", NORM: "p.m."}]
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
@ -28,13 +28,25 @@ class Greek(Language):
|
||||||
@Greek.factory(
|
@Greek.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "rule",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return GreekLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return GreekLemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Greek"]
|
__all__ = ["Greek"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
@ -26,13 +26,25 @@ class English(Language):
|
||||||
@English.factory(
|
@English.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "rule",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return EnglishLemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["English"]
|
__all__ = ["English"]
|
||||||
|
|
|
@ -10,7 +10,7 @@ class EnglishLemmatizer(Lemmatizer):
|
||||||
Check whether we're dealing with an uninflected paradigm, so we can
|
Check whether we're dealing with an uninflected paradigm, so we can
|
||||||
avoid lemmatization entirely.
|
avoid lemmatization entirely.
|
||||||
|
|
||||||
univ_pos (unicode / int): The token's universal part-of-speech tag.
|
univ_pos (str / int): The token's universal part-of-speech tag.
|
||||||
morphology (dict): The token's morphological features following the
|
morphology (dict): The token's morphological features following the
|
||||||
Universal Dependencies scheme.
|
Universal Dependencies scheme.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
@ -26,13 +26,25 @@ class Spanish(Language):
|
||||||
@Spanish.factory(
|
@Spanish.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "rule",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return SpanishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return SpanishLemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Spanish"]
|
__all__ = ["Spanish"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
@ -26,13 +26,25 @@ class Persian(Language):
|
||||||
@Persian.factory(
|
@Persian.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "rule",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return Lemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Persian"]
|
__all__ = ["Persian"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
|
@ -31,13 +31,25 @@ class French(Language):
|
||||||
@French.factory(
|
@French.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "rule",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return FrenchLemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["French"]
|
__all__ = ["French"]
|
||||||
|
|
|
@ -1,6 +1,11 @@
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from thinc.api import Model
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language, BaseDefaults
|
from ...language import Language, BaseDefaults
|
||||||
|
from .lemmatizer import IrishLemmatizer
|
||||||
|
|
||||||
|
|
||||||
class IrishDefaults(BaseDefaults):
|
class IrishDefaults(BaseDefaults):
|
||||||
|
@ -13,4 +18,16 @@ class Irish(Language):
|
||||||
Defaults = IrishDefaults
|
Defaults = IrishDefaults
|
||||||
|
|
||||||
|
|
||||||
|
@Irish.factory(
|
||||||
|
"lemmatizer",
|
||||||
|
assigns=["token.lemma"],
|
||||||
|
default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
|
||||||
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
|
)
|
||||||
|
def make_lemmatizer(
|
||||||
|
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||||
|
):
|
||||||
|
return IrishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Irish"]
|
__all__ = ["Irish"]
|
||||||
|
|
|
@ -1,35 +0,0 @@
|
||||||
# fmt: off
|
|
||||||
consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z"]
|
|
||||||
broad_vowels = ["a", "á", "o", "ó", "u", "ú"]
|
|
||||||
slender_vowels = ["e", "é", "i", "í"]
|
|
||||||
vowels = broad_vowels + slender_vowels
|
|
||||||
# fmt: on
|
|
||||||
|
|
||||||
|
|
||||||
def ends_dentals(word):
|
|
||||||
if word != "" and word[-1] in ["d", "n", "t", "s"]:
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def devoice(word):
|
|
||||||
if len(word) > 2 and word[-2] == "s" and word[-1] == "d":
|
|
||||||
return word[:-1] + "t"
|
|
||||||
else:
|
|
||||||
return word
|
|
||||||
|
|
||||||
|
|
||||||
def ends_with_vowel(word):
|
|
||||||
return word != "" and word[-1] in vowels
|
|
||||||
|
|
||||||
|
|
||||||
def starts_with_vowel(word):
|
|
||||||
return word != "" and word[0] in vowels
|
|
||||||
|
|
||||||
|
|
||||||
def deduplicate(word):
|
|
||||||
if len(word) > 2 and word[-2] == word[-1] and word[-1] in consonants:
|
|
||||||
return word[:-1]
|
|
||||||
else:
|
|
||||||
return word
|
|
162
spacy/lang/ga/lemmatizer.py
Normal file
162
spacy/lang/ga/lemmatizer.py
Normal file
|
@ -0,0 +1,162 @@
|
||||||
|
from typing import List, Dict, Tuple
|
||||||
|
|
||||||
|
from ...pipeline import Lemmatizer
|
||||||
|
from ...tokens import Token
|
||||||
|
|
||||||
|
|
||||||
|
class IrishLemmatizer(Lemmatizer):
|
||||||
|
# This is a lookup-based lemmatiser using data extracted from
|
||||||
|
# BuNaMo (https://github.com/michmech/BuNaMo)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
|
||||||
|
if mode == "pos_lookup":
|
||||||
|
# fmt: off
|
||||||
|
required = [
|
||||||
|
"lemma_lookup_adj", "lemma_lookup_adp",
|
||||||
|
"lemma_lookup_noun", "lemma_lookup_verb"
|
||||||
|
]
|
||||||
|
# fmt: on
|
||||||
|
return (required, [])
|
||||||
|
else:
|
||||||
|
return super().get_lookups_config(mode)
|
||||||
|
|
||||||
|
def pos_lookup_lemmatize(self, token: Token) -> List[str]:
|
||||||
|
univ_pos = token.pos_
|
||||||
|
string = unponc(token.text)
|
||||||
|
if univ_pos not in ["PROPN", "ADP", "ADJ", "NOUN", "VERB"]:
|
||||||
|
return [string.lower()]
|
||||||
|
demutated = demutate(string)
|
||||||
|
secondary = ""
|
||||||
|
if string[0:1].lower() == "h" and string[1:2].lower() in "aáeéiíoóuú":
|
||||||
|
secondary = string[1:]
|
||||||
|
lookup_pos = univ_pos.lower()
|
||||||
|
if univ_pos == "PROPN":
|
||||||
|
lookup_pos = "noun"
|
||||||
|
if token.has_morph():
|
||||||
|
# TODO: lookup is actually required for the genitive forms, but
|
||||||
|
# this is not in BuNaMo, and would not be of use with IDT.
|
||||||
|
if univ_pos == "NOUN" and (
|
||||||
|
"VerbForm=Vnoun" in token.morph or "VerbForm=Inf" in token.morph
|
||||||
|
):
|
||||||
|
hpref = "Form=HPref" in token.morph
|
||||||
|
return [demutate(string, hpref).lower()]
|
||||||
|
elif univ_pos == "ADJ" and "VerbForm=Part" in token.morph:
|
||||||
|
return [demutate(string).lower()]
|
||||||
|
lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})
|
||||||
|
|
||||||
|
def to_list(value):
|
||||||
|
if value is None:
|
||||||
|
value = []
|
||||||
|
elif not isinstance(value, list):
|
||||||
|
value = [value]
|
||||||
|
return value
|
||||||
|
|
||||||
|
if univ_pos == "ADP":
|
||||||
|
return to_list(lookup_table.get(string, string.lower()))
|
||||||
|
ret = []
|
||||||
|
if univ_pos == "PROPN":
|
||||||
|
ret.extend(to_list(lookup_table.get(demutated)))
|
||||||
|
ret.extend(to_list(lookup_table.get(secondary)))
|
||||||
|
else:
|
||||||
|
ret.extend(to_list(lookup_table.get(demutated.lower())))
|
||||||
|
ret.extend(to_list(lookup_table.get(secondary.lower())))
|
||||||
|
if len(ret) == 0:
|
||||||
|
ret = [string.lower()]
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
def demutate(word: str, is_hpref: bool = False) -> str:
|
||||||
|
UVOWELS = "AÁEÉIÍOÓUÚ"
|
||||||
|
LVOWELS = "aáeéiíoóuú"
|
||||||
|
lc = word.lower()
|
||||||
|
# remove eclipsis
|
||||||
|
if lc.startswith("bhf"):
|
||||||
|
word = word[2:]
|
||||||
|
elif lc.startswith("mb"):
|
||||||
|
word = word[1:]
|
||||||
|
elif lc.startswith("gc"):
|
||||||
|
word = word[1:]
|
||||||
|
elif lc.startswith("nd"):
|
||||||
|
word = word[1:]
|
||||||
|
elif lc.startswith("ng"):
|
||||||
|
word = word[1:]
|
||||||
|
elif lc.startswith("bp"):
|
||||||
|
word = word[1:]
|
||||||
|
elif lc.startswith("dt"):
|
||||||
|
word = word[1:]
|
||||||
|
elif word[0:1] == "n" and word[1:2] in UVOWELS:
|
||||||
|
word = word[1:]
|
||||||
|
elif lc.startswith("n-") and word[2:3] in LVOWELS:
|
||||||
|
word = word[2:]
|
||||||
|
# non-standard eclipsis
|
||||||
|
elif lc.startswith("bh-f"):
|
||||||
|
word = word[3:]
|
||||||
|
elif lc.startswith("m-b"):
|
||||||
|
word = word[2:]
|
||||||
|
elif lc.startswith("g-c"):
|
||||||
|
word = word[2:]
|
||||||
|
elif lc.startswith("n-d"):
|
||||||
|
word = word[2:]
|
||||||
|
elif lc.startswith("n-g"):
|
||||||
|
word = word[2:]
|
||||||
|
elif lc.startswith("b-p"):
|
||||||
|
word = word[2:]
|
||||||
|
elif lc.startswith("d-t"):
|
||||||
|
word = word[2:]
|
||||||
|
|
||||||
|
# t-prothesis
|
||||||
|
elif lc.startswith("ts"):
|
||||||
|
word = word[1:]
|
||||||
|
elif lc.startswith("t-s"):
|
||||||
|
word = word[2:]
|
||||||
|
|
||||||
|
# h-prothesis, if known to be present
|
||||||
|
elif is_hpref and word[0:1] == "h":
|
||||||
|
word = word[1:]
|
||||||
|
# h-prothesis, simple case
|
||||||
|
# words can also begin with 'h', but unlike eclipsis,
|
||||||
|
# a hyphen is not used, so that needs to be handled
|
||||||
|
# elsewhere
|
||||||
|
elif word[0:1] == "h" and word[1:2] in UVOWELS:
|
||||||
|
word = word[1:]
|
||||||
|
|
||||||
|
# lenition
|
||||||
|
# this breaks the previous if, to handle super-non-standard
|
||||||
|
# text where both eclipsis and lenition were used.
|
||||||
|
if lc[0:1] in "bcdfgmpst" and lc[1:2] == "h":
|
||||||
|
word = word[0:1] + word[2:]
|
||||||
|
|
||||||
|
return word
|
||||||
|
|
||||||
|
|
||||||
|
def unponc(word: str) -> str:
|
||||||
|
# fmt: off
|
||||||
|
PONC = {
|
||||||
|
"ḃ": "bh",
|
||||||
|
"ċ": "ch",
|
||||||
|
"ḋ": "dh",
|
||||||
|
"ḟ": "fh",
|
||||||
|
"ġ": "gh",
|
||||||
|
"ṁ": "mh",
|
||||||
|
"ṗ": "ph",
|
||||||
|
"ṡ": "sh",
|
||||||
|
"ṫ": "th",
|
||||||
|
"Ḃ": "BH",
|
||||||
|
"Ċ": "CH",
|
||||||
|
"Ḋ": "DH",
|
||||||
|
"Ḟ": "FH",
|
||||||
|
"Ġ": "GH",
|
||||||
|
"Ṁ": "MH",
|
||||||
|
"Ṗ": "PH",
|
||||||
|
"Ṡ": "SH",
|
||||||
|
"Ṫ": "TH"
|
||||||
|
}
|
||||||
|
# fmt: on
|
||||||
|
buf = []
|
||||||
|
for ch in word:
|
||||||
|
if ch in PONC:
|
||||||
|
buf.append(PONC[ch])
|
||||||
|
else:
|
||||||
|
buf.append(ch)
|
||||||
|
return "".join(buf)
|
|
@ -9,6 +9,8 @@ _exc = {
|
||||||
"ded'": [{ORTH: "de", NORM: "de"}, {ORTH: "d'", NORM: "do"}],
|
"ded'": [{ORTH: "de", NORM: "de"}, {ORTH: "d'", NORM: "do"}],
|
||||||
"lem'": [{ORTH: "le", NORM: "le"}, {ORTH: "m'", NORM: "mo"}],
|
"lem'": [{ORTH: "le", NORM: "le"}, {ORTH: "m'", NORM: "mo"}],
|
||||||
"led'": [{ORTH: "le", NORM: "le"}, {ORTH: "d'", NORM: "do"}],
|
"led'": [{ORTH: "le", NORM: "le"}, {ORTH: "d'", NORM: "do"}],
|
||||||
|
"théis": [{ORTH: "th", NORM: "tar"}, {ORTH: "éis", NORM: "éis"}],
|
||||||
|
"tréis": [{ORTH: "tr", NORM: "tar"}, {ORTH: "éis", NORM: "éis"}],
|
||||||
}
|
}
|
||||||
|
|
||||||
for exc_data in [
|
for exc_data in [
|
||||||
|
|
|
@ -646,5 +646,10 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
for u in "cfkCFK":
|
||||||
|
_exc[f"°{u}"] = [{ORTH: f"°{u}"}]
|
||||||
|
_exc[f"°{u}."] = [{ORTH: f"°{u}"}, {ORTH: "."}]
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match
|
TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
@ -23,13 +23,25 @@ class Italian(Language):
|
||||||
@Italian.factory(
|
@Italian.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "pos_lookup",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return ItalianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return ItalianLemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Italian"]
|
__all__ = ["Italian"]
|
||||||
|
|
|
@ -1,21 +1,25 @@
|
||||||
from typing import Optional, Union, Dict, Any
|
from typing import Optional, Union, Dict, Any, Callable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import srsly
|
import srsly
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
|
from thinc.api import Model
|
||||||
|
import re
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
from .tag_orth_map import TAG_ORTH_MAP
|
from .tag_orth_map import TAG_ORTH_MAP
|
||||||
from .tag_bigram_map import TAG_BIGRAM_MAP
|
from .tag_bigram_map import TAG_BIGRAM_MAP
|
||||||
from ...compat import copy_reg
|
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
from ...language import Language, BaseDefaults
|
from ...language import Language, BaseDefaults
|
||||||
|
from ...pipeline import Morphologizer
|
||||||
|
from ...pipeline.morphologizer import DEFAULT_MORPH_MODEL
|
||||||
from ...scorer import Scorer
|
from ...scorer import Scorer
|
||||||
from ...symbols import POS
|
from ...symbols import POS
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc, MorphAnalysis
|
||||||
from ...training import validate_examples
|
from ...training import validate_examples
|
||||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
|
from ...vocab import Vocab
|
||||||
from ... import util
|
from ... import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -31,16 +35,21 @@ split_mode = null
|
||||||
@registry.tokenizers("spacy.ja.JapaneseTokenizer")
|
@registry.tokenizers("spacy.ja.JapaneseTokenizer")
|
||||||
def create_tokenizer(split_mode: Optional[str] = None):
|
def create_tokenizer(split_mode: Optional[str] = None):
|
||||||
def japanese_tokenizer_factory(nlp):
|
def japanese_tokenizer_factory(nlp):
|
||||||
return JapaneseTokenizer(nlp, split_mode=split_mode)
|
return JapaneseTokenizer(nlp.vocab, split_mode=split_mode)
|
||||||
|
|
||||||
return japanese_tokenizer_factory
|
return japanese_tokenizer_factory
|
||||||
|
|
||||||
|
|
||||||
class JapaneseTokenizer(DummyTokenizer):
|
class JapaneseTokenizer(DummyTokenizer):
|
||||||
def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
|
def __init__(self, vocab: Vocab, split_mode: Optional[str] = None) -> None:
|
||||||
self.vocab = nlp.vocab
|
self.vocab = vocab
|
||||||
self.split_mode = split_mode
|
self.split_mode = split_mode
|
||||||
self.tokenizer = try_sudachi_import(self.split_mode)
|
self.tokenizer = try_sudachi_import(self.split_mode)
|
||||||
|
# if we're using split mode A we don't need subtokens
|
||||||
|
self.need_subtokens = not (split_mode is None or split_mode == "A")
|
||||||
|
|
||||||
|
def __reduce__(self):
|
||||||
|
return JapaneseTokenizer, (self.vocab, self.split_mode)
|
||||||
|
|
||||||
def __call__(self, text: str) -> Doc:
|
def __call__(self, text: str) -> Doc:
|
||||||
# convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
|
# convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
|
||||||
|
@ -49,8 +58,8 @@ class JapaneseTokenizer(DummyTokenizer):
|
||||||
dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
|
dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
|
||||||
|
|
||||||
# create Doc with tag bi-gram based part-of-speech identification rules
|
# create Doc with tag bi-gram based part-of-speech identification rules
|
||||||
words, tags, inflections, lemmas, readings, sub_tokens_list = (
|
words, tags, inflections, lemmas, norms, readings, sub_tokens_list = (
|
||||||
zip(*dtokens) if dtokens else [[]] * 6
|
zip(*dtokens) if dtokens else [[]] * 7
|
||||||
)
|
)
|
||||||
sub_tokens_list = list(sub_tokens_list)
|
sub_tokens_list = list(sub_tokens_list)
|
||||||
doc = Doc(self.vocab, words=words, spaces=spaces)
|
doc = Doc(self.vocab, words=words, spaces=spaces)
|
||||||
|
@ -68,9 +77,18 @@ class JapaneseTokenizer(DummyTokenizer):
|
||||||
)
|
)
|
||||||
# if there's no lemma info (it's an unk) just use the surface
|
# if there's no lemma info (it's an unk) just use the surface
|
||||||
token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
|
token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
|
||||||
doc.user_data["inflections"] = inflections
|
morph = {}
|
||||||
doc.user_data["reading_forms"] = readings
|
if dtoken.inf:
|
||||||
doc.user_data["sub_tokens"] = sub_tokens_list
|
# it's normal for this to be empty for non-inflecting types
|
||||||
|
morph["Inflection"] = dtoken.inf
|
||||||
|
token.norm_ = dtoken.norm
|
||||||
|
if dtoken.reading:
|
||||||
|
# punctuation is its own reading, but we don't want values like
|
||||||
|
# "=" here
|
||||||
|
morph["Reading"] = re.sub("[=|]", "_", dtoken.reading)
|
||||||
|
token.morph = MorphAnalysis(self.vocab, morph)
|
||||||
|
if self.need_subtokens:
|
||||||
|
doc.user_data["sub_tokens"] = sub_tokens_list
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def _get_dtokens(self, sudachipy_tokens, need_sub_tokens: bool = True):
|
def _get_dtokens(self, sudachipy_tokens, need_sub_tokens: bool = True):
|
||||||
|
@ -81,9 +99,10 @@ class JapaneseTokenizer(DummyTokenizer):
|
||||||
DetailedToken(
|
DetailedToken(
|
||||||
token.surface(), # orth
|
token.surface(), # orth
|
||||||
"-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]), # tag
|
"-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]), # tag
|
||||||
",".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]), # inf
|
";".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]), # inf
|
||||||
token.dictionary_form(), # lemma
|
token.dictionary_form(), # lemma
|
||||||
token.reading_form(), # user_data['reading_forms']
|
token.normalized_form(),
|
||||||
|
token.reading_form(),
|
||||||
sub_tokens_list[idx]
|
sub_tokens_list[idx]
|
||||||
if sub_tokens_list
|
if sub_tokens_list
|
||||||
else None, # user_data['sub_tokens']
|
else None, # user_data['sub_tokens']
|
||||||
|
@ -105,9 +124,8 @@ class JapaneseTokenizer(DummyTokenizer):
|
||||||
]
|
]
|
||||||
|
|
||||||
def _get_sub_tokens(self, sudachipy_tokens):
|
def _get_sub_tokens(self, sudachipy_tokens):
|
||||||
if (
|
# do nothing for default split mode
|
||||||
self.split_mode is None or self.split_mode == "A"
|
if not self.need_subtokens:
|
||||||
): # do nothing for default split mode
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
sub_tokens_list = [] # list of (list of list of DetailedToken | None)
|
sub_tokens_list = [] # list of (list of list of DetailedToken | None)
|
||||||
|
@ -176,9 +194,33 @@ class Japanese(Language):
|
||||||
Defaults = JapaneseDefaults
|
Defaults = JapaneseDefaults
|
||||||
|
|
||||||
|
|
||||||
|
@Japanese.factory(
|
||||||
|
"morphologizer",
|
||||||
|
assigns=["token.morph", "token.pos"],
|
||||||
|
default_config={
|
||||||
|
"model": DEFAULT_MORPH_MODEL,
|
||||||
|
"overwrite": True,
|
||||||
|
"extend": True,
|
||||||
|
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
|
||||||
|
},
|
||||||
|
default_score_weights={"pos_acc": 0.5, "morph_micro_f": 0.5, "morph_per_feat": None},
|
||||||
|
)
|
||||||
|
def make_morphologizer(
|
||||||
|
nlp: Language,
|
||||||
|
model: Model,
|
||||||
|
name: str,
|
||||||
|
overwrite: bool,
|
||||||
|
extend: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
|
):
|
||||||
|
return Morphologizer(
|
||||||
|
nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Hold the attributes we need with convenient names
|
# Hold the attributes we need with convenient names
|
||||||
DetailedToken = namedtuple(
|
DetailedToken = namedtuple(
|
||||||
"DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"]
|
"DetailedToken", ["surface", "tag", "inf", "lemma", "norm", "reading", "sub_tokens"]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -254,7 +296,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
|
||||||
return text_dtokens, text_spaces
|
return text_dtokens, text_spaces
|
||||||
elif len([word for word in words if not word.isspace()]) == 0:
|
elif len([word for word in words if not word.isspace()]) == 0:
|
||||||
assert text.isspace()
|
assert text.isspace()
|
||||||
text_dtokens = [DetailedToken(text, gap_tag, "", text, None, None)]
|
text_dtokens = [DetailedToken(text, gap_tag, "", text, text, None, None)]
|
||||||
text_spaces = [False]
|
text_spaces = [False]
|
||||||
return text_dtokens, text_spaces
|
return text_dtokens, text_spaces
|
||||||
|
|
||||||
|
@ -271,7 +313,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
|
||||||
# space token
|
# space token
|
||||||
if word_start > 0:
|
if word_start > 0:
|
||||||
w = text[text_pos : text_pos + word_start]
|
w = text[text_pos : text_pos + word_start]
|
||||||
text_dtokens.append(DetailedToken(w, gap_tag, "", w, None, None))
|
text_dtokens.append(DetailedToken(w, gap_tag, "", w, w, None, None))
|
||||||
text_spaces.append(False)
|
text_spaces.append(False)
|
||||||
text_pos += word_start
|
text_pos += word_start
|
||||||
|
|
||||||
|
@ -287,16 +329,10 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
|
||||||
# trailing space token
|
# trailing space token
|
||||||
if text_pos < len(text):
|
if text_pos < len(text):
|
||||||
w = text[text_pos:]
|
w = text[text_pos:]
|
||||||
text_dtokens.append(DetailedToken(w, gap_tag, "", w, None, None))
|
text_dtokens.append(DetailedToken(w, gap_tag, "", w, w, None, None))
|
||||||
text_spaces.append(False)
|
text_spaces.append(False)
|
||||||
|
|
||||||
return text_dtokens, text_spaces
|
return text_dtokens, text_spaces
|
||||||
|
|
||||||
|
|
||||||
def pickle_japanese(instance):
|
|
||||||
return Japanese, tuple()
|
|
||||||
|
|
||||||
|
|
||||||
copy_reg.pickle(Japanese, pickle_japanese)
|
|
||||||
|
|
||||||
__all__ = ["Japanese"]
|
__all__ = ["Japanese"]
|
||||||
|
|
|
@ -5,11 +5,11 @@ from .tag_map import TAG_MAP
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language, BaseDefaults
|
from ...language import Language, BaseDefaults
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...compat import copy_reg
|
|
||||||
from ...scorer import Scorer
|
from ...scorer import Scorer
|
||||||
from ...symbols import POS
|
from ...symbols import POS
|
||||||
from ...training import validate_examples
|
from ...training import validate_examples
|
||||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
|
from ...vocab import Vocab
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -23,17 +23,20 @@ DEFAULT_CONFIG = """
|
||||||
@registry.tokenizers("spacy.ko.KoreanTokenizer")
|
@registry.tokenizers("spacy.ko.KoreanTokenizer")
|
||||||
def create_tokenizer():
|
def create_tokenizer():
|
||||||
def korean_tokenizer_factory(nlp):
|
def korean_tokenizer_factory(nlp):
|
||||||
return KoreanTokenizer(nlp)
|
return KoreanTokenizer(nlp.vocab)
|
||||||
|
|
||||||
return korean_tokenizer_factory
|
return korean_tokenizer_factory
|
||||||
|
|
||||||
|
|
||||||
class KoreanTokenizer(DummyTokenizer):
|
class KoreanTokenizer(DummyTokenizer):
|
||||||
def __init__(self, nlp: Language):
|
def __init__(self, vocab: Vocab):
|
||||||
self.vocab = nlp.vocab
|
self.vocab = vocab
|
||||||
MeCab = try_mecab_import() # type: ignore[func-returns-value]
|
MeCab = try_mecab_import() # type: ignore[func-returns-value]
|
||||||
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
|
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
|
||||||
|
|
||||||
|
def __reduce__(self):
|
||||||
|
return KoreanTokenizer, (self.vocab,)
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
self.mecab_tokenizer.__del__()
|
self.mecab_tokenizer.__del__()
|
||||||
|
|
||||||
|
@ -106,10 +109,4 @@ def check_spaces(text, tokens):
|
||||||
yield False
|
yield False
|
||||||
|
|
||||||
|
|
||||||
def pickle_korean(instance):
|
|
||||||
return Korean, tuple()
|
|
||||||
|
|
||||||
|
|
||||||
copy_reg.pickle(Korean, pickle_korean)
|
|
||||||
|
|
||||||
__all__ = ["Korean"]
|
__all__ = ["Korean"]
|
||||||
|
|
|
@ -3,6 +3,7 @@ import unicodedata
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from .. import attrs
|
from .. import attrs
|
||||||
|
from .tokenizer_exceptions import URL_MATCH
|
||||||
|
|
||||||
|
|
||||||
_like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match
|
_like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match
|
||||||
|
@ -109,6 +110,8 @@ def like_url(text: str) -> bool:
|
||||||
return True
|
return True
|
||||||
if tld.isalpha() and tld in _tlds:
|
if tld.isalpha() and tld in _tlds:
|
||||||
return True
|
return True
|
||||||
|
if URL_MATCH(text):
|
||||||
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
from .lemmatizer import MacedonianLemmatizer
|
from .lemmatizer import MacedonianLemmatizer
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
@ -38,13 +38,25 @@ class Macedonian(Language):
|
||||||
@Macedonian.factory(
|
@Macedonian.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "rule",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return MacedonianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return MacedonianLemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Macedonian"]
|
__all__ = ["Macedonian"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
|
@ -26,13 +26,25 @@ class Norwegian(Language):
|
||||||
@Norwegian.factory(
|
@Norwegian.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "rule",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return Lemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Norwegian"]
|
__all__ = ["Norwegian"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
|
@ -30,13 +30,25 @@ class Dutch(Language):
|
||||||
@Dutch.factory(
|
@Dutch.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "rule",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return DutchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return DutchLemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Dutch"]
|
__all__ = ["Dutch"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
|
@ -33,13 +33,25 @@ class Polish(Language):
|
||||||
@Polish.factory(
|
@Polish.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "pos_lookup",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return PolishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return PolishLemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Polish"]
|
__all__ = ["Polish"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
@ -22,7 +22,12 @@ class Russian(Language):
|
||||||
@Russian.factory(
|
@Russian.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "pymorphy2", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "pymorphy2",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
|
@ -31,8 +36,11 @@ def make_lemmatizer(
|
||||||
name: str,
|
name: str,
|
||||||
mode: str,
|
mode: str,
|
||||||
overwrite: bool,
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return RussianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return RussianLemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Russian"]
|
__all__ = ["Russian"]
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
from typing import Optional, List, Dict, Tuple
|
from typing import Optional, List, Dict, Tuple, Callable
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
|
from ...pipeline.lemmatizer import lemmatizer_score
|
||||||
from ...symbols import POS
|
from ...symbols import POS
|
||||||
from ...tokens import Token
|
from ...tokens import Token
|
||||||
from ...vocab import Vocab
|
from ...vocab import Vocab
|
||||||
|
@ -20,6 +21,7 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
*,
|
*,
|
||||||
mode: str = "pymorphy2",
|
mode: str = "pymorphy2",
|
||||||
overwrite: bool = False,
|
overwrite: bool = False,
|
||||||
|
scorer: Optional[Callable] = lemmatizer_score,
|
||||||
) -> None:
|
) -> None:
|
||||||
if mode == "pymorphy2":
|
if mode == "pymorphy2":
|
||||||
try:
|
try:
|
||||||
|
@ -31,7 +33,7 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
) from None
|
) from None
|
||||||
if getattr(self, "_morph", None) is None:
|
if getattr(self, "_morph", None) is None:
|
||||||
self._morph = MorphAnalyzer()
|
self._morph = MorphAnalyzer()
|
||||||
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
|
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)
|
||||||
|
|
||||||
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
|
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
|
||||||
string = token.text
|
string = token.text
|
||||||
|
|
|
@ -1,47 +1,195 @@
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
අතර
|
සහ
|
||||||
එච්චර
|
සමග
|
||||||
එපමණ
|
සමඟ
|
||||||
එලෙස
|
අහා
|
||||||
එවිට
|
ආහ්
|
||||||
ඒ
|
ආ
|
||||||
කට
|
ඕහෝ
|
||||||
කදී
|
අනේ
|
||||||
කින්
|
අඳෝ
|
||||||
ක්
|
අපොයි
|
||||||
ට
|
අපෝ
|
||||||
තුර
|
අයියෝ
|
||||||
ත්
|
ආයි
|
||||||
ද
|
ඌයි
|
||||||
නමුත්
|
චී
|
||||||
නොහොත්
|
චිහ්
|
||||||
පමණ
|
චික්
|
||||||
පමණි
|
හෝ
|
||||||
ම
|
දෝ
|
||||||
මෙච්චර
|
දෝහෝ
|
||||||
මෙපමණ
|
මෙන්
|
||||||
මෙලෙස
|
සේ
|
||||||
මෙවිට
|
වැනි
|
||||||
මේ
|
බඳු
|
||||||
ය
|
වන්
|
||||||
යි
|
අයුරු
|
||||||
ලදී
|
අයුරින්
|
||||||
ලෙස
|
ලෙස
|
||||||
වගේ
|
වැඩි
|
||||||
|
ශ්රී
|
||||||
|
හා
|
||||||
|
ය
|
||||||
|
නිසා
|
||||||
|
නිසාවෙන්
|
||||||
|
බවට
|
||||||
|
බව
|
||||||
|
බවෙන්
|
||||||
|
නම්
|
||||||
|
වැඩි
|
||||||
|
සිට
|
||||||
|
දී
|
||||||
|
මහා
|
||||||
|
මහ
|
||||||
|
පමණ
|
||||||
|
පමණින්
|
||||||
|
පමන
|
||||||
වන
|
වන
|
||||||
විට
|
විට
|
||||||
විටෙක
|
විටින්
|
||||||
විතර
|
මේ
|
||||||
විය
|
මෙලෙස
|
||||||
වුව
|
මෙයින්
|
||||||
වුවත්
|
ඇති
|
||||||
වුවද
|
ලෙස
|
||||||
වූ
|
සිදු
|
||||||
සමඟ
|
වශයෙන්
|
||||||
|
යන
|
||||||
|
සඳහා
|
||||||
|
මගින්
|
||||||
|
හෝ
|
||||||
|
ඉතා
|
||||||
|
ඒ
|
||||||
|
එම
|
||||||
|
ද
|
||||||
|
අතර
|
||||||
|
විසින්
|
||||||
|
සමග
|
||||||
|
පිළිබඳව
|
||||||
|
පිළිබඳ
|
||||||
|
තුළ
|
||||||
|
බව
|
||||||
|
වැනි
|
||||||
|
මහ
|
||||||
|
මෙම
|
||||||
|
මෙහි
|
||||||
|
මේ
|
||||||
|
වෙත
|
||||||
|
වෙතින්
|
||||||
|
වෙතට
|
||||||
|
වෙනුවෙන්
|
||||||
|
වෙනුවට
|
||||||
|
වෙන
|
||||||
|
ගැන
|
||||||
|
නෑ
|
||||||
|
අනුව
|
||||||
|
නව
|
||||||
|
පිළිබඳ
|
||||||
|
විශේෂ
|
||||||
|
දැනට
|
||||||
|
එහෙන්
|
||||||
|
මෙහෙන්
|
||||||
|
එහේ
|
||||||
|
මෙහේ
|
||||||
|
ම
|
||||||
|
තවත්
|
||||||
|
තව
|
||||||
සහ
|
සහ
|
||||||
හා
|
දක්වා
|
||||||
|
ට
|
||||||
|
ගේ
|
||||||
|
එ
|
||||||
|
ක
|
||||||
|
ක්
|
||||||
|
බවත්
|
||||||
|
බවද
|
||||||
|
මත
|
||||||
|
ඇතුලු
|
||||||
|
ඇතුළු
|
||||||
|
මෙසේ
|
||||||
|
වඩා
|
||||||
|
වඩාත්ම
|
||||||
|
නිති
|
||||||
|
නිතිත්
|
||||||
|
නිතොර
|
||||||
|
නිතර
|
||||||
|
ඉක්බිති
|
||||||
|
දැන්
|
||||||
|
යලි
|
||||||
|
පුන
|
||||||
|
ඉතින්
|
||||||
|
සිට
|
||||||
|
සිටන්
|
||||||
|
පටන්
|
||||||
|
තෙක්
|
||||||
|
දක්වා
|
||||||
|
සා
|
||||||
|
තාක්
|
||||||
|
තුවක්
|
||||||
|
පවා
|
||||||
|
ද
|
||||||
|
හෝ
|
||||||
|
වත්
|
||||||
|
විනා
|
||||||
|
හැර
|
||||||
|
මිස
|
||||||
|
මුත්
|
||||||
|
කිම
|
||||||
|
කිම්
|
||||||
|
ඇයි
|
||||||
|
මන්ද
|
||||||
හෙවත්
|
හෙවත්
|
||||||
හෝ
|
නොහොත්
|
||||||
|
පතා
|
||||||
|
පාසා
|
||||||
|
ගානෙ
|
||||||
|
තව
|
||||||
|
ඉතා
|
||||||
|
බොහෝ
|
||||||
|
වහා
|
||||||
|
සෙද
|
||||||
|
සැනින්
|
||||||
|
හනික
|
||||||
|
එම්බා
|
||||||
|
එම්බල
|
||||||
|
බොල
|
||||||
|
නම්
|
||||||
|
වනාහි
|
||||||
|
කලී
|
||||||
|
ඉඳුරා
|
||||||
|
අන්න
|
||||||
|
ඔන්න
|
||||||
|
මෙන්න
|
||||||
|
උදෙසා
|
||||||
|
පිණිස
|
||||||
|
සඳහා
|
||||||
|
අරබයා
|
||||||
|
නිසා
|
||||||
|
එනිසා
|
||||||
|
එබැවින්
|
||||||
|
බැවින්
|
||||||
|
හෙයින්
|
||||||
|
සේක්
|
||||||
|
සේක
|
||||||
|
ගැන
|
||||||
|
අනුව
|
||||||
|
පරිදි
|
||||||
|
විට
|
||||||
|
තෙක්
|
||||||
|
මෙතෙක්
|
||||||
|
මේතාක්
|
||||||
|
තුරු
|
||||||
|
තුරා
|
||||||
|
තුරාවට
|
||||||
|
තුලින්
|
||||||
|
නමුත්
|
||||||
|
එනමුත්
|
||||||
|
වස්
|
||||||
|
මෙන්
|
||||||
|
ලෙස
|
||||||
|
පරිදි
|
||||||
|
එහෙත්
|
||||||
""".split()
|
""".split()
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
@ -29,13 +29,25 @@ class Swedish(Language):
|
||||||
@Swedish.factory(
|
@Swedish.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "rule",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return Lemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Swedish"]
|
__all__ = ["Swedish"]
|
||||||
|
|
|
@ -3,6 +3,7 @@ from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language, BaseDefaults
|
from ...language import Language, BaseDefaults
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
|
from ...vocab import Vocab
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -16,13 +17,13 @@ DEFAULT_CONFIG = """
|
||||||
@registry.tokenizers("spacy.th.ThaiTokenizer")
|
@registry.tokenizers("spacy.th.ThaiTokenizer")
|
||||||
def create_thai_tokenizer():
|
def create_thai_tokenizer():
|
||||||
def thai_tokenizer_factory(nlp):
|
def thai_tokenizer_factory(nlp):
|
||||||
return ThaiTokenizer(nlp)
|
return ThaiTokenizer(nlp.vocab)
|
||||||
|
|
||||||
return thai_tokenizer_factory
|
return thai_tokenizer_factory
|
||||||
|
|
||||||
|
|
||||||
class ThaiTokenizer(DummyTokenizer):
|
class ThaiTokenizer(DummyTokenizer):
|
||||||
def __init__(self, nlp: Language) -> None:
|
def __init__(self, vocab: Vocab) -> None:
|
||||||
try:
|
try:
|
||||||
from pythainlp.tokenize import word_tokenize
|
from pythainlp.tokenize import word_tokenize
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
@ -31,7 +32,7 @@ class ThaiTokenizer(DummyTokenizer):
|
||||||
"https://github.com/PyThaiNLP/pythainlp"
|
"https://github.com/PyThaiNLP/pythainlp"
|
||||||
) from None
|
) from None
|
||||||
self.word_tokenize = word_tokenize
|
self.word_tokenize = word_tokenize
|
||||||
self.vocab = nlp.vocab
|
self.vocab = vocab
|
||||||
|
|
||||||
def __call__(self, text: str) -> Doc:
|
def __call__(self, text: str) -> Doc:
|
||||||
words = list(self.word_tokenize(text))
|
words = list(self.word_tokenize(text))
|
||||||
|
|
|
@ -2,7 +2,7 @@ from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
_num_words = [
|
_num_words = [
|
||||||
"ዜሮ",
|
"ዜሮ",
|
||||||
"ሐደ",
|
"ሓደ",
|
||||||
"ክልተ",
|
"ክልተ",
|
||||||
"ሰለስተ",
|
"ሰለስተ",
|
||||||
"ኣርባዕተ",
|
"ኣርባዕተ",
|
||||||
|
@ -11,66 +11,37 @@ _num_words = [
|
||||||
"ሸውዓተ",
|
"ሸውዓተ",
|
||||||
"ሽሞንተ",
|
"ሽሞንተ",
|
||||||
"ትሽዓተ",
|
"ትሽዓተ",
|
||||||
"ኣሰርተ",
|
"ዓሰርተ",
|
||||||
"ኣሰርተ ሐደ",
|
|
||||||
"ኣሰርተ ክልተ",
|
|
||||||
"ኣሰርተ ሰለስተ",
|
|
||||||
"ኣሰርተ ኣርባዕተ",
|
|
||||||
"ኣሰርተ ሓሙሽተ",
|
|
||||||
"ኣሰርተ ሽድሽተ",
|
|
||||||
"ኣሰርተ ሸውዓተ",
|
|
||||||
"ኣሰርተ ሽሞንተ",
|
|
||||||
"ኣሰርተ ትሽዓተ",
|
|
||||||
"ዕስራ",
|
"ዕስራ",
|
||||||
"ሰላሳ",
|
"ሰላሳ",
|
||||||
"ኣርብዓ",
|
"ኣርብዓ",
|
||||||
"ሃምሳ",
|
"ሓምሳ",
|
||||||
"ስልሳ",
|
"ሱሳ",
|
||||||
"ሰብዓ",
|
"ሰብዓ",
|
||||||
"ሰማንያ",
|
"ሰማንያ",
|
||||||
"ተስዓ",
|
"ቴስዓ",
|
||||||
"ሚእቲ",
|
"ሚእቲ",
|
||||||
"ሺሕ",
|
"ሺሕ",
|
||||||
"ሚልዮን",
|
"ሚልዮን",
|
||||||
"ቢልዮን",
|
"ቢልዮን",
|
||||||
"ትሪልዮን",
|
"ትሪልዮን",
|
||||||
"ኳድሪልዮን",
|
"ኳድሪልዮን",
|
||||||
"ገጅልዮን",
|
"ጋዚልዮን",
|
||||||
"ባዝልዮን",
|
"ባዚልዮን"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Tigrinya ordinals above 10 are the same as _num_words but start with "መበል "
|
||||||
_ordinal_words = [
|
_ordinal_words = [
|
||||||
"ቀዳማይ",
|
"ቀዳማይ",
|
||||||
"ካልኣይ",
|
"ካልኣይ",
|
||||||
"ሳልሳይ",
|
"ሳልሳይ",
|
||||||
"ራብኣይ",
|
"ራብዓይ",
|
||||||
"ሓምሻይ",
|
"ሓምሻይ",
|
||||||
"ሻድሻይ",
|
"ሻድሻይ",
|
||||||
"ሻውዓይ",
|
"ሻውዓይ",
|
||||||
"ሻምናይ",
|
"ሻምናይ",
|
||||||
"ዘጠነኛ",
|
"ታሽዓይ",
|
||||||
"አስረኛ",
|
"ዓስራይ"
|
||||||
"ኣሰርተ አንደኛ",
|
|
||||||
"ኣሰርተ ሁለተኛ",
|
|
||||||
"ኣሰርተ ሶስተኛ",
|
|
||||||
"ኣሰርተ አራተኛ",
|
|
||||||
"ኣሰርተ አምስተኛ",
|
|
||||||
"ኣሰርተ ስድስተኛ",
|
|
||||||
"ኣሰርተ ሰባተኛ",
|
|
||||||
"ኣሰርተ ስምንተኛ",
|
|
||||||
"ኣሰርተ ዘጠነኛ",
|
|
||||||
"ሃያኛ",
|
|
||||||
"ሰላሳኛ" "አርባኛ",
|
|
||||||
"አምሳኛ",
|
|
||||||
"ስድሳኛ",
|
|
||||||
"ሰባኛ",
|
|
||||||
"ሰማንያኛ",
|
|
||||||
"ዘጠናኛ",
|
|
||||||
"መቶኛ",
|
|
||||||
"ሺኛ",
|
|
||||||
"ሚሊዮንኛ",
|
|
||||||
"ቢሊዮንኛ",
|
|
||||||
"ትሪሊዮንኛ",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -92,7 +63,7 @@ def like_num(text):
|
||||||
# Check ordinal number
|
# Check ordinal number
|
||||||
if text_lower in _ordinal_words:
|
if text_lower in _ordinal_words:
|
||||||
return True
|
return True
|
||||||
if text_lower.endswith("ኛ"):
|
if text_lower.endswith("ይ"):
|
||||||
if text_lower[:-2].isdigit():
|
if text_lower[:-2].isdigit():
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
||||||
from ..char_classes import UNITS, ALPHA_UPPER
|
from ..char_classes import UNITS, ALPHA_UPPER
|
||||||
|
|
||||||
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧".strip().split()
|
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
|
||||||
|
|
||||||
_suffixes = (
|
_suffixes = (
|
||||||
_list_punct
|
_list_punct
|
||||||
|
|
|
@ -1,6 +1,27 @@
|
||||||
|
# Stop words from Tigrinya Wordcount: https://github.com/fgaim/Tigrinya-WordCount/blob/main/ti_stop_words.txt
|
||||||
|
|
||||||
# Stop words
|
# Stop words
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
ግን ግና ንስኻ ንስኺ ንስኻትክን ንስኻትኩም ናትካ ናትኪ ናትክን ናትኩም
|
'ምበር 'ሞ 'ቲ 'ታ 'ኳ 'ውን 'ዚ 'የ 'ዩ 'ያ 'ዮም 'ዮን
|
||||||
|
ልዕሊ ሒዙ ሒዛ ሕጂ መበል መን መንጎ መጠን ማለት ምስ ምባል
|
||||||
|
ምእንቲ ምኽንያቱ ምኽንያት ምዃኑ ምዃንና ምዃኖም
|
||||||
|
ስለ ስለዚ ስለዝበላ ሽዑ ቅድሚ በለ በቲ በዚ ብምባል ብተወሳኺ ብኸመይ
|
||||||
|
ብዘይ ብዘይካ ብዙሕ ብዛዕባ ብፍላይ ተባሂሉ ነበረ ነቲ ነታ ነቶም
|
||||||
|
ነዚ ነይሩ ነገራት ነገር ናብ ናብቲ ናትኩም ናትኪ ናትካ ናትክን
|
||||||
|
ናይ ናይቲ ንሕና ንሱ ንሳ ንሳቶም ንስኺ ንስኻ ንስኻትኩም ንስኻትክን ንዓይ
|
||||||
|
ኢለ ኢሉ ኢላ ኢልካ ኢሎም ኢና ኢኻ ኢዩ ኣለኹ
|
||||||
|
ኣለዉ ኣለዎ ኣሎ ኣብ ኣብቲ ኣብታ ኣብኡ ኣብዚ ኣነ ኣዝዩ ኣይኮነን ኣይኰነን
|
||||||
|
እምበር እሞ እተን እቲ እታ እቶም እንተ እንተሎ
|
||||||
|
ኣላ እንተኾነ እንታይ እንከሎ እኳ እዋን እውን እዚ እዛ እዞም
|
||||||
|
እየ እየን እዩ እያ እዮም
|
||||||
|
ከሎ ከመይ ከም ከምቲ ከምኡ ከምዘሎ
|
||||||
|
ከምዚ ከኣ ኩሉ ካልእ ካልኦት ካብ ካብቲ ካብቶም ክሳብ ክሳዕ ክብል
|
||||||
|
ክንደይ ክንዲ ክኸውን ኮይኑ ኰይኑ ኵሉ ኸም ኸኣ ወይ
|
||||||
|
ዋላ ዘለና ዘለዉ ዘለዋ ዘለዎ ዘለዎም ዘላ ዘሎ ዘይብሉ
|
||||||
|
ዝርከብ ዝበሃል ዝበለ ዝብል ዝተባህለ ዝተኻየደ ዝተፈላለየ ዝተፈላለዩ
|
||||||
|
ዝነበረ ዝነበረት ዝነበሩ ዝካየድ ዝኸውን ዝኽእል ዝኾነ ዝዀነ
|
||||||
|
የለን ይቕረብ ይብል ይኸውን ይኹን ይኽእል ደኣ ድሕሪ ድማ
|
||||||
|
ገለ ገሊጹ ገና ገይሩ ግና ግን ጥራይ
|
||||||
""".split()
|
""".split()
|
||||||
)
|
)
|
||||||
|
|
|
@ -250,3 +250,9 @@ o.0
|
||||||
|
|
||||||
for orth in emoticons:
|
for orth in emoticons:
|
||||||
BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
|
BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
|
# Moved from a suffix setting due to #9155 removing prefixes from consideration
|
||||||
|
# for lookbehinds
|
||||||
|
for u in "cfkCFK":
|
||||||
|
BASE_EXCEPTIONS[f"°{u}."] = [{ORTH: "°"}, {ORTH: f"{u}"}, {ORTH: "."}]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
|
@ -23,13 +23,25 @@ class Ukrainian(Language):
|
||||||
@Ukrainian.factory(
|
@Ukrainian.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "pymorphy2", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "pymorphy2",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return UkrainianLemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Ukrainian"]
|
__all__ = ["Ukrainian"]
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from ..ru.lemmatizer import RussianLemmatizer
|
from ..ru.lemmatizer import RussianLemmatizer
|
||||||
|
from ...pipeline.lemmatizer import lemmatizer_score
|
||||||
from ...vocab import Vocab
|
from ...vocab import Vocab
|
||||||
|
|
||||||
|
|
||||||
|
@ -15,6 +16,7 @@ class UkrainianLemmatizer(RussianLemmatizer):
|
||||||
*,
|
*,
|
||||||
mode: str = "pymorphy2",
|
mode: str = "pymorphy2",
|
||||||
overwrite: bool = False,
|
overwrite: bool = False,
|
||||||
|
scorer: Optional[Callable] = lemmatizer_score,
|
||||||
) -> None:
|
) -> None:
|
||||||
if mode == "pymorphy2":
|
if mode == "pymorphy2":
|
||||||
try:
|
try:
|
||||||
|
@ -27,4 +29,4 @@ class UkrainianLemmatizer(RussianLemmatizer):
|
||||||
) from None
|
) from None
|
||||||
if getattr(self, "_morph", None) is None:
|
if getattr(self, "_morph", None) is None:
|
||||||
self._morph = MorphAnalyzer(lang="uk")
|
self._morph = MorphAnalyzer(lang="uk")
|
||||||
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
|
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)
|
||||||
|
|
|
@ -9,6 +9,7 @@ from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language, BaseDefaults
|
from ...language import Language, BaseDefaults
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
|
from ...vocab import Vocab
|
||||||
from ... import util
|
from ... import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -24,14 +25,14 @@ use_pyvi = true
|
||||||
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
|
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
|
||||||
def create_vietnamese_tokenizer(use_pyvi: bool = True):
|
def create_vietnamese_tokenizer(use_pyvi: bool = True):
|
||||||
def vietnamese_tokenizer_factory(nlp):
|
def vietnamese_tokenizer_factory(nlp):
|
||||||
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
|
return VietnameseTokenizer(nlp.vocab, use_pyvi=use_pyvi)
|
||||||
|
|
||||||
return vietnamese_tokenizer_factory
|
return vietnamese_tokenizer_factory
|
||||||
|
|
||||||
|
|
||||||
class VietnameseTokenizer(DummyTokenizer):
|
class VietnameseTokenizer(DummyTokenizer):
|
||||||
def __init__(self, nlp: Language, use_pyvi: bool = False):
|
def __init__(self, vocab: Vocab, use_pyvi: bool = False):
|
||||||
self.vocab = nlp.vocab
|
self.vocab = vocab
|
||||||
self.use_pyvi = use_pyvi
|
self.use_pyvi = use_pyvi
|
||||||
if self.use_pyvi:
|
if self.use_pyvi:
|
||||||
try:
|
try:
|
||||||
|
@ -45,6 +46,9 @@ class VietnameseTokenizer(DummyTokenizer):
|
||||||
)
|
)
|
||||||
raise ImportError(msg) from None
|
raise ImportError(msg) from None
|
||||||
|
|
||||||
|
def __reduce__(self):
|
||||||
|
return VietnameseTokenizer, (self.vocab, self.use_pyvi)
|
||||||
|
|
||||||
def __call__(self, text: str) -> Doc:
|
def __call__(self, text: str) -> Doc:
|
||||||
if self.use_pyvi:
|
if self.use_pyvi:
|
||||||
words = self.pyvi_tokenize(text)
|
words = self.pyvi_tokenize(text)
|
||||||
|
|
18
spacy/lang/vi/examples.py
Normal file
18
spacy/lang/vi/examples.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
>>> from spacy.lang.vi.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Đây là đâu, tôi là ai?",
|
||||||
|
"Căn phòng có nhiều cửa sổ nên nó khá sáng",
|
||||||
|
"Đại dịch COVID vừa qua đã gây ảnh hưởng rất lớn tới nhiều doanh nghiệp lớn nhỏ.",
|
||||||
|
"Thành phố Hồ Chí Minh đã bị ảnh hưởng nặng nề trong thời gian vừa qua.",
|
||||||
|
"Ông bạn đang ở đâu thế?",
|
||||||
|
"Ai là người giải phóng đất nước Việt Nam khỏi ách đô hộ?",
|
||||||
|
"Vị tướng nào là người đã làm nên chiến thắng lịch sử Điện Biên Phủ?",
|
||||||
|
"Làm việc nhiều chán quá, đi chơi đâu đi?",
|
||||||
|
]
|
|
@ -9,11 +9,14 @@ _num_words = [
|
||||||
"bốn",
|
"bốn",
|
||||||
"năm",
|
"năm",
|
||||||
"sáu",
|
"sáu",
|
||||||
|
"bảy",
|
||||||
"bẩy",
|
"bẩy",
|
||||||
"tám",
|
"tám",
|
||||||
"chín",
|
"chín",
|
||||||
"mười",
|
"mười",
|
||||||
|
"chục",
|
||||||
"trăm",
|
"trăm",
|
||||||
|
"nghìn",
|
||||||
"tỷ",
|
"tỷ",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -11,6 +11,7 @@ from ...scorer import Scorer
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...training import validate_examples, Example
|
from ...training import validate_examples, Example
|
||||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
|
from ...vocab import Vocab
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ... import util
|
from ... import util
|
||||||
|
@ -48,14 +49,14 @@ class Segmenter(str, Enum):
|
||||||
@registry.tokenizers("spacy.zh.ChineseTokenizer")
|
@registry.tokenizers("spacy.zh.ChineseTokenizer")
|
||||||
def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
|
def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
|
||||||
def chinese_tokenizer_factory(nlp):
|
def chinese_tokenizer_factory(nlp):
|
||||||
return ChineseTokenizer(nlp, segmenter=segmenter)
|
return ChineseTokenizer(nlp.vocab, segmenter=segmenter)
|
||||||
|
|
||||||
return chinese_tokenizer_factory
|
return chinese_tokenizer_factory
|
||||||
|
|
||||||
|
|
||||||
class ChineseTokenizer(DummyTokenizer):
|
class ChineseTokenizer(DummyTokenizer):
|
||||||
def __init__(self, nlp: Language, segmenter: Segmenter = Segmenter.char):
|
def __init__(self, vocab: Vocab, segmenter: Segmenter = Segmenter.char):
|
||||||
self.vocab = nlp.vocab
|
self.vocab = vocab
|
||||||
self.segmenter = (
|
self.segmenter = (
|
||||||
segmenter.value if isinstance(segmenter, Segmenter) else segmenter
|
segmenter.value if isinstance(segmenter, Segmenter) else segmenter
|
||||||
)
|
)
|
||||||
|
|
|
@ -115,7 +115,7 @@ class Language:
|
||||||
|
|
||||||
Defaults (class): Settings, data and factory methods for creating the `nlp`
|
Defaults (class): Settings, data and factory methods for creating the `nlp`
|
||||||
object and processing pipeline.
|
object and processing pipeline.
|
||||||
lang (str): Two-letter language ID, i.e. ISO code.
|
lang (str): IETF language code, such as 'en'.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language
|
DOCS: https://spacy.io/api/language
|
||||||
"""
|
"""
|
||||||
|
@ -228,6 +228,7 @@ class Language:
|
||||||
"vectors": len(self.vocab.vectors),
|
"vectors": len(self.vocab.vectors),
|
||||||
"keys": self.vocab.vectors.n_keys,
|
"keys": self.vocab.vectors.n_keys,
|
||||||
"name": self.vocab.vectors.name,
|
"name": self.vocab.vectors.name,
|
||||||
|
"mode": self.vocab.vectors.mode,
|
||||||
}
|
}
|
||||||
self._meta["labels"] = dict(self.pipe_labels)
|
self._meta["labels"] = dict(self.pipe_labels)
|
||||||
# TODO: Adding this back to prevent breaking people's code etc., but
|
# TODO: Adding this back to prevent breaking people's code etc., but
|
||||||
|
@ -978,7 +979,7 @@ class Language:
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
self,
|
self,
|
||||||
text: str,
|
text: Union[str, Doc],
|
||||||
*,
|
*,
|
||||||
disable: Iterable[str] = SimpleFrozenList(),
|
disable: Iterable[str] = SimpleFrozenList(),
|
||||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||||
|
@ -987,7 +988,9 @@ class Language:
|
||||||
and can contain arbitrary whitespace. Alignment into the original string
|
and can contain arbitrary whitespace. Alignment into the original string
|
||||||
is preserved.
|
is preserved.
|
||||||
|
|
||||||
text (str): The text to be processed.
|
text (Union[str, Doc]): If `str`, the text to be processed. If `Doc`,
|
||||||
|
the doc will be passed directly to the pipeline, skipping
|
||||||
|
`Language.make_doc`.
|
||||||
disable (List[str]): Names of the pipeline components to disable.
|
disable (List[str]): Names of the pipeline components to disable.
|
||||||
component_cfg (Dict[str, dict]): An optional dictionary with extra
|
component_cfg (Dict[str, dict]): An optional dictionary with extra
|
||||||
keyword arguments for specific components.
|
keyword arguments for specific components.
|
||||||
|
@ -995,7 +998,7 @@ class Language:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#call
|
DOCS: https://spacy.io/api/language#call
|
||||||
"""
|
"""
|
||||||
doc = self.make_doc(text)
|
doc = self._ensure_doc(text)
|
||||||
if component_cfg is None:
|
if component_cfg is None:
|
||||||
component_cfg = {}
|
component_cfg = {}
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
|
@ -1080,6 +1083,20 @@ class Language:
|
||||||
)
|
)
|
||||||
return self.tokenizer(text)
|
return self.tokenizer(text)
|
||||||
|
|
||||||
|
def _ensure_doc(self, doc_like: Union[str, Doc]) -> Doc:
|
||||||
|
"""Create a Doc if need be, or raise an error if the input is not a Doc or a string."""
|
||||||
|
if isinstance(doc_like, Doc):
|
||||||
|
return doc_like
|
||||||
|
if isinstance(doc_like, str):
|
||||||
|
return self.make_doc(doc_like)
|
||||||
|
raise ValueError(Errors.E866.format(type=type(doc_like)))
|
||||||
|
|
||||||
|
def _ensure_doc_with_context(self, doc_like: Union[str, Doc], context: Any) -> Doc:
|
||||||
|
"""Create a Doc if need be and add as_tuples context, or raise an error if the input is not a Doc or a string."""
|
||||||
|
doc = self._ensure_doc(doc_like)
|
||||||
|
doc._context = context
|
||||||
|
return doc
|
||||||
|
|
||||||
def update(
|
def update(
|
||||||
self,
|
self,
|
||||||
examples: Iterable[Example],
|
examples: Iterable[Example],
|
||||||
|
@ -1450,7 +1467,7 @@ class Language:
|
||||||
@overload
|
@overload
|
||||||
def pipe(
|
def pipe(
|
||||||
self,
|
self,
|
||||||
texts: Iterable[str],
|
texts: Iterable[Union[str, Doc]],
|
||||||
*,
|
*,
|
||||||
as_tuples: Literal[False] = ...,
|
as_tuples: Literal[False] = ...,
|
||||||
batch_size: Optional[int] = ...,
|
batch_size: Optional[int] = ...,
|
||||||
|
@ -1463,7 +1480,7 @@ class Language:
|
||||||
@overload
|
@overload
|
||||||
def pipe( # noqa: F811
|
def pipe( # noqa: F811
|
||||||
self,
|
self,
|
||||||
texts: Iterable[Tuple[str, _AnyContext]],
|
texts: Iterable[Tuple[Union[str, Doc], _AnyContext]],
|
||||||
*,
|
*,
|
||||||
as_tuples: Literal[True] = ...,
|
as_tuples: Literal[True] = ...,
|
||||||
batch_size: Optional[int] = ...,
|
batch_size: Optional[int] = ...,
|
||||||
|
@ -1475,7 +1492,9 @@ class Language:
|
||||||
|
|
||||||
def pipe( # noqa: F811
|
def pipe( # noqa: F811
|
||||||
self,
|
self,
|
||||||
texts: Union[Iterable[str], Iterable[Tuple[str, _AnyContext]]],
|
texts: Union[
|
||||||
|
Iterable[Union[str, Doc]], Iterable[Tuple[Union[str, Doc], _AnyContext]]
|
||||||
|
],
|
||||||
*,
|
*,
|
||||||
as_tuples: bool = False,
|
as_tuples: bool = False,
|
||||||
batch_size: Optional[int] = None,
|
batch_size: Optional[int] = None,
|
||||||
|
@ -1485,7 +1504,8 @@ class Language:
|
||||||
) -> Union[Iterator[Doc], Iterator[Tuple[Doc, _AnyContext]]]:
|
) -> Union[Iterator[Doc], Iterator[Tuple[Doc, _AnyContext]]]:
|
||||||
"""Process texts as a stream, and yield `Doc` objects in order.
|
"""Process texts as a stream, and yield `Doc` objects in order.
|
||||||
|
|
||||||
texts (Iterable[str]): A sequence of texts to process.
|
texts (Iterable[Union[str, Doc]]): A sequence of texts or docs to
|
||||||
|
process.
|
||||||
as_tuples (bool): If set to True, inputs should be a sequence of
|
as_tuples (bool): If set to True, inputs should be a sequence of
|
||||||
(text, context) tuples. Output will then be a sequence of
|
(text, context) tuples. Output will then be a sequence of
|
||||||
(doc, context) tuples. Defaults to False.
|
(doc, context) tuples. Defaults to False.
|
||||||
|
@ -1500,23 +1520,24 @@ class Language:
|
||||||
"""
|
"""
|
||||||
# Handle texts with context as tuples
|
# Handle texts with context as tuples
|
||||||
if as_tuples:
|
if as_tuples:
|
||||||
texts = cast(Iterable[Tuple[str, _AnyContext]], texts)
|
texts = cast(Iterable[Tuple[Union[str, Doc], _AnyContext]], texts)
|
||||||
text_context1, text_context2 = itertools.tee(texts)
|
docs_with_contexts = (
|
||||||
texts = (tc[0] for tc in text_context1)
|
self._ensure_doc_with_context(text, context) for text, context in texts
|
||||||
contexts = (tc[1] for tc in text_context2)
|
)
|
||||||
docs = self.pipe(
|
docs = self.pipe(
|
||||||
texts,
|
docs_with_contexts,
|
||||||
batch_size=batch_size,
|
batch_size=batch_size,
|
||||||
disable=disable,
|
disable=disable,
|
||||||
n_process=n_process,
|
n_process=n_process,
|
||||||
component_cfg=component_cfg,
|
component_cfg=component_cfg,
|
||||||
)
|
)
|
||||||
for doc, context in zip(docs, contexts):
|
for doc in docs:
|
||||||
|
context = doc._context
|
||||||
|
doc._context = None
|
||||||
yield (doc, context)
|
yield (doc, context)
|
||||||
return
|
return
|
||||||
|
|
||||||
# At this point, we know that we're dealing with an iterable of plain texts
|
texts = cast(Iterable[Union[str, Doc]], texts)
|
||||||
texts = cast(Iterable[str], texts)
|
|
||||||
|
|
||||||
# Set argument defaults
|
# Set argument defaults
|
||||||
if n_process == -1:
|
if n_process == -1:
|
||||||
|
@ -1551,7 +1572,7 @@ class Language:
|
||||||
docs = self._multiprocessing_pipe(texts, pipes, n_process, batch_size)
|
docs = self._multiprocessing_pipe(texts, pipes, n_process, batch_size)
|
||||||
else:
|
else:
|
||||||
# if n_process == 1, no processes are forked.
|
# if n_process == 1, no processes are forked.
|
||||||
docs = (self.make_doc(text) for text in texts)
|
docs = (self._ensure_doc(text) for text in texts)
|
||||||
for pipe in pipes:
|
for pipe in pipes:
|
||||||
docs = pipe(docs)
|
docs = pipe(docs)
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
|
@ -1570,7 +1591,7 @@ class Language:
|
||||||
|
|
||||||
def _multiprocessing_pipe(
|
def _multiprocessing_pipe(
|
||||||
self,
|
self,
|
||||||
texts: Iterable[str],
|
texts: Iterable[Union[str, Doc]],
|
||||||
pipes: Iterable[Callable[..., Iterator[Doc]]],
|
pipes: Iterable[Callable[..., Iterator[Doc]]],
|
||||||
n_process: int,
|
n_process: int,
|
||||||
batch_size: int,
|
batch_size: int,
|
||||||
|
@ -1596,7 +1617,7 @@ class Language:
|
||||||
procs = [
|
procs = [
|
||||||
mp.Process(
|
mp.Process(
|
||||||
target=_apply_pipes,
|
target=_apply_pipes,
|
||||||
args=(self.make_doc, pipes, rch, sch, Underscore.get_state()),
|
args=(self._ensure_doc, pipes, rch, sch, Underscore.get_state()),
|
||||||
)
|
)
|
||||||
for rch, sch in zip(texts_q, bytedocs_send_ch)
|
for rch, sch in zip(texts_q, bytedocs_send_ch)
|
||||||
]
|
]
|
||||||
|
@ -1609,11 +1630,12 @@ class Language:
|
||||||
recv.recv() for recv in cycle(bytedocs_recv_ch)
|
recv.recv() for recv in cycle(bytedocs_recv_ch)
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
for i, (_, (byte_doc, byte_error)) in enumerate(
|
for i, (_, (byte_doc, byte_context, byte_error)) in enumerate(
|
||||||
zip(raw_texts, byte_tuples), 1
|
zip(raw_texts, byte_tuples), 1
|
||||||
):
|
):
|
||||||
if byte_doc is not None:
|
if byte_doc is not None:
|
||||||
doc = Doc(self.vocab).from_bytes(byte_doc)
|
doc = Doc(self.vocab).from_bytes(byte_doc)
|
||||||
|
doc._context = byte_context
|
||||||
yield doc
|
yield doc
|
||||||
elif byte_error is not None:
|
elif byte_error is not None:
|
||||||
error = srsly.msgpack_loads(byte_error)
|
error = srsly.msgpack_loads(byte_error)
|
||||||
|
@ -2138,7 +2160,7 @@ def _copy_examples(examples: Iterable[Example]) -> List[Example]:
|
||||||
|
|
||||||
|
|
||||||
def _apply_pipes(
|
def _apply_pipes(
|
||||||
make_doc: Callable[[str], Doc],
|
ensure_doc: Callable[[Union[str, Doc]], Doc],
|
||||||
pipes: Iterable[Callable[..., Iterator[Doc]]],
|
pipes: Iterable[Callable[..., Iterator[Doc]]],
|
||||||
receiver,
|
receiver,
|
||||||
sender,
|
sender,
|
||||||
|
@ -2146,7 +2168,8 @@ def _apply_pipes(
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Worker for Language.pipe
|
"""Worker for Language.pipe
|
||||||
|
|
||||||
make_doc (Callable[[str,] Doc]): Function to create Doc from text.
|
ensure_doc (Callable[[Union[str, Doc]], Doc]): Function to create Doc from text
|
||||||
|
or raise an error if the input is neither a Doc nor a string.
|
||||||
pipes (Iterable[Pipe]): The components to apply.
|
pipes (Iterable[Pipe]): The components to apply.
|
||||||
receiver (multiprocessing.Connection): Pipe to receive text. Usually
|
receiver (multiprocessing.Connection): Pipe to receive text. Usually
|
||||||
created by `multiprocessing.Pipe()`
|
created by `multiprocessing.Pipe()`
|
||||||
|
@ -2159,16 +2182,16 @@ def _apply_pipes(
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
texts = receiver.get()
|
texts = receiver.get()
|
||||||
docs = (make_doc(text) for text in texts)
|
docs = (ensure_doc(text) for text in texts)
|
||||||
for pipe in pipes:
|
for pipe in pipes:
|
||||||
docs = pipe(docs) # type: ignore[arg-type, assignment]
|
docs = pipe(docs) # type: ignore[arg-type, assignment]
|
||||||
# Connection does not accept unpickable objects, so send list.
|
# Connection does not accept unpickable objects, so send list.
|
||||||
byte_docs = [(doc.to_bytes(), None) for doc in docs]
|
byte_docs = [(doc.to_bytes(), doc._context, None) for doc in docs]
|
||||||
padding = [(None, None)] * (len(texts) - len(byte_docs))
|
padding = [(None, None, None)] * (len(texts) - len(byte_docs))
|
||||||
sender.send(byte_docs + padding) # type: ignore[operator]
|
sender.send(byte_docs + padding) # type: ignore[operator]
|
||||||
except Exception:
|
except Exception:
|
||||||
error_msg = [(None, srsly.msgpack_dumps(traceback.format_exc()))]
|
error_msg = [(None, None, srsly.msgpack_dumps(traceback.format_exc()))]
|
||||||
padding = [(None, None)] * (len(texts) - 1)
|
padding = [(None, None, None)] * (len(texts) - 1)
|
||||||
sender.send(error_msg + padding)
|
sender.send(error_msg + padding)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -284,7 +284,7 @@ cdef class Lexeme:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lower]
|
return self.vocab.strings[self.c.lower]
|
||||||
|
|
||||||
def __set__(self, unicode x):
|
def __set__(self, str x):
|
||||||
self.c.lower = self.vocab.strings.add(x)
|
self.c.lower = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property norm_:
|
property norm_:
|
||||||
|
@ -294,7 +294,7 @@ cdef class Lexeme:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.norm]
|
return self.vocab.strings[self.c.norm]
|
||||||
|
|
||||||
def __set__(self, unicode x):
|
def __set__(self, str x):
|
||||||
self.norm = self.vocab.strings.add(x)
|
self.norm = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property shape_:
|
property shape_:
|
||||||
|
@ -304,7 +304,7 @@ cdef class Lexeme:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.shape]
|
return self.vocab.strings[self.c.shape]
|
||||||
|
|
||||||
def __set__(self, unicode x):
|
def __set__(self, str x):
|
||||||
self.c.shape = self.vocab.strings.add(x)
|
self.c.shape = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property prefix_:
|
property prefix_:
|
||||||
|
@ -314,7 +314,7 @@ cdef class Lexeme:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.prefix]
|
return self.vocab.strings[self.c.prefix]
|
||||||
|
|
||||||
def __set__(self, unicode x):
|
def __set__(self, str x):
|
||||||
self.c.prefix = self.vocab.strings.add(x)
|
self.c.prefix = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property suffix_:
|
property suffix_:
|
||||||
|
@ -324,7 +324,7 @@ cdef class Lexeme:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.suffix]
|
return self.vocab.strings[self.c.suffix]
|
||||||
|
|
||||||
def __set__(self, unicode x):
|
def __set__(self, str x):
|
||||||
self.c.suffix = self.vocab.strings.add(x)
|
self.c.suffix = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property lang_:
|
property lang_:
|
||||||
|
@ -332,7 +332,7 @@ cdef class Lexeme:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lang]
|
return self.vocab.strings[self.c.lang]
|
||||||
|
|
||||||
def __set__(self, unicode x):
|
def __set__(self, str x):
|
||||||
self.c.lang = self.vocab.strings.add(x)
|
self.c.lang = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property flags:
|
property flags:
|
||||||
|
|
|
@ -148,9 +148,9 @@ cdef class DependencyMatcher:
|
||||||
Creates a token key to be used by the matcher
|
Creates a token key to be used by the matcher
|
||||||
"""
|
"""
|
||||||
return self._normalize_key(
|
return self._normalize_key(
|
||||||
unicode(key) + DELIMITER +
|
str(key) + DELIMITER +
|
||||||
unicode(pattern_idx) + DELIMITER +
|
str(pattern_idx) + DELIMITER +
|
||||||
unicode(token_idx)
|
str(token_idx)
|
||||||
)
|
)
|
||||||
|
|
||||||
def add(self, key, patterns, *, on_match=None):
|
def add(self, key, patterns, *, on_match=None):
|
||||||
|
@ -424,7 +424,7 @@ cdef class DependencyMatcher:
|
||||||
return [doc[child.i] for child in doc[node].head.children if child.i < node]
|
return [doc[child.i] for child in doc[node].head.children if child.i < node]
|
||||||
|
|
||||||
def _normalize_key(self, key):
|
def _normalize_key(self, key):
|
||||||
if isinstance(key, basestring):
|
if isinstance(key, str):
|
||||||
return self.vocab.strings.add(key)
|
return self.vocab.strings.add(key)
|
||||||
else:
|
else:
|
||||||
return key
|
return key
|
||||||
|
|
|
@ -312,7 +312,7 @@ cdef class Matcher:
|
||||||
return final_results
|
return final_results
|
||||||
|
|
||||||
def _normalize_key(self, key):
|
def _normalize_key(self, key):
|
||||||
if isinstance(key, basestring):
|
if isinstance(key, str):
|
||||||
return self.vocab.strings.add(key)
|
return self.vocab.strings.add(key)
|
||||||
else:
|
else:
|
||||||
return key
|
return key
|
||||||
|
@ -360,7 +360,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
||||||
for i, token in enumerate(doclike):
|
for i, token in enumerate(doclike):
|
||||||
for name, index in extensions.items():
|
for name, index in extensions.items():
|
||||||
value = token._.get(name)
|
value = token._.get(name)
|
||||||
if isinstance(value, basestring):
|
if isinstance(value, str):
|
||||||
value = token.vocab.strings[value]
|
value = token.vocab.strings[value]
|
||||||
extra_attr_values[i * nr_extra_attr + index] = value
|
extra_attr_values[i * nr_extra_attr + index] = value
|
||||||
# Main loop
|
# Main loop
|
||||||
|
@ -786,7 +786,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
|
||||||
def _get_attr_values(spec, string_store):
|
def _get_attr_values(spec, string_store):
|
||||||
attr_values = []
|
attr_values = []
|
||||||
for attr, value in spec.items():
|
for attr, value in spec.items():
|
||||||
if isinstance(attr, basestring):
|
if isinstance(attr, str):
|
||||||
attr = attr.upper()
|
attr = attr.upper()
|
||||||
if attr == '_':
|
if attr == '_':
|
||||||
continue
|
continue
|
||||||
|
@ -797,7 +797,7 @@ def _get_attr_values(spec, string_store):
|
||||||
if attr == "IS_SENT_START":
|
if attr == "IS_SENT_START":
|
||||||
attr = "SENT_START"
|
attr = "SENT_START"
|
||||||
attr = IDS.get(attr)
|
attr = IDS.get(attr)
|
||||||
if isinstance(value, basestring):
|
if isinstance(value, str):
|
||||||
value = string_store.add(value)
|
value = string_store.add(value)
|
||||||
elif isinstance(value, bool):
|
elif isinstance(value, bool):
|
||||||
value = int(value)
|
value = int(value)
|
||||||
|
@ -938,7 +938,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
|
||||||
seen_predicates = {pred.key: pred.i for pred in extra_predicates}
|
seen_predicates = {pred.key: pred.i for pred in extra_predicates}
|
||||||
output = []
|
output = []
|
||||||
for attr, value in spec.items():
|
for attr, value in spec.items():
|
||||||
if isinstance(attr, basestring):
|
if isinstance(attr, str):
|
||||||
if attr == "_":
|
if attr == "_":
|
||||||
output.extend(
|
output.extend(
|
||||||
_get_extension_extra_predicates(
|
_get_extension_extra_predicates(
|
||||||
|
@ -995,7 +995,7 @@ def _get_operators(spec):
|
||||||
"?": (ZERO_ONE,), "1": (ONE,), "!": (ZERO,)}
|
"?": (ZERO_ONE,), "1": (ONE,), "!": (ZERO,)}
|
||||||
# Fix casing
|
# Fix casing
|
||||||
spec = {key.upper(): values for key, values in spec.items()
|
spec = {key.upper(): values for key, values in spec.items()
|
||||||
if isinstance(key, basestring)}
|
if isinstance(key, str)}
|
||||||
if "OP" not in spec:
|
if "OP" not in spec:
|
||||||
return (ONE,)
|
return (ONE,)
|
||||||
elif spec["OP"] in lookup:
|
elif spec["OP"] in lookup:
|
||||||
|
@ -1013,7 +1013,7 @@ def _get_extensions(spec, string_store, name2index):
|
||||||
if isinstance(value, dict):
|
if isinstance(value, dict):
|
||||||
# Handle predicates (e.g. "IN", in the extra_predicates, not here.
|
# Handle predicates (e.g. "IN", in the extra_predicates, not here.
|
||||||
continue
|
continue
|
||||||
if isinstance(value, basestring):
|
if isinstance(value, str):
|
||||||
value = string_store.add(value)
|
value = string_store.add(value)
|
||||||
if name not in name2index:
|
if name not in name2index:
|
||||||
name2index[name] = len(name2index)
|
name2index[name] = len(name2index)
|
||||||
|
|
|
@ -1,11 +1,13 @@
|
||||||
from typing import List, Tuple, Callable, Optional, cast
|
from typing import List, Tuple, Callable, Optional, Sequence, cast
|
||||||
from thinc.initializers import glorot_uniform_init
|
from thinc.initializers import glorot_uniform_init
|
||||||
from thinc.util import partial
|
from thinc.util import partial
|
||||||
from thinc.types import Ragged, Floats2d, Floats1d
|
from thinc.types import Ragged, Floats2d, Floats1d, Ints1d
|
||||||
from thinc.api import Model, Ops, registry
|
from thinc.api import Model, Ops, registry
|
||||||
|
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
from ..vectors import Mode
|
||||||
|
from ..vocab import Vocab
|
||||||
|
|
||||||
|
|
||||||
@registry.layers("spacy.StaticVectors.v2")
|
@registry.layers("spacy.StaticVectors.v2")
|
||||||
|
@ -34,20 +36,32 @@ def StaticVectors(
|
||||||
def forward(
|
def forward(
|
||||||
model: Model[List[Doc], Ragged], docs: List[Doc], is_train: bool
|
model: Model[List[Doc], Ragged], docs: List[Doc], is_train: bool
|
||||||
) -> Tuple[Ragged, Callable]:
|
) -> Tuple[Ragged, Callable]:
|
||||||
if not sum(len(doc) for doc in docs):
|
token_count = sum(len(doc) for doc in docs)
|
||||||
|
if not token_count:
|
||||||
return _handle_empty(model.ops, model.get_dim("nO"))
|
return _handle_empty(model.ops, model.get_dim("nO"))
|
||||||
key_attr = model.attrs["key_attr"]
|
key_attr: int = model.attrs["key_attr"]
|
||||||
W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
|
keys: Ints1d = model.ops.flatten(
|
||||||
V = cast(Floats2d, model.ops.asarray(docs[0].vocab.vectors.data))
|
cast(Sequence, [doc.to_array(key_attr) for doc in docs])
|
||||||
rows = model.ops.flatten(
|
|
||||||
[doc.vocab.vectors.find(keys=doc.to_array(key_attr)) for doc in docs]
|
|
||||||
)
|
)
|
||||||
|
vocab: Vocab = docs[0].vocab
|
||||||
|
W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
|
||||||
|
if vocab.vectors.mode == Mode.default:
|
||||||
|
V = cast(Floats2d, model.ops.asarray(vocab.vectors.data))
|
||||||
|
rows = vocab.vectors.find(keys=keys)
|
||||||
|
V = model.ops.as_contig(V[rows])
|
||||||
|
elif vocab.vectors.mode == Mode.floret:
|
||||||
|
V = cast(Floats2d, vocab.vectors.get_batch(keys))
|
||||||
|
V = model.ops.as_contig(V)
|
||||||
|
else:
|
||||||
|
raise RuntimeError(Errors.E896)
|
||||||
try:
|
try:
|
||||||
vectors_data = model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True)
|
vectors_data = model.ops.gemm(V, W, trans2=True)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
raise RuntimeError(Errors.E896)
|
raise RuntimeError(Errors.E896)
|
||||||
# Convert negative indices to 0-vectors (TODO: more options for UNK tokens)
|
if vocab.vectors.mode == Mode.default:
|
||||||
vectors_data[rows < 0] = 0
|
# Convert negative indices to 0-vectors
|
||||||
|
# TODO: more options for UNK tokens
|
||||||
|
vectors_data[rows < 0] = 0
|
||||||
output = Ragged(
|
output = Ragged(
|
||||||
vectors_data, model.ops.asarray([len(doc) for doc in docs], dtype="i") # type: ignore
|
vectors_data, model.ops.asarray([len(doc) for doc in docs], dtype="i") # type: ignore
|
||||||
)
|
)
|
||||||
|
@ -63,7 +77,7 @@ def forward(
|
||||||
model.inc_grad(
|
model.inc_grad(
|
||||||
"W",
|
"W",
|
||||||
model.ops.gemm(
|
model.ops.gemm(
|
||||||
cast(Floats2d, d_output.data), model.ops.as_contig(V[rows]), trans1=True
|
cast(Floats2d, d_output.data), model.ops.as_contig(V), trans1=True
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
return []
|
return []
|
||||||
|
|
|
@ -17,7 +17,7 @@ from ...errors import Errors
|
||||||
from thinc.extra.search cimport Beam
|
from thinc.extra.search cimport Beam
|
||||||
|
|
||||||
cdef weight_t MIN_SCORE = -90000
|
cdef weight_t MIN_SCORE = -90000
|
||||||
cdef attr_t SUBTOK_LABEL = hash_string(u'subtok')
|
cdef attr_t SUBTOK_LABEL = hash_string('subtok')
|
||||||
|
|
||||||
DEF NON_MONOTONIC = True
|
DEF NON_MONOTONIC = True
|
||||||
|
|
||||||
|
|
|
@ -5,15 +5,15 @@ from pathlib import Path
|
||||||
|
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..training import validate_examples, Example
|
from ..training import Example
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..matcher import Matcher
|
from ..matcher import Matcher
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..symbols import IDS, TAG, POS, MORPH, LEMMA
|
from ..symbols import IDS
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
from ..tokens._retokenize import normalize_token_attrs, set_token_attrs
|
from ..tokens._retokenize import normalize_token_attrs, set_token_attrs
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from ..util import SimpleFrozenList
|
from ..util import SimpleFrozenList, registry
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -23,9 +23,41 @@ TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]]
|
||||||
MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
|
MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
|
||||||
|
|
||||||
|
|
||||||
@Language.factory("attribute_ruler", default_config={"validate": False})
|
@Language.factory(
|
||||||
def make_attribute_ruler(nlp: Language, name: str, validate: bool):
|
"attribute_ruler",
|
||||||
return AttributeRuler(nlp.vocab, name, validate=validate)
|
default_config={
|
||||||
|
"validate": False,
|
||||||
|
"scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
def make_attribute_ruler(
|
||||||
|
nlp: Language, name: str, validate: bool, scorer: Optional[Callable]
|
||||||
|
):
|
||||||
|
return AttributeRuler(nlp.vocab, name, validate=validate, scorer=scorer)
|
||||||
|
|
||||||
|
|
||||||
|
def attribute_ruler_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
|
def morph_key_getter(token, attr):
|
||||||
|
return getattr(token, attr).key
|
||||||
|
|
||||||
|
results = {}
|
||||||
|
results.update(Scorer.score_token_attr(examples, "tag", **kwargs))
|
||||||
|
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
||||||
|
results.update(
|
||||||
|
Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs)
|
||||||
|
)
|
||||||
|
results.update(
|
||||||
|
Scorer.score_token_attr_per_feat(
|
||||||
|
examples, "morph", getter=morph_key_getter, **kwargs
|
||||||
|
)
|
||||||
|
)
|
||||||
|
results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
@registry.scorers("spacy.attribute_ruler_scorer.v1")
|
||||||
|
def make_attribute_ruler_scorer():
|
||||||
|
return attribute_ruler_score
|
||||||
|
|
||||||
|
|
||||||
class AttributeRuler(Pipe):
|
class AttributeRuler(Pipe):
|
||||||
|
@ -36,7 +68,12 @@ class AttributeRuler(Pipe):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, vocab: Vocab, name: str = "attribute_ruler", *, validate: bool = False
|
self,
|
||||||
|
vocab: Vocab,
|
||||||
|
name: str = "attribute_ruler",
|
||||||
|
*,
|
||||||
|
validate: bool = False,
|
||||||
|
scorer: Optional[Callable] = attribute_ruler_score,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Create the AttributeRuler. After creation, you can add patterns
|
"""Create the AttributeRuler. After creation, you can add patterns
|
||||||
with the `.initialize()` or `.add_patterns()` methods, or load patterns
|
with the `.initialize()` or `.add_patterns()` methods, or load patterns
|
||||||
|
@ -45,6 +82,10 @@ class AttributeRuler(Pipe):
|
||||||
|
|
||||||
vocab (Vocab): The vocab.
|
vocab (Vocab): The vocab.
|
||||||
name (str): The pipe name. Defaults to "attribute_ruler".
|
name (str): The pipe name. Defaults to "attribute_ruler".
|
||||||
|
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||||
|
Scorer.score_token_attr for the attributes "tag", "pos", "morph" and
|
||||||
|
"lemma" and Scorer.score_token_attr_per_feat for the attribute
|
||||||
|
"morph".
|
||||||
|
|
||||||
RETURNS (AttributeRuler): The AttributeRuler component.
|
RETURNS (AttributeRuler): The AttributeRuler component.
|
||||||
|
|
||||||
|
@ -57,6 +98,7 @@ class AttributeRuler(Pipe):
|
||||||
self.attrs: List[Dict] = []
|
self.attrs: List[Dict] = []
|
||||||
self._attrs_unnormed: List[Dict] = [] # store for reference
|
self._attrs_unnormed: List[Dict] = [] # store for reference
|
||||||
self.indices: List[int] = []
|
self.indices: List[int] = []
|
||||||
|
self.scorer = scorer
|
||||||
|
|
||||||
def clear(self) -> None:
|
def clear(self) -> None:
|
||||||
"""Reset all patterns."""
|
"""Reset all patterns."""
|
||||||
|
@ -228,45 +270,6 @@ class AttributeRuler(Pipe):
|
||||||
all_patterns.append(p)
|
all_patterns.append(p)
|
||||||
return all_patterns # type: ignore[return-value]
|
return all_patterns # type: ignore[return-value]
|
||||||
|
|
||||||
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
|
||||||
"""Score a batch of examples.
|
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by
|
|
||||||
Scorer.score_token_attr for the attributes "tag", "pos", "morph"
|
|
||||||
and "lemma" for the target token attributes.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#score
|
|
||||||
"""
|
|
||||||
|
|
||||||
def morph_key_getter(token, attr):
|
|
||||||
return getattr(token, attr).key
|
|
||||||
|
|
||||||
validate_examples(examples, "AttributeRuler.score")
|
|
||||||
results = {}
|
|
||||||
attrs = set() # type: ignore
|
|
||||||
for token_attrs in self.attrs:
|
|
||||||
attrs.update(token_attrs)
|
|
||||||
for attr in attrs:
|
|
||||||
if attr == TAG:
|
|
||||||
results.update(Scorer.score_token_attr(examples, "tag", **kwargs))
|
|
||||||
elif attr == POS:
|
|
||||||
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
|
||||||
elif attr == MORPH:
|
|
||||||
results.update(
|
|
||||||
Scorer.score_token_attr(
|
|
||||||
examples, "morph", getter=morph_key_getter, **kwargs
|
|
||||||
)
|
|
||||||
)
|
|
||||||
results.update(
|
|
||||||
Scorer.score_token_attr_per_feat(
|
|
||||||
examples, "morph", getter=morph_key_getter, **kwargs
|
|
||||||
)
|
|
||||||
)
|
|
||||||
elif attr == LEMMA:
|
|
||||||
results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
|
|
||||||
return results
|
|
||||||
|
|
||||||
def to_bytes(self, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
|
def to_bytes(self, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
|
||||||
"""Serialize the AttributeRuler to a bytestring.
|
"""Serialize the AttributeRuler to a bytestring.
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from typing import Optional, Iterable
|
from typing import Optional, Iterable, Callable
|
||||||
from thinc.api import Model, Config
|
from thinc.api import Model, Config
|
||||||
|
|
||||||
from ._parser_internals.transition_system import TransitionSystem
|
from ._parser_internals.transition_system import TransitionSystem
|
||||||
|
@ -12,7 +12,7 @@ from ..language import Language
|
||||||
from ._parser_internals import nonproj
|
from ._parser_internals import nonproj
|
||||||
from ._parser_internals.nonproj import DELIMITER
|
from ._parser_internals.nonproj import DELIMITER
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..training import validate_examples
|
from ..util import registry
|
||||||
|
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
|
@ -46,6 +46,7 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
"learn_tokens": False,
|
"learn_tokens": False,
|
||||||
"min_action_freq": 30,
|
"min_action_freq": 30,
|
||||||
"model": DEFAULT_PARSER_MODEL,
|
"model": DEFAULT_PARSER_MODEL,
|
||||||
|
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
|
||||||
},
|
},
|
||||||
default_score_weights={
|
default_score_weights={
|
||||||
"dep_uas": 0.5,
|
"dep_uas": 0.5,
|
||||||
|
@ -63,7 +64,8 @@ def make_parser(
|
||||||
moves: Optional[TransitionSystem],
|
moves: Optional[TransitionSystem],
|
||||||
update_with_oracle_cut_size: int,
|
update_with_oracle_cut_size: int,
|
||||||
learn_tokens: bool,
|
learn_tokens: bool,
|
||||||
min_action_freq: int
|
min_action_freq: int,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
"""Create a transition-based DependencyParser component. The dependency parser
|
"""Create a transition-based DependencyParser component. The dependency parser
|
||||||
jointly learns sentence segmentation and labelled dependency parsing, and can
|
jointly learns sentence segmentation and labelled dependency parsing, and can
|
||||||
|
@ -100,6 +102,7 @@ def make_parser(
|
||||||
primarily affects the label accuracy, it can also affect the attachment
|
primarily affects the label accuracy, it can also affect the attachment
|
||||||
structure, as the labels are used to represent the pseudo-projectivity
|
structure, as the labels are used to represent the pseudo-projectivity
|
||||||
transformation.
|
transformation.
|
||||||
|
scorer (Optional[Callable]): The scoring method.
|
||||||
"""
|
"""
|
||||||
return DependencyParser(
|
return DependencyParser(
|
||||||
nlp.vocab,
|
nlp.vocab,
|
||||||
|
@ -115,7 +118,8 @@ def make_parser(
|
||||||
beam_update_prob=0.0,
|
beam_update_prob=0.0,
|
||||||
# At some point in the future we can try to implement support for
|
# At some point in the future we can try to implement support for
|
||||||
# partial annotations, perhaps only in the beam objective.
|
# partial annotations, perhaps only in the beam objective.
|
||||||
incorrect_spans_key=None
|
incorrect_spans_key=None,
|
||||||
|
scorer=scorer,
|
||||||
)
|
)
|
||||||
|
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
|
@ -130,6 +134,7 @@ def make_parser(
|
||||||
"learn_tokens": False,
|
"learn_tokens": False,
|
||||||
"min_action_freq": 30,
|
"min_action_freq": 30,
|
||||||
"model": DEFAULT_PARSER_MODEL,
|
"model": DEFAULT_PARSER_MODEL,
|
||||||
|
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
|
||||||
},
|
},
|
||||||
default_score_weights={
|
default_score_weights={
|
||||||
"dep_uas": 0.5,
|
"dep_uas": 0.5,
|
||||||
|
@ -151,6 +156,7 @@ def make_beam_parser(
|
||||||
beam_width: int,
|
beam_width: int,
|
||||||
beam_density: float,
|
beam_density: float,
|
||||||
beam_update_prob: float,
|
beam_update_prob: float,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
"""Create a transition-based DependencyParser component that uses beam-search.
|
"""Create a transition-based DependencyParser component that uses beam-search.
|
||||||
The dependency parser jointly learns sentence segmentation and labelled
|
The dependency parser jointly learns sentence segmentation and labelled
|
||||||
|
@ -207,10 +213,41 @@ def make_beam_parser(
|
||||||
min_action_freq=min_action_freq,
|
min_action_freq=min_action_freq,
|
||||||
# At some point in the future we can try to implement support for
|
# At some point in the future we can try to implement support for
|
||||||
# partial annotations, perhaps only in the beam objective.
|
# partial annotations, perhaps only in the beam objective.
|
||||||
incorrect_spans_key=None
|
incorrect_spans_key=None,
|
||||||
|
scorer=scorer,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def parser_score(examples, **kwargs):
|
||||||
|
"""Score a batch of examples.
|
||||||
|
|
||||||
|
examples (Iterable[Example]): The examples to score.
|
||||||
|
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
|
||||||
|
and Scorer.score_deps.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/dependencyparser#score
|
||||||
|
"""
|
||||||
|
def has_sents(doc):
|
||||||
|
return doc.has_annotation("SENT_START")
|
||||||
|
|
||||||
|
def dep_getter(token, attr):
|
||||||
|
dep = getattr(token, attr)
|
||||||
|
dep = token.vocab.strings.as_string(dep).lower()
|
||||||
|
return dep
|
||||||
|
results = {}
|
||||||
|
results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
|
||||||
|
kwargs.setdefault("getter", dep_getter)
|
||||||
|
kwargs.setdefault("ignore_labels", ("p", "punct"))
|
||||||
|
results.update(Scorer.score_deps(examples, "dep", **kwargs))
|
||||||
|
del results["sents_per_type"]
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
@registry.scorers("spacy.parser_scorer.v1")
|
||||||
|
def make_parser_scorer():
|
||||||
|
return parser_score
|
||||||
|
|
||||||
|
|
||||||
cdef class DependencyParser(Parser):
|
cdef class DependencyParser(Parser):
|
||||||
"""Pipeline component for dependency parsing.
|
"""Pipeline component for dependency parsing.
|
||||||
|
|
||||||
|
@ -233,6 +270,7 @@ cdef class DependencyParser(Parser):
|
||||||
beam_update_prob=0.0,
|
beam_update_prob=0.0,
|
||||||
multitasks=tuple(),
|
multitasks=tuple(),
|
||||||
incorrect_spans_key=None,
|
incorrect_spans_key=None,
|
||||||
|
scorer=parser_score,
|
||||||
):
|
):
|
||||||
"""Create a DependencyParser.
|
"""Create a DependencyParser.
|
||||||
"""
|
"""
|
||||||
|
@ -249,6 +287,7 @@ cdef class DependencyParser(Parser):
|
||||||
beam_update_prob=beam_update_prob,
|
beam_update_prob=beam_update_prob,
|
||||||
multitasks=multitasks,
|
multitasks=multitasks,
|
||||||
incorrect_spans_key=incorrect_spans_key,
|
incorrect_spans_key=incorrect_spans_key,
|
||||||
|
scorer=scorer,
|
||||||
)
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -281,31 +320,6 @@ cdef class DependencyParser(Parser):
|
||||||
labels.add(label)
|
labels.add(label)
|
||||||
return tuple(sorted(labels))
|
return tuple(sorted(labels))
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
|
||||||
"""Score a batch of examples.
|
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
|
|
||||||
and Scorer.score_deps.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/dependencyparser#score
|
|
||||||
"""
|
|
||||||
def has_sents(doc):
|
|
||||||
return doc.has_annotation("SENT_START")
|
|
||||||
|
|
||||||
validate_examples(examples, "DependencyParser.score")
|
|
||||||
def dep_getter(token, attr):
|
|
||||||
dep = getattr(token, attr)
|
|
||||||
dep = token.vocab.strings.as_string(dep).lower()
|
|
||||||
return dep
|
|
||||||
results = {}
|
|
||||||
results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
|
|
||||||
kwargs.setdefault("getter", dep_getter)
|
|
||||||
kwargs.setdefault("ignore_labels", ("p", "punct"))
|
|
||||||
results.update(Scorer.score_deps(examples, "dep", **kwargs))
|
|
||||||
del results["sents_per_type"]
|
|
||||||
return results
|
|
||||||
|
|
||||||
def scored_parses(self, beams):
|
def scored_parses(self, beams):
|
||||||
"""Return two dictionaries with scores for each beam/doc that was processed:
|
"""Return two dictionaries with scores for each beam/doc that was processed:
|
||||||
one containing (i, head) keys, and another containing (i, label) keys.
|
one containing (i, head) keys, and another containing (i, label) keys.
|
||||||
|
|
|
@ -17,10 +17,12 @@ from ..language import Language
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from ..training import Example, validate_examples, validate_get_examples
|
from ..training import Example, validate_examples, validate_get_examples
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..util import SimpleFrozenList
|
from ..util import SimpleFrozenList, registry
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
|
|
||||||
|
# See #9050
|
||||||
|
BACKWARD_OVERWRITE = True
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
[model]
|
[model]
|
||||||
|
@ -51,6 +53,8 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
"incl_context": True,
|
"incl_context": True,
|
||||||
"entity_vector_length": 64,
|
"entity_vector_length": 64,
|
||||||
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
||||||
|
"overwrite": True,
|
||||||
|
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
||||||
},
|
},
|
||||||
default_score_weights={
|
default_score_weights={
|
||||||
"nel_micro_f": 1.0,
|
"nel_micro_f": 1.0,
|
||||||
|
@ -69,6 +73,8 @@ def make_entity_linker(
|
||||||
incl_context: bool,
|
incl_context: bool,
|
||||||
entity_vector_length: int,
|
entity_vector_length: int,
|
||||||
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
"""Construct an EntityLinker component.
|
"""Construct an EntityLinker component.
|
||||||
|
|
||||||
|
@ -82,6 +88,7 @@ def make_entity_linker(
|
||||||
entity_vector_length (int): Size of encoding vectors in the KB.
|
entity_vector_length (int): Size of encoding vectors in the KB.
|
||||||
get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
|
get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
|
||||||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||||
|
scorer (Optional[Callable]): The scoring method.
|
||||||
"""
|
"""
|
||||||
return EntityLinker(
|
return EntityLinker(
|
||||||
nlp.vocab,
|
nlp.vocab,
|
||||||
|
@ -93,9 +100,20 @@ def make_entity_linker(
|
||||||
incl_context=incl_context,
|
incl_context=incl_context,
|
||||||
entity_vector_length=entity_vector_length,
|
entity_vector_length=entity_vector_length,
|
||||||
get_candidates=get_candidates,
|
get_candidates=get_candidates,
|
||||||
|
overwrite=overwrite,
|
||||||
|
scorer=scorer,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def entity_linker_score(examples, **kwargs):
|
||||||
|
return Scorer.score_links(examples, negative_labels=[EntityLinker.NIL], **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.scorers("spacy.entity_linker_scorer.v1")
|
||||||
|
def make_entity_linker_scorer():
|
||||||
|
return entity_linker_score
|
||||||
|
|
||||||
|
|
||||||
class EntityLinker(TrainablePipe):
|
class EntityLinker(TrainablePipe):
|
||||||
"""Pipeline component for named entity linking.
|
"""Pipeline component for named entity linking.
|
||||||
|
|
||||||
|
@ -116,6 +134,8 @@ class EntityLinker(TrainablePipe):
|
||||||
incl_context: bool,
|
incl_context: bool,
|
||||||
entity_vector_length: int,
|
entity_vector_length: int,
|
||||||
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||||
|
overwrite: bool = BACKWARD_OVERWRITE,
|
||||||
|
scorer: Optional[Callable] = entity_linker_score,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize an entity linker.
|
"""Initialize an entity linker.
|
||||||
|
|
||||||
|
@ -130,6 +150,8 @@ class EntityLinker(TrainablePipe):
|
||||||
entity_vector_length (int): Size of encoding vectors in the KB.
|
entity_vector_length (int): Size of encoding vectors in the KB.
|
||||||
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
||||||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||||
|
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||||
|
Scorer.score_links.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker#init
|
DOCS: https://spacy.io/api/entitylinker#init
|
||||||
"""
|
"""
|
||||||
|
@ -141,11 +163,12 @@ class EntityLinker(TrainablePipe):
|
||||||
self.incl_prior = incl_prior
|
self.incl_prior = incl_prior
|
||||||
self.incl_context = incl_context
|
self.incl_context = incl_context
|
||||||
self.get_candidates = get_candidates
|
self.get_candidates = get_candidates
|
||||||
self.cfg: Dict[str, Any] = {}
|
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
|
||||||
self.distance = CosineDistance(normalize=False)
|
self.distance = CosineDistance(normalize=False)
|
||||||
# how many neighbour sentences to take into account
|
# how many neighbour sentences to take into account
|
||||||
# create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
|
# create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
|
||||||
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
||||||
|
self.scorer = scorer
|
||||||
|
|
||||||
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
||||||
"""Define the KB of this pipe by providing a function that will
|
"""Define the KB of this pipe by providing a function that will
|
||||||
|
@ -384,23 +407,14 @@ class EntityLinker(TrainablePipe):
|
||||||
if count_ents != len(kb_ids):
|
if count_ents != len(kb_ids):
|
||||||
raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
|
raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
|
||||||
i = 0
|
i = 0
|
||||||
|
overwrite = self.cfg["overwrite"]
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
kb_id = kb_ids[i]
|
kb_id = kb_ids[i]
|
||||||
i += 1
|
i += 1
|
||||||
for token in ent:
|
for token in ent:
|
||||||
token.ent_kb_id_ = kb_id
|
if token.ent_kb_id == 0 or overwrite:
|
||||||
|
token.ent_kb_id_ = kb_id
|
||||||
def score(self, examples, **kwargs):
|
|
||||||
"""Score a batch of examples.
|
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
|
||||||
RETURNS (Dict[str, Any]): The scores.
|
|
||||||
|
|
||||||
DOCS TODO: https://spacy.io/api/entity_linker#score
|
|
||||||
"""
|
|
||||||
validate_examples(examples, "EntityLinker.score")
|
|
||||||
return Scorer.score_links(examples, negative_labels=[self.NIL])
|
|
||||||
|
|
||||||
def to_bytes(self, *, exclude=tuple()):
|
def to_bytes(self, *, exclude=tuple()):
|
||||||
"""Serialize the pipe to a bytestring.
|
"""Serialize the pipe to a bytestring.
|
||||||
|
|
|
@ -9,11 +9,10 @@ from .pipe import Pipe
|
||||||
from ..training import Example
|
from ..training import Example
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
|
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
from ..matcher import Matcher, PhraseMatcher
|
from ..matcher import Matcher, PhraseMatcher
|
||||||
from ..scorer import get_ner_prf
|
from ..scorer import get_ner_prf
|
||||||
from ..training import validate_examples
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_ENT_ID_SEP = "||"
|
DEFAULT_ENT_ID_SEP = "||"
|
||||||
|
@ -28,6 +27,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
|
||||||
"validate": False,
|
"validate": False,
|
||||||
"overwrite_ents": False,
|
"overwrite_ents": False,
|
||||||
"ent_id_sep": DEFAULT_ENT_ID_SEP,
|
"ent_id_sep": DEFAULT_ENT_ID_SEP,
|
||||||
|
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
|
||||||
},
|
},
|
||||||
default_score_weights={
|
default_score_weights={
|
||||||
"ents_f": 1.0,
|
"ents_f": 1.0,
|
||||||
|
@ -43,6 +43,7 @@ def make_entity_ruler(
|
||||||
validate: bool,
|
validate: bool,
|
||||||
overwrite_ents: bool,
|
overwrite_ents: bool,
|
||||||
ent_id_sep: str,
|
ent_id_sep: str,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return EntityRuler(
|
return EntityRuler(
|
||||||
nlp,
|
nlp,
|
||||||
|
@ -51,9 +52,19 @@ def make_entity_ruler(
|
||||||
validate=validate,
|
validate=validate,
|
||||||
overwrite_ents=overwrite_ents,
|
overwrite_ents=overwrite_ents,
|
||||||
ent_id_sep=ent_id_sep,
|
ent_id_sep=ent_id_sep,
|
||||||
|
scorer=scorer,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def entity_ruler_score(examples, **kwargs):
|
||||||
|
return get_ner_prf(examples)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.scorers("spacy.entity_ruler_scorer.v1")
|
||||||
|
def make_entity_ruler_scorer():
|
||||||
|
return entity_ruler_score
|
||||||
|
|
||||||
|
|
||||||
class EntityRuler(Pipe):
|
class EntityRuler(Pipe):
|
||||||
"""The EntityRuler lets you add spans to the `Doc.ents` using token-based
|
"""The EntityRuler lets you add spans to the `Doc.ents` using token-based
|
||||||
rules or exact phrase matches. It can be combined with the statistical
|
rules or exact phrase matches. It can be combined with the statistical
|
||||||
|
@ -75,6 +86,7 @@ class EntityRuler(Pipe):
|
||||||
overwrite_ents: bool = False,
|
overwrite_ents: bool = False,
|
||||||
ent_id_sep: str = DEFAULT_ENT_ID_SEP,
|
ent_id_sep: str = DEFAULT_ENT_ID_SEP,
|
||||||
patterns: Optional[List[PatternType]] = None,
|
patterns: Optional[List[PatternType]] = None,
|
||||||
|
scorer: Optional[Callable] = entity_ruler_score,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize the entity ruler. If patterns are supplied here, they
|
"""Initialize the entity ruler. If patterns are supplied here, they
|
||||||
need to be a list of dictionaries with a `"label"` and `"pattern"`
|
need to be a list of dictionaries with a `"label"` and `"pattern"`
|
||||||
|
@ -95,6 +107,8 @@ class EntityRuler(Pipe):
|
||||||
overwrite_ents (bool): If existing entities are present, e.g. entities
|
overwrite_ents (bool): If existing entities are present, e.g. entities
|
||||||
added by the model, overwrite them by matches if necessary.
|
added by the model, overwrite them by matches if necessary.
|
||||||
ent_id_sep (str): Separator used internally for entity IDs.
|
ent_id_sep (str): Separator used internally for entity IDs.
|
||||||
|
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||||
|
spacy.scorer.get_ner_prf.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#init
|
DOCS: https://spacy.io/api/entityruler#init
|
||||||
"""
|
"""
|
||||||
|
@ -113,6 +127,7 @@ class EntityRuler(Pipe):
|
||||||
self._ent_ids = defaultdict(tuple) # type: ignore
|
self._ent_ids = defaultdict(tuple) # type: ignore
|
||||||
if patterns is not None:
|
if patterns is not None:
|
||||||
self.add_patterns(patterns)
|
self.add_patterns(patterns)
|
||||||
|
self.scorer = scorer
|
||||||
|
|
||||||
def __len__(self) -> int:
|
def __len__(self) -> int:
|
||||||
"""The number of all patterns added to the entity ruler."""
|
"""The number of all patterns added to the entity ruler."""
|
||||||
|
@ -363,10 +378,6 @@ class EntityRuler(Pipe):
|
||||||
label = f"{label}{self.ent_id_sep}{ent_id}"
|
label = f"{label}{self.ent_id_sep}{ent_id}"
|
||||||
return label
|
return label
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
|
||||||
validate_examples(examples, "EntityRuler.score")
|
|
||||||
return get_ner_prf(examples)
|
|
||||||
|
|
||||||
def from_bytes(
|
def from_bytes(
|
||||||
self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
|
self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
|
||||||
) -> "EntityRuler":
|
) -> "EntityRuler":
|
||||||
|
|
|
@ -12,21 +12,41 @@ from ..lookups import Lookups, load_lookups
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..tokens import Doc, Token
|
from ..tokens import Doc, Token
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from ..training import validate_examples
|
from ..util import logger, SimpleFrozenList, registry
|
||||||
from ..util import logger, SimpleFrozenList
|
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "lookup", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "lookup",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return Lemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def lemmatizer_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
|
return Scorer.score_token_attr(examples, "lemma", **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.scorers("spacy.lemmatizer_scorer.v1")
|
||||||
|
def make_lemmatizer_scorer():
|
||||||
|
return lemmatizer_score
|
||||||
|
|
||||||
|
|
||||||
class Lemmatizer(Pipe):
|
class Lemmatizer(Pipe):
|
||||||
|
@ -60,6 +80,7 @@ class Lemmatizer(Pipe):
|
||||||
*,
|
*,
|
||||||
mode: str = "lookup",
|
mode: str = "lookup",
|
||||||
overwrite: bool = False,
|
overwrite: bool = False,
|
||||||
|
scorer: Optional[Callable] = lemmatizer_score,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize a Lemmatizer.
|
"""Initialize a Lemmatizer.
|
||||||
|
|
||||||
|
@ -69,6 +90,8 @@ class Lemmatizer(Pipe):
|
||||||
mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
|
mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
|
||||||
overwrite (bool): Whether to overwrite existing lemmas. Defaults to
|
overwrite (bool): Whether to overwrite existing lemmas. Defaults to
|
||||||
`False`.
|
`False`.
|
||||||
|
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||||
|
Scorer.score_token_attr for the attribute "lemma".
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lemmatizer#init
|
DOCS: https://spacy.io/api/lemmatizer#init
|
||||||
"""
|
"""
|
||||||
|
@ -89,6 +112,7 @@ class Lemmatizer(Pipe):
|
||||||
raise ValueError(Errors.E1003.format(mode=mode))
|
raise ValueError(Errors.E1003.format(mode=mode))
|
||||||
self.lemmatize = getattr(self, mode_attr)
|
self.lemmatize = getattr(self, mode_attr)
|
||||||
self.cache = {} # type: ignore[var-annotated]
|
self.cache = {} # type: ignore[var-annotated]
|
||||||
|
self.scorer = scorer
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def mode(self):
|
def mode(self):
|
||||||
|
@ -247,17 +271,6 @@ class Lemmatizer(Pipe):
|
||||||
"""
|
"""
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
|
||||||
"""Score a batch of examples.
|
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
|
||||||
RETURNS (Dict[str, Any]): The scores.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lemmatizer#score
|
|
||||||
"""
|
|
||||||
validate_examples(examples, "Lemmatizer.score")
|
|
||||||
return Scorer.score_token_attr(examples, "lemma", **kwargs)
|
|
||||||
|
|
||||||
def to_disk(
|
def to_disk(
|
||||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||||
):
|
):
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
from typing import Optional, Union, Dict
|
from typing import Optional, Union, Dict, Callable
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import SequenceCategoricalCrossentropy, Model, Config
|
from thinc.api import SequenceCategoricalCrossentropy, Model, Config
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
|
@ -17,7 +17,11 @@ from .tagger import Tagger
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..training import validate_examples, validate_get_examples
|
from ..training import validate_examples, validate_get_examples
|
||||||
|
from ..util import registry
|
||||||
|
|
||||||
|
# See #9050
|
||||||
|
BACKWARD_OVERWRITE = True
|
||||||
|
BACKWARD_EXTEND = False
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
[model]
|
[model]
|
||||||
|
@ -48,15 +52,35 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"morphologizer",
|
"morphologizer",
|
||||||
assigns=["token.morph", "token.pos"],
|
assigns=["token.morph", "token.pos"],
|
||||||
default_config={"model": DEFAULT_MORPH_MODEL},
|
default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}},
|
||||||
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
|
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
|
||||||
)
|
)
|
||||||
def make_morphologizer(
|
def make_morphologizer(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
model: Model,
|
model: Model,
|
||||||
name: str,
|
name: str,
|
||||||
|
overwrite: bool,
|
||||||
|
extend: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return Morphologizer(nlp.vocab, model, name)
|
return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer)
|
||||||
|
|
||||||
|
|
||||||
|
def morphologizer_score(examples, **kwargs):
|
||||||
|
def morph_key_getter(token, attr):
|
||||||
|
return getattr(token, attr).key
|
||||||
|
|
||||||
|
results = {}
|
||||||
|
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
||||||
|
results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
|
||||||
|
results.update(Scorer.score_token_attr_per_feat(examples,
|
||||||
|
"morph", getter=morph_key_getter, **kwargs))
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
@registry.scorers("spacy.morphologizer_scorer.v1")
|
||||||
|
def make_morphologizer_scorer():
|
||||||
|
return morphologizer_score
|
||||||
|
|
||||||
|
|
||||||
class Morphologizer(Tagger):
|
class Morphologizer(Tagger):
|
||||||
|
@ -67,6 +91,10 @@ class Morphologizer(Tagger):
|
||||||
vocab: Vocab,
|
vocab: Vocab,
|
||||||
model: Model,
|
model: Model,
|
||||||
name: str = "morphologizer",
|
name: str = "morphologizer",
|
||||||
|
*,
|
||||||
|
overwrite: bool = BACKWARD_OVERWRITE,
|
||||||
|
extend: bool = BACKWARD_EXTEND,
|
||||||
|
scorer: Optional[Callable] = morphologizer_score,
|
||||||
):
|
):
|
||||||
"""Initialize a morphologizer.
|
"""Initialize a morphologizer.
|
||||||
|
|
||||||
|
@ -74,6 +102,9 @@ class Morphologizer(Tagger):
|
||||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||||
name (str): The component instance name, used to add entries to the
|
name (str): The component instance name, used to add entries to the
|
||||||
losses during training.
|
losses during training.
|
||||||
|
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||||
|
Scorer.score_token_attr for the attributes "pos" and "morph" and
|
||||||
|
Scorer.score_token_attr_per_feat for the attribute "morph".
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/morphologizer#init
|
DOCS: https://spacy.io/api/morphologizer#init
|
||||||
"""
|
"""
|
||||||
|
@ -85,8 +116,14 @@ class Morphologizer(Tagger):
|
||||||
# store mappings from morph+POS labels to token-level annotations:
|
# store mappings from morph+POS labels to token-level annotations:
|
||||||
# 1) labels_morph stores a mapping from morph+POS->morph
|
# 1) labels_morph stores a mapping from morph+POS->morph
|
||||||
# 2) labels_pos stores a mapping from morph+POS->POS
|
# 2) labels_pos stores a mapping from morph+POS->POS
|
||||||
cfg = {"labels_morph": {}, "labels_pos": {}}
|
cfg = {
|
||||||
|
"labels_morph": {},
|
||||||
|
"labels_pos": {},
|
||||||
|
"overwrite": overwrite,
|
||||||
|
"extend": extend,
|
||||||
|
}
|
||||||
self.cfg = dict(sorted(cfg.items()))
|
self.cfg = dict(sorted(cfg.items()))
|
||||||
|
self.scorer = scorer
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
@ -192,14 +229,34 @@ class Morphologizer(Tagger):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
cdef Vocab vocab = self.vocab
|
cdef Vocab vocab = self.vocab
|
||||||
|
cdef bint overwrite = self.cfg["overwrite"]
|
||||||
|
cdef bint extend = self.cfg["extend"]
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
doc_tag_ids = batch_tag_ids[i]
|
doc_tag_ids = batch_tag_ids[i]
|
||||||
if hasattr(doc_tag_ids, "get"):
|
if hasattr(doc_tag_ids, "get"):
|
||||||
doc_tag_ids = doc_tag_ids.get()
|
doc_tag_ids = doc_tag_ids.get()
|
||||||
for j, tag_id in enumerate(doc_tag_ids):
|
for j, tag_id in enumerate(doc_tag_ids):
|
||||||
morph = self.labels[tag_id]
|
morph = self.labels[tag_id]
|
||||||
doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0))
|
# set morph
|
||||||
doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0)
|
if doc.c[j].morph == 0 or overwrite or extend:
|
||||||
|
if overwrite and extend:
|
||||||
|
# morphologizer morph overwrites any existing features
|
||||||
|
# while extending
|
||||||
|
extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph])
|
||||||
|
extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0)))
|
||||||
|
doc.c[j].morph = self.vocab.morphology.add(extended_morph)
|
||||||
|
elif extend:
|
||||||
|
# existing features are preserved and any new features
|
||||||
|
# are added
|
||||||
|
extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0))
|
||||||
|
extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph]))
|
||||||
|
doc.c[j].morph = self.vocab.morphology.add(extended_morph)
|
||||||
|
else:
|
||||||
|
# clobber
|
||||||
|
doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0))
|
||||||
|
# set POS
|
||||||
|
if doc.c[j].pos == 0 or overwrite:
|
||||||
|
doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0)
|
||||||
|
|
||||||
def get_loss(self, examples, scores):
|
def get_loss(self, examples, scores):
|
||||||
"""Find the loss and gradient of loss for the batch of documents and
|
"""Find the loss and gradient of loss for the batch of documents and
|
||||||
|
@ -246,24 +303,3 @@ class Morphologizer(Tagger):
|
||||||
if self.model.ops.xp.isnan(loss):
|
if self.model.ops.xp.isnan(loss):
|
||||||
raise ValueError(Errors.E910.format(name=self.name))
|
raise ValueError(Errors.E910.format(name=self.name))
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
|
||||||
"""Score a batch of examples.
|
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by
|
|
||||||
Scorer.score_token_attr for the attributes "pos" and "morph" and
|
|
||||||
Scorer.score_token_attr_per_feat for the attribute "morph".
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/morphologizer#score
|
|
||||||
"""
|
|
||||||
def morph_key_getter(token, attr):
|
|
||||||
return getattr(token, attr).key
|
|
||||||
|
|
||||||
validate_examples(examples, "Morphologizer.score")
|
|
||||||
results = {}
|
|
||||||
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
|
||||||
results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
|
|
||||||
results.update(Scorer.score_token_attr_per_feat(examples,
|
|
||||||
"morph", getter=morph_key_getter, **kwargs))
|
|
||||||
return results
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from typing import Optional, Iterable
|
from typing import Optional, Iterable, Callable
|
||||||
from thinc.api import Model, Config
|
from thinc.api import Model, Config
|
||||||
|
|
||||||
from ._parser_internals.transition_system import TransitionSystem
|
from ._parser_internals.transition_system import TransitionSystem
|
||||||
|
@ -9,7 +9,7 @@ from ._parser_internals.ner cimport BiluoPushDown
|
||||||
|
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..scorer import get_ner_prf, PRFScore
|
from ..scorer import get_ner_prf, PRFScore
|
||||||
from ..training import validate_examples
|
from ..util import registry
|
||||||
|
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
|
@ -41,7 +41,8 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
"moves": None,
|
"moves": None,
|
||||||
"update_with_oracle_cut_size": 100,
|
"update_with_oracle_cut_size": 100,
|
||||||
"model": DEFAULT_NER_MODEL,
|
"model": DEFAULT_NER_MODEL,
|
||||||
"incorrect_spans_key": None
|
"incorrect_spans_key": None,
|
||||||
|
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
|
||||||
},
|
},
|
||||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
||||||
|
|
||||||
|
@ -52,7 +53,8 @@ def make_ner(
|
||||||
model: Model,
|
model: Model,
|
||||||
moves: Optional[TransitionSystem],
|
moves: Optional[TransitionSystem],
|
||||||
update_with_oracle_cut_size: int,
|
update_with_oracle_cut_size: int,
|
||||||
incorrect_spans_key: Optional[str]=None
|
incorrect_spans_key: Optional[str],
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
"""Create a transition-based EntityRecognizer component. The entity recognizer
|
"""Create a transition-based EntityRecognizer component. The entity recognizer
|
||||||
identifies non-overlapping labelled spans of tokens.
|
identifies non-overlapping labelled spans of tokens.
|
||||||
|
@ -80,6 +82,7 @@ def make_ner(
|
||||||
incorrect_spans_key (Optional[str]): Identifies spans that are known
|
incorrect_spans_key (Optional[str]): Identifies spans that are known
|
||||||
to be incorrect entity annotations. The incorrect entity annotations
|
to be incorrect entity annotations. The incorrect entity annotations
|
||||||
can be stored in the span group, under this key.
|
can be stored in the span group, under this key.
|
||||||
|
scorer (Optional[Callable]): The scoring method.
|
||||||
"""
|
"""
|
||||||
return EntityRecognizer(
|
return EntityRecognizer(
|
||||||
nlp.vocab,
|
nlp.vocab,
|
||||||
|
@ -92,6 +95,7 @@ def make_ner(
|
||||||
beam_width=1,
|
beam_width=1,
|
||||||
beam_density=0.0,
|
beam_density=0.0,
|
||||||
beam_update_prob=0.0,
|
beam_update_prob=0.0,
|
||||||
|
scorer=scorer,
|
||||||
)
|
)
|
||||||
|
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
|
@ -104,7 +108,8 @@ def make_ner(
|
||||||
"beam_density": 0.01,
|
"beam_density": 0.01,
|
||||||
"beam_update_prob": 0.5,
|
"beam_update_prob": 0.5,
|
||||||
"beam_width": 32,
|
"beam_width": 32,
|
||||||
"incorrect_spans_key": None
|
"incorrect_spans_key": None,
|
||||||
|
"scorer": None,
|
||||||
},
|
},
|
||||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
||||||
)
|
)
|
||||||
|
@ -117,7 +122,8 @@ def make_beam_ner(
|
||||||
beam_width: int,
|
beam_width: int,
|
||||||
beam_density: float,
|
beam_density: float,
|
||||||
beam_update_prob: float,
|
beam_update_prob: float,
|
||||||
incorrect_spans_key: Optional[str]=None
|
incorrect_spans_key: Optional[str],
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
"""Create a transition-based EntityRecognizer component that uses beam-search.
|
"""Create a transition-based EntityRecognizer component that uses beam-search.
|
||||||
The entity recognizer identifies non-overlapping labelled spans of tokens.
|
The entity recognizer identifies non-overlapping labelled spans of tokens.
|
||||||
|
@ -153,6 +159,7 @@ def make_beam_ner(
|
||||||
and are faster to compute.
|
and are faster to compute.
|
||||||
incorrect_spans_key (Optional[str]): Optional key into span groups of
|
incorrect_spans_key (Optional[str]): Optional key into span groups of
|
||||||
entities known to be non-entities.
|
entities known to be non-entities.
|
||||||
|
scorer (Optional[Callable]): The scoring method.
|
||||||
"""
|
"""
|
||||||
return EntityRecognizer(
|
return EntityRecognizer(
|
||||||
nlp.vocab,
|
nlp.vocab,
|
||||||
|
@ -164,10 +171,20 @@ def make_beam_ner(
|
||||||
beam_width=beam_width,
|
beam_width=beam_width,
|
||||||
beam_density=beam_density,
|
beam_density=beam_density,
|
||||||
beam_update_prob=beam_update_prob,
|
beam_update_prob=beam_update_prob,
|
||||||
incorrect_spans_key=incorrect_spans_key
|
incorrect_spans_key=incorrect_spans_key,
|
||||||
|
scorer=scorer,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def ner_score(examples, **kwargs):
|
||||||
|
return get_ner_prf(examples, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.scorers("spacy.ner_scorer.v1")
|
||||||
|
def make_ner_scorer():
|
||||||
|
return ner_score
|
||||||
|
|
||||||
|
|
||||||
cdef class EntityRecognizer(Parser):
|
cdef class EntityRecognizer(Parser):
|
||||||
"""Pipeline component for named entity recognition.
|
"""Pipeline component for named entity recognition.
|
||||||
|
|
||||||
|
@ -188,6 +205,7 @@ cdef class EntityRecognizer(Parser):
|
||||||
beam_update_prob=0.0,
|
beam_update_prob=0.0,
|
||||||
multitasks=tuple(),
|
multitasks=tuple(),
|
||||||
incorrect_spans_key=None,
|
incorrect_spans_key=None,
|
||||||
|
scorer=ner_score,
|
||||||
):
|
):
|
||||||
"""Create an EntityRecognizer.
|
"""Create an EntityRecognizer.
|
||||||
"""
|
"""
|
||||||
|
@ -204,6 +222,7 @@ cdef class EntityRecognizer(Parser):
|
||||||
beam_update_prob=beam_update_prob,
|
beam_update_prob=beam_update_prob,
|
||||||
multitasks=multitasks,
|
multitasks=multitasks,
|
||||||
incorrect_spans_key=incorrect_spans_key,
|
incorrect_spans_key=incorrect_spans_key,
|
||||||
|
scorer=scorer,
|
||||||
)
|
)
|
||||||
|
|
||||||
def add_multitask_objective(self, mt_component):
|
def add_multitask_objective(self, mt_component):
|
||||||
|
@ -227,17 +246,6 @@ cdef class EntityRecognizer(Parser):
|
||||||
if move[0] in ("B", "I", "L", "U"))
|
if move[0] in ("B", "I", "L", "U"))
|
||||||
return tuple(sorted(labels))
|
return tuple(sorted(labels))
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
|
||||||
"""Score a batch of examples.
|
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
|
||||||
RETURNS (Dict[str, Any]): The NER precision, recall and f-scores.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityrecognizer#score
|
|
||||||
"""
|
|
||||||
validate_examples(examples, "EntityRecognizer.score")
|
|
||||||
return get_ner_prf(examples)
|
|
||||||
|
|
||||||
def scored_ents(self, beams):
|
def scored_ents(self, beams):
|
||||||
"""Return a dictionary of (start, end, label) tuples with corresponding scores
|
"""Return a dictionary of (start, end, label) tuples with corresponding scores
|
||||||
for each beam/doc that was processed.
|
for each beam/doc that was processed.
|
||||||
|
|
|
@ -81,6 +81,17 @@ cdef class Pipe:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#score
|
DOCS: https://spacy.io/api/pipe#score
|
||||||
"""
|
"""
|
||||||
|
if hasattr(self, "scorer") and self.scorer is not None:
|
||||||
|
scorer_kwargs = {}
|
||||||
|
# use default settings from cfg (e.g., threshold)
|
||||||
|
if hasattr(self, "cfg") and isinstance(self.cfg, dict):
|
||||||
|
scorer_kwargs.update(self.cfg)
|
||||||
|
# override self.cfg["labels"] with self.labels
|
||||||
|
if hasattr(self, "labels"):
|
||||||
|
scorer_kwargs["labels"] = self.labels
|
||||||
|
# override with kwargs settings
|
||||||
|
scorer_kwargs.update(kwargs)
|
||||||
|
return self.scorer(examples, **scorer_kwargs)
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -1,26 +1,32 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
from typing import Optional, List
|
from typing import Optional, List, Callable
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
|
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
|
from .senter import senter_score
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..training import validate_examples
|
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
# see #9050
|
||||||
|
BACKWARD_OVERWRITE = False
|
||||||
|
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"sentencizer",
|
"sentencizer",
|
||||||
assigns=["token.is_sent_start", "doc.sents"],
|
assigns=["token.is_sent_start", "doc.sents"],
|
||||||
default_config={"punct_chars": None},
|
default_config={"punct_chars": None, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
|
||||||
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
||||||
)
|
)
|
||||||
def make_sentencizer(
|
def make_sentencizer(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
punct_chars: Optional[List[str]]
|
punct_chars: Optional[List[str]],
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return Sentencizer(name, punct_chars=punct_chars)
|
return Sentencizer(name, punct_chars=punct_chars, overwrite=overwrite, scorer=scorer)
|
||||||
|
|
||||||
|
|
||||||
class Sentencizer(Pipe):
|
class Sentencizer(Pipe):
|
||||||
|
@ -41,12 +47,20 @@ class Sentencizer(Pipe):
|
||||||
'𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
|
'𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
|
||||||
'。', '。']
|
'。', '。']
|
||||||
|
|
||||||
def __init__(self, name="sentencizer", *, punct_chars=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
name="sentencizer",
|
||||||
|
*,
|
||||||
|
punct_chars=None,
|
||||||
|
overwrite=BACKWARD_OVERWRITE,
|
||||||
|
scorer=senter_score,
|
||||||
|
):
|
||||||
"""Initialize the sentencizer.
|
"""Initialize the sentencizer.
|
||||||
|
|
||||||
punct_chars (list): Punctuation characters to split on. Will be
|
punct_chars (list): Punctuation characters to split on. Will be
|
||||||
serialized with the nlp object.
|
serialized with the nlp object.
|
||||||
RETURNS (Sentencizer): The sentencizer component.
|
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||||
|
Scorer.score_spans for the attribute "sents".
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer#init
|
DOCS: https://spacy.io/api/sentencizer#init
|
||||||
"""
|
"""
|
||||||
|
@ -55,6 +69,8 @@ class Sentencizer(Pipe):
|
||||||
self.punct_chars = set(punct_chars)
|
self.punct_chars = set(punct_chars)
|
||||||
else:
|
else:
|
||||||
self.punct_chars = set(self.default_punct_chars)
|
self.punct_chars = set(self.default_punct_chars)
|
||||||
|
self.overwrite = overwrite
|
||||||
|
self.scorer = scorer
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
|
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
|
||||||
|
@ -115,29 +131,12 @@ class Sentencizer(Pipe):
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
doc_tag_ids = batch_tag_ids[i]
|
doc_tag_ids = batch_tag_ids[i]
|
||||||
for j, tag_id in enumerate(doc_tag_ids):
|
for j, tag_id in enumerate(doc_tag_ids):
|
||||||
# Don't clobber existing sentence boundaries
|
if doc.c[j].sent_start == 0 or self.overwrite:
|
||||||
if doc.c[j].sent_start == 0:
|
|
||||||
if tag_id:
|
if tag_id:
|
||||||
doc.c[j].sent_start = 1
|
doc.c[j].sent_start = 1
|
||||||
else:
|
else:
|
||||||
doc.c[j].sent_start = -1
|
doc.c[j].sent_start = -1
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
|
||||||
"""Score a batch of examples.
|
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer#score
|
|
||||||
"""
|
|
||||||
def has_sents(doc):
|
|
||||||
return doc.has_annotation("SENT_START")
|
|
||||||
|
|
||||||
validate_examples(examples, "Sentencizer.score")
|
|
||||||
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
|
|
||||||
del results["sents_per_type"]
|
|
||||||
return results
|
|
||||||
|
|
||||||
def to_bytes(self, *, exclude=tuple()):
|
def to_bytes(self, *, exclude=tuple()):
|
||||||
"""Serialize the sentencizer to a bytestring.
|
"""Serialize the sentencizer to a bytestring.
|
||||||
|
|
||||||
|
@ -145,7 +144,7 @@ class Sentencizer(Pipe):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer#to_bytes
|
DOCS: https://spacy.io/api/sentencizer#to_bytes
|
||||||
"""
|
"""
|
||||||
return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)})
|
return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars), "overwrite": self.overwrite})
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, *, exclude=tuple()):
|
def from_bytes(self, bytes_data, *, exclude=tuple()):
|
||||||
"""Load the sentencizer from a bytestring.
|
"""Load the sentencizer from a bytestring.
|
||||||
|
@ -157,6 +156,7 @@ class Sentencizer(Pipe):
|
||||||
"""
|
"""
|
||||||
cfg = srsly.msgpack_loads(bytes_data)
|
cfg = srsly.msgpack_loads(bytes_data)
|
||||||
self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
|
self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
|
||||||
|
self.overwrite = cfg.get("overwrite", self.overwrite)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_disk(self, path, *, exclude=tuple()):
|
def to_disk(self, path, *, exclude=tuple()):
|
||||||
|
@ -166,7 +166,7 @@ class Sentencizer(Pipe):
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
path = path.with_suffix(".json")
|
path = path.with_suffix(".json")
|
||||||
srsly.write_json(path, {"punct_chars": list(self.punct_chars)})
|
srsly.write_json(path, {"punct_chars": list(self.punct_chars), "overwrite": self.overwrite})
|
||||||
|
|
||||||
|
|
||||||
def from_disk(self, path, *, exclude=tuple()):
|
def from_disk(self, path, *, exclude=tuple()):
|
||||||
|
@ -178,4 +178,5 @@ class Sentencizer(Pipe):
|
||||||
path = path.with_suffix(".json")
|
path = path.with_suffix(".json")
|
||||||
cfg = srsly.read_json(path)
|
cfg = srsly.read_json(path)
|
||||||
self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
|
self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
|
||||||
|
self.overwrite = cfg.get("overwrite", self.overwrite)
|
||||||
return self
|
return self
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
|
from typing import Optional, Callable
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import Model, SequenceCategoricalCrossentropy, Config
|
from thinc.api import Model, SequenceCategoricalCrossentropy, Config
|
||||||
|
@ -11,8 +12,11 @@ from ..language import Language
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..training import validate_examples, validate_get_examples
|
from ..training import validate_examples, validate_get_examples
|
||||||
|
from ..util import registry
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
# See #9050
|
||||||
|
BACKWARD_OVERWRITE = False
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
[model]
|
[model]
|
||||||
|
@ -34,11 +38,25 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"senter",
|
"senter",
|
||||||
assigns=["token.is_sent_start"],
|
assigns=["token.is_sent_start"],
|
||||||
default_config={"model": DEFAULT_SENTER_MODEL},
|
default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
|
||||||
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
||||||
)
|
)
|
||||||
def make_senter(nlp: Language, name: str, model: Model):
|
def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]):
|
||||||
return SentenceRecognizer(nlp.vocab, model, name)
|
return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer)
|
||||||
|
|
||||||
|
|
||||||
|
def senter_score(examples, **kwargs):
|
||||||
|
def has_sents(doc):
|
||||||
|
return doc.has_annotation("SENT_START")
|
||||||
|
|
||||||
|
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
|
||||||
|
del results["sents_per_type"]
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
@registry.scorers("spacy.senter_scorer.v1")
|
||||||
|
def make_senter_scorer():
|
||||||
|
return senter_score
|
||||||
|
|
||||||
|
|
||||||
class SentenceRecognizer(Tagger):
|
class SentenceRecognizer(Tagger):
|
||||||
|
@ -46,13 +64,23 @@ class SentenceRecognizer(Tagger):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer
|
DOCS: https://spacy.io/api/sentencerecognizer
|
||||||
"""
|
"""
|
||||||
def __init__(self, vocab, model, name="senter"):
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab,
|
||||||
|
model,
|
||||||
|
name="senter",
|
||||||
|
*,
|
||||||
|
overwrite=BACKWARD_OVERWRITE,
|
||||||
|
scorer=senter_score,
|
||||||
|
):
|
||||||
"""Initialize a sentence recognizer.
|
"""Initialize a sentence recognizer.
|
||||||
|
|
||||||
vocab (Vocab): The shared vocabulary.
|
vocab (Vocab): The shared vocabulary.
|
||||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||||
name (str): The component instance name, used to add entries to the
|
name (str): The component instance name, used to add entries to the
|
||||||
losses during training.
|
losses during training.
|
||||||
|
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||||
|
Scorer.score_spans for the attribute "sents".
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer#init
|
DOCS: https://spacy.io/api/sentencerecognizer#init
|
||||||
"""
|
"""
|
||||||
|
@ -60,7 +88,8 @@ class SentenceRecognizer(Tagger):
|
||||||
self.model = model
|
self.model = model
|
||||||
self.name = name
|
self.name = name
|
||||||
self._rehearsal_model = None
|
self._rehearsal_model = None
|
||||||
self.cfg = {}
|
self.cfg = {"overwrite": overwrite}
|
||||||
|
self.scorer = scorer
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
@ -85,13 +114,13 @@ class SentenceRecognizer(Tagger):
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
|
cdef bint overwrite = self.cfg["overwrite"]
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
doc_tag_ids = batch_tag_ids[i]
|
doc_tag_ids = batch_tag_ids[i]
|
||||||
if hasattr(doc_tag_ids, "get"):
|
if hasattr(doc_tag_ids, "get"):
|
||||||
doc_tag_ids = doc_tag_ids.get()
|
doc_tag_ids = doc_tag_ids.get()
|
||||||
for j, tag_id in enumerate(doc_tag_ids):
|
for j, tag_id in enumerate(doc_tag_ids):
|
||||||
# Don't clobber existing sentence boundaries
|
if doc.c[j].sent_start == 0 or overwrite:
|
||||||
if doc.c[j].sent_start == 0:
|
|
||||||
if tag_id == 1:
|
if tag_id == 1:
|
||||||
doc.c[j].sent_start = 1
|
doc.c[j].sent_start = 1
|
||||||
else:
|
else:
|
||||||
|
@ -153,18 +182,3 @@ class SentenceRecognizer(Tagger):
|
||||||
|
|
||||||
def add_label(self, label, values=None):
|
def add_label(self, label, values=None):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
|
||||||
"""Score a batch of examples.
|
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer#score
|
|
||||||
"""
|
|
||||||
def has_sents(doc):
|
|
||||||
return doc.has_annotation("SENT_START")
|
|
||||||
|
|
||||||
validate_examples(examples, "SentenceRecognizer.score")
|
|
||||||
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
|
|
||||||
del results["sents_per_type"]
|
|
||||||
return results
|
|
||||||
|
|
|
@ -104,6 +104,7 @@ def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester:
|
||||||
"max_positive": None,
|
"max_positive": None,
|
||||||
"model": DEFAULT_SPANCAT_MODEL,
|
"model": DEFAULT_SPANCAT_MODEL,
|
||||||
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
||||||
|
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
|
||||||
},
|
},
|
||||||
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
||||||
)
|
)
|
||||||
|
@ -113,8 +114,9 @@ def make_spancat(
|
||||||
suggester: Suggester,
|
suggester: Suggester,
|
||||||
model: Model[Tuple[List[Doc], Ragged], Floats2d],
|
model: Model[Tuple[List[Doc], Ragged], Floats2d],
|
||||||
spans_key: str,
|
spans_key: str,
|
||||||
threshold: float = 0.5,
|
scorer: Optional[Callable],
|
||||||
max_positive: Optional[int] = None,
|
threshold: float,
|
||||||
|
max_positive: Optional[int],
|
||||||
) -> "SpanCategorizer":
|
) -> "SpanCategorizer":
|
||||||
"""Create a SpanCategorizer component. The span categorizer consists of two
|
"""Create a SpanCategorizer component. The span categorizer consists of two
|
||||||
parts: a suggester function that proposes candidate spans, and a labeller
|
parts: a suggester function that proposes candidate spans, and a labeller
|
||||||
|
@ -144,9 +146,28 @@ def make_spancat(
|
||||||
threshold=threshold,
|
threshold=threshold,
|
||||||
max_positive=max_positive,
|
max_positive=max_positive,
|
||||||
name=name,
|
name=name,
|
||||||
|
scorer=scorer,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def spancat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
|
kwargs = dict(kwargs)
|
||||||
|
attr_prefix = "spans_"
|
||||||
|
key = kwargs["spans_key"]
|
||||||
|
kwargs.setdefault("attr", f"{attr_prefix}{key}")
|
||||||
|
kwargs.setdefault("allow_overlap", True)
|
||||||
|
kwargs.setdefault(
|
||||||
|
"getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], [])
|
||||||
|
)
|
||||||
|
kwargs.setdefault("has_annotation", lambda doc: key in doc.spans)
|
||||||
|
return Scorer.score_spans(examples, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.scorers("spacy.spancat_scorer.v1")
|
||||||
|
def make_spancat_scorer():
|
||||||
|
return spancat_score
|
||||||
|
|
||||||
|
|
||||||
class SpanCategorizer(TrainablePipe):
|
class SpanCategorizer(TrainablePipe):
|
||||||
"""Pipeline component to label spans of text.
|
"""Pipeline component to label spans of text.
|
||||||
|
|
||||||
|
@ -163,8 +184,25 @@ class SpanCategorizer(TrainablePipe):
|
||||||
spans_key: str = "spans",
|
spans_key: str = "spans",
|
||||||
threshold: float = 0.5,
|
threshold: float = 0.5,
|
||||||
max_positive: Optional[int] = None,
|
max_positive: Optional[int] = None,
|
||||||
|
scorer: Optional[Callable] = spancat_score,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize the span categorizer.
|
"""Initialize the span categorizer.
|
||||||
|
vocab (Vocab): The shared vocabulary.
|
||||||
|
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||||
|
name (str): The component instance name, used to add entries to the
|
||||||
|
losses during training.
|
||||||
|
spans_key (str): Key of the Doc.spans dict to save the spans under.
|
||||||
|
During initialization and training, the component will look for
|
||||||
|
spans on the reference document under the same key. Defaults to
|
||||||
|
`"spans"`.
|
||||||
|
threshold (float): Minimum probability to consider a prediction
|
||||||
|
positive. Spans with a positive prediction will be saved on the Doc.
|
||||||
|
Defaults to 0.5.
|
||||||
|
max_positive (Optional[int]): Maximum number of labels to consider
|
||||||
|
positive per span. Defaults to None, indicating no limit.
|
||||||
|
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||||
|
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
|
||||||
|
spans allowed.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/spancategorizer#init
|
DOCS: https://spacy.io/api/spancategorizer#init
|
||||||
"""
|
"""
|
||||||
|
@ -178,6 +216,7 @@ class SpanCategorizer(TrainablePipe):
|
||||||
self.suggester = suggester
|
self.suggester = suggester
|
||||||
self.model = model
|
self.model = model
|
||||||
self.name = name
|
self.name = name
|
||||||
|
self.scorer = scorer
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def key(self) -> str:
|
def key(self) -> str:
|
||||||
|
@ -379,26 +418,6 @@ class SpanCategorizer(TrainablePipe):
|
||||||
else:
|
else:
|
||||||
self.model.initialize()
|
self.model.initialize()
|
||||||
|
|
||||||
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
|
||||||
"""Score a batch of examples.
|
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/spancategorizer#score
|
|
||||||
"""
|
|
||||||
validate_examples(examples, "SpanCategorizer.score")
|
|
||||||
self._validate_categories(examples)
|
|
||||||
kwargs = dict(kwargs)
|
|
||||||
attr_prefix = "spans_"
|
|
||||||
kwargs.setdefault("attr", f"{attr_prefix}{self.key}")
|
|
||||||
kwargs.setdefault("allow_overlap", True)
|
|
||||||
kwargs.setdefault(
|
|
||||||
"getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], [])
|
|
||||||
)
|
|
||||||
kwargs.setdefault("has_annotation", lambda doc: self.key in doc.spans)
|
|
||||||
return Scorer.score_spans(examples, **kwargs)
|
|
||||||
|
|
||||||
def _validate_categories(self, examples: Iterable[Example]):
|
def _validate_categories(self, examples: Iterable[Example]):
|
||||||
# TODO
|
# TODO
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
|
from typing import Callable, Optional
|
||||||
import numpy
|
import numpy
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
|
from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
|
||||||
|
@ -18,8 +19,11 @@ from ..parts_of_speech import X
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..training import validate_examples, validate_get_examples
|
from ..training import validate_examples, validate_get_examples
|
||||||
|
from ..util import registry
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
# See #9050
|
||||||
|
BACKWARD_OVERWRITE = False
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
[model]
|
[model]
|
||||||
|
@ -41,10 +45,16 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"tagger",
|
"tagger",
|
||||||
assigns=["token.tag"],
|
assigns=["token.tag"],
|
||||||
default_config={"model": DEFAULT_TAGGER_MODEL},
|
default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}},
|
||||||
default_score_weights={"tag_acc": 1.0},
|
default_score_weights={"tag_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_tagger(nlp: Language, name: str, model: Model):
|
def make_tagger(
|
||||||
|
nlp: Language,
|
||||||
|
name: str,
|
||||||
|
model: Model,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
|
):
|
||||||
"""Construct a part-of-speech tagger component.
|
"""Construct a part-of-speech tagger component.
|
||||||
|
|
||||||
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
|
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
|
||||||
|
@ -52,7 +62,16 @@ def make_tagger(nlp: Language, name: str, model: Model):
|
||||||
in size, and be normalized as probabilities (all scores between 0 and 1,
|
in size, and be normalized as probabilities (all scores between 0 and 1,
|
||||||
with the rows summing to 1).
|
with the rows summing to 1).
|
||||||
"""
|
"""
|
||||||
return Tagger(nlp.vocab, model, name)
|
return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer)
|
||||||
|
|
||||||
|
|
||||||
|
def tagger_score(examples, **kwargs):
|
||||||
|
return Scorer.score_token_attr(examples, "tag", **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.scorers("spacy.tagger_scorer.v1")
|
||||||
|
def make_tagger_scorer():
|
||||||
|
return tagger_score
|
||||||
|
|
||||||
|
|
||||||
class Tagger(TrainablePipe):
|
class Tagger(TrainablePipe):
|
||||||
|
@ -60,13 +79,23 @@ class Tagger(TrainablePipe):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger
|
DOCS: https://spacy.io/api/tagger
|
||||||
"""
|
"""
|
||||||
def __init__(self, vocab, model, name="tagger"):
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab,
|
||||||
|
model,
|
||||||
|
name="tagger",
|
||||||
|
*,
|
||||||
|
overwrite=BACKWARD_OVERWRITE,
|
||||||
|
scorer=tagger_score,
|
||||||
|
):
|
||||||
"""Initialize a part-of-speech tagger.
|
"""Initialize a part-of-speech tagger.
|
||||||
|
|
||||||
vocab (Vocab): The shared vocabulary.
|
vocab (Vocab): The shared vocabulary.
|
||||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||||
name (str): The component instance name, used to add entries to the
|
name (str): The component instance name, used to add entries to the
|
||||||
losses during training.
|
losses during training.
|
||||||
|
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||||
|
Scorer.score_token_attr for the attribute "tag".
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#init
|
DOCS: https://spacy.io/api/tagger#init
|
||||||
"""
|
"""
|
||||||
|
@ -74,8 +103,9 @@ class Tagger(TrainablePipe):
|
||||||
self.model = model
|
self.model = model
|
||||||
self.name = name
|
self.name = name
|
||||||
self._rehearsal_model = None
|
self._rehearsal_model = None
|
||||||
cfg = {"labels": []}
|
cfg = {"labels": [], "overwrite": overwrite}
|
||||||
self.cfg = dict(sorted(cfg.items()))
|
self.cfg = dict(sorted(cfg.items()))
|
||||||
|
self.scorer = scorer
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
@ -135,13 +165,13 @@ class Tagger(TrainablePipe):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
cdef Vocab vocab = self.vocab
|
cdef Vocab vocab = self.vocab
|
||||||
|
cdef bint overwrite = self.cfg["overwrite"]
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
doc_tag_ids = batch_tag_ids[i]
|
doc_tag_ids = batch_tag_ids[i]
|
||||||
if hasattr(doc_tag_ids, "get"):
|
if hasattr(doc_tag_ids, "get"):
|
||||||
doc_tag_ids = doc_tag_ids.get()
|
doc_tag_ids = doc_tag_ids.get()
|
||||||
for j, tag_id in enumerate(doc_tag_ids):
|
for j, tag_id in enumerate(doc_tag_ids):
|
||||||
# Don't clobber preset POS tags
|
if doc.c[j].tag == 0 or overwrite:
|
||||||
if doc.c[j].tag == 0:
|
|
||||||
doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
|
doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
|
||||||
|
|
||||||
def update(self, examples, *, drop=0., sgd=None, losses=None):
|
def update(self, examples, *, drop=0., sgd=None, losses=None):
|
||||||
|
@ -289,15 +319,3 @@ class Tagger(TrainablePipe):
|
||||||
self.cfg["labels"].append(label)
|
self.cfg["labels"].append(label)
|
||||||
self.vocab.strings.add(label)
|
self.vocab.strings.add(label)
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
|
||||||
"""Score a batch of examples.
|
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by
|
|
||||||
Scorer.score_token_attr for the attributes "tag".
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#score
|
|
||||||
"""
|
|
||||||
validate_examples(examples, "Tagger.score")
|
|
||||||
return Scorer.score_token_attr(examples, "tag", **kwargs)
|
|
||||||
|
|
|
@ -10,6 +10,7 @@ from ..training import Example, validate_examples, validate_get_examples
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
|
from ..util import registry
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
|
|
||||||
|
|
||||||
|
@ -70,7 +71,11 @@ subword_features = true
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"textcat",
|
"textcat",
|
||||||
assigns=["doc.cats"],
|
assigns=["doc.cats"],
|
||||||
default_config={"threshold": 0.5, "model": DEFAULT_SINGLE_TEXTCAT_MODEL},
|
default_config={
|
||||||
|
"threshold": 0.5,
|
||||||
|
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
|
||||||
|
"scorer": {"@scorers": "spacy.textcat_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={
|
default_score_weights={
|
||||||
"cats_score": 1.0,
|
"cats_score": 1.0,
|
||||||
"cats_score_desc": None,
|
"cats_score_desc": None,
|
||||||
|
@ -86,7 +91,11 @@ subword_features = true
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
def make_textcat(
|
def make_textcat(
|
||||||
nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
|
nlp: Language,
|
||||||
|
name: str,
|
||||||
|
model: Model[List[Doc], List[Floats2d]],
|
||||||
|
threshold: float,
|
||||||
|
scorer: Optional[Callable],
|
||||||
) -> "TextCategorizer":
|
) -> "TextCategorizer":
|
||||||
"""Create a TextCategorizer component. The text categorizer predicts categories
|
"""Create a TextCategorizer component. The text categorizer predicts categories
|
||||||
over a whole document. It can learn one or more labels, and the labels are considered
|
over a whole document. It can learn one or more labels, and the labels are considered
|
||||||
|
@ -95,8 +104,23 @@ def make_textcat(
|
||||||
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
|
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
|
||||||
scores for each category.
|
scores for each category.
|
||||||
threshold (float): Cutoff to consider a prediction "positive".
|
threshold (float): Cutoff to consider a prediction "positive".
|
||||||
|
scorer (Optional[Callable]): The scoring method.
|
||||||
"""
|
"""
|
||||||
return TextCategorizer(nlp.vocab, model, name, threshold=threshold)
|
return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer)
|
||||||
|
|
||||||
|
|
||||||
|
def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
|
return Scorer.score_cats(
|
||||||
|
examples,
|
||||||
|
"cats",
|
||||||
|
multi_label=False,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.scorers("spacy.textcat_scorer.v1")
|
||||||
|
def make_textcat_scorer():
|
||||||
|
return textcat_score
|
||||||
|
|
||||||
|
|
||||||
class TextCategorizer(TrainablePipe):
|
class TextCategorizer(TrainablePipe):
|
||||||
|
@ -106,7 +130,13 @@ class TextCategorizer(TrainablePipe):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, vocab: Vocab, model: Model, name: str = "textcat", *, threshold: float
|
self,
|
||||||
|
vocab: Vocab,
|
||||||
|
model: Model,
|
||||||
|
name: str = "textcat",
|
||||||
|
*,
|
||||||
|
threshold: float,
|
||||||
|
scorer: Optional[Callable] = textcat_score,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize a text categorizer for single-label classification.
|
"""Initialize a text categorizer for single-label classification.
|
||||||
|
|
||||||
|
@ -115,6 +145,8 @@ class TextCategorizer(TrainablePipe):
|
||||||
name (str): The component instance name, used to add entries to the
|
name (str): The component instance name, used to add entries to the
|
||||||
losses during training.
|
losses during training.
|
||||||
threshold (float): Cutoff to consider a prediction "positive".
|
threshold (float): Cutoff to consider a prediction "positive".
|
||||||
|
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||||
|
Scorer.score_cats for the attribute "cats".
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#init
|
DOCS: https://spacy.io/api/textcategorizer#init
|
||||||
"""
|
"""
|
||||||
|
@ -124,6 +156,7 @@ class TextCategorizer(TrainablePipe):
|
||||||
self._rehearsal_model = None
|
self._rehearsal_model = None
|
||||||
cfg = {"labels": [], "threshold": threshold, "positive_label": None}
|
cfg = {"labels": [], "threshold": threshold, "positive_label": None}
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
|
self.scorer = scorer
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self) -> Tuple[str]:
|
def labels(self) -> Tuple[str]:
|
||||||
|
@ -353,26 +386,6 @@ class TextCategorizer(TrainablePipe):
|
||||||
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||||
|
|
||||||
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
|
||||||
"""Score a batch of examples.
|
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#score
|
|
||||||
"""
|
|
||||||
validate_examples(examples, "TextCategorizer.score")
|
|
||||||
self._validate_categories(examples)
|
|
||||||
kwargs.setdefault("threshold", self.cfg["threshold"])
|
|
||||||
kwargs.setdefault("positive_label", self.cfg["positive_label"])
|
|
||||||
return Scorer.score_cats(
|
|
||||||
examples,
|
|
||||||
"cats",
|
|
||||||
labels=self.labels,
|
|
||||||
multi_label=False,
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _validate_categories(self, examples: Iterable[Example]):
|
def _validate_categories(self, examples: Iterable[Example]):
|
||||||
"""Check whether the provided examples all have single-label cats annotations."""
|
"""Check whether the provided examples all have single-label cats annotations."""
|
||||||
for ex in examples:
|
for ex in examples:
|
||||||
|
|
|
@ -5,10 +5,11 @@ from thinc.api import Model, Config
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d
|
||||||
|
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..training import Example, validate_examples, validate_get_examples
|
from ..training import Example, validate_get_examples
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
|
from ..util import registry
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from .textcat import TextCategorizer
|
from .textcat import TextCategorizer
|
||||||
|
|
||||||
|
@ -70,7 +71,11 @@ subword_features = true
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"textcat_multilabel",
|
"textcat_multilabel",
|
||||||
assigns=["doc.cats"],
|
assigns=["doc.cats"],
|
||||||
default_config={"threshold": 0.5, "model": DEFAULT_MULTI_TEXTCAT_MODEL},
|
default_config={
|
||||||
|
"threshold": 0.5,
|
||||||
|
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
|
||||||
|
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={
|
default_score_weights={
|
||||||
"cats_score": 1.0,
|
"cats_score": 1.0,
|
||||||
"cats_score_desc": None,
|
"cats_score_desc": None,
|
||||||
|
@ -86,7 +91,11 @@ subword_features = true
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
def make_multilabel_textcat(
|
def make_multilabel_textcat(
|
||||||
nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
|
nlp: Language,
|
||||||
|
name: str,
|
||||||
|
model: Model[List[Doc], List[Floats2d]],
|
||||||
|
threshold: float,
|
||||||
|
scorer: Optional[Callable],
|
||||||
) -> "TextCategorizer":
|
) -> "TextCategorizer":
|
||||||
"""Create a TextCategorizer component. The text categorizer predicts categories
|
"""Create a TextCategorizer component. The text categorizer predicts categories
|
||||||
over a whole document. It can learn one or more labels, and the labels are considered
|
over a whole document. It can learn one or more labels, and the labels are considered
|
||||||
|
@ -97,7 +106,23 @@ def make_multilabel_textcat(
|
||||||
scores for each category.
|
scores for each category.
|
||||||
threshold (float): Cutoff to consider a prediction "positive".
|
threshold (float): Cutoff to consider a prediction "positive".
|
||||||
"""
|
"""
|
||||||
return MultiLabel_TextCategorizer(nlp.vocab, model, name, threshold=threshold)
|
return MultiLabel_TextCategorizer(
|
||||||
|
nlp.vocab, model, name, threshold=threshold, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
|
return Scorer.score_cats(
|
||||||
|
examples,
|
||||||
|
"cats",
|
||||||
|
multi_label=True,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.scorers("spacy.textcat_multilabel_scorer.v1")
|
||||||
|
def make_textcat_multilabel_scorer():
|
||||||
|
return textcat_multilabel_score
|
||||||
|
|
||||||
|
|
||||||
class MultiLabel_TextCategorizer(TextCategorizer):
|
class MultiLabel_TextCategorizer(TextCategorizer):
|
||||||
|
@ -113,6 +138,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
|
||||||
name: str = "textcat_multilabel",
|
name: str = "textcat_multilabel",
|
||||||
*,
|
*,
|
||||||
threshold: float,
|
threshold: float,
|
||||||
|
scorer: Optional[Callable] = textcat_multilabel_score,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize a text categorizer for multi-label classification.
|
"""Initialize a text categorizer for multi-label classification.
|
||||||
|
|
||||||
|
@ -130,6 +156,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
|
||||||
self._rehearsal_model = None
|
self._rehearsal_model = None
|
||||||
cfg = {"labels": [], "threshold": threshold}
|
cfg = {"labels": [], "threshold": threshold}
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
|
self.scorer = scorer
|
||||||
|
|
||||||
def initialize( # type: ignore[override]
|
def initialize( # type: ignore[override]
|
||||||
self,
|
self,
|
||||||
|
@ -166,24 +193,6 @@ class MultiLabel_TextCategorizer(TextCategorizer):
|
||||||
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||||
|
|
||||||
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
|
||||||
"""Score a batch of examples.
|
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#score
|
|
||||||
"""
|
|
||||||
validate_examples(examples, "MultiLabel_TextCategorizer.score")
|
|
||||||
kwargs.setdefault("threshold", self.cfg["threshold"])
|
|
||||||
return Scorer.score_cats(
|
|
||||||
examples,
|
|
||||||
"cats",
|
|
||||||
labels=self.labels,
|
|
||||||
multi_label=True,
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _validate_categories(self, examples: Iterable[Example]):
|
def _validate_categories(self, examples: Iterable[Example]):
|
||||||
"""This component allows any type of single- or multi-label annotations.
|
"""This component allows any type of single- or multi-label annotations.
|
||||||
This method overwrites the more strict one from 'textcat'."""
|
This method overwrites the more strict one from 'textcat'."""
|
||||||
|
|
|
@ -5,3 +5,4 @@ cdef class TrainablePipe(Pipe):
|
||||||
cdef public Vocab vocab
|
cdef public Vocab vocab
|
||||||
cdef public object model
|
cdef public object model
|
||||||
cdef public object cfg
|
cdef public object cfg
|
||||||
|
cdef public object scorer
|
||||||
|
|
|
@ -49,7 +49,8 @@ cdef class Parser(TrainablePipe):
|
||||||
beam_density=0.0,
|
beam_density=0.0,
|
||||||
beam_update_prob=0.0,
|
beam_update_prob=0.0,
|
||||||
multitasks=tuple(),
|
multitasks=tuple(),
|
||||||
incorrect_spans_key=None
|
incorrect_spans_key=None,
|
||||||
|
scorer=None,
|
||||||
):
|
):
|
||||||
"""Create a Parser.
|
"""Create a Parser.
|
||||||
|
|
||||||
|
@ -86,6 +87,7 @@ cdef class Parser(TrainablePipe):
|
||||||
incorrect_spans_key (Optional[str]): Identifies spans that are known
|
incorrect_spans_key (Optional[str]): Identifies spans that are known
|
||||||
to be incorrect entity annotations. The incorrect entity annotations
|
to be incorrect entity annotations. The incorrect entity annotations
|
||||||
can be stored in the span group, under this key.
|
can be stored in the span group, under this key.
|
||||||
|
scorer (Optional[Callable]): The scoring method. Defaults to None.
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.name = name
|
self.name = name
|
||||||
|
@ -117,6 +119,7 @@ cdef class Parser(TrainablePipe):
|
||||||
self.add_multitask_objective(multitask)
|
self.add_multitask_objective(multitask)
|
||||||
|
|
||||||
self._rehearsal_model = None
|
self._rehearsal_model = None
|
||||||
|
self.scorer = scorer
|
||||||
|
|
||||||
def __getnewargs_ex__(self):
|
def __getnewargs_ex__(self):
|
||||||
"""This allows pickling the Parser and its keyword-only init arguments"""
|
"""This allows pickling the Parser and its keyword-only init arguments"""
|
||||||
|
|
|
@ -351,7 +351,8 @@ class ConfigSchemaPretrain(BaseModel):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for")
|
max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for")
|
||||||
dropout: StrictFloat = Field(..., title="Dropout rate")
|
dropout: StrictFloat = Field(..., title="Dropout rate")
|
||||||
n_save_every: Optional[StrictInt] = Field(..., title="Saving frequency")
|
n_save_every: Optional[StrictInt] = Field(..., title="Saving additional temporary model after n batches within an epoch")
|
||||||
|
n_save_epoch: Optional[StrictInt] = Field(..., title="Saving model after every n epoch")
|
||||||
optimizer: Optimizer = Field(..., title="The optimizer to use")
|
optimizer: Optimizer = Field(..., title="The optimizer to use")
|
||||||
corpus: StrictStr = Field(..., title="Path in the config to the training data")
|
corpus: StrictStr = Field(..., title="Path in the config to the training data")
|
||||||
batcher: Batcher = Field(..., title="Batcher for the training data")
|
batcher: Batcher = Field(..., title="Batcher for the training data")
|
||||||
|
|
|
@ -247,18 +247,21 @@ class Scorer:
|
||||||
missing_values: Set[Any] = MISSING_VALUES, # type: ignore[assignment]
|
missing_values: Set[Any] = MISSING_VALUES, # type: ignore[assignment]
|
||||||
**cfg,
|
**cfg,
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""Return PRF scores per feat for a token attribute in UFEATS format.
|
"""Return micro PRF and PRF scores per feat for a token attribute in
|
||||||
|
UFEATS format.
|
||||||
|
|
||||||
examples (Iterable[Example]): Examples to score
|
examples (Iterable[Example]): Examples to score
|
||||||
attr (str): The attribute to score.
|
attr (str): The attribute to score.
|
||||||
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
|
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
|
||||||
getter(token, attr) should return the value of the attribute for an
|
getter(token, attr) should return the value of the attribute for an
|
||||||
individual token.
|
individual token.
|
||||||
missing_values (Set[Any]): Attribute values to treat as missing annotation
|
missing_values (Set[Any]): Attribute values to treat as missing
|
||||||
in the reference annotation.
|
annotation in the reference annotation.
|
||||||
RETURNS (dict): A dictionary containing the per-feat PRF scores under
|
RETURNS (dict): A dictionary containing the micro PRF scores under the
|
||||||
the key attr_per_feat.
|
key attr_micro_p/r/f and the per-feat PRF scores under
|
||||||
|
attr_per_feat.
|
||||||
"""
|
"""
|
||||||
|
micro_score = PRFScore()
|
||||||
per_feat = {}
|
per_feat = {}
|
||||||
for example in examples:
|
for example in examples:
|
||||||
pred_doc = example.predicted
|
pred_doc = example.predicted
|
||||||
|
@ -300,15 +303,22 @@ class Scorer:
|
||||||
pred_per_feat[field] = set()
|
pred_per_feat[field] = set()
|
||||||
pred_per_feat[field].add((gold_i, feat))
|
pred_per_feat[field].add((gold_i, feat))
|
||||||
for field in per_feat:
|
for field in per_feat:
|
||||||
|
micro_score.score_set(pred_per_feat.get(field, set()), gold_per_feat.get(field, set()))
|
||||||
per_feat[field].score_set(
|
per_feat[field].score_set(
|
||||||
pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
|
pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
|
||||||
)
|
)
|
||||||
score_key = f"{attr}_per_feat"
|
result: Dict[str, Any] = {}
|
||||||
if any([len(v) for v in per_feat.values()]):
|
if len(micro_score) > 0:
|
||||||
result = {k: v.to_dict() for k, v in per_feat.items()}
|
result[f"{attr}_micro_p"] = micro_score.precision
|
||||||
return {score_key: result}
|
result[f"{attr}_micro_r"] = micro_score.recall
|
||||||
|
result[f"{attr}_micro_f"] = micro_score.fscore
|
||||||
|
result[f"{attr}_per_feat"] = {k: v.to_dict() for k, v in per_feat.items()}
|
||||||
else:
|
else:
|
||||||
return {score_key: None}
|
result[f"{attr}_micro_p"] = None
|
||||||
|
result[f"{attr}_micro_r"] = None
|
||||||
|
result[f"{attr}_micro_f"] = None
|
||||||
|
result[f"{attr}_per_feat"] = None
|
||||||
|
return result
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def score_spans(
|
def score_spans(
|
||||||
|
@ -545,7 +555,7 @@ class Scorer:
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def score_links(
|
def score_links(
|
||||||
examples: Iterable[Example], *, negative_labels: Iterable[str]
|
examples: Iterable[Example], *, negative_labels: Iterable[str], **cfg
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""Returns PRF for predicted links on the entity level.
|
"""Returns PRF for predicted links on the entity level.
|
||||||
To disentangle the performance of the NEL from the NER,
|
To disentangle the performance of the NEL from the NER,
|
||||||
|
@ -721,7 +731,7 @@ class Scorer:
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]:
|
def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
"""Compute micro-PRF and per-entity PRF scores for a sequence of examples."""
|
"""Compute micro-PRF and per-entity PRF scores for a sequence of examples."""
|
||||||
score_per_type = defaultdict(PRFScore)
|
score_per_type = defaultdict(PRFScore)
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
|
|
|
@ -8,10 +8,10 @@ from murmurhash.mrmr cimport hash64
|
||||||
from .typedefs cimport attr_t, hash_t
|
from .typedefs cimport attr_t, hash_t
|
||||||
|
|
||||||
|
|
||||||
cpdef hash_t hash_string(unicode string) except 0
|
cpdef hash_t hash_string(str string) except 0
|
||||||
cdef hash_t hash_utf8(char* utf8_string, int length) nogil
|
cdef hash_t hash_utf8(char* utf8_string, int length) nogil
|
||||||
|
|
||||||
cdef unicode decode_Utf8Str(const Utf8Str* string)
|
cdef str decode_Utf8Str(const Utf8Str* string)
|
||||||
|
|
||||||
|
|
||||||
ctypedef union Utf8Str:
|
ctypedef union Utf8Str:
|
||||||
|
@ -25,5 +25,5 @@ cdef class StringStore:
|
||||||
cdef vector[hash_t] keys
|
cdef vector[hash_t] keys
|
||||||
cdef public PreshMap _map
|
cdef public PreshMap _map
|
||||||
|
|
||||||
cdef const Utf8Str* intern_unicode(self, unicode py_string)
|
cdef const Utf8Str* intern_unicode(self, str py_string)
|
||||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)
|
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)
|
||||||
|
|
|
@ -33,7 +33,7 @@ def get_string_id(key):
|
||||||
return hash_utf8(chars, len(chars))
|
return hash_utf8(chars, len(chars))
|
||||||
|
|
||||||
|
|
||||||
cpdef hash_t hash_string(unicode string) except 0:
|
cpdef hash_t hash_string(str string) except 0:
|
||||||
chars = string.encode("utf8")
|
chars = string.encode("utf8")
|
||||||
return hash_utf8(chars, len(chars))
|
return hash_utf8(chars, len(chars))
|
||||||
|
|
||||||
|
@ -46,7 +46,7 @@ cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
|
||||||
return hash32(utf8_string, length, 1)
|
return hash32(utf8_string, length, 1)
|
||||||
|
|
||||||
|
|
||||||
cdef unicode decode_Utf8Str(const Utf8Str* string):
|
cdef str decode_Utf8Str(const Utf8Str* string):
|
||||||
cdef int i, length
|
cdef int i, length
|
||||||
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
|
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
|
||||||
return string.s[1:string.s[0]+1].decode("utf8")
|
return string.s[1:string.s[0]+1].decode("utf8")
|
||||||
|
@ -107,17 +107,17 @@ cdef class StringStore:
|
||||||
def __getitem__(self, object string_or_id):
|
def __getitem__(self, object string_or_id):
|
||||||
"""Retrieve a string from a given hash, or vice versa.
|
"""Retrieve a string from a given hash, or vice versa.
|
||||||
|
|
||||||
string_or_id (bytes, unicode or uint64): The value to encode.
|
string_or_id (bytes, str or uint64): The value to encode.
|
||||||
Returns (str / uint64): The value to be retrieved.
|
Returns (str / uint64): The value to be retrieved.
|
||||||
"""
|
"""
|
||||||
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
|
if isinstance(string_or_id, str) and len(string_or_id) == 0:
|
||||||
return 0
|
return 0
|
||||||
elif string_or_id == 0:
|
elif string_or_id == 0:
|
||||||
return ""
|
return ""
|
||||||
elif string_or_id in SYMBOLS_BY_STR:
|
elif string_or_id in SYMBOLS_BY_STR:
|
||||||
return SYMBOLS_BY_STR[string_or_id]
|
return SYMBOLS_BY_STR[string_or_id]
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
if isinstance(string_or_id, unicode):
|
if isinstance(string_or_id, str):
|
||||||
key = hash_string(string_or_id)
|
key = hash_string(string_or_id)
|
||||||
return key
|
return key
|
||||||
elif isinstance(string_or_id, bytes):
|
elif isinstance(string_or_id, bytes):
|
||||||
|
@ -135,14 +135,14 @@ cdef class StringStore:
|
||||||
|
|
||||||
def as_int(self, key):
|
def as_int(self, key):
|
||||||
"""If key is an int, return it; otherwise, get the int value."""
|
"""If key is an int, return it; otherwise, get the int value."""
|
||||||
if not isinstance(key, basestring):
|
if not isinstance(key, str):
|
||||||
return key
|
return key
|
||||||
else:
|
else:
|
||||||
return self[key]
|
return self[key]
|
||||||
|
|
||||||
def as_string(self, key):
|
def as_string(self, key):
|
||||||
"""If key is a string, return it; otherwise, get the string value."""
|
"""If key is a string, return it; otherwise, get the string value."""
|
||||||
if isinstance(key, basestring):
|
if isinstance(key, str):
|
||||||
return key
|
return key
|
||||||
else:
|
else:
|
||||||
return self[key]
|
return self[key]
|
||||||
|
@ -153,7 +153,7 @@ cdef class StringStore:
|
||||||
string (str): The string to add.
|
string (str): The string to add.
|
||||||
RETURNS (uint64): The string's hash value.
|
RETURNS (uint64): The string's hash value.
|
||||||
"""
|
"""
|
||||||
if isinstance(string, unicode):
|
if isinstance(string, str):
|
||||||
if string in SYMBOLS_BY_STR:
|
if string in SYMBOLS_BY_STR:
|
||||||
return SYMBOLS_BY_STR[string]
|
return SYMBOLS_BY_STR[string]
|
||||||
key = hash_string(string)
|
key = hash_string(string)
|
||||||
|
@ -189,7 +189,7 @@ cdef class StringStore:
|
||||||
return True
|
return True
|
||||||
elif string in SYMBOLS_BY_STR:
|
elif string in SYMBOLS_BY_STR:
|
||||||
return True
|
return True
|
||||||
elif isinstance(string, unicode):
|
elif isinstance(string, str):
|
||||||
key = hash_string(string)
|
key = hash_string(string)
|
||||||
else:
|
else:
|
||||||
string = string.encode("utf8")
|
string = string.encode("utf8")
|
||||||
|
@ -269,7 +269,7 @@ cdef class StringStore:
|
||||||
for string in strings:
|
for string in strings:
|
||||||
self.add(string)
|
self.add(string)
|
||||||
|
|
||||||
cdef const Utf8Str* intern_unicode(self, unicode py_string):
|
cdef const Utf8Str* intern_unicode(self, str py_string):
|
||||||
# 0 means missing, but we don't bother offsetting the index.
|
# 0 means missing, but we don't bother offsetting the index.
|
||||||
cdef bytes byte_string = py_string.encode("utf8")
|
cdef bytes byte_string = py_string.encode("utf8")
|
||||||
return self._intern_utf8(byte_string, len(byte_string))
|
return self._intern_utf8(byte_string, len(byte_string))
|
||||||
|
|
|
@ -5,9 +5,11 @@ from spacy.compat import pickle
|
||||||
def test_pickle_single_doc():
|
def test_pickle_single_doc():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
doc = nlp("pickle roundtrip")
|
doc = nlp("pickle roundtrip")
|
||||||
|
doc._context = 3
|
||||||
data = pickle.dumps(doc, 1)
|
data = pickle.dumps(doc, 1)
|
||||||
doc2 = pickle.loads(data)
|
doc2 = pickle.loads(data)
|
||||||
assert doc2.text == "pickle roundtrip"
|
assert doc2.text == "pickle roundtrip"
|
||||||
|
assert doc2._context == 3
|
||||||
|
|
||||||
|
|
||||||
def test_list_of_docs_pickles_efficiently():
|
def test_list_of_docs_pickles_efficiently():
|
||||||
|
|
|
@ -11,7 +11,18 @@ def test_ca_tokenizer_handles_abbr(ca_tokenizer, text, lemma):
|
||||||
|
|
||||||
|
|
||||||
def test_ca_tokenizer_handles_exc_in_text(ca_tokenizer):
|
def test_ca_tokenizer_handles_exc_in_text(ca_tokenizer):
|
||||||
text = "La Núria i el Pere han vingut aprox. a les 7 de la tarda."
|
text = "La Dra. Puig viu a la pl. dels Til·lers."
|
||||||
tokens = ca_tokenizer(text)
|
doc = ca_tokenizer(text)
|
||||||
assert len(tokens) == 15
|
assert [t.text for t in doc] == [
|
||||||
assert tokens[7].text == "aprox."
|
"La",
|
||||||
|
"Dra.",
|
||||||
|
"Puig",
|
||||||
|
"viu",
|
||||||
|
"a",
|
||||||
|
"la",
|
||||||
|
"pl.",
|
||||||
|
"d",
|
||||||
|
"els",
|
||||||
|
"Til·lers",
|
||||||
|
".",
|
||||||
|
]
|
||||||
|
|
|
@ -2,7 +2,14 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"text,expected_tokens", [("d'un", ["d'", "un"]), ("s'ha", ["s'", "ha"])]
|
"text,expected_tokens",
|
||||||
|
[
|
||||||
|
("d'un", ["d'", "un"]),
|
||||||
|
("s'ha", ["s'", "ha"]),
|
||||||
|
("del", ["d", "el"]),
|
||||||
|
("cantar-te", ["cantar", "-te"]),
|
||||||
|
("-hola", ["-", "hola"]),
|
||||||
|
],
|
||||||
)
|
)
|
||||||
def test_contractions(ca_tokenizer, text, expected_tokens):
|
def test_contractions(ca_tokenizer, text, expected_tokens):
|
||||||
"""Test that the contractions are split into two tokens"""
|
"""Test that the contractions are split into two tokens"""
|
||||||
|
|
|
@ -12,17 +12,20 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer):
|
||||||
una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida."""
|
una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida."""
|
||||||
|
|
||||||
tokens = ca_tokenizer(text)
|
tokens = ca_tokenizer(text)
|
||||||
assert len(tokens) == 140
|
assert len(tokens) == 146
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"text,length",
|
"text,length",
|
||||||
[
|
[
|
||||||
("Perquè va anar-hi?", 4),
|
("Perquè va anar-hi?", 5),
|
||||||
|
("El cotxe dels veins.", 6),
|
||||||
("“Ah no?”", 5),
|
("“Ah no?”", 5),
|
||||||
("""Sí! "Anem", va contestar el Joan Carles""", 11),
|
("""Sí! "Anem", va contestar el Joan Carles""", 11),
|
||||||
("Van córrer aprox. 10km", 5),
|
("Van córrer aprox. 10km", 5),
|
||||||
("Llavors perqué...", 3),
|
("Llavors perqué...", 3),
|
||||||
|
("Vull parlar-te'n demà al matí", 8),
|
||||||
|
("Vull explicar-t'ho demà al matí", 8),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_ca_tokenizer_handles_cnts(ca_tokenizer, text, length):
|
def test_ca_tokenizer_handles_cnts(ca_tokenizer, text, length):
|
||||||
|
|
|
@ -8,3 +8,17 @@ import pytest
|
||||||
def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma):
|
def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma):
|
||||||
test_lemma = ja_tokenizer(word)[0].lemma_
|
test_lemma = ja_tokenizer(word)[0].lemma_
|
||||||
assert test_lemma == lemma
|
assert test_lemma == lemma
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"word,norm",
|
||||||
|
[
|
||||||
|
("SUMMER", "サマー"),
|
||||||
|
("食べ物", "食べ物"),
|
||||||
|
("綜合", "総合"),
|
||||||
|
("コンピュータ", "コンピューター"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_ja_lemmatizer_norm(ja_tokenizer, word, norm):
|
||||||
|
test_norm = ja_tokenizer(word)[0].norm_
|
||||||
|
assert test_norm == norm
|
||||||
|
|
9
spacy/tests/lang/ja/test_morphologizer_factory.py
Normal file
9
spacy/tests/lang/ja/test_morphologizer_factory.py
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
import pytest
|
||||||
|
from spacy.lang.ja import Japanese
|
||||||
|
|
||||||
|
|
||||||
|
def test_ja_morphologizer_factory():
|
||||||
|
pytest.importorskip("sudachipy")
|
||||||
|
nlp = Japanese()
|
||||||
|
morphologizer = nlp.add_pipe("morphologizer")
|
||||||
|
assert morphologizer.cfg["extend"] is True
|
|
@ -1,3 +1,5 @@
|
||||||
|
import pickle
|
||||||
|
|
||||||
from spacy.lang.ja import Japanese
|
from spacy.lang.ja import Japanese
|
||||||
from ...util import make_tempdir
|
from ...util import make_tempdir
|
||||||
|
|
||||||
|
@ -31,3 +33,9 @@ def test_ja_tokenizer_serialize(ja_tokenizer):
|
||||||
nlp_r.from_disk(d)
|
nlp_r.from_disk(d)
|
||||||
assert nlp_bytes == nlp_r.to_bytes()
|
assert nlp_bytes == nlp_r.to_bytes()
|
||||||
assert nlp_r.tokenizer.split_mode == "B"
|
assert nlp_r.tokenizer.split_mode == "B"
|
||||||
|
|
||||||
|
|
||||||
|
def test_ja_tokenizer_pickle(ja_tokenizer):
|
||||||
|
b = pickle.dumps(ja_tokenizer)
|
||||||
|
ja_tokenizer_re = pickle.loads(b)
|
||||||
|
assert ja_tokenizer.to_bytes() == ja_tokenizer_re.to_bytes()
|
||||||
|
|
|
@ -34,22 +34,22 @@ SENTENCE_TESTS = [
|
||||||
]
|
]
|
||||||
|
|
||||||
tokens1 = [
|
tokens1 = [
|
||||||
DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", reading="イイン", sub_tokens=None),
|
DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", norm="委員", reading="イイン", sub_tokens=None),
|
||||||
DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", reading="カイ", sub_tokens=None),
|
DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", norm="会", reading="カイ", sub_tokens=None),
|
||||||
]
|
]
|
||||||
tokens2 = [
|
tokens2 = [
|
||||||
DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", reading="センキョ", sub_tokens=None),
|
DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", norm="選挙", reading="センキョ", sub_tokens=None),
|
||||||
DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", reading="カンリ", sub_tokens=None),
|
DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", norm="管理", reading="カンリ", sub_tokens=None),
|
||||||
DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", reading="イイン", sub_tokens=None),
|
DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", norm="委員", reading="イイン", sub_tokens=None),
|
||||||
DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", reading="カイ", sub_tokens=None),
|
DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", norm="会", reading="カイ", sub_tokens=None),
|
||||||
]
|
]
|
||||||
tokens3 = [
|
tokens3 = [
|
||||||
DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", reading="センキョ", sub_tokens=None),
|
DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", norm="選挙", reading="センキョ", sub_tokens=None),
|
||||||
DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", reading="カンリ", sub_tokens=None),
|
DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", norm="管理", reading="カンリ", sub_tokens=None),
|
||||||
DetailedToken(surface="委員会", tag="名詞-普通名詞-一般", inf="", lemma="委員会", reading="イインカイ", sub_tokens=None),
|
DetailedToken(surface="委員会", tag="名詞-普通名詞-一般", inf="", lemma="委員会", norm="委員会", reading="イインカイ", sub_tokens=None),
|
||||||
]
|
]
|
||||||
SUB_TOKEN_TESTS = [
|
SUB_TOKEN_TESTS = [
|
||||||
("選挙管理委員会", [None, None, None, None], [None, None, [tokens1]], [[tokens2, tokens3]])
|
("選挙管理委員会", [None, None, [tokens1]], [[tokens2, tokens3]])
|
||||||
]
|
]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
@ -111,18 +111,16 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
|
||||||
assert len(nlp_c(text)) == len_c
|
assert len(nlp_c(text)) == len_c
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize("text,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS)
|
||||||
"text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS
|
|
||||||
)
|
|
||||||
def test_ja_tokenizer_sub_tokens(
|
def test_ja_tokenizer_sub_tokens(
|
||||||
ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c
|
ja_tokenizer, text, sub_tokens_list_b, sub_tokens_list_c
|
||||||
):
|
):
|
||||||
nlp_a = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "A"}}})
|
nlp_a = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "A"}}})
|
||||||
nlp_b = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "B"}}})
|
nlp_b = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "B"}}})
|
||||||
nlp_c = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "C"}}})
|
nlp_c = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "C"}}})
|
||||||
|
|
||||||
assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a
|
assert ja_tokenizer(text).user_data.get("sub_tokens") is None
|
||||||
assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a
|
assert nlp_a(text).user_data.get("sub_tokens") is None
|
||||||
assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b
|
assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b
|
||||||
assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c
|
assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c
|
||||||
|
|
||||||
|
@ -132,16 +130,24 @@ def test_ja_tokenizer_sub_tokens(
|
||||||
[
|
[
|
||||||
(
|
(
|
||||||
"取ってつけた",
|
"取ってつけた",
|
||||||
("五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"),
|
(["五段-ラ行;連用形-促音便"], [], ["下一段-カ行;連用形-一般"], ["助動詞-タ;終止形-一般"]),
|
||||||
("トッ", "テ", "ツケ", "タ"),
|
(["トッ"], ["テ"], ["ツケ"], ["タ"]),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"2=3",
|
||||||
|
([], [], []),
|
||||||
|
(["ニ"], ["_"], ["サン"])
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_ja_tokenizer_inflections_reading_forms(
|
def test_ja_tokenizer_inflections_reading_forms(
|
||||||
ja_tokenizer, text, inflections, reading_forms
|
ja_tokenizer, text, inflections, reading_forms
|
||||||
):
|
):
|
||||||
assert ja_tokenizer(text).user_data["inflections"] == inflections
|
tokens = ja_tokenizer(text)
|
||||||
assert ja_tokenizer(text).user_data["reading_forms"] == reading_forms
|
test_inflections = [tt.morph.get("Inflection") for tt in tokens]
|
||||||
|
assert test_inflections == list(inflections)
|
||||||
|
test_readings = [tt.morph.get("Reading") for tt in tokens]
|
||||||
|
assert test_readings == list(reading_forms)
|
||||||
|
|
||||||
|
|
||||||
def test_ja_tokenizer_emptyish_texts(ja_tokenizer):
|
def test_ja_tokenizer_emptyish_texts(ja_tokenizer):
|
||||||
|
|
24
spacy/tests/lang/ko/test_serialize.py
Normal file
24
spacy/tests/lang/ko/test_serialize.py
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
from spacy.lang.ko import Korean
|
||||||
|
from ...util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
|
def test_ko_tokenizer_serialize(ko_tokenizer):
|
||||||
|
tokenizer_bytes = ko_tokenizer.to_bytes()
|
||||||
|
nlp = Korean()
|
||||||
|
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
||||||
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||||
|
|
||||||
|
with make_tempdir() as d:
|
||||||
|
file_path = d / "tokenizer"
|
||||||
|
ko_tokenizer.to_disk(file_path)
|
||||||
|
nlp = Korean()
|
||||||
|
nlp.tokenizer.from_disk(file_path)
|
||||||
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||||
|
|
||||||
|
|
||||||
|
def test_ko_tokenizer_pickle(ko_tokenizer):
|
||||||
|
b = pickle.dumps(ko_tokenizer)
|
||||||
|
ko_tokenizer_re = pickle.loads(b)
|
||||||
|
assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -58,9 +58,10 @@ def test_lex_attrs_is_currency(text, match):
|
||||||
("www.google.com", True),
|
("www.google.com", True),
|
||||||
("google.com", True),
|
("google.com", True),
|
||||||
("sydney.com", True),
|
("sydney.com", True),
|
||||||
("2girls1cup.org", True),
|
("1abc2def.org", True),
|
||||||
("http://stupid", True),
|
("http://stupid", True),
|
||||||
("www.hi", True),
|
("www.hi", True),
|
||||||
|
("example.com/example", True),
|
||||||
("dog", False),
|
("dog", False),
|
||||||
("1.2", False),
|
("1.2", False),
|
||||||
("1.a", False),
|
("1.a", False),
|
||||||
|
|
24
spacy/tests/lang/th/test_serialize.py
Normal file
24
spacy/tests/lang/th/test_serialize.py
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
from spacy.lang.th import Thai
|
||||||
|
from ...util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
|
def test_th_tokenizer_serialize(th_tokenizer):
|
||||||
|
tokenizer_bytes = th_tokenizer.to_bytes()
|
||||||
|
nlp = Thai()
|
||||||
|
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
||||||
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||||
|
|
||||||
|
with make_tempdir() as d:
|
||||||
|
file_path = d / "tokenizer"
|
||||||
|
th_tokenizer.to_disk(file_path)
|
||||||
|
nlp = Thai()
|
||||||
|
nlp.tokenizer.from_disk(file_path)
|
||||||
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||||
|
|
||||||
|
|
||||||
|
def test_th_tokenizer_pickle(th_tokenizer):
|
||||||
|
b = pickle.dumps(th_tokenizer)
|
||||||
|
th_tokenizer_re = pickle.loads(b)
|
||||||
|
assert th_tokenizer.to_bytes() == th_tokenizer_re.to_bytes()
|
|
@ -37,7 +37,7 @@ def test_ti_tokenizer_handles_cnts(ti_tokenizer, text, length):
|
||||||
("10.000", True),
|
("10.000", True),
|
||||||
("1000", True),
|
("1000", True),
|
||||||
("999,0", True),
|
("999,0", True),
|
||||||
("ሐደ", True),
|
("ሓደ", True),
|
||||||
("ክልተ", True),
|
("ክልተ", True),
|
||||||
("ትሪልዮን", True),
|
("ትሪልዮን", True),
|
||||||
("ከልቢ", False),
|
("ከልቢ", False),
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
import pickle
|
||||||
|
|
||||||
from spacy.lang.vi import Vietnamese
|
from spacy.lang.vi import Vietnamese
|
||||||
from ...util import make_tempdir
|
from ...util import make_tempdir
|
||||||
|
|
||||||
|
@ -31,3 +33,9 @@ def test_vi_tokenizer_serialize(vi_tokenizer):
|
||||||
nlp_r.from_disk(d)
|
nlp_r.from_disk(d)
|
||||||
assert nlp_bytes == nlp_r.to_bytes()
|
assert nlp_bytes == nlp_r.to_bytes()
|
||||||
assert nlp_r.tokenizer.use_pyvi is False
|
assert nlp_r.tokenizer.use_pyvi is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_vi_tokenizer_pickle(vi_tokenizer):
|
||||||
|
b = pickle.dumps(vi_tokenizer)
|
||||||
|
vi_tokenizer_re = pickle.loads(b)
|
||||||
|
assert vi_tokenizer.to_bytes() == vi_tokenizer_re.to_bytes()
|
||||||
|
|
|
@ -32,24 +32,6 @@ def pattern_dicts():
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("attribute_ruler_patterns")
|
|
||||||
def attribute_ruler_patterns():
|
|
||||||
return [
|
|
||||||
{
|
|
||||||
"patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
|
|
||||||
"attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
|
|
||||||
},
|
|
||||||
# one pattern sets the lemma
|
|
||||||
{"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
|
|
||||||
# another pattern sets the morphology
|
|
||||||
{
|
|
||||||
"patterns": [[{"ORTH": "test"}]],
|
|
||||||
"attrs": {"MORPH": "Case=Nom|Number=Sing"},
|
|
||||||
"index": 0,
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def tag_map():
|
def tag_map():
|
||||||
return {
|
return {
|
||||||
|
@ -121,7 +103,25 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
||||||
assert doc.has_annotation("LEMMA")
|
assert doc.has_annotation("LEMMA")
|
||||||
assert doc.has_annotation("MORPH")
|
assert doc.has_annotation("MORPH")
|
||||||
nlp.remove_pipe("attribute_ruler")
|
nlp.remove_pipe("attribute_ruler")
|
||||||
|
|
||||||
# initialize with patterns from misc registry
|
# initialize with patterns from misc registry
|
||||||
|
@registry.misc("attribute_ruler_patterns")
|
||||||
|
def attribute_ruler_patterns():
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
|
||||||
|
"attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
|
||||||
|
},
|
||||||
|
# one pattern sets the lemma
|
||||||
|
{"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
|
||||||
|
# another pattern sets the morphology
|
||||||
|
{
|
||||||
|
"patterns": [[{"ORTH": "test"}]],
|
||||||
|
"attrs": {"MORPH": "Case=Nom|Number=Sing"},
|
||||||
|
"index": 0,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
nlp.config["initialize"]["components"]["attribute_ruler"] = {
|
nlp.config["initialize"]["components"]["attribute_ruler"] = {
|
||||||
"patterns": {"@misc": "attribute_ruler_patterns"}
|
"patterns": {"@misc": "attribute_ruler_patterns"}
|
||||||
}
|
}
|
||||||
|
@ -162,6 +162,26 @@ def test_attributeruler_score(nlp, pattern_dicts):
|
||||||
assert scores["lemma_acc"] == pytest.approx(0.2)
|
assert scores["lemma_acc"] == pytest.approx(0.2)
|
||||||
# no morphs are set
|
# no morphs are set
|
||||||
assert scores["morph_acc"] is None
|
assert scores["morph_acc"] is None
|
||||||
|
nlp.remove_pipe("attribute_ruler")
|
||||||
|
|
||||||
|
# test with custom scorer
|
||||||
|
@registry.misc("weird_scorer.v1")
|
||||||
|
def make_weird_scorer():
|
||||||
|
def weird_scorer(examples, weird_score, **kwargs):
|
||||||
|
return {"weird_score": weird_score}
|
||||||
|
|
||||||
|
return weird_scorer
|
||||||
|
|
||||||
|
ruler = nlp.add_pipe(
|
||||||
|
"attribute_ruler", config={"scorer": {"@misc": "weird_scorer.v1"}}
|
||||||
|
)
|
||||||
|
ruler.initialize(lambda: [], patterns=pattern_dicts)
|
||||||
|
scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.12345})
|
||||||
|
assert scores["weird_score"] == 0.12345
|
||||||
|
assert "token_acc" in scores
|
||||||
|
assert "lemma_acc" not in scores
|
||||||
|
scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.23456})
|
||||||
|
assert scores["weird_score"] == 0.23456
|
||||||
|
|
||||||
|
|
||||||
def test_attributeruler_rule_order(nlp):
|
def test_attributeruler_rule_order(nlp):
|
||||||
|
|
|
@ -8,6 +8,7 @@ from spacy.language import Language
|
||||||
from spacy.tests.util import make_tempdir
|
from spacy.tests.util import make_tempdir
|
||||||
from spacy.morphology import Morphology
|
from spacy.morphology import Morphology
|
||||||
from spacy.attrs import MORPH
|
from spacy.attrs import MORPH
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
def test_label_types():
|
def test_label_types():
|
||||||
|
@ -137,6 +138,41 @@ def test_overfitting_IO():
|
||||||
assert [str(t.morph) for t in doc] == gold_morphs
|
assert [str(t.morph) for t in doc] == gold_morphs
|
||||||
assert [t.pos_ for t in doc] == gold_pos_tags
|
assert [t.pos_ for t in doc] == gold_pos_tags
|
||||||
|
|
||||||
|
# Test overwrite+extend settings
|
||||||
|
# (note that "" is unset, "_" is set and empty)
|
||||||
|
morphs = ["Feat=V", "Feat=N", "_"]
|
||||||
|
doc = Doc(nlp.vocab, words=["blue", "ham", "like"], morphs=morphs)
|
||||||
|
orig_morphs = [str(t.morph) for t in doc]
|
||||||
|
orig_pos_tags = [t.pos_ for t in doc]
|
||||||
|
morphologizer = nlp.get_pipe("morphologizer")
|
||||||
|
|
||||||
|
# don't overwrite or extend
|
||||||
|
morphologizer.cfg["overwrite"] = False
|
||||||
|
doc = morphologizer(doc)
|
||||||
|
assert [str(t.morph) for t in doc] == orig_morphs
|
||||||
|
assert [t.pos_ for t in doc] == orig_pos_tags
|
||||||
|
|
||||||
|
# overwrite and extend
|
||||||
|
morphologizer.cfg["overwrite"] = True
|
||||||
|
morphologizer.cfg["extend"] = True
|
||||||
|
doc = Doc(nlp.vocab, words=["I", "like"], morphs=["Feat=A|That=A|This=A", ""])
|
||||||
|
doc = morphologizer(doc)
|
||||||
|
assert [str(t.morph) for t in doc] == ["Feat=N|That=A|This=A", "Feat=V"]
|
||||||
|
|
||||||
|
# extend without overwriting
|
||||||
|
morphologizer.cfg["overwrite"] = False
|
||||||
|
morphologizer.cfg["extend"] = True
|
||||||
|
doc = Doc(nlp.vocab, words=["I", "like"], morphs=["Feat=A|That=A|This=A", "That=B"])
|
||||||
|
doc = morphologizer(doc)
|
||||||
|
assert [str(t.morph) for t in doc] == ["Feat=A|That=A|This=A", "Feat=V|That=B"]
|
||||||
|
|
||||||
|
# overwrite without extending
|
||||||
|
morphologizer.cfg["overwrite"] = True
|
||||||
|
morphologizer.cfg["extend"] = False
|
||||||
|
doc = Doc(nlp.vocab, words=["I", "like"], morphs=["Feat=A|That=A|This=A", ""])
|
||||||
|
doc = morphologizer(doc)
|
||||||
|
assert [str(t.morph) for t in doc] == ["Feat=N", "Feat=V"]
|
||||||
|
|
||||||
# Test with unset morph and partial POS
|
# Test with unset morph and partial POS
|
||||||
nlp.remove_pipe("morphologizer")
|
nlp.remove_pipe("morphologizer")
|
||||||
nlp.add_pipe("morphologizer")
|
nlp.add_pipe("morphologizer")
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
import pytest
|
import pytest
|
||||||
import pickle
|
import pickle
|
||||||
|
from thinc.api import get_current_ops
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.strings import StringStore
|
from spacy.strings import StringStore
|
||||||
|
from spacy.vectors import Vectors
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
|
|
||||||
|
@ -129,7 +131,11 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
|
||||||
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
||||||
def test_pickle_vocab(strings, lex_attr):
|
def test_pickle_vocab(strings, lex_attr):
|
||||||
vocab = Vocab(strings=strings)
|
vocab = Vocab(strings=strings)
|
||||||
|
ops = get_current_ops()
|
||||||
|
vectors = Vectors(data=ops.xp.zeros((10, 10)), mode="floret", hash_count=1)
|
||||||
|
vocab.vectors = vectors
|
||||||
vocab[strings[0]].norm_ = lex_attr
|
vocab[strings[0]].norm_ = lex_attr
|
||||||
vocab_pickled = pickle.dumps(vocab)
|
vocab_pickled = pickle.dumps(vocab)
|
||||||
vocab_unpickled = pickle.loads(vocab_pickled)
|
vocab_unpickled = pickle.loads(vocab_pickled)
|
||||||
assert vocab.to_bytes() == vocab_unpickled.to_bytes()
|
assert vocab.to_bytes() == vocab_unpickled.to_bytes()
|
||||||
|
assert vocab_unpickled.vectors.mode == "floret"
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
from click import NoSuchOption
|
from click import NoSuchOption
|
||||||
|
from packaging.specifiers import SpecifierSet
|
||||||
from spacy.training import docs_to_json, offsets_to_biluo_tags
|
from spacy.training import docs_to_json, offsets_to_biluo_tags
|
||||||
from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs
|
from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs
|
||||||
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
||||||
|
@ -491,19 +492,27 @@ def test_string_to_list_intify(value):
|
||||||
assert string_to_list(value, intify=True) == [1, 2, 3]
|
assert string_to_list(value, intify=True) == [1, 2, 3]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="Temporarily skip until v3.2.0 release")
|
||||||
def test_download_compatibility():
|
def test_download_compatibility():
|
||||||
model_name = "en_core_web_sm"
|
spec = SpecifierSet("==" + about.__version__)
|
||||||
compatibility = get_compatibility()
|
spec.prereleases = False
|
||||||
version = get_version(model_name, compatibility)
|
if about.__version__ in spec:
|
||||||
assert get_minor_version(about.__version__) == get_minor_version(version)
|
model_name = "en_core_web_sm"
|
||||||
|
compatibility = get_compatibility()
|
||||||
|
version = get_version(model_name, compatibility)
|
||||||
|
assert get_minor_version(about.__version__) == get_minor_version(version)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="Temporarily skip until v3.2.0 release")
|
||||||
def test_validate_compatibility_table():
|
def test_validate_compatibility_table():
|
||||||
model_pkgs, compat = get_model_pkgs()
|
spec = SpecifierSet("==" + about.__version__)
|
||||||
spacy_version = get_minor_version(about.__version__)
|
spec.prereleases = False
|
||||||
current_compat = compat.get(spacy_version, {})
|
if about.__version__ in spec:
|
||||||
assert len(current_compat) > 0
|
model_pkgs, compat = get_model_pkgs()
|
||||||
assert "en_core_web_sm" in current_compat
|
spacy_version = get_minor_version(about.__version__)
|
||||||
|
current_compat = compat.get(spacy_version, {})
|
||||||
|
assert len(current_compat) > 0
|
||||||
|
assert "en_core_web_sm" in current_compat
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("component_name", ["ner", "textcat", "spancat", "tagger"])
|
@pytest.mark.parametrize("component_name", ["ner", "textcat", "spancat", "tagger"])
|
||||||
|
|
|
@ -8,7 +8,7 @@ from spacy.vocab import Vocab
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.de import German
|
from spacy.lang.de import German
|
||||||
from spacy.util import registry, ignore_error, raise_error
|
from spacy.util import registry, ignore_error, raise_error, find_matching_language
|
||||||
import spacy
|
import spacy
|
||||||
from thinc.api import CupyOps, NumpyOps, get_current_ops
|
from thinc.api import CupyOps, NumpyOps, get_current_ops
|
||||||
|
|
||||||
|
@ -255,6 +255,38 @@ def test_language_pipe_error_handler_custom(en_vocab, n_process):
|
||||||
assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"]
|
assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("n_process", [1, 2])
|
||||||
|
def test_language_pipe_error_handler_input_as_tuples(en_vocab, n_process):
|
||||||
|
"""Test the error handling of nlp.pipe with input as tuples"""
|
||||||
|
Language.component("my_evil_component", func=evil_component)
|
||||||
|
ops = get_current_ops()
|
||||||
|
if isinstance(ops, NumpyOps) or n_process < 2:
|
||||||
|
nlp = English()
|
||||||
|
nlp.add_pipe("my_evil_component")
|
||||||
|
texts = [
|
||||||
|
("TEXT 111", 111),
|
||||||
|
("TEXT 222", 222),
|
||||||
|
("TEXT 333", 333),
|
||||||
|
("TEXT 342", 342),
|
||||||
|
("TEXT 666", 666),
|
||||||
|
]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
list(nlp.pipe(texts, as_tuples=True))
|
||||||
|
nlp.set_error_handler(warn_error)
|
||||||
|
logger = logging.getLogger("spacy")
|
||||||
|
with mock.patch.object(logger, "warning") as mock_warning:
|
||||||
|
tuples = list(nlp.pipe(texts, as_tuples=True, n_process=n_process))
|
||||||
|
# HACK/TODO? the warnings in child processes don't seem to be
|
||||||
|
# detected by the mock logger
|
||||||
|
if n_process == 1:
|
||||||
|
mock_warning.assert_called()
|
||||||
|
assert mock_warning.call_count == 2
|
||||||
|
assert len(tuples) + mock_warning.call_count == len(texts)
|
||||||
|
assert (tuples[0][0].text, tuples[0][1]) == ("TEXT 111", 111)
|
||||||
|
assert (tuples[1][0].text, tuples[1][1]) == ("TEXT 333", 333)
|
||||||
|
assert (tuples[2][0].text, tuples[2][1]) == ("TEXT 666", 666)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("n_process", [1, 2])
|
@pytest.mark.parametrize("n_process", [1, 2])
|
||||||
def test_language_pipe_error_handler_pipe(en_vocab, n_process):
|
def test_language_pipe_error_handler_pipe(en_vocab, n_process):
|
||||||
"""Test the error handling of a component's pipe method"""
|
"""Test the error handling of a component's pipe method"""
|
||||||
|
@ -512,6 +544,55 @@ def test_spacy_blank():
|
||||||
assert nlp.meta["name"] == "my_custom_model"
|
assert nlp.meta["name"] == "my_custom_model"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"lang,target",
|
||||||
|
[
|
||||||
|
("en", "en"),
|
||||||
|
("fra", "fr"),
|
||||||
|
("fre", "fr"),
|
||||||
|
("iw", "he"),
|
||||||
|
("mo", "ro"),
|
||||||
|
("mul", "xx"),
|
||||||
|
("no", "nb"),
|
||||||
|
("pt-BR", "pt"),
|
||||||
|
("xx", "xx"),
|
||||||
|
("zh-Hans", "zh"),
|
||||||
|
("zh-Hant", None),
|
||||||
|
("zxx", None),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_language_matching(lang, target):
|
||||||
|
"""
|
||||||
|
Test that we can look up languages by equivalent or nearly-equivalent
|
||||||
|
language codes.
|
||||||
|
"""
|
||||||
|
assert find_matching_language(lang) == target
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"lang,target",
|
||||||
|
[
|
||||||
|
("en", "en"),
|
||||||
|
("fra", "fr"),
|
||||||
|
("fre", "fr"),
|
||||||
|
("iw", "he"),
|
||||||
|
("mo", "ro"),
|
||||||
|
("mul", "xx"),
|
||||||
|
("no", "nb"),
|
||||||
|
("pt-BR", "pt"),
|
||||||
|
("xx", "xx"),
|
||||||
|
("zh-Hans", "zh"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_blank_languages(lang, target):
|
||||||
|
"""
|
||||||
|
Test that we can get spacy.blank in various languages, including codes
|
||||||
|
that are defined to be equivalent or that match by CLDR language matching.
|
||||||
|
"""
|
||||||
|
nlp = spacy.blank(lang)
|
||||||
|
assert nlp.lang == target
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("value", [False, None, ["x", "y"], Language, Vocab])
|
@pytest.mark.parametrize("value", [False, None, ["x", "y"], Language, Vocab])
|
||||||
def test_language_init_invalid_vocab(value):
|
def test_language_init_invalid_vocab(value):
|
||||||
err_fragment = "invalid value"
|
err_fragment = "invalid value"
|
||||||
|
@ -540,6 +621,32 @@ def test_language_source_and_vectors(nlp2):
|
||||||
assert nlp.vocab.vectors.to_bytes() == vectors_bytes
|
assert nlp.vocab.vectors.to_bytes() == vectors_bytes
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("n_process", [1, 2])
|
||||||
|
def test_pass_doc_to_pipeline(nlp, n_process):
|
||||||
|
texts = ["cats", "dogs", "guinea pigs"]
|
||||||
|
docs = [nlp.make_doc(text) for text in texts]
|
||||||
|
assert not any(len(doc.cats) for doc in docs)
|
||||||
|
doc = nlp(docs[0])
|
||||||
|
assert doc.text == texts[0]
|
||||||
|
assert len(doc.cats) > 0
|
||||||
|
if isinstance(get_current_ops(), NumpyOps) or n_process < 2:
|
||||||
|
docs = nlp.pipe(docs, n_process=n_process)
|
||||||
|
assert [doc.text for doc in docs] == texts
|
||||||
|
assert all(len(doc.cats) for doc in docs)
|
||||||
|
|
||||||
|
|
||||||
|
def test_invalid_arg_to_pipeline(nlp):
|
||||||
|
str_list = ["This is a text.", "This is another."]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
nlp(str_list) # type: ignore
|
||||||
|
assert len(list(nlp.pipe(str_list))) == 2
|
||||||
|
int_list = [1, 2, 3]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
list(nlp.pipe(int_list)) # type: ignore
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
nlp(int_list) # type: ignore
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
not isinstance(get_current_ops(), CupyOps), reason="test requires GPU"
|
not isinstance(get_current_ops(), CupyOps), reason="test requires GPU"
|
||||||
)
|
)
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user