mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Merge remote-tracking branch 'upstream/develop' into chore/switch-to-master-v3.2.0
This commit is contained in:
commit
07dea324f6
10
.github/azure-steps.yml
vendored
10
.github/azure-steps.yml
vendored
|
@ -65,8 +65,11 @@ steps:
|
|||
condition: eq(${{ parameters.gpu }}, true)
|
||||
|
||||
- script: |
|
||||
python -m spacy download ca_core_news_sm
|
||||
python -m spacy download ca_core_news_md
|
||||
#python -m spacy download ca_core_news_sm
|
||||
#python -m spacy download ca_core_news_md
|
||||
# temporarily install the v3.1.0 models
|
||||
pip install --no-deps https://github.com/explosion/spacy-models/releases/download/ca_core_news_sm-3.1.0/ca_core_news_sm-3.1.0-py3-none-any.whl
|
||||
pip install --no-deps https://github.com/explosion/spacy-models/releases/download/ca_core_news_md-3.1.0/ca_core_news_md-3.1.0-py3-none-any.whl
|
||||
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||
displayName: 'Test download CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
@ -95,7 +98,8 @@ steps:
|
|||
|
||||
- script: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||
# temporarily ignore W095
|
||||
PYTHONWARNINGS="error,ignore:[W095]:UserWarning,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||
displayName: 'Test assemble CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
|
|
106
.github/contributors/avi197.md
vendored
Normal file
106
.github/contributors/avi197.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Son Pham |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 09/10/2021 |
|
||||
| GitHub username | Avi197 |
|
||||
| Website (optional) | |
|
106
.github/contributors/fgaim.md
vendored
Normal file
106
.github/contributors/fgaim.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Fitsum Gaim |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 2021-08-07 |
|
||||
| GitHub username | fgaim |
|
||||
| Website (optional) | |
|
106
.github/contributors/syrull.md
vendored
Normal file
106
.github/contributors/syrull.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Dimitar Ganev |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 2021/8/2 |
|
||||
| GitHub username | syrull |
|
||||
| Website (optional) | |
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -9,6 +9,7 @@ keys/
|
|||
spacy/tests/package/setup.cfg
|
||||
spacy/tests/package/pyproject.toml
|
||||
spacy/tests/package/requirements.txt
|
||||
spacy/tests/universe/universe.json
|
||||
|
||||
# Website
|
||||
website/.cache/
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
# Our libraries
|
||||
spacy-legacy>=3.0.8,<3.1.0
|
||||
spacy-loggers>=1.0.0,<2.0.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.12,<8.1.0
|
||||
|
@ -17,6 +18,7 @@ requests>=2.13.0,<3.0.0
|
|||
tqdm>=4.38.0,<5.0.0
|
||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0
|
||||
jinja2
|
||||
langcodes>=3.2.0,<4.0.0
|
||||
# Official Python utilities
|
||||
setuptools
|
||||
packaging>=20.0
|
||||
|
|
|
@ -42,6 +42,7 @@ setup_requires =
|
|||
install_requires =
|
||||
# Our libraries
|
||||
spacy-legacy>=3.0.8,<3.1.0
|
||||
spacy-loggers>=1.0.0,<2.0.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
|
@ -62,6 +63,7 @@ install_requires =
|
|||
setuptools
|
||||
packaging>=20.0
|
||||
typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8"
|
||||
langcodes>=3.2.0,<4.0.0
|
||||
|
||||
[options.entry_points]
|
||||
console_scripts =
|
||||
|
@ -69,9 +71,9 @@ console_scripts =
|
|||
|
||||
[options.extras_require]
|
||||
lookups =
|
||||
spacy_lookups_data>=1.0.2,<1.1.0
|
||||
spacy_lookups_data>=1.0.3,<1.1.0
|
||||
transformers =
|
||||
spacy_transformers>=1.0.1,<1.2.0
|
||||
spacy_transformers>=1.1.2,<1.2.0
|
||||
ray =
|
||||
spacy_ray>=0.1.0,<1.0.0
|
||||
cuda =
|
||||
|
|
1
setup.py
1
setup.py
|
@ -81,6 +81,7 @@ COPY_FILES = {
|
|||
ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package",
|
||||
ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package",
|
||||
ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package",
|
||||
ROOT / "website" / "meta" / "universe.json": PACKAGE_ROOT / "tests" / "universe",
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy"
|
||||
__version__ = "3.1.4"
|
||||
__version__ = "3.2.0"
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
__projects__ = "https://github.com/explosion/projects"
|
||||
|
|
|
@ -142,7 +142,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
|||
for name, value in stringy_attrs.items():
|
||||
int_key = intify_attr(name)
|
||||
if int_key is not None:
|
||||
if strings_map is not None and isinstance(value, basestring):
|
||||
if strings_map is not None and isinstance(value, str):
|
||||
if hasattr(strings_map, 'add'):
|
||||
value = strings_map.add(value)
|
||||
else:
|
||||
|
|
|
@ -20,6 +20,7 @@ def init_vectors_cli(
|
|||
output_dir: Path = Arg(..., help="Pipeline output directory"),
|
||||
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
|
||||
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
||||
mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
|
||||
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
|
||||
|
@ -34,7 +35,14 @@ def init_vectors_cli(
|
|||
nlp = util.get_lang_class(lang)()
|
||||
if jsonl_loc is not None:
|
||||
update_lexemes(nlp, jsonl_loc)
|
||||
convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
|
||||
convert_vectors(
|
||||
nlp,
|
||||
vectors_loc,
|
||||
truncate=truncate,
|
||||
prune=prune,
|
||||
name=name,
|
||||
mode=mode,
|
||||
)
|
||||
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
||||
nlp.to_disk(output_dir)
|
||||
msg.good(
|
||||
|
|
|
@ -5,6 +5,7 @@ raw_text = null
|
|||
max_epochs = 1000
|
||||
dropout = 0.2
|
||||
n_save_every = null
|
||||
n_save_epoch = null
|
||||
component = "tok2vec"
|
||||
layer = ""
|
||||
corpus = "corpora.pretrain"
|
||||
|
|
|
@ -22,6 +22,9 @@ def setup_default_warnings():
|
|||
# warn once about lemmatizer without required POS
|
||||
filter_warning("once", error_msg=Warnings.W108)
|
||||
|
||||
# floret vector table cannot be modified
|
||||
filter_warning("once", error_msg="[W114]")
|
||||
|
||||
|
||||
def filter_warning(action: str, error_msg: str):
|
||||
"""Customize how spaCy should handle a certain warning.
|
||||
|
@ -186,6 +189,8 @@ class Warnings(metaclass=ErrorsWithCodes):
|
|||
"vectors are not identical to current pipeline vectors.")
|
||||
W114 = ("Using multiprocessing with GPU models is not recommended and may "
|
||||
"lead to errors.")
|
||||
W115 = ("Skipping {method}: the floret vector table cannot be modified. "
|
||||
"Vectors are calculated from character ngrams.")
|
||||
|
||||
|
||||
class Errors(metaclass=ErrorsWithCodes):
|
||||
|
@ -277,7 +282,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"you forget to call the `set_extension` method?")
|
||||
E047 = ("Can't assign a value to unregistered extension attribute "
|
||||
"'{name}'. Did you forget to call the `set_extension` method?")
|
||||
E048 = ("Can't import language {lang} from spacy.lang: {err}")
|
||||
E048 = ("Can't import language {lang} or any matching language from spacy.lang: {err}")
|
||||
E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
|
||||
"package or a valid path to a data directory.")
|
||||
E052 = ("Can't find model directory: {path}")
|
||||
|
@ -511,13 +516,24 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
|
||||
E200 = ("Can't yet set {attr} from Span. Vote for this feature on the "
|
||||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
|
||||
E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
|
||||
|
||||
# New errors added in v3.x
|
||||
E866 = ("A SpanGroup is not functional after the corresponding Doc has "
|
||||
E858 = ("The {mode} vector table does not support this operation. "
|
||||
"{alternative}")
|
||||
E859 = ("The floret vector table cannot be modified.")
|
||||
E860 = ("Can't truncate fasttext-bloom vectors.")
|
||||
E861 = ("No 'keys' should be provided when initializing floret vectors "
|
||||
"with 'minn' and 'maxn'.")
|
||||
E862 = ("'hash_count' must be between 1-4 for floret vectors.")
|
||||
E863 = ("'maxn' must be greater than or equal to 'minn'.")
|
||||
E864 = ("The complete vector table 'data' is required to initialize floret "
|
||||
"vectors.")
|
||||
E865 = ("A SpanGroup is not functional after the corresponding Doc has "
|
||||
"been garbage collected. To keep using the spans, make sure that "
|
||||
"the corresponding Doc object is still available in the scope of "
|
||||
"your function.")
|
||||
E866 = ("Expected a string or 'Doc' as input, but got: {type}.")
|
||||
E867 = ("The 'textcat' component requires at least two labels because it "
|
||||
"uses mutually exclusive classes where exactly one label is True "
|
||||
"for each doc. For binary classification tasks, you can use two "
|
||||
|
|
20
spacy/kb.pyx
20
spacy/kb.pyx
|
@ -124,7 +124,7 @@ cdef class KnowledgeBase:
|
|||
def get_alias_strings(self):
|
||||
return [self.vocab.strings[x] for x in self._alias_index]
|
||||
|
||||
def add_entity(self, unicode entity, float freq, vector[float] entity_vector):
|
||||
def add_entity(self, str entity, float freq, vector[float] entity_vector):
|
||||
"""
|
||||
Add an entity to the KB, optionally specifying its log probability based on corpus frequency
|
||||
Return the hash of the entity ID/name at the end.
|
||||
|
@ -185,15 +185,15 @@ cdef class KnowledgeBase:
|
|||
|
||||
i += 1
|
||||
|
||||
def contains_entity(self, unicode entity):
|
||||
def contains_entity(self, str entity):
|
||||
cdef hash_t entity_hash = self.vocab.strings.add(entity)
|
||||
return entity_hash in self._entry_index
|
||||
|
||||
def contains_alias(self, unicode alias):
|
||||
def contains_alias(self, str alias):
|
||||
cdef hash_t alias_hash = self.vocab.strings.add(alias)
|
||||
return alias_hash in self._alias_index
|
||||
|
||||
def add_alias(self, unicode alias, entities, probabilities):
|
||||
def add_alias(self, str alias, entities, probabilities):
|
||||
"""
|
||||
For a given alias, add its potential entities and prior probabilies to the KB.
|
||||
Return the alias_hash at the end
|
||||
|
@ -239,7 +239,7 @@ cdef class KnowledgeBase:
|
|||
raise RuntimeError(Errors.E891.format(alias=alias))
|
||||
return alias_hash
|
||||
|
||||
def append_alias(self, unicode alias, unicode entity, float prior_prob, ignore_warnings=False):
|
||||
def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False):
|
||||
"""
|
||||
For an alias already existing in the KB, extend its potential entities with one more.
|
||||
Throw a warning if either the alias or the entity is unknown,
|
||||
|
@ -286,7 +286,7 @@ cdef class KnowledgeBase:
|
|||
alias_entry.probs = probs
|
||||
self._aliases_table[alias_index] = alias_entry
|
||||
|
||||
def get_alias_candidates(self, unicode alias) -> Iterator[Candidate]:
|
||||
def get_alias_candidates(self, str alias) -> Iterator[Candidate]:
|
||||
"""
|
||||
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
||||
and the prior probability of that alias resolving to that entity.
|
||||
|
@ -307,7 +307,7 @@ cdef class KnowledgeBase:
|
|||
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
|
||||
if entry_index != 0]
|
||||
|
||||
def get_vector(self, unicode entity):
|
||||
def get_vector(self, str entity):
|
||||
cdef hash_t entity_hash = self.vocab.strings[entity]
|
||||
|
||||
# Return an empty list if this entity is unknown in this KB
|
||||
|
@ -317,7 +317,7 @@ cdef class KnowledgeBase:
|
|||
|
||||
return self._vectors_table[self._entries[entry_index].vector_index]
|
||||
|
||||
def get_prior_prob(self, unicode entity, unicode alias):
|
||||
def get_prior_prob(self, str entity, str alias):
|
||||
""" Return the prior probability of a given alias being linked to a given entity,
|
||||
or return 0.0 when this combination is not known in the knowledge base"""
|
||||
cdef hash_t alias_hash = self.vocab.strings[alias]
|
||||
|
@ -587,7 +587,7 @@ cdef class Writer:
|
|||
def __init__(self, path):
|
||||
assert isinstance(path, Path)
|
||||
content = bytes(path)
|
||||
cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
|
||||
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
|
||||
self._fp = fopen(<char*>bytes_loc, 'wb')
|
||||
if not self._fp:
|
||||
raise IOError(Errors.E146.format(path=path))
|
||||
|
@ -629,7 +629,7 @@ cdef class Writer:
|
|||
cdef class Reader:
|
||||
def __init__(self, path):
|
||||
content = bytes(path)
|
||||
cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
|
||||
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
|
||||
self._fp = fopen(<char*>bytes_loc, 'rb')
|
||||
if not self._fp:
|
||||
PyErr_SetFromErrno(IOError)
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
||||
from ..char_classes import UNITS, ALPHA_UPPER
|
||||
|
||||
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧".strip().split()
|
||||
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
|
||||
|
||||
_suffixes = (
|
||||
_list_punct
|
||||
|
|
|
@ -1,265 +1,79 @@
|
|||
# Source: https://github.com/Alir3z4/stop-words
|
||||
|
||||
"""
|
||||
References:
|
||||
https://github.com/Alir3z4/stop-words - Original list, serves as a base.
|
||||
https://postvai.com/books/stop-dumi.pdf - Additions to the original list in order to improve it.
|
||||
"""
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
а
|
||||
автентичен
|
||||
аз
|
||||
ако
|
||||
ала
|
||||
бе
|
||||
без
|
||||
беше
|
||||
би
|
||||
бивш
|
||||
бивша
|
||||
бившо
|
||||
бил
|
||||
била
|
||||
били
|
||||
било
|
||||
благодаря
|
||||
близо
|
||||
бъдат
|
||||
бъде
|
||||
бяха
|
||||
в
|
||||
вас
|
||||
ваш
|
||||
ваша
|
||||
вероятно
|
||||
вече
|
||||
взема
|
||||
ви
|
||||
вие
|
||||
винаги
|
||||
внимава
|
||||
време
|
||||
все
|
||||
всеки
|
||||
всички
|
||||
всичко
|
||||
всяка
|
||||
във
|
||||
въпреки
|
||||
върху
|
||||
г
|
||||
ги
|
||||
главен
|
||||
главна
|
||||
главно
|
||||
глас
|
||||
го
|
||||
година
|
||||
години
|
||||
годишен
|
||||
д
|
||||
да
|
||||
дали
|
||||
два
|
||||
двама
|
||||
двамата
|
||||
две
|
||||
двете
|
||||
ден
|
||||
днес
|
||||
дни
|
||||
до
|
||||
добра
|
||||
добре
|
||||
добро
|
||||
добър
|
||||
докато
|
||||
докога
|
||||
дори
|
||||
досега
|
||||
доста
|
||||
друг
|
||||
друга
|
||||
други
|
||||
е
|
||||
евтин
|
||||
едва
|
||||
един
|
||||
една
|
||||
еднаква
|
||||
еднакви
|
||||
еднакъв
|
||||
едно
|
||||
екип
|
||||
ето
|
||||
живот
|
||||
за
|
||||
забавям
|
||||
зад
|
||||
заедно
|
||||
заради
|
||||
засега
|
||||
заспал
|
||||
затова
|
||||
защо
|
||||
защото
|
||||
и
|
||||
из
|
||||
или
|
||||
им
|
||||
има
|
||||
имат
|
||||
иска
|
||||
й
|
||||
каза
|
||||
как
|
||||
каква
|
||||
какво
|
||||
както
|
||||
какъв
|
||||
като
|
||||
кога
|
||||
когато
|
||||
което
|
||||
които
|
||||
кой
|
||||
който
|
||||
колко
|
||||
която
|
||||
къде
|
||||
където
|
||||
към
|
||||
лесен
|
||||
лесно
|
||||
ли
|
||||
лош
|
||||
м
|
||||
май
|
||||
малко
|
||||
ме
|
||||
между
|
||||
мек
|
||||
мен
|
||||
месец
|
||||
ми
|
||||
много
|
||||
мнозина
|
||||
мога
|
||||
могат
|
||||
може
|
||||
мокър
|
||||
моля
|
||||
момента
|
||||
му
|
||||
н
|
||||
на
|
||||
над
|
||||
назад
|
||||
най
|
||||
направи
|
||||
напред
|
||||
например
|
||||
нас
|
||||
не
|
||||
него
|
||||
нещо
|
||||
нея
|
||||
ни
|
||||
ние
|
||||
никой
|
||||
нито
|
||||
нищо
|
||||
но
|
||||
нов
|
||||
нова
|
||||
нови
|
||||
новина
|
||||
някои
|
||||
някой
|
||||
няколко
|
||||
няма
|
||||
обаче
|
||||
около
|
||||
освен
|
||||
особено
|
||||
от
|
||||
отгоре
|
||||
отново
|
||||
още
|
||||
пак
|
||||
по
|
||||
повече
|
||||
повечето
|
||||
под
|
||||
поне
|
||||
поради
|
||||
после
|
||||
почти
|
||||
прави
|
||||
пред
|
||||
преди
|
||||
през
|
||||
при
|
||||
пък
|
||||
първата
|
||||
първи
|
||||
първо
|
||||
пъти
|
||||
равен
|
||||
равна
|
||||
с
|
||||
са
|
||||
сам
|
||||
само
|
||||
се
|
||||
сега
|
||||
си
|
||||
син
|
||||
скоро
|
||||
след
|
||||
следващ
|
||||
сме
|
||||
смях
|
||||
според
|
||||
сред
|
||||
срещу
|
||||
сте
|
||||
съм
|
||||
със
|
||||
също
|
||||
т
|
||||
тази
|
||||
така
|
||||
такива
|
||||
такъв
|
||||
там
|
||||
твой
|
||||
те
|
||||
тези
|
||||
ти
|
||||
т.н.
|
||||
то
|
||||
това
|
||||
тогава
|
||||
този
|
||||
той
|
||||
толкова
|
||||
точно
|
||||
три
|
||||
трябва
|
||||
тук
|
||||
тъй
|
||||
тя
|
||||
тях
|
||||
у
|
||||
утре
|
||||
харесва
|
||||
хиляди
|
||||
ч
|
||||
часа
|
||||
че
|
||||
често
|
||||
чрез
|
||||
ще
|
||||
щом
|
||||
а автентичен аз ако ала
|
||||
|
||||
бе без беше би бивш бивша бившо бивши бил била били било благодаря близо бъдат
|
||||
бъде бъда бяха
|
||||
|
||||
в вас ваш ваша вашата вашият вероятно вече взема ви вие винаги внимава време все
|
||||
всеки всички вместо всичко вследствие всъщност всяка втори във въпреки върху
|
||||
вътре веднъж
|
||||
|
||||
г ги главен главна главно глас го годно година години годишен
|
||||
|
||||
д да дали далеч далече два двама двамата две двете ден днес дни до добра добре
|
||||
добро добър достатъчно докато докога дори досега доста друг друга другаде други
|
||||
|
||||
е евтин едва един една еднаква еднакви еднакъв едно екип ето
|
||||
|
||||
живот жив
|
||||
|
||||
за здравей здрасти знае зная забавям зад зададени заедно заради засега заспал
|
||||
затова запазва започвам защо защото завинаги
|
||||
|
||||
и из или им има имат иска искам използвайки изглежда изглеждаше изглеждайки
|
||||
извън имайки
|
||||
|
||||
й йо
|
||||
|
||||
каза казва казвайки казвам как каква какво както какъв като кога кауза каузи
|
||||
когато когото което които кой който колко която къде където към край кратък
|
||||
кръгъл
|
||||
|
||||
лесен лесно ли летя летиш летим лош
|
||||
|
||||
м май малко макар малцина междувременно минус ме между мек мен месец ми мис
|
||||
мисля много мнозина мога могат може мой можем мокър моля момента му
|
||||
|
||||
н на над назад най наш навсякъде навътре нагоре направи напред надолу наистина
|
||||
например наопаки наполовина напоследък нека независимо нас насам наскоро
|
||||
настрана необходимо него негов нещо нея ни ние никой нито нищо но нов някак нова
|
||||
нови новина някои някой някога някъде няколко няма
|
||||
|
||||
о обаче около описан опитах опитва опитвайки опитвам определен определено освен
|
||||
обикновено осигурява обратно означава особен особено от ох отвъд отгоре отдолу
|
||||
отново отива отивам отидох отсега отделно отколкото откъдето очевидно оттам
|
||||
относно още
|
||||
|
||||
п пак по повече повечето под поне просто пряко поради после последен последно
|
||||
посочен почти прави прав прави правя пред преди през при пък първата първи първо
|
||||
път пъти плюс
|
||||
|
||||
равен равна различен различни разумен разумно
|
||||
|
||||
с са сам само себе сериозно сигурен сигурно се сега си син скоро скорошен след
|
||||
следващ следващия следва следното следователно случва сме смях собствен
|
||||
сравнително смея според сред става срещу съвсем съдържа съдържащ съжалявам
|
||||
съответен съответно сте съм със също
|
||||
|
||||
т така техен техни такива такъв твърде там трета твой те тези ти то това
|
||||
тогава този той търси толкова точно три трябва тук тъй тя тях
|
||||
|
||||
у утре ужасно употреба успоредно уточнен уточняване
|
||||
|
||||
харесва харесали хиляди
|
||||
|
||||
ч часа ценя цяло цялостен че често чрез чудя
|
||||
|
||||
ще щеше щом щяха
|
||||
|
||||
юмрук
|
||||
я
|
||||
як
|
||||
|
||||
я як
|
||||
""".split()
|
||||
)
|
||||
|
|
|
@ -1,10 +1,16 @@
|
|||
"""
|
||||
References:
|
||||
https://slovored.com/bg/abbr/grammar/ - Additional refs for abbreviations
|
||||
(countries, occupations, fields of studies and more).
|
||||
"""
|
||||
|
||||
from ...symbols import ORTH, NORM
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
||||
|
||||
_abbr_exc = [
|
||||
# measurements
|
||||
for abbr in [
|
||||
{ORTH: "м", NORM: "метър"},
|
||||
{ORTH: "мм", NORM: "милиметър"},
|
||||
{ORTH: "см", NORM: "сантиметър"},
|
||||
|
@ -17,51 +23,191 @@ _abbr_exc = [
|
|||
{ORTH: "хл", NORM: "хектолиър"},
|
||||
{ORTH: "дкл", NORM: "декалитър"},
|
||||
{ORTH: "л", NORM: "литър"},
|
||||
]
|
||||
for abbr in _abbr_exc:
|
||||
]:
|
||||
_exc[abbr[ORTH]] = [abbr]
|
||||
|
||||
_abbr_line_exc = [
|
||||
# line abbreviations
|
||||
for abbr in [
|
||||
{ORTH: "г-жа", NORM: "госпожа"},
|
||||
{ORTH: "г-н", NORM: "господин"},
|
||||
{ORTH: "г-ца", NORM: "госпожица"},
|
||||
{ORTH: "д-р", NORM: "доктор"},
|
||||
{ORTH: "о-в", NORM: "остров"},
|
||||
{ORTH: "п-в", NORM: "полуостров"},
|
||||
]
|
||||
|
||||
for abbr in _abbr_line_exc:
|
||||
{ORTH: "с-у", NORM: "срещу"},
|
||||
{ORTH: "в-у", NORM: "върху"},
|
||||
{ORTH: "м-у", NORM: "между"},
|
||||
]:
|
||||
_exc[abbr[ORTH]] = [abbr]
|
||||
|
||||
_abbr_dot_exc = [
|
||||
# foreign language related abbreviations
|
||||
for abbr in [
|
||||
{ORTH: "англ.", NORM: "английски"},
|
||||
{ORTH: "ан.", NORM: "английски термин"},
|
||||
{ORTH: "араб.", NORM: "арабски"},
|
||||
{ORTH: "афр.", NORM: "африкански"},
|
||||
{ORTH: "гр.", NORM: "гръцки"},
|
||||
{ORTH: "лат.", NORM: "латински"},
|
||||
{ORTH: "рим.", NORM: "римски"},
|
||||
{ORTH: "старогр.", NORM: "старогръцки"},
|
||||
{ORTH: "староевр.", NORM: "староеврейски"},
|
||||
{ORTH: "фр.", NORM: "френски"},
|
||||
{ORTH: "хол.", NORM: "холандски"},
|
||||
{ORTH: "швед.", NORM: "шведски"},
|
||||
{ORTH: "шотл.", NORM: "шотландски"},
|
||||
{ORTH: "яп.", NORM: "японски"},
|
||||
]:
|
||||
_exc[abbr[ORTH]] = [abbr]
|
||||
|
||||
# profession and academic titles abbreviations
|
||||
for abbr in [
|
||||
{ORTH: "акад.", NORM: "академик"},
|
||||
{ORTH: "ал.", NORM: "алинея"},
|
||||
{ORTH: "арх.", NORM: "архитект"},
|
||||
{ORTH: "инж.", NORM: "инженер"},
|
||||
{ORTH: "канц.", NORM: "канцлер"},
|
||||
{ORTH: "проф.", NORM: "професор"},
|
||||
{ORTH: "св.", NORM: "свети"},
|
||||
]:
|
||||
_exc[abbr[ORTH]] = [abbr]
|
||||
|
||||
# fields of studies
|
||||
for abbr in [
|
||||
{ORTH: "агр.", NORM: "агрономия"},
|
||||
{ORTH: "ав.", NORM: "авиация"},
|
||||
{ORTH: "агр.", NORM: "агрономия"},
|
||||
{ORTH: "археол.", NORM: "археология"},
|
||||
{ORTH: "астр.", NORM: "астрономия"},
|
||||
{ORTH: "геод.", NORM: "геодезия"},
|
||||
{ORTH: "геол.", NORM: "геология"},
|
||||
{ORTH: "геом.", NORM: "геометрия"},
|
||||
{ORTH: "гимн.", NORM: "гимнастика"},
|
||||
{ORTH: "грам.", NORM: "граматика"},
|
||||
{ORTH: "жур.", NORM: "журналистика"},
|
||||
{ORTH: "журн.", NORM: "журналистика"},
|
||||
{ORTH: "зем.", NORM: "земеделие"},
|
||||
{ORTH: "икон.", NORM: "икономика"},
|
||||
{ORTH: "лит.", NORM: "литература"},
|
||||
{ORTH: "мат.", NORM: "математика"},
|
||||
{ORTH: "мед.", NORM: "медицина"},
|
||||
{ORTH: "муз.", NORM: "музика"},
|
||||
{ORTH: "печ.", NORM: "печатарство"},
|
||||
{ORTH: "пол.", NORM: "политика"},
|
||||
{ORTH: "псих.", NORM: "психология"},
|
||||
{ORTH: "соц.", NORM: "социология"},
|
||||
{ORTH: "стат.", NORM: "статистика"},
|
||||
{ORTH: "стил.", NORM: "стилистика"},
|
||||
{ORTH: "топогр.", NORM: "топография"},
|
||||
{ORTH: "търг.", NORM: "търговия"},
|
||||
{ORTH: "фарм.", NORM: "фармацевтика"},
|
||||
{ORTH: "фехт.", NORM: "фехтовка"},
|
||||
{ORTH: "физиол.", NORM: "физиология"},
|
||||
{ORTH: "физ.", NORM: "физика"},
|
||||
{ORTH: "фил.", NORM: "философия"},
|
||||
{ORTH: "фин.", NORM: "финанси"},
|
||||
{ORTH: "фолкл.", NORM: "фолклор"},
|
||||
{ORTH: "фон.", NORM: "фонетика"},
|
||||
{ORTH: "фот.", NORM: "фотография"},
|
||||
{ORTH: "футб.", NORM: "футбол"},
|
||||
{ORTH: "хим.", NORM: "химия"},
|
||||
{ORTH: "хир.", NORM: "хирургия"},
|
||||
{ORTH: "ел.", NORM: "електротехника"},
|
||||
]:
|
||||
_exc[abbr[ORTH]] = [abbr]
|
||||
|
||||
for abbr in [
|
||||
{ORTH: "ал.", NORM: "алинея"},
|
||||
{ORTH: "авт.", NORM: "автоматично"},
|
||||
{ORTH: "адм.", NORM: "администрация"},
|
||||
{ORTH: "арт.", NORM: "артилерия"},
|
||||
{ORTH: "бл.", NORM: "блок"},
|
||||
{ORTH: "бр.", NORM: "брой"},
|
||||
{ORTH: "бул.", NORM: "булевард"},
|
||||
{ORTH: "букв.", NORM: "буквално"},
|
||||
{ORTH: "в.", NORM: "век"},
|
||||
{ORTH: "вр.", NORM: "време"},
|
||||
{ORTH: "вм.", NORM: "вместо"},
|
||||
{ORTH: "воен.", NORM: "военен термин"},
|
||||
{ORTH: "г.", NORM: "година"},
|
||||
{ORTH: "гр.", NORM: "град"},
|
||||
{ORTH: "гл.", NORM: "глагол"},
|
||||
{ORTH: "др.", NORM: "други"},
|
||||
{ORTH: "ез.", NORM: "езеро"},
|
||||
{ORTH: "ж.р.", NORM: "женски род"},
|
||||
{ORTH: "инж.", NORM: "инженер"},
|
||||
{ORTH: "жп.", NORM: "железопът"},
|
||||
{ORTH: "застр.", NORM: "застрахователно дело"},
|
||||
{ORTH: "знач.", NORM: "значение"},
|
||||
{ORTH: "и др.", NORM: "и други"},
|
||||
{ORTH: "и под.", NORM: "и подобни"},
|
||||
{ORTH: "и пр.", NORM: "и прочие"},
|
||||
{ORTH: "изр.", NORM: "изречение"},
|
||||
{ORTH: "изт.", NORM: "източен"},
|
||||
{ORTH: "конкр.", NORM: "конкретно"},
|
||||
{ORTH: "лв.", NORM: "лев"},
|
||||
{ORTH: "л.", NORM: "лице"},
|
||||
{ORTH: "м.р.", NORM: "мъжки род"},
|
||||
{ORTH: "мат.", NORM: "математика"},
|
||||
{ORTH: "мед.", NORM: "медицина"},
|
||||
{ORTH: "мин.вр.", NORM: "минало време"},
|
||||
{ORTH: "мн.ч.", NORM: "множествено число"},
|
||||
{ORTH: "напр.", NORM: "например"},
|
||||
{ORTH: "нар.", NORM: "наречие"},
|
||||
{ORTH: "науч.", NORM: "научен термин"},
|
||||
{ORTH: "непр.", NORM: "неправилно"},
|
||||
{ORTH: "обик.", NORM: "обикновено"},
|
||||
{ORTH: "опред.", NORM: "определение"},
|
||||
{ORTH: "особ.", NORM: "особено"},
|
||||
{ORTH: "ост.", NORM: "остаряло"},
|
||||
{ORTH: "относ.", NORM: "относително"},
|
||||
{ORTH: "отр.", NORM: "отрицателно"},
|
||||
{ORTH: "пл.", NORM: "площад"},
|
||||
{ORTH: "проф.", NORM: "професор"},
|
||||
{ORTH: "пад.", NORM: "падеж"},
|
||||
{ORTH: "парл.", NORM: "парламентарен"},
|
||||
{ORTH: "погов.", NORM: "поговорка"},
|
||||
{ORTH: "пон.", NORM: "понякога"},
|
||||
{ORTH: "правосл.", NORM: "православен"},
|
||||
{ORTH: "прибл.", NORM: "приблизително"},
|
||||
{ORTH: "прил.", NORM: "прилагателно име"},
|
||||
{ORTH: "пр.", NORM: "прочие"},
|
||||
{ORTH: "с.", NORM: "село"},
|
||||
{ORTH: "с.р.", NORM: "среден род"},
|
||||
{ORTH: "св.", NORM: "свети"},
|
||||
{ORTH: "сп.", NORM: "списание"},
|
||||
{ORTH: "стр.", NORM: "страница"},
|
||||
{ORTH: "сз.", NORM: "съюз"},
|
||||
{ORTH: "сег.", NORM: "сегашно"},
|
||||
{ORTH: "сп.", NORM: "спорт"},
|
||||
{ORTH: "срв.", NORM: "сравни"},
|
||||
{ORTH: "с.ст.", NORM: "селскостопанска техника"},
|
||||
{ORTH: "счет.", NORM: "счетоводство"},
|
||||
{ORTH: "съкр.", NORM: "съкратено"},
|
||||
{ORTH: "съобщ.", NORM: "съобщение"},
|
||||
{ORTH: "същ.", NORM: "съществително"},
|
||||
{ORTH: "текст.", NORM: "текстилен"},
|
||||
{ORTH: "телев.", NORM: "телевизия"},
|
||||
{ORTH: "тел.", NORM: "телефон"},
|
||||
{ORTH: "т.е.", NORM: "тоест"},
|
||||
{ORTH: "т.н.", NORM: "така нататък"},
|
||||
{ORTH: "т.нар.", NORM: "така наречен"},
|
||||
{ORTH: "търж.", NORM: "тържествено"},
|
||||
{ORTH: "ул.", NORM: "улица"},
|
||||
{ORTH: "уч.", NORM: "училище"},
|
||||
{ORTH: "унив.", NORM: "университет"},
|
||||
{ORTH: "харт.", NORM: "хартия"},
|
||||
{ORTH: "хидр.", NORM: "хидравлика"},
|
||||
{ORTH: "хран.", NORM: "хранителна"},
|
||||
{ORTH: "църк.", NORM: "църковен термин"},
|
||||
{ORTH: "числ.", NORM: "числително"},
|
||||
{ORTH: "чл.", NORM: "член"},
|
||||
]
|
||||
|
||||
for abbr in _abbr_dot_exc:
|
||||
{ORTH: "ч.", NORM: "число"},
|
||||
{ORTH: "числ.", NORM: "числително"},
|
||||
{ORTH: "шахм.", NORM: "шахмат"},
|
||||
{ORTH: "шах.", NORM: "шахмат"},
|
||||
{ORTH: "юр.", NORM: "юридически"},
|
||||
]:
|
||||
_exc[abbr[ORTH]] = [abbr]
|
||||
|
||||
# slash abbreviations
|
||||
for abbr in [
|
||||
{ORTH: "м/у", NORM: "между"},
|
||||
{ORTH: "с/у", NORM: "срещу"},
|
||||
]:
|
||||
_exc[abbr[ORTH]] = [abbr]
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
from thinc.api import Model
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
|
@ -23,13 +23,25 @@ class Bengali(Language):
|
|||
@Bengali.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "rule",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return Lemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Bengali"]
|
||||
|
|
23
spacy/lang/ca/__init__.py
Normal file → Executable file
23
spacy/lang/ca/__init__.py
Normal file → Executable file
|
@ -1,9 +1,9 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
@ -15,6 +15,7 @@ class CatalanDefaults(BaseDefaults):
|
|||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
stop_words = STOP_WORDS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
@ -28,13 +29,25 @@ class Catalan(Language):
|
|||
@Catalan.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "rule",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return CatalanLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return CatalanLemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Catalan"]
|
||||
|
|
11
spacy/lang/ca/punctuation.py
Normal file → Executable file
11
spacy/lang/ca/punctuation.py
Normal file → Executable file
|
@ -1,4 +1,5 @@
|
|||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
||||
from ..char_classes import LIST_CURRENCY
|
||||
from ..char_classes import CURRENCY
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
|
||||
from ..char_classes import merge_chars, _units
|
||||
|
@ -6,6 +7,14 @@ from ..char_classes import merge_chars, _units
|
|||
|
||||
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
|
||||
|
||||
_prefixes = (
|
||||
["§", "%", "=", "—", "–", "-", r"\+(?![0-9])"]
|
||||
+ LIST_PUNCT
|
||||
+ LIST_ELLIPSES
|
||||
+ LIST_QUOTES
|
||||
+ LIST_CURRENCY
|
||||
+ LIST_ICONS
|
||||
)
|
||||
|
||||
_infixes = (
|
||||
LIST_ELLIPSES
|
||||
|
@ -18,6 +27,7 @@ _infixes = (
|
|||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION),
|
||||
r"('ls|'l|'ns|'t|'m|'n|-les|-la|-lo|-li|-los|-me|-nos|-te|-vos|-se|-hi|-ne|-ho)(?![A-Za-z])|(-l'|-m'|-t'|-n')",
|
||||
]
|
||||
)
|
||||
|
||||
|
@ -44,3 +54,4 @@ _suffixes = (
|
|||
|
||||
TOKENIZER_INFIXES = _infixes
|
||||
TOKENIZER_SUFFIXES = _suffixes
|
||||
TOKENIZER_PREFIXES = _prefixes
|
||||
|
|
21
spacy/lang/ca/tokenizer_exceptions.py
Normal file → Executable file
21
spacy/lang/ca/tokenizer_exceptions.py
Normal file → Executable file
|
@ -18,12 +18,21 @@ for exc_data in [
|
|||
{ORTH: "nov.", NORM: "novembre"},
|
||||
{ORTH: "dec.", NORM: "desembre"},
|
||||
{ORTH: "Dr.", NORM: "doctor"},
|
||||
{ORTH: "Dra.", NORM: "doctora"},
|
||||
{ORTH: "Sr.", NORM: "senyor"},
|
||||
{ORTH: "Sra.", NORM: "senyora"},
|
||||
{ORTH: "Srta.", NORM: "senyoreta"},
|
||||
{ORTH: "núm", NORM: "número"},
|
||||
{ORTH: "St.", NORM: "sant"},
|
||||
{ORTH: "Sta.", NORM: "santa"},
|
||||
{ORTH: "pl.", NORM: "plaça"},
|
||||
{ORTH: "à."},
|
||||
{ORTH: "è."},
|
||||
{ORTH: "é."},
|
||||
{ORTH: "í."},
|
||||
{ORTH: "ò."},
|
||||
{ORTH: "ó."},
|
||||
{ORTH: "ú."},
|
||||
{ORTH: "'l"},
|
||||
{ORTH: "'ls"},
|
||||
{ORTH: "'m"},
|
||||
|
@ -34,6 +43,18 @@ for exc_data in [
|
|||
]:
|
||||
_exc[exc_data[ORTH]] = [exc_data]
|
||||
|
||||
_exc["del"] = [{ORTH: "d", NORM: "de"}, {ORTH: "el"}]
|
||||
_exc["dels"] = [{ORTH: "d", NORM: "de"}, {ORTH: "els"}]
|
||||
|
||||
_exc["al"] = [{ORTH: "a"}, {ORTH: "l", NORM: "el"}]
|
||||
_exc["als"] = [{ORTH: "a"}, {ORTH: "ls", NORM: "els"}]
|
||||
|
||||
_exc["pel"] = [{ORTH: "p", NORM: "per"}, {ORTH: "el"}]
|
||||
_exc["pels"] = [{ORTH: "p", NORM: "per"}, {ORTH: "els"}]
|
||||
|
||||
_exc["holahola"] = [{ORTH: "holahola", NORM: "cocacola"}]
|
||||
|
||||
|
||||
# Times
|
||||
_exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", NORM: "p.m."}]
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
from thinc.api import Model
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
|
@ -28,13 +28,25 @@ class Greek(Language):
|
|||
@Greek.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "rule",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return GreekLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return GreekLemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Greek"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
from thinc.api import Model
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
|
@ -26,13 +26,25 @@ class English(Language):
|
|||
@English.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "rule",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return EnglishLemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["English"]
|
||||
|
|
|
@ -10,7 +10,7 @@ class EnglishLemmatizer(Lemmatizer):
|
|||
Check whether we're dealing with an uninflected paradigm, so we can
|
||||
avoid lemmatization entirely.
|
||||
|
||||
univ_pos (unicode / int): The token's universal part-of-speech tag.
|
||||
univ_pos (str / int): The token's universal part-of-speech tag.
|
||||
morphology (dict): The token's morphological features following the
|
||||
Universal Dependencies scheme.
|
||||
"""
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
from thinc.api import Model
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
|
@ -26,13 +26,25 @@ class Spanish(Language):
|
|||
@Spanish.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "rule",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return SpanishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return SpanishLemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Spanish"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
from thinc.api import Model
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
@ -26,13 +26,25 @@ class Persian(Language):
|
|||
@Persian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "rule",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return Lemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Persian"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
|
@ -31,13 +31,25 @@ class French(Language):
|
|||
@French.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "rule",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return FrenchLemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["French"]
|
||||
|
|
|
@ -1,6 +1,11 @@
|
|||
from typing import Optional
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language, BaseDefaults
|
||||
from .lemmatizer import IrishLemmatizer
|
||||
|
||||
|
||||
class IrishDefaults(BaseDefaults):
|
||||
|
@ -13,4 +18,16 @@ class Irish(Language):
|
|||
Defaults = IrishDefaults
|
||||
|
||||
|
||||
@Irish.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
):
|
||||
return IrishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
|
||||
|
||||
__all__ = ["Irish"]
|
||||
|
|
|
@ -1,35 +0,0 @@
|
|||
# fmt: off
|
||||
consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z"]
|
||||
broad_vowels = ["a", "á", "o", "ó", "u", "ú"]
|
||||
slender_vowels = ["e", "é", "i", "í"]
|
||||
vowels = broad_vowels + slender_vowels
|
||||
# fmt: on
|
||||
|
||||
|
||||
def ends_dentals(word):
|
||||
if word != "" and word[-1] in ["d", "n", "t", "s"]:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def devoice(word):
|
||||
if len(word) > 2 and word[-2] == "s" and word[-1] == "d":
|
||||
return word[:-1] + "t"
|
||||
else:
|
||||
return word
|
||||
|
||||
|
||||
def ends_with_vowel(word):
|
||||
return word != "" and word[-1] in vowels
|
||||
|
||||
|
||||
def starts_with_vowel(word):
|
||||
return word != "" and word[0] in vowels
|
||||
|
||||
|
||||
def deduplicate(word):
|
||||
if len(word) > 2 and word[-2] == word[-1] and word[-1] in consonants:
|
||||
return word[:-1]
|
||||
else:
|
||||
return word
|
162
spacy/lang/ga/lemmatizer.py
Normal file
162
spacy/lang/ga/lemmatizer.py
Normal file
|
@ -0,0 +1,162 @@
|
|||
from typing import List, Dict, Tuple
|
||||
|
||||
from ...pipeline import Lemmatizer
|
||||
from ...tokens import Token
|
||||
|
||||
|
||||
class IrishLemmatizer(Lemmatizer):
|
||||
# This is a lookup-based lemmatiser using data extracted from
|
||||
# BuNaMo (https://github.com/michmech/BuNaMo)
|
||||
|
||||
@classmethod
|
||||
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
|
||||
if mode == "pos_lookup":
|
||||
# fmt: off
|
||||
required = [
|
||||
"lemma_lookup_adj", "lemma_lookup_adp",
|
||||
"lemma_lookup_noun", "lemma_lookup_verb"
|
||||
]
|
||||
# fmt: on
|
||||
return (required, [])
|
||||
else:
|
||||
return super().get_lookups_config(mode)
|
||||
|
||||
def pos_lookup_lemmatize(self, token: Token) -> List[str]:
|
||||
univ_pos = token.pos_
|
||||
string = unponc(token.text)
|
||||
if univ_pos not in ["PROPN", "ADP", "ADJ", "NOUN", "VERB"]:
|
||||
return [string.lower()]
|
||||
demutated = demutate(string)
|
||||
secondary = ""
|
||||
if string[0:1].lower() == "h" and string[1:2].lower() in "aáeéiíoóuú":
|
||||
secondary = string[1:]
|
||||
lookup_pos = univ_pos.lower()
|
||||
if univ_pos == "PROPN":
|
||||
lookup_pos = "noun"
|
||||
if token.has_morph():
|
||||
# TODO: lookup is actually required for the genitive forms, but
|
||||
# this is not in BuNaMo, and would not be of use with IDT.
|
||||
if univ_pos == "NOUN" and (
|
||||
"VerbForm=Vnoun" in token.morph or "VerbForm=Inf" in token.morph
|
||||
):
|
||||
hpref = "Form=HPref" in token.morph
|
||||
return [demutate(string, hpref).lower()]
|
||||
elif univ_pos == "ADJ" and "VerbForm=Part" in token.morph:
|
||||
return [demutate(string).lower()]
|
||||
lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})
|
||||
|
||||
def to_list(value):
|
||||
if value is None:
|
||||
value = []
|
||||
elif not isinstance(value, list):
|
||||
value = [value]
|
||||
return value
|
||||
|
||||
if univ_pos == "ADP":
|
||||
return to_list(lookup_table.get(string, string.lower()))
|
||||
ret = []
|
||||
if univ_pos == "PROPN":
|
||||
ret.extend(to_list(lookup_table.get(demutated)))
|
||||
ret.extend(to_list(lookup_table.get(secondary)))
|
||||
else:
|
||||
ret.extend(to_list(lookup_table.get(demutated.lower())))
|
||||
ret.extend(to_list(lookup_table.get(secondary.lower())))
|
||||
if len(ret) == 0:
|
||||
ret = [string.lower()]
|
||||
return ret
|
||||
|
||||
|
||||
def demutate(word: str, is_hpref: bool = False) -> str:
|
||||
UVOWELS = "AÁEÉIÍOÓUÚ"
|
||||
LVOWELS = "aáeéiíoóuú"
|
||||
lc = word.lower()
|
||||
# remove eclipsis
|
||||
if lc.startswith("bhf"):
|
||||
word = word[2:]
|
||||
elif lc.startswith("mb"):
|
||||
word = word[1:]
|
||||
elif lc.startswith("gc"):
|
||||
word = word[1:]
|
||||
elif lc.startswith("nd"):
|
||||
word = word[1:]
|
||||
elif lc.startswith("ng"):
|
||||
word = word[1:]
|
||||
elif lc.startswith("bp"):
|
||||
word = word[1:]
|
||||
elif lc.startswith("dt"):
|
||||
word = word[1:]
|
||||
elif word[0:1] == "n" and word[1:2] in UVOWELS:
|
||||
word = word[1:]
|
||||
elif lc.startswith("n-") and word[2:3] in LVOWELS:
|
||||
word = word[2:]
|
||||
# non-standard eclipsis
|
||||
elif lc.startswith("bh-f"):
|
||||
word = word[3:]
|
||||
elif lc.startswith("m-b"):
|
||||
word = word[2:]
|
||||
elif lc.startswith("g-c"):
|
||||
word = word[2:]
|
||||
elif lc.startswith("n-d"):
|
||||
word = word[2:]
|
||||
elif lc.startswith("n-g"):
|
||||
word = word[2:]
|
||||
elif lc.startswith("b-p"):
|
||||
word = word[2:]
|
||||
elif lc.startswith("d-t"):
|
||||
word = word[2:]
|
||||
|
||||
# t-prothesis
|
||||
elif lc.startswith("ts"):
|
||||
word = word[1:]
|
||||
elif lc.startswith("t-s"):
|
||||
word = word[2:]
|
||||
|
||||
# h-prothesis, if known to be present
|
||||
elif is_hpref and word[0:1] == "h":
|
||||
word = word[1:]
|
||||
# h-prothesis, simple case
|
||||
# words can also begin with 'h', but unlike eclipsis,
|
||||
# a hyphen is not used, so that needs to be handled
|
||||
# elsewhere
|
||||
elif word[0:1] == "h" and word[1:2] in UVOWELS:
|
||||
word = word[1:]
|
||||
|
||||
# lenition
|
||||
# this breaks the previous if, to handle super-non-standard
|
||||
# text where both eclipsis and lenition were used.
|
||||
if lc[0:1] in "bcdfgmpst" and lc[1:2] == "h":
|
||||
word = word[0:1] + word[2:]
|
||||
|
||||
return word
|
||||
|
||||
|
||||
def unponc(word: str) -> str:
|
||||
# fmt: off
|
||||
PONC = {
|
||||
"ḃ": "bh",
|
||||
"ċ": "ch",
|
||||
"ḋ": "dh",
|
||||
"ḟ": "fh",
|
||||
"ġ": "gh",
|
||||
"ṁ": "mh",
|
||||
"ṗ": "ph",
|
||||
"ṡ": "sh",
|
||||
"ṫ": "th",
|
||||
"Ḃ": "BH",
|
||||
"Ċ": "CH",
|
||||
"Ḋ": "DH",
|
||||
"Ḟ": "FH",
|
||||
"Ġ": "GH",
|
||||
"Ṁ": "MH",
|
||||
"Ṗ": "PH",
|
||||
"Ṡ": "SH",
|
||||
"Ṫ": "TH"
|
||||
}
|
||||
# fmt: on
|
||||
buf = []
|
||||
for ch in word:
|
||||
if ch in PONC:
|
||||
buf.append(PONC[ch])
|
||||
else:
|
||||
buf.append(ch)
|
||||
return "".join(buf)
|
|
@ -9,6 +9,8 @@ _exc = {
|
|||
"ded'": [{ORTH: "de", NORM: "de"}, {ORTH: "d'", NORM: "do"}],
|
||||
"lem'": [{ORTH: "le", NORM: "le"}, {ORTH: "m'", NORM: "mo"}],
|
||||
"led'": [{ORTH: "le", NORM: "le"}, {ORTH: "d'", NORM: "do"}],
|
||||
"théis": [{ORTH: "th", NORM: "tar"}, {ORTH: "éis", NORM: "éis"}],
|
||||
"tréis": [{ORTH: "tr", NORM: "tar"}, {ORTH: "éis", NORM: "éis"}],
|
||||
}
|
||||
|
||||
for exc_data in [
|
||||
|
|
|
@ -646,5 +646,10 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format(
|
|||
)
|
||||
|
||||
|
||||
for u in "cfkCFK":
|
||||
_exc[f"°{u}"] = [{ORTH: f"°{u}"}]
|
||||
_exc[f"°{u}."] = [{ORTH: f"°{u}"}, {ORTH: "."}]
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
from thinc.api import Model
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
|
@ -23,13 +23,25 @@ class Italian(Language):
|
|||
@Italian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "pos_lookup",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return ItalianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return ItalianLemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Italian"]
|
||||
|
|
|
@ -1,21 +1,25 @@
|
|||
from typing import Optional, Union, Dict, Any
|
||||
from typing import Optional, Union, Dict, Any, Callable
|
||||
from pathlib import Path
|
||||
import srsly
|
||||
from collections import namedtuple
|
||||
from thinc.api import Model
|
||||
import re
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .tag_map import TAG_MAP
|
||||
from .tag_orth_map import TAG_ORTH_MAP
|
||||
from .tag_bigram_map import TAG_BIGRAM_MAP
|
||||
from ...compat import copy_reg
|
||||
from ...errors import Errors
|
||||
from ...language import Language, BaseDefaults
|
||||
from ...pipeline import Morphologizer
|
||||
from ...pipeline.morphologizer import DEFAULT_MORPH_MODEL
|
||||
from ...scorer import Scorer
|
||||
from ...symbols import POS
|
||||
from ...tokens import Doc
|
||||
from ...tokens import Doc, MorphAnalysis
|
||||
from ...training import validate_examples
|
||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||
from ...vocab import Vocab
|
||||
from ... import util
|
||||
|
||||
|
||||
|
@ -31,16 +35,21 @@ split_mode = null
|
|||
@registry.tokenizers("spacy.ja.JapaneseTokenizer")
|
||||
def create_tokenizer(split_mode: Optional[str] = None):
|
||||
def japanese_tokenizer_factory(nlp):
|
||||
return JapaneseTokenizer(nlp, split_mode=split_mode)
|
||||
return JapaneseTokenizer(nlp.vocab, split_mode=split_mode)
|
||||
|
||||
return japanese_tokenizer_factory
|
||||
|
||||
|
||||
class JapaneseTokenizer(DummyTokenizer):
|
||||
def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
|
||||
self.vocab = nlp.vocab
|
||||
def __init__(self, vocab: Vocab, split_mode: Optional[str] = None) -> None:
|
||||
self.vocab = vocab
|
||||
self.split_mode = split_mode
|
||||
self.tokenizer = try_sudachi_import(self.split_mode)
|
||||
# if we're using split mode A we don't need subtokens
|
||||
self.need_subtokens = not (split_mode is None or split_mode == "A")
|
||||
|
||||
def __reduce__(self):
|
||||
return JapaneseTokenizer, (self.vocab, self.split_mode)
|
||||
|
||||
def __call__(self, text: str) -> Doc:
|
||||
# convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
|
||||
|
@ -49,8 +58,8 @@ class JapaneseTokenizer(DummyTokenizer):
|
|||
dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
|
||||
|
||||
# create Doc with tag bi-gram based part-of-speech identification rules
|
||||
words, tags, inflections, lemmas, readings, sub_tokens_list = (
|
||||
zip(*dtokens) if dtokens else [[]] * 6
|
||||
words, tags, inflections, lemmas, norms, readings, sub_tokens_list = (
|
||||
zip(*dtokens) if dtokens else [[]] * 7
|
||||
)
|
||||
sub_tokens_list = list(sub_tokens_list)
|
||||
doc = Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
@ -68,9 +77,18 @@ class JapaneseTokenizer(DummyTokenizer):
|
|||
)
|
||||
# if there's no lemma info (it's an unk) just use the surface
|
||||
token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
|
||||
doc.user_data["inflections"] = inflections
|
||||
doc.user_data["reading_forms"] = readings
|
||||
doc.user_data["sub_tokens"] = sub_tokens_list
|
||||
morph = {}
|
||||
if dtoken.inf:
|
||||
# it's normal for this to be empty for non-inflecting types
|
||||
morph["Inflection"] = dtoken.inf
|
||||
token.norm_ = dtoken.norm
|
||||
if dtoken.reading:
|
||||
# punctuation is its own reading, but we don't want values like
|
||||
# "=" here
|
||||
morph["Reading"] = re.sub("[=|]", "_", dtoken.reading)
|
||||
token.morph = MorphAnalysis(self.vocab, morph)
|
||||
if self.need_subtokens:
|
||||
doc.user_data["sub_tokens"] = sub_tokens_list
|
||||
return doc
|
||||
|
||||
def _get_dtokens(self, sudachipy_tokens, need_sub_tokens: bool = True):
|
||||
|
@ -81,9 +99,10 @@ class JapaneseTokenizer(DummyTokenizer):
|
|||
DetailedToken(
|
||||
token.surface(), # orth
|
||||
"-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]), # tag
|
||||
",".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]), # inf
|
||||
";".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]), # inf
|
||||
token.dictionary_form(), # lemma
|
||||
token.reading_form(), # user_data['reading_forms']
|
||||
token.normalized_form(),
|
||||
token.reading_form(),
|
||||
sub_tokens_list[idx]
|
||||
if sub_tokens_list
|
||||
else None, # user_data['sub_tokens']
|
||||
|
@ -105,9 +124,8 @@ class JapaneseTokenizer(DummyTokenizer):
|
|||
]
|
||||
|
||||
def _get_sub_tokens(self, sudachipy_tokens):
|
||||
if (
|
||||
self.split_mode is None or self.split_mode == "A"
|
||||
): # do nothing for default split mode
|
||||
# do nothing for default split mode
|
||||
if not self.need_subtokens:
|
||||
return None
|
||||
|
||||
sub_tokens_list = [] # list of (list of list of DetailedToken | None)
|
||||
|
@ -176,9 +194,33 @@ class Japanese(Language):
|
|||
Defaults = JapaneseDefaults
|
||||
|
||||
|
||||
@Japanese.factory(
|
||||
"morphologizer",
|
||||
assigns=["token.morph", "token.pos"],
|
||||
default_config={
|
||||
"model": DEFAULT_MORPH_MODEL,
|
||||
"overwrite": True,
|
||||
"extend": True,
|
||||
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"pos_acc": 0.5, "morph_micro_f": 0.5, "morph_per_feat": None},
|
||||
)
|
||||
def make_morphologizer(
|
||||
nlp: Language,
|
||||
model: Model,
|
||||
name: str,
|
||||
overwrite: bool,
|
||||
extend: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return Morphologizer(
|
||||
nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
# Hold the attributes we need with convenient names
|
||||
DetailedToken = namedtuple(
|
||||
"DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"]
|
||||
"DetailedToken", ["surface", "tag", "inf", "lemma", "norm", "reading", "sub_tokens"]
|
||||
)
|
||||
|
||||
|
||||
|
@ -254,7 +296,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
|
|||
return text_dtokens, text_spaces
|
||||
elif len([word for word in words if not word.isspace()]) == 0:
|
||||
assert text.isspace()
|
||||
text_dtokens = [DetailedToken(text, gap_tag, "", text, None, None)]
|
||||
text_dtokens = [DetailedToken(text, gap_tag, "", text, text, None, None)]
|
||||
text_spaces = [False]
|
||||
return text_dtokens, text_spaces
|
||||
|
||||
|
@ -271,7 +313,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
|
|||
# space token
|
||||
if word_start > 0:
|
||||
w = text[text_pos : text_pos + word_start]
|
||||
text_dtokens.append(DetailedToken(w, gap_tag, "", w, None, None))
|
||||
text_dtokens.append(DetailedToken(w, gap_tag, "", w, w, None, None))
|
||||
text_spaces.append(False)
|
||||
text_pos += word_start
|
||||
|
||||
|
@ -287,16 +329,10 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
|
|||
# trailing space token
|
||||
if text_pos < len(text):
|
||||
w = text[text_pos:]
|
||||
text_dtokens.append(DetailedToken(w, gap_tag, "", w, None, None))
|
||||
text_dtokens.append(DetailedToken(w, gap_tag, "", w, w, None, None))
|
||||
text_spaces.append(False)
|
||||
|
||||
return text_dtokens, text_spaces
|
||||
|
||||
|
||||
def pickle_japanese(instance):
|
||||
return Japanese, tuple()
|
||||
|
||||
|
||||
copy_reg.pickle(Japanese, pickle_japanese)
|
||||
|
||||
__all__ = ["Japanese"]
|
||||
|
|
|
@ -5,11 +5,11 @@ from .tag_map import TAG_MAP
|
|||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language, BaseDefaults
|
||||
from ...tokens import Doc
|
||||
from ...compat import copy_reg
|
||||
from ...scorer import Scorer
|
||||
from ...symbols import POS
|
||||
from ...training import validate_examples
|
||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||
from ...vocab import Vocab
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -23,17 +23,20 @@ DEFAULT_CONFIG = """
|
|||
@registry.tokenizers("spacy.ko.KoreanTokenizer")
|
||||
def create_tokenizer():
|
||||
def korean_tokenizer_factory(nlp):
|
||||
return KoreanTokenizer(nlp)
|
||||
return KoreanTokenizer(nlp.vocab)
|
||||
|
||||
return korean_tokenizer_factory
|
||||
|
||||
|
||||
class KoreanTokenizer(DummyTokenizer):
|
||||
def __init__(self, nlp: Language):
|
||||
self.vocab = nlp.vocab
|
||||
def __init__(self, vocab: Vocab):
|
||||
self.vocab = vocab
|
||||
MeCab = try_mecab_import() # type: ignore[func-returns-value]
|
||||
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
|
||||
|
||||
def __reduce__(self):
|
||||
return KoreanTokenizer, (self.vocab,)
|
||||
|
||||
def __del__(self):
|
||||
self.mecab_tokenizer.__del__()
|
||||
|
||||
|
@ -106,10 +109,4 @@ def check_spaces(text, tokens):
|
|||
yield False
|
||||
|
||||
|
||||
def pickle_korean(instance):
|
||||
return Korean, tuple()
|
||||
|
||||
|
||||
copy_reg.pickle(Korean, pickle_korean)
|
||||
|
||||
__all__ = ["Korean"]
|
||||
|
|
|
@ -3,6 +3,7 @@ import unicodedata
|
|||
import re
|
||||
|
||||
from .. import attrs
|
||||
from .tokenizer_exceptions import URL_MATCH
|
||||
|
||||
|
||||
_like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match
|
||||
|
@ -109,6 +110,8 @@ def like_url(text: str) -> bool:
|
|||
return True
|
||||
if tld.isalpha() and tld in _tlds:
|
||||
return True
|
||||
if URL_MATCH(text):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
from thinc.api import Model
|
||||
from .lemmatizer import MacedonianLemmatizer
|
||||
from .stop_words import STOP_WORDS
|
||||
|
@ -38,13 +38,25 @@ class Macedonian(Language):
|
|||
@Macedonian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "rule",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return MacedonianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return MacedonianLemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Macedonian"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
from thinc.api import Model
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
|
@ -26,13 +26,25 @@ class Norwegian(Language):
|
|||
@Norwegian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "rule",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return Lemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Norwegian"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
|
@ -30,13 +30,25 @@ class Dutch(Language):
|
|||
@Dutch.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "rule",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return DutchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return DutchLemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Dutch"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
|
@ -33,13 +33,25 @@ class Polish(Language):
|
|||
@Polish.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "pos_lookup",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return PolishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return PolishLemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Polish"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
from thinc.api import Model
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
|
@ -22,7 +22,12 @@ class Russian(Language):
|
|||
@Russian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "pymorphy2", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "pymorphy2",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
|
@ -31,8 +36,11 @@ def make_lemmatizer(
|
|||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return RussianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return RussianLemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Russian"]
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
from typing import Optional, List, Dict, Tuple
|
||||
from typing import Optional, List, Dict, Tuple, Callable
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from ...pipeline import Lemmatizer
|
||||
from ...pipeline.lemmatizer import lemmatizer_score
|
||||
from ...symbols import POS
|
||||
from ...tokens import Token
|
||||
from ...vocab import Vocab
|
||||
|
@ -20,6 +21,7 @@ class RussianLemmatizer(Lemmatizer):
|
|||
*,
|
||||
mode: str = "pymorphy2",
|
||||
overwrite: bool = False,
|
||||
scorer: Optional[Callable] = lemmatizer_score,
|
||||
) -> None:
|
||||
if mode == "pymorphy2":
|
||||
try:
|
||||
|
@ -31,7 +33,7 @@ class RussianLemmatizer(Lemmatizer):
|
|||
) from None
|
||||
if getattr(self, "_morph", None) is None:
|
||||
self._morph = MorphAnalyzer()
|
||||
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)
|
||||
|
||||
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
|
||||
string = token.text
|
||||
|
|
|
@ -1,47 +1,195 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
අතර
|
||||
එච්චර
|
||||
එපමණ
|
||||
එලෙස
|
||||
එවිට
|
||||
ඒ
|
||||
කට
|
||||
කදී
|
||||
කින්
|
||||
ක්
|
||||
ට
|
||||
තුර
|
||||
ත්
|
||||
ද
|
||||
නමුත්
|
||||
නොහොත්
|
||||
පමණ
|
||||
පමණි
|
||||
ම
|
||||
මෙච්චර
|
||||
මෙපමණ
|
||||
මෙලෙස
|
||||
මෙවිට
|
||||
මේ
|
||||
ය
|
||||
යි
|
||||
ලදී
|
||||
සහ
|
||||
සමග
|
||||
සමඟ
|
||||
අහා
|
||||
ආහ්
|
||||
ආ
|
||||
ඕහෝ
|
||||
අනේ
|
||||
අඳෝ
|
||||
අපොයි
|
||||
අපෝ
|
||||
අයියෝ
|
||||
ආයි
|
||||
ඌයි
|
||||
චී
|
||||
චිහ්
|
||||
චික්
|
||||
හෝ
|
||||
දෝ
|
||||
දෝහෝ
|
||||
මෙන්
|
||||
සේ
|
||||
වැනි
|
||||
බඳු
|
||||
වන්
|
||||
අයුරු
|
||||
අයුරින්
|
||||
ලෙස
|
||||
වගේ
|
||||
වැඩි
|
||||
ශ්රී
|
||||
හා
|
||||
ය
|
||||
නිසා
|
||||
නිසාවෙන්
|
||||
බවට
|
||||
බව
|
||||
බවෙන්
|
||||
නම්
|
||||
වැඩි
|
||||
සිට
|
||||
දී
|
||||
මහා
|
||||
මහ
|
||||
පමණ
|
||||
පමණින්
|
||||
පමන
|
||||
වන
|
||||
විට
|
||||
විටෙක
|
||||
විතර
|
||||
විය
|
||||
වුව
|
||||
වුවත්
|
||||
වුවද
|
||||
වූ
|
||||
සමඟ
|
||||
විටින්
|
||||
මේ
|
||||
මෙලෙස
|
||||
මෙයින්
|
||||
ඇති
|
||||
ලෙස
|
||||
සිදු
|
||||
වශයෙන්
|
||||
යන
|
||||
සඳහා
|
||||
මගින්
|
||||
හෝ
|
||||
ඉතා
|
||||
ඒ
|
||||
එම
|
||||
ද
|
||||
අතර
|
||||
විසින්
|
||||
සමග
|
||||
පිළිබඳව
|
||||
පිළිබඳ
|
||||
තුළ
|
||||
බව
|
||||
වැනි
|
||||
මහ
|
||||
මෙම
|
||||
මෙහි
|
||||
මේ
|
||||
වෙත
|
||||
වෙතින්
|
||||
වෙතට
|
||||
වෙනුවෙන්
|
||||
වෙනුවට
|
||||
වෙන
|
||||
ගැන
|
||||
නෑ
|
||||
අනුව
|
||||
නව
|
||||
පිළිබඳ
|
||||
විශේෂ
|
||||
දැනට
|
||||
එහෙන්
|
||||
මෙහෙන්
|
||||
එහේ
|
||||
මෙහේ
|
||||
ම
|
||||
තවත්
|
||||
තව
|
||||
සහ
|
||||
හා
|
||||
දක්වා
|
||||
ට
|
||||
ගේ
|
||||
එ
|
||||
ක
|
||||
ක්
|
||||
බවත්
|
||||
බවද
|
||||
මත
|
||||
ඇතුලු
|
||||
ඇතුළු
|
||||
මෙසේ
|
||||
වඩා
|
||||
වඩාත්ම
|
||||
නිති
|
||||
නිතිත්
|
||||
නිතොර
|
||||
නිතර
|
||||
ඉක්බිති
|
||||
දැන්
|
||||
යලි
|
||||
පුන
|
||||
ඉතින්
|
||||
සිට
|
||||
සිටන්
|
||||
පටන්
|
||||
තෙක්
|
||||
දක්වා
|
||||
සා
|
||||
තාක්
|
||||
තුවක්
|
||||
පවා
|
||||
ද
|
||||
හෝ
|
||||
වත්
|
||||
විනා
|
||||
හැර
|
||||
මිස
|
||||
මුත්
|
||||
කිම
|
||||
කිම්
|
||||
ඇයි
|
||||
මන්ද
|
||||
හෙවත්
|
||||
හෝ
|
||||
නොහොත්
|
||||
පතා
|
||||
පාසා
|
||||
ගානෙ
|
||||
තව
|
||||
ඉතා
|
||||
බොහෝ
|
||||
වහා
|
||||
සෙද
|
||||
සැනින්
|
||||
හනික
|
||||
එම්බා
|
||||
එම්බල
|
||||
බොල
|
||||
නම්
|
||||
වනාහි
|
||||
කලී
|
||||
ඉඳුරා
|
||||
අන්න
|
||||
ඔන්න
|
||||
මෙන්න
|
||||
උදෙසා
|
||||
පිණිස
|
||||
සඳහා
|
||||
අරබයා
|
||||
නිසා
|
||||
එනිසා
|
||||
එබැවින්
|
||||
බැවින්
|
||||
හෙයින්
|
||||
සේක්
|
||||
සේක
|
||||
ගැන
|
||||
අනුව
|
||||
පරිදි
|
||||
විට
|
||||
තෙක්
|
||||
මෙතෙක්
|
||||
මේතාක්
|
||||
තුරු
|
||||
තුරා
|
||||
තුරාවට
|
||||
තුලින්
|
||||
නමුත්
|
||||
එනමුත්
|
||||
වස්
|
||||
මෙන්
|
||||
ලෙස
|
||||
පරිදි
|
||||
එහෙත්
|
||||
""".split()
|
||||
)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
from thinc.api import Model
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
|
@ -29,13 +29,25 @@ class Swedish(Language):
|
|||
@Swedish.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "rule",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return Lemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Swedish"]
|
||||
|
|
|
@ -3,6 +3,7 @@ from .lex_attrs import LEX_ATTRS
|
|||
from ...language import Language, BaseDefaults
|
||||
from ...tokens import Doc
|
||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||
from ...vocab import Vocab
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -16,13 +17,13 @@ DEFAULT_CONFIG = """
|
|||
@registry.tokenizers("spacy.th.ThaiTokenizer")
|
||||
def create_thai_tokenizer():
|
||||
def thai_tokenizer_factory(nlp):
|
||||
return ThaiTokenizer(nlp)
|
||||
return ThaiTokenizer(nlp.vocab)
|
||||
|
||||
return thai_tokenizer_factory
|
||||
|
||||
|
||||
class ThaiTokenizer(DummyTokenizer):
|
||||
def __init__(self, nlp: Language) -> None:
|
||||
def __init__(self, vocab: Vocab) -> None:
|
||||
try:
|
||||
from pythainlp.tokenize import word_tokenize
|
||||
except ImportError:
|
||||
|
@ -31,7 +32,7 @@ class ThaiTokenizer(DummyTokenizer):
|
|||
"https://github.com/PyThaiNLP/pythainlp"
|
||||
) from None
|
||||
self.word_tokenize = word_tokenize
|
||||
self.vocab = nlp.vocab
|
||||
self.vocab = vocab
|
||||
|
||||
def __call__(self, text: str) -> Doc:
|
||||
words = list(self.word_tokenize(text))
|
||||
|
|
|
@ -2,7 +2,7 @@ from ...attrs import LIKE_NUM
|
|||
|
||||
_num_words = [
|
||||
"ዜሮ",
|
||||
"ሐደ",
|
||||
"ሓደ",
|
||||
"ክልተ",
|
||||
"ሰለስተ",
|
||||
"ኣርባዕተ",
|
||||
|
@ -11,66 +11,37 @@ _num_words = [
|
|||
"ሸውዓተ",
|
||||
"ሽሞንተ",
|
||||
"ትሽዓተ",
|
||||
"ኣሰርተ",
|
||||
"ኣሰርተ ሐደ",
|
||||
"ኣሰርተ ክልተ",
|
||||
"ኣሰርተ ሰለስተ",
|
||||
"ኣሰርተ ኣርባዕተ",
|
||||
"ኣሰርተ ሓሙሽተ",
|
||||
"ኣሰርተ ሽድሽተ",
|
||||
"ኣሰርተ ሸውዓተ",
|
||||
"ኣሰርተ ሽሞንተ",
|
||||
"ኣሰርተ ትሽዓተ",
|
||||
"ዓሰርተ",
|
||||
"ዕስራ",
|
||||
"ሰላሳ",
|
||||
"ኣርብዓ",
|
||||
"ሃምሳ",
|
||||
"ስልሳ",
|
||||
"ሓምሳ",
|
||||
"ሱሳ",
|
||||
"ሰብዓ",
|
||||
"ሰማንያ",
|
||||
"ተስዓ",
|
||||
"ቴስዓ",
|
||||
"ሚእቲ",
|
||||
"ሺሕ",
|
||||
"ሚልዮን",
|
||||
"ቢልዮን",
|
||||
"ትሪልዮን",
|
||||
"ኳድሪልዮን",
|
||||
"ገጅልዮን",
|
||||
"ባዝልዮን",
|
||||
"ጋዚልዮን",
|
||||
"ባዚልዮን"
|
||||
]
|
||||
|
||||
# Tigrinya ordinals above 10 are the same as _num_words but start with "መበል "
|
||||
_ordinal_words = [
|
||||
"ቀዳማይ",
|
||||
"ካልኣይ",
|
||||
"ሳልሳይ",
|
||||
"ራብኣይ",
|
||||
"ራብዓይ",
|
||||
"ሓምሻይ",
|
||||
"ሻድሻይ",
|
||||
"ሻውዓይ",
|
||||
"ሻምናይ",
|
||||
"ዘጠነኛ",
|
||||
"አስረኛ",
|
||||
"ኣሰርተ አንደኛ",
|
||||
"ኣሰርተ ሁለተኛ",
|
||||
"ኣሰርተ ሶስተኛ",
|
||||
"ኣሰርተ አራተኛ",
|
||||
"ኣሰርተ አምስተኛ",
|
||||
"ኣሰርተ ስድስተኛ",
|
||||
"ኣሰርተ ሰባተኛ",
|
||||
"ኣሰርተ ስምንተኛ",
|
||||
"ኣሰርተ ዘጠነኛ",
|
||||
"ሃያኛ",
|
||||
"ሰላሳኛ" "አርባኛ",
|
||||
"አምሳኛ",
|
||||
"ስድሳኛ",
|
||||
"ሰባኛ",
|
||||
"ሰማንያኛ",
|
||||
"ዘጠናኛ",
|
||||
"መቶኛ",
|
||||
"ሺኛ",
|
||||
"ሚሊዮንኛ",
|
||||
"ቢሊዮንኛ",
|
||||
"ትሪሊዮንኛ",
|
||||
"ታሽዓይ",
|
||||
"ዓስራይ"
|
||||
]
|
||||
|
||||
|
||||
|
@ -92,7 +63,7 @@ def like_num(text):
|
|||
# Check ordinal number
|
||||
if text_lower in _ordinal_words:
|
||||
return True
|
||||
if text_lower.endswith("ኛ"):
|
||||
if text_lower.endswith("ይ"):
|
||||
if text_lower[:-2].isdigit():
|
||||
return True
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
||||
from ..char_classes import UNITS, ALPHA_UPPER
|
||||
|
||||
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧".strip().split()
|
||||
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
|
||||
|
||||
_suffixes = (
|
||||
_list_punct
|
||||
|
|
|
@ -1,6 +1,27 @@
|
|||
# Stop words from Tigrinya Wordcount: https://github.com/fgaim/Tigrinya-WordCount/blob/main/ti_stop_words.txt
|
||||
|
||||
# Stop words
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
ግን ግና ንስኻ ንስኺ ንስኻትክን ንስኻትኩም ናትካ ናትኪ ናትክን ናትኩም
|
||||
'ምበር 'ሞ 'ቲ 'ታ 'ኳ 'ውን 'ዚ 'የ 'ዩ 'ያ 'ዮም 'ዮን
|
||||
ልዕሊ ሒዙ ሒዛ ሕጂ መበል መን መንጎ መጠን ማለት ምስ ምባል
|
||||
ምእንቲ ምኽንያቱ ምኽንያት ምዃኑ ምዃንና ምዃኖም
|
||||
ስለ ስለዚ ስለዝበላ ሽዑ ቅድሚ በለ በቲ በዚ ብምባል ብተወሳኺ ብኸመይ
|
||||
ብዘይ ብዘይካ ብዙሕ ብዛዕባ ብፍላይ ተባሂሉ ነበረ ነቲ ነታ ነቶም
|
||||
ነዚ ነይሩ ነገራት ነገር ናብ ናብቲ ናትኩም ናትኪ ናትካ ናትክን
|
||||
ናይ ናይቲ ንሕና ንሱ ንሳ ንሳቶም ንስኺ ንስኻ ንስኻትኩም ንስኻትክን ንዓይ
|
||||
ኢለ ኢሉ ኢላ ኢልካ ኢሎም ኢና ኢኻ ኢዩ ኣለኹ
|
||||
ኣለዉ ኣለዎ ኣሎ ኣብ ኣብቲ ኣብታ ኣብኡ ኣብዚ ኣነ ኣዝዩ ኣይኮነን ኣይኰነን
|
||||
እምበር እሞ እተን እቲ እታ እቶም እንተ እንተሎ
|
||||
ኣላ እንተኾነ እንታይ እንከሎ እኳ እዋን እውን እዚ እዛ እዞም
|
||||
እየ እየን እዩ እያ እዮም
|
||||
ከሎ ከመይ ከም ከምቲ ከምኡ ከምዘሎ
|
||||
ከምዚ ከኣ ኩሉ ካልእ ካልኦት ካብ ካብቲ ካብቶም ክሳብ ክሳዕ ክብል
|
||||
ክንደይ ክንዲ ክኸውን ኮይኑ ኰይኑ ኵሉ ኸም ኸኣ ወይ
|
||||
ዋላ ዘለና ዘለዉ ዘለዋ ዘለዎ ዘለዎም ዘላ ዘሎ ዘይብሉ
|
||||
ዝርከብ ዝበሃል ዝበለ ዝብል ዝተባህለ ዝተኻየደ ዝተፈላለየ ዝተፈላለዩ
|
||||
ዝነበረ ዝነበረት ዝነበሩ ዝካየድ ዝኸውን ዝኽእል ዝኾነ ዝዀነ
|
||||
የለን ይቕረብ ይብል ይኸውን ይኹን ይኽእል ደኣ ድሕሪ ድማ
|
||||
ገለ ገሊጹ ገና ገይሩ ግና ግን ጥራይ
|
||||
""".split()
|
||||
)
|
||||
|
|
|
@ -250,3 +250,9 @@ o.0
|
|||
|
||||
for orth in emoticons:
|
||||
BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
|
||||
|
||||
|
||||
# Moved from a suffix setting due to #9155 removing prefixes from consideration
|
||||
# for lookbehinds
|
||||
for u in "cfkCFK":
|
||||
BASE_EXCEPTIONS[f"°{u}."] = [{ORTH: "°"}, {ORTH: f"{u}"}, {ORTH: "."}]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
|
@ -23,13 +23,25 @@ class Ukrainian(Language):
|
|||
@Ukrainian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "pymorphy2", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "pymorphy2",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return UkrainianLemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Ukrainian"]
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from ..ru.lemmatizer import RussianLemmatizer
|
||||
from ...pipeline.lemmatizer import lemmatizer_score
|
||||
from ...vocab import Vocab
|
||||
|
||||
|
||||
|
@ -15,6 +16,7 @@ class UkrainianLemmatizer(RussianLemmatizer):
|
|||
*,
|
||||
mode: str = "pymorphy2",
|
||||
overwrite: bool = False,
|
||||
scorer: Optional[Callable] = lemmatizer_score,
|
||||
) -> None:
|
||||
if mode == "pymorphy2":
|
||||
try:
|
||||
|
@ -27,4 +29,4 @@ class UkrainianLemmatizer(RussianLemmatizer):
|
|||
) from None
|
||||
if getattr(self, "_morph", None) is None:
|
||||
self._morph = MorphAnalyzer(lang="uk")
|
||||
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)
|
||||
|
|
|
@ -9,6 +9,7 @@ from .lex_attrs import LEX_ATTRS
|
|||
from ...language import Language, BaseDefaults
|
||||
from ...tokens import Doc
|
||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||
from ...vocab import Vocab
|
||||
from ... import util
|
||||
|
||||
|
||||
|
@ -24,14 +25,14 @@ use_pyvi = true
|
|||
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
|
||||
def create_vietnamese_tokenizer(use_pyvi: bool = True):
|
||||
def vietnamese_tokenizer_factory(nlp):
|
||||
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
|
||||
return VietnameseTokenizer(nlp.vocab, use_pyvi=use_pyvi)
|
||||
|
||||
return vietnamese_tokenizer_factory
|
||||
|
||||
|
||||
class VietnameseTokenizer(DummyTokenizer):
|
||||
def __init__(self, nlp: Language, use_pyvi: bool = False):
|
||||
self.vocab = nlp.vocab
|
||||
def __init__(self, vocab: Vocab, use_pyvi: bool = False):
|
||||
self.vocab = vocab
|
||||
self.use_pyvi = use_pyvi
|
||||
if self.use_pyvi:
|
||||
try:
|
||||
|
@ -45,6 +46,9 @@ class VietnameseTokenizer(DummyTokenizer):
|
|||
)
|
||||
raise ImportError(msg) from None
|
||||
|
||||
def __reduce__(self):
|
||||
return VietnameseTokenizer, (self.vocab, self.use_pyvi)
|
||||
|
||||
def __call__(self, text: str) -> Doc:
|
||||
if self.use_pyvi:
|
||||
words = self.pyvi_tokenize(text)
|
||||
|
|
18
spacy/lang/vi/examples.py
Normal file
18
spacy/lang/vi/examples.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
>>> from spacy.lang.vi.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Đây là đâu, tôi là ai?",
|
||||
"Căn phòng có nhiều cửa sổ nên nó khá sáng",
|
||||
"Đại dịch COVID vừa qua đã gây ảnh hưởng rất lớn tới nhiều doanh nghiệp lớn nhỏ.",
|
||||
"Thành phố Hồ Chí Minh đã bị ảnh hưởng nặng nề trong thời gian vừa qua.",
|
||||
"Ông bạn đang ở đâu thế?",
|
||||
"Ai là người giải phóng đất nước Việt Nam khỏi ách đô hộ?",
|
||||
"Vị tướng nào là người đã làm nên chiến thắng lịch sử Điện Biên Phủ?",
|
||||
"Làm việc nhiều chán quá, đi chơi đâu đi?",
|
||||
]
|
|
@ -9,11 +9,14 @@ _num_words = [
|
|||
"bốn",
|
||||
"năm",
|
||||
"sáu",
|
||||
"bảy",
|
||||
"bẩy",
|
||||
"tám",
|
||||
"chín",
|
||||
"mười",
|
||||
"chục",
|
||||
"trăm",
|
||||
"nghìn",
|
||||
"tỷ",
|
||||
]
|
||||
|
||||
|
|
|
@ -11,6 +11,7 @@ from ...scorer import Scorer
|
|||
from ...tokens import Doc
|
||||
from ...training import validate_examples, Example
|
||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||
from ...vocab import Vocab
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
from ... import util
|
||||
|
@ -48,14 +49,14 @@ class Segmenter(str, Enum):
|
|||
@registry.tokenizers("spacy.zh.ChineseTokenizer")
|
||||
def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
|
||||
def chinese_tokenizer_factory(nlp):
|
||||
return ChineseTokenizer(nlp, segmenter=segmenter)
|
||||
return ChineseTokenizer(nlp.vocab, segmenter=segmenter)
|
||||
|
||||
return chinese_tokenizer_factory
|
||||
|
||||
|
||||
class ChineseTokenizer(DummyTokenizer):
|
||||
def __init__(self, nlp: Language, segmenter: Segmenter = Segmenter.char):
|
||||
self.vocab = nlp.vocab
|
||||
def __init__(self, vocab: Vocab, segmenter: Segmenter = Segmenter.char):
|
||||
self.vocab = vocab
|
||||
self.segmenter = (
|
||||
segmenter.value if isinstance(segmenter, Segmenter) else segmenter
|
||||
)
|
||||
|
|
|
@ -115,7 +115,7 @@ class Language:
|
|||
|
||||
Defaults (class): Settings, data and factory methods for creating the `nlp`
|
||||
object and processing pipeline.
|
||||
lang (str): Two-letter language ID, i.e. ISO code.
|
||||
lang (str): IETF language code, such as 'en'.
|
||||
|
||||
DOCS: https://spacy.io/api/language
|
||||
"""
|
||||
|
@ -228,6 +228,7 @@ class Language:
|
|||
"vectors": len(self.vocab.vectors),
|
||||
"keys": self.vocab.vectors.n_keys,
|
||||
"name": self.vocab.vectors.name,
|
||||
"mode": self.vocab.vectors.mode,
|
||||
}
|
||||
self._meta["labels"] = dict(self.pipe_labels)
|
||||
# TODO: Adding this back to prevent breaking people's code etc., but
|
||||
|
@ -978,7 +979,7 @@ class Language:
|
|||
|
||||
def __call__(
|
||||
self,
|
||||
text: str,
|
||||
text: Union[str, Doc],
|
||||
*,
|
||||
disable: Iterable[str] = SimpleFrozenList(),
|
||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||
|
@ -987,7 +988,9 @@ class Language:
|
|||
and can contain arbitrary whitespace. Alignment into the original string
|
||||
is preserved.
|
||||
|
||||
text (str): The text to be processed.
|
||||
text (Union[str, Doc]): If `str`, the text to be processed. If `Doc`,
|
||||
the doc will be passed directly to the pipeline, skipping
|
||||
`Language.make_doc`.
|
||||
disable (List[str]): Names of the pipeline components to disable.
|
||||
component_cfg (Dict[str, dict]): An optional dictionary with extra
|
||||
keyword arguments for specific components.
|
||||
|
@ -995,7 +998,7 @@ class Language:
|
|||
|
||||
DOCS: https://spacy.io/api/language#call
|
||||
"""
|
||||
doc = self.make_doc(text)
|
||||
doc = self._ensure_doc(text)
|
||||
if component_cfg is None:
|
||||
component_cfg = {}
|
||||
for name, proc in self.pipeline:
|
||||
|
@ -1080,6 +1083,20 @@ class Language:
|
|||
)
|
||||
return self.tokenizer(text)
|
||||
|
||||
def _ensure_doc(self, doc_like: Union[str, Doc]) -> Doc:
|
||||
"""Create a Doc if need be, or raise an error if the input is not a Doc or a string."""
|
||||
if isinstance(doc_like, Doc):
|
||||
return doc_like
|
||||
if isinstance(doc_like, str):
|
||||
return self.make_doc(doc_like)
|
||||
raise ValueError(Errors.E866.format(type=type(doc_like)))
|
||||
|
||||
def _ensure_doc_with_context(self, doc_like: Union[str, Doc], context: Any) -> Doc:
|
||||
"""Create a Doc if need be and add as_tuples context, or raise an error if the input is not a Doc or a string."""
|
||||
doc = self._ensure_doc(doc_like)
|
||||
doc._context = context
|
||||
return doc
|
||||
|
||||
def update(
|
||||
self,
|
||||
examples: Iterable[Example],
|
||||
|
@ -1450,7 +1467,7 @@ class Language:
|
|||
@overload
|
||||
def pipe(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
texts: Iterable[Union[str, Doc]],
|
||||
*,
|
||||
as_tuples: Literal[False] = ...,
|
||||
batch_size: Optional[int] = ...,
|
||||
|
@ -1463,7 +1480,7 @@ class Language:
|
|||
@overload
|
||||
def pipe( # noqa: F811
|
||||
self,
|
||||
texts: Iterable[Tuple[str, _AnyContext]],
|
||||
texts: Iterable[Tuple[Union[str, Doc], _AnyContext]],
|
||||
*,
|
||||
as_tuples: Literal[True] = ...,
|
||||
batch_size: Optional[int] = ...,
|
||||
|
@ -1475,7 +1492,9 @@ class Language:
|
|||
|
||||
def pipe( # noqa: F811
|
||||
self,
|
||||
texts: Union[Iterable[str], Iterable[Tuple[str, _AnyContext]]],
|
||||
texts: Union[
|
||||
Iterable[Union[str, Doc]], Iterable[Tuple[Union[str, Doc], _AnyContext]]
|
||||
],
|
||||
*,
|
||||
as_tuples: bool = False,
|
||||
batch_size: Optional[int] = None,
|
||||
|
@ -1485,7 +1504,8 @@ class Language:
|
|||
) -> Union[Iterator[Doc], Iterator[Tuple[Doc, _AnyContext]]]:
|
||||
"""Process texts as a stream, and yield `Doc` objects in order.
|
||||
|
||||
texts (Iterable[str]): A sequence of texts to process.
|
||||
texts (Iterable[Union[str, Doc]]): A sequence of texts or docs to
|
||||
process.
|
||||
as_tuples (bool): If set to True, inputs should be a sequence of
|
||||
(text, context) tuples. Output will then be a sequence of
|
||||
(doc, context) tuples. Defaults to False.
|
||||
|
@ -1500,23 +1520,24 @@ class Language:
|
|||
"""
|
||||
# Handle texts with context as tuples
|
||||
if as_tuples:
|
||||
texts = cast(Iterable[Tuple[str, _AnyContext]], texts)
|
||||
text_context1, text_context2 = itertools.tee(texts)
|
||||
texts = (tc[0] for tc in text_context1)
|
||||
contexts = (tc[1] for tc in text_context2)
|
||||
texts = cast(Iterable[Tuple[Union[str, Doc], _AnyContext]], texts)
|
||||
docs_with_contexts = (
|
||||
self._ensure_doc_with_context(text, context) for text, context in texts
|
||||
)
|
||||
docs = self.pipe(
|
||||
texts,
|
||||
docs_with_contexts,
|
||||
batch_size=batch_size,
|
||||
disable=disable,
|
||||
n_process=n_process,
|
||||
component_cfg=component_cfg,
|
||||
)
|
||||
for doc, context in zip(docs, contexts):
|
||||
for doc in docs:
|
||||
context = doc._context
|
||||
doc._context = None
|
||||
yield (doc, context)
|
||||
return
|
||||
|
||||
# At this point, we know that we're dealing with an iterable of plain texts
|
||||
texts = cast(Iterable[str], texts)
|
||||
texts = cast(Iterable[Union[str, Doc]], texts)
|
||||
|
||||
# Set argument defaults
|
||||
if n_process == -1:
|
||||
|
@ -1551,7 +1572,7 @@ class Language:
|
|||
docs = self._multiprocessing_pipe(texts, pipes, n_process, batch_size)
|
||||
else:
|
||||
# if n_process == 1, no processes are forked.
|
||||
docs = (self.make_doc(text) for text in texts)
|
||||
docs = (self._ensure_doc(text) for text in texts)
|
||||
for pipe in pipes:
|
||||
docs = pipe(docs)
|
||||
for doc in docs:
|
||||
|
@ -1570,7 +1591,7 @@ class Language:
|
|||
|
||||
def _multiprocessing_pipe(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
texts: Iterable[Union[str, Doc]],
|
||||
pipes: Iterable[Callable[..., Iterator[Doc]]],
|
||||
n_process: int,
|
||||
batch_size: int,
|
||||
|
@ -1596,7 +1617,7 @@ class Language:
|
|||
procs = [
|
||||
mp.Process(
|
||||
target=_apply_pipes,
|
||||
args=(self.make_doc, pipes, rch, sch, Underscore.get_state()),
|
||||
args=(self._ensure_doc, pipes, rch, sch, Underscore.get_state()),
|
||||
)
|
||||
for rch, sch in zip(texts_q, bytedocs_send_ch)
|
||||
]
|
||||
|
@ -1609,11 +1630,12 @@ class Language:
|
|||
recv.recv() for recv in cycle(bytedocs_recv_ch)
|
||||
)
|
||||
try:
|
||||
for i, (_, (byte_doc, byte_error)) in enumerate(
|
||||
for i, (_, (byte_doc, byte_context, byte_error)) in enumerate(
|
||||
zip(raw_texts, byte_tuples), 1
|
||||
):
|
||||
if byte_doc is not None:
|
||||
doc = Doc(self.vocab).from_bytes(byte_doc)
|
||||
doc._context = byte_context
|
||||
yield doc
|
||||
elif byte_error is not None:
|
||||
error = srsly.msgpack_loads(byte_error)
|
||||
|
@ -2138,7 +2160,7 @@ def _copy_examples(examples: Iterable[Example]) -> List[Example]:
|
|||
|
||||
|
||||
def _apply_pipes(
|
||||
make_doc: Callable[[str], Doc],
|
||||
ensure_doc: Callable[[Union[str, Doc]], Doc],
|
||||
pipes: Iterable[Callable[..., Iterator[Doc]]],
|
||||
receiver,
|
||||
sender,
|
||||
|
@ -2146,7 +2168,8 @@ def _apply_pipes(
|
|||
) -> None:
|
||||
"""Worker for Language.pipe
|
||||
|
||||
make_doc (Callable[[str,] Doc]): Function to create Doc from text.
|
||||
ensure_doc (Callable[[Union[str, Doc]], Doc]): Function to create Doc from text
|
||||
or raise an error if the input is neither a Doc nor a string.
|
||||
pipes (Iterable[Pipe]): The components to apply.
|
||||
receiver (multiprocessing.Connection): Pipe to receive text. Usually
|
||||
created by `multiprocessing.Pipe()`
|
||||
|
@ -2159,16 +2182,16 @@ def _apply_pipes(
|
|||
while True:
|
||||
try:
|
||||
texts = receiver.get()
|
||||
docs = (make_doc(text) for text in texts)
|
||||
docs = (ensure_doc(text) for text in texts)
|
||||
for pipe in pipes:
|
||||
docs = pipe(docs) # type: ignore[arg-type, assignment]
|
||||
# Connection does not accept unpickable objects, so send list.
|
||||
byte_docs = [(doc.to_bytes(), None) for doc in docs]
|
||||
padding = [(None, None)] * (len(texts) - len(byte_docs))
|
||||
byte_docs = [(doc.to_bytes(), doc._context, None) for doc in docs]
|
||||
padding = [(None, None, None)] * (len(texts) - len(byte_docs))
|
||||
sender.send(byte_docs + padding) # type: ignore[operator]
|
||||
except Exception:
|
||||
error_msg = [(None, srsly.msgpack_dumps(traceback.format_exc()))]
|
||||
padding = [(None, None)] * (len(texts) - 1)
|
||||
error_msg = [(None, None, srsly.msgpack_dumps(traceback.format_exc()))]
|
||||
padding = [(None, None, None)] * (len(texts) - 1)
|
||||
sender.send(error_msg + padding)
|
||||
|
||||
|
||||
|
|
|
@ -284,7 +284,7 @@ cdef class Lexeme:
|
|||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lower]
|
||||
|
||||
def __set__(self, unicode x):
|
||||
def __set__(self, str x):
|
||||
self.c.lower = self.vocab.strings.add(x)
|
||||
|
||||
property norm_:
|
||||
|
@ -294,7 +294,7 @@ cdef class Lexeme:
|
|||
def __get__(self):
|
||||
return self.vocab.strings[self.c.norm]
|
||||
|
||||
def __set__(self, unicode x):
|
||||
def __set__(self, str x):
|
||||
self.norm = self.vocab.strings.add(x)
|
||||
|
||||
property shape_:
|
||||
|
@ -304,7 +304,7 @@ cdef class Lexeme:
|
|||
def __get__(self):
|
||||
return self.vocab.strings[self.c.shape]
|
||||
|
||||
def __set__(self, unicode x):
|
||||
def __set__(self, str x):
|
||||
self.c.shape = self.vocab.strings.add(x)
|
||||
|
||||
property prefix_:
|
||||
|
@ -314,7 +314,7 @@ cdef class Lexeme:
|
|||
def __get__(self):
|
||||
return self.vocab.strings[self.c.prefix]
|
||||
|
||||
def __set__(self, unicode x):
|
||||
def __set__(self, str x):
|
||||
self.c.prefix = self.vocab.strings.add(x)
|
||||
|
||||
property suffix_:
|
||||
|
@ -324,7 +324,7 @@ cdef class Lexeme:
|
|||
def __get__(self):
|
||||
return self.vocab.strings[self.c.suffix]
|
||||
|
||||
def __set__(self, unicode x):
|
||||
def __set__(self, str x):
|
||||
self.c.suffix = self.vocab.strings.add(x)
|
||||
|
||||
property lang_:
|
||||
|
@ -332,7 +332,7 @@ cdef class Lexeme:
|
|||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lang]
|
||||
|
||||
def __set__(self, unicode x):
|
||||
def __set__(self, str x):
|
||||
self.c.lang = self.vocab.strings.add(x)
|
||||
|
||||
property flags:
|
||||
|
|
|
@ -148,9 +148,9 @@ cdef class DependencyMatcher:
|
|||
Creates a token key to be used by the matcher
|
||||
"""
|
||||
return self._normalize_key(
|
||||
unicode(key) + DELIMITER +
|
||||
unicode(pattern_idx) + DELIMITER +
|
||||
unicode(token_idx)
|
||||
str(key) + DELIMITER +
|
||||
str(pattern_idx) + DELIMITER +
|
||||
str(token_idx)
|
||||
)
|
||||
|
||||
def add(self, key, patterns, *, on_match=None):
|
||||
|
@ -424,7 +424,7 @@ cdef class DependencyMatcher:
|
|||
return [doc[child.i] for child in doc[node].head.children if child.i < node]
|
||||
|
||||
def _normalize_key(self, key):
|
||||
if isinstance(key, basestring):
|
||||
if isinstance(key, str):
|
||||
return self.vocab.strings.add(key)
|
||||
else:
|
||||
return key
|
||||
|
|
|
@ -312,7 +312,7 @@ cdef class Matcher:
|
|||
return final_results
|
||||
|
||||
def _normalize_key(self, key):
|
||||
if isinstance(key, basestring):
|
||||
if isinstance(key, str):
|
||||
return self.vocab.strings.add(key)
|
||||
else:
|
||||
return key
|
||||
|
@ -360,7 +360,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
|||
for i, token in enumerate(doclike):
|
||||
for name, index in extensions.items():
|
||||
value = token._.get(name)
|
||||
if isinstance(value, basestring):
|
||||
if isinstance(value, str):
|
||||
value = token.vocab.strings[value]
|
||||
extra_attr_values[i * nr_extra_attr + index] = value
|
||||
# Main loop
|
||||
|
@ -786,7 +786,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
|
|||
def _get_attr_values(spec, string_store):
|
||||
attr_values = []
|
||||
for attr, value in spec.items():
|
||||
if isinstance(attr, basestring):
|
||||
if isinstance(attr, str):
|
||||
attr = attr.upper()
|
||||
if attr == '_':
|
||||
continue
|
||||
|
@ -797,7 +797,7 @@ def _get_attr_values(spec, string_store):
|
|||
if attr == "IS_SENT_START":
|
||||
attr = "SENT_START"
|
||||
attr = IDS.get(attr)
|
||||
if isinstance(value, basestring):
|
||||
if isinstance(value, str):
|
||||
value = string_store.add(value)
|
||||
elif isinstance(value, bool):
|
||||
value = int(value)
|
||||
|
@ -938,7 +938,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
|
|||
seen_predicates = {pred.key: pred.i for pred in extra_predicates}
|
||||
output = []
|
||||
for attr, value in spec.items():
|
||||
if isinstance(attr, basestring):
|
||||
if isinstance(attr, str):
|
||||
if attr == "_":
|
||||
output.extend(
|
||||
_get_extension_extra_predicates(
|
||||
|
@ -995,7 +995,7 @@ def _get_operators(spec):
|
|||
"?": (ZERO_ONE,), "1": (ONE,), "!": (ZERO,)}
|
||||
# Fix casing
|
||||
spec = {key.upper(): values for key, values in spec.items()
|
||||
if isinstance(key, basestring)}
|
||||
if isinstance(key, str)}
|
||||
if "OP" not in spec:
|
||||
return (ONE,)
|
||||
elif spec["OP"] in lookup:
|
||||
|
@ -1013,7 +1013,7 @@ def _get_extensions(spec, string_store, name2index):
|
|||
if isinstance(value, dict):
|
||||
# Handle predicates (e.g. "IN", in the extra_predicates, not here.
|
||||
continue
|
||||
if isinstance(value, basestring):
|
||||
if isinstance(value, str):
|
||||
value = string_store.add(value)
|
||||
if name not in name2index:
|
||||
name2index[name] = len(name2index)
|
||||
|
|
|
@ -1,11 +1,13 @@
|
|||
from typing import List, Tuple, Callable, Optional, cast
|
||||
from typing import List, Tuple, Callable, Optional, Sequence, cast
|
||||
from thinc.initializers import glorot_uniform_init
|
||||
from thinc.util import partial
|
||||
from thinc.types import Ragged, Floats2d, Floats1d
|
||||
from thinc.types import Ragged, Floats2d, Floats1d, Ints1d
|
||||
from thinc.api import Model, Ops, registry
|
||||
|
||||
from ..tokens import Doc
|
||||
from ..errors import Errors
|
||||
from ..vectors import Mode
|
||||
from ..vocab import Vocab
|
||||
|
||||
|
||||
@registry.layers("spacy.StaticVectors.v2")
|
||||
|
@ -34,20 +36,32 @@ def StaticVectors(
|
|||
def forward(
|
||||
model: Model[List[Doc], Ragged], docs: List[Doc], is_train: bool
|
||||
) -> Tuple[Ragged, Callable]:
|
||||
if not sum(len(doc) for doc in docs):
|
||||
token_count = sum(len(doc) for doc in docs)
|
||||
if not token_count:
|
||||
return _handle_empty(model.ops, model.get_dim("nO"))
|
||||
key_attr = model.attrs["key_attr"]
|
||||
W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
|
||||
V = cast(Floats2d, model.ops.asarray(docs[0].vocab.vectors.data))
|
||||
rows = model.ops.flatten(
|
||||
[doc.vocab.vectors.find(keys=doc.to_array(key_attr)) for doc in docs]
|
||||
key_attr: int = model.attrs["key_attr"]
|
||||
keys: Ints1d = model.ops.flatten(
|
||||
cast(Sequence, [doc.to_array(key_attr) for doc in docs])
|
||||
)
|
||||
vocab: Vocab = docs[0].vocab
|
||||
W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
|
||||
if vocab.vectors.mode == Mode.default:
|
||||
V = cast(Floats2d, model.ops.asarray(vocab.vectors.data))
|
||||
rows = vocab.vectors.find(keys=keys)
|
||||
V = model.ops.as_contig(V[rows])
|
||||
elif vocab.vectors.mode == Mode.floret:
|
||||
V = cast(Floats2d, vocab.vectors.get_batch(keys))
|
||||
V = model.ops.as_contig(V)
|
||||
else:
|
||||
raise RuntimeError(Errors.E896)
|
||||
try:
|
||||
vectors_data = model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True)
|
||||
vectors_data = model.ops.gemm(V, W, trans2=True)
|
||||
except ValueError:
|
||||
raise RuntimeError(Errors.E896)
|
||||
# Convert negative indices to 0-vectors (TODO: more options for UNK tokens)
|
||||
vectors_data[rows < 0] = 0
|
||||
if vocab.vectors.mode == Mode.default:
|
||||
# Convert negative indices to 0-vectors
|
||||
# TODO: more options for UNK tokens
|
||||
vectors_data[rows < 0] = 0
|
||||
output = Ragged(
|
||||
vectors_data, model.ops.asarray([len(doc) for doc in docs], dtype="i") # type: ignore
|
||||
)
|
||||
|
@ -63,7 +77,7 @@ def forward(
|
|||
model.inc_grad(
|
||||
"W",
|
||||
model.ops.gemm(
|
||||
cast(Floats2d, d_output.data), model.ops.as_contig(V[rows]), trans1=True
|
||||
cast(Floats2d, d_output.data), model.ops.as_contig(V), trans1=True
|
||||
),
|
||||
)
|
||||
return []
|
||||
|
|
|
@ -17,7 +17,7 @@ from ...errors import Errors
|
|||
from thinc.extra.search cimport Beam
|
||||
|
||||
cdef weight_t MIN_SCORE = -90000
|
||||
cdef attr_t SUBTOK_LABEL = hash_string(u'subtok')
|
||||
cdef attr_t SUBTOK_LABEL = hash_string('subtok')
|
||||
|
||||
DEF NON_MONOTONIC = True
|
||||
|
||||
|
|
|
@ -5,15 +5,15 @@ from pathlib import Path
|
|||
|
||||
from .pipe import Pipe
|
||||
from ..errors import Errors
|
||||
from ..training import validate_examples, Example
|
||||
from ..training import Example
|
||||
from ..language import Language
|
||||
from ..matcher import Matcher
|
||||
from ..scorer import Scorer
|
||||
from ..symbols import IDS, TAG, POS, MORPH, LEMMA
|
||||
from ..symbols import IDS
|
||||
from ..tokens import Doc, Span
|
||||
from ..tokens._retokenize import normalize_token_attrs, set_token_attrs
|
||||
from ..vocab import Vocab
|
||||
from ..util import SimpleFrozenList
|
||||
from ..util import SimpleFrozenList, registry
|
||||
from .. import util
|
||||
|
||||
|
||||
|
@ -23,9 +23,41 @@ TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]]
|
|||
MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
|
||||
|
||||
|
||||
@Language.factory("attribute_ruler", default_config={"validate": False})
|
||||
def make_attribute_ruler(nlp: Language, name: str, validate: bool):
|
||||
return AttributeRuler(nlp.vocab, name, validate=validate)
|
||||
@Language.factory(
|
||||
"attribute_ruler",
|
||||
default_config={
|
||||
"validate": False,
|
||||
"scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"},
|
||||
},
|
||||
)
|
||||
def make_attribute_ruler(
|
||||
nlp: Language, name: str, validate: bool, scorer: Optional[Callable]
|
||||
):
|
||||
return AttributeRuler(nlp.vocab, name, validate=validate, scorer=scorer)
|
||||
|
||||
|
||||
def attribute_ruler_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
def morph_key_getter(token, attr):
|
||||
return getattr(token, attr).key
|
||||
|
||||
results = {}
|
||||
results.update(Scorer.score_token_attr(examples, "tag", **kwargs))
|
||||
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
||||
results.update(
|
||||
Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs)
|
||||
)
|
||||
results.update(
|
||||
Scorer.score_token_attr_per_feat(
|
||||
examples, "morph", getter=morph_key_getter, **kwargs
|
||||
)
|
||||
)
|
||||
results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
|
||||
return results
|
||||
|
||||
|
||||
@registry.scorers("spacy.attribute_ruler_scorer.v1")
|
||||
def make_attribute_ruler_scorer():
|
||||
return attribute_ruler_score
|
||||
|
||||
|
||||
class AttributeRuler(Pipe):
|
||||
|
@ -36,7 +68,12 @@ class AttributeRuler(Pipe):
|
|||
"""
|
||||
|
||||
def __init__(
|
||||
self, vocab: Vocab, name: str = "attribute_ruler", *, validate: bool = False
|
||||
self,
|
||||
vocab: Vocab,
|
||||
name: str = "attribute_ruler",
|
||||
*,
|
||||
validate: bool = False,
|
||||
scorer: Optional[Callable] = attribute_ruler_score,
|
||||
) -> None:
|
||||
"""Create the AttributeRuler. After creation, you can add patterns
|
||||
with the `.initialize()` or `.add_patterns()` methods, or load patterns
|
||||
|
@ -45,6 +82,10 @@ class AttributeRuler(Pipe):
|
|||
|
||||
vocab (Vocab): The vocab.
|
||||
name (str): The pipe name. Defaults to "attribute_ruler".
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_token_attr for the attributes "tag", "pos", "morph" and
|
||||
"lemma" and Scorer.score_token_attr_per_feat for the attribute
|
||||
"morph".
|
||||
|
||||
RETURNS (AttributeRuler): The AttributeRuler component.
|
||||
|
||||
|
@ -57,6 +98,7 @@ class AttributeRuler(Pipe):
|
|||
self.attrs: List[Dict] = []
|
||||
self._attrs_unnormed: List[Dict] = [] # store for reference
|
||||
self.indices: List[int] = []
|
||||
self.scorer = scorer
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Reset all patterns."""
|
||||
|
@ -228,45 +270,6 @@ class AttributeRuler(Pipe):
|
|||
all_patterns.append(p)
|
||||
return all_patterns # type: ignore[return-value]
|
||||
|
||||
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores, produced by
|
||||
Scorer.score_token_attr for the attributes "tag", "pos", "morph"
|
||||
and "lemma" for the target token attributes.
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#score
|
||||
"""
|
||||
|
||||
def morph_key_getter(token, attr):
|
||||
return getattr(token, attr).key
|
||||
|
||||
validate_examples(examples, "AttributeRuler.score")
|
||||
results = {}
|
||||
attrs = set() # type: ignore
|
||||
for token_attrs in self.attrs:
|
||||
attrs.update(token_attrs)
|
||||
for attr in attrs:
|
||||
if attr == TAG:
|
||||
results.update(Scorer.score_token_attr(examples, "tag", **kwargs))
|
||||
elif attr == POS:
|
||||
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
||||
elif attr == MORPH:
|
||||
results.update(
|
||||
Scorer.score_token_attr(
|
||||
examples, "morph", getter=morph_key_getter, **kwargs
|
||||
)
|
||||
)
|
||||
results.update(
|
||||
Scorer.score_token_attr_per_feat(
|
||||
examples, "morph", getter=morph_key_getter, **kwargs
|
||||
)
|
||||
)
|
||||
elif attr == LEMMA:
|
||||
results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
|
||||
return results
|
||||
|
||||
def to_bytes(self, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
|
||||
"""Serialize the AttributeRuler to a bytestring.
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
from collections import defaultdict
|
||||
from typing import Optional, Iterable
|
||||
from typing import Optional, Iterable, Callable
|
||||
from thinc.api import Model, Config
|
||||
|
||||
from ._parser_internals.transition_system import TransitionSystem
|
||||
|
@ -12,7 +12,7 @@ from ..language import Language
|
|||
from ._parser_internals import nonproj
|
||||
from ._parser_internals.nonproj import DELIMITER
|
||||
from ..scorer import Scorer
|
||||
from ..training import validate_examples
|
||||
from ..util import registry
|
||||
|
||||
|
||||
default_model_config = """
|
||||
|
@ -46,6 +46,7 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"model": DEFAULT_PARSER_MODEL,
|
||||
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
|
||||
},
|
||||
default_score_weights={
|
||||
"dep_uas": 0.5,
|
||||
|
@ -63,7 +64,8 @@ def make_parser(
|
|||
moves: Optional[TransitionSystem],
|
||||
update_with_oracle_cut_size: int,
|
||||
learn_tokens: bool,
|
||||
min_action_freq: int
|
||||
min_action_freq: int,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
"""Create a transition-based DependencyParser component. The dependency parser
|
||||
jointly learns sentence segmentation and labelled dependency parsing, and can
|
||||
|
@ -100,6 +102,7 @@ def make_parser(
|
|||
primarily affects the label accuracy, it can also affect the attachment
|
||||
structure, as the labels are used to represent the pseudo-projectivity
|
||||
transformation.
|
||||
scorer (Optional[Callable]): The scoring method.
|
||||
"""
|
||||
return DependencyParser(
|
||||
nlp.vocab,
|
||||
|
@ -115,7 +118,8 @@ def make_parser(
|
|||
beam_update_prob=0.0,
|
||||
# At some point in the future we can try to implement support for
|
||||
# partial annotations, perhaps only in the beam objective.
|
||||
incorrect_spans_key=None
|
||||
incorrect_spans_key=None,
|
||||
scorer=scorer,
|
||||
)
|
||||
|
||||
@Language.factory(
|
||||
|
@ -130,6 +134,7 @@ def make_parser(
|
|||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"model": DEFAULT_PARSER_MODEL,
|
||||
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
|
||||
},
|
||||
default_score_weights={
|
||||
"dep_uas": 0.5,
|
||||
|
@ -151,6 +156,7 @@ def make_beam_parser(
|
|||
beam_width: int,
|
||||
beam_density: float,
|
||||
beam_update_prob: float,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
"""Create a transition-based DependencyParser component that uses beam-search.
|
||||
The dependency parser jointly learns sentence segmentation and labelled
|
||||
|
@ -207,10 +213,41 @@ def make_beam_parser(
|
|||
min_action_freq=min_action_freq,
|
||||
# At some point in the future we can try to implement support for
|
||||
# partial annotations, perhaps only in the beam objective.
|
||||
incorrect_spans_key=None
|
||||
incorrect_spans_key=None,
|
||||
scorer=scorer,
|
||||
)
|
||||
|
||||
|
||||
def parser_score(examples, **kwargs):
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
|
||||
and Scorer.score_deps.
|
||||
|
||||
DOCS: https://spacy.io/api/dependencyparser#score
|
||||
"""
|
||||
def has_sents(doc):
|
||||
return doc.has_annotation("SENT_START")
|
||||
|
||||
def dep_getter(token, attr):
|
||||
dep = getattr(token, attr)
|
||||
dep = token.vocab.strings.as_string(dep).lower()
|
||||
return dep
|
||||
results = {}
|
||||
results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
|
||||
kwargs.setdefault("getter", dep_getter)
|
||||
kwargs.setdefault("ignore_labels", ("p", "punct"))
|
||||
results.update(Scorer.score_deps(examples, "dep", **kwargs))
|
||||
del results["sents_per_type"]
|
||||
return results
|
||||
|
||||
|
||||
@registry.scorers("spacy.parser_scorer.v1")
|
||||
def make_parser_scorer():
|
||||
return parser_score
|
||||
|
||||
|
||||
cdef class DependencyParser(Parser):
|
||||
"""Pipeline component for dependency parsing.
|
||||
|
||||
|
@ -233,6 +270,7 @@ cdef class DependencyParser(Parser):
|
|||
beam_update_prob=0.0,
|
||||
multitasks=tuple(),
|
||||
incorrect_spans_key=None,
|
||||
scorer=parser_score,
|
||||
):
|
||||
"""Create a DependencyParser.
|
||||
"""
|
||||
|
@ -249,6 +287,7 @@ cdef class DependencyParser(Parser):
|
|||
beam_update_prob=beam_update_prob,
|
||||
multitasks=multitasks,
|
||||
incorrect_spans_key=incorrect_spans_key,
|
||||
scorer=scorer,
|
||||
)
|
||||
|
||||
@property
|
||||
|
@ -281,31 +320,6 @@ cdef class DependencyParser(Parser):
|
|||
labels.add(label)
|
||||
return tuple(sorted(labels))
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
|
||||
and Scorer.score_deps.
|
||||
|
||||
DOCS: https://spacy.io/api/dependencyparser#score
|
||||
"""
|
||||
def has_sents(doc):
|
||||
return doc.has_annotation("SENT_START")
|
||||
|
||||
validate_examples(examples, "DependencyParser.score")
|
||||
def dep_getter(token, attr):
|
||||
dep = getattr(token, attr)
|
||||
dep = token.vocab.strings.as_string(dep).lower()
|
||||
return dep
|
||||
results = {}
|
||||
results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
|
||||
kwargs.setdefault("getter", dep_getter)
|
||||
kwargs.setdefault("ignore_labels", ("p", "punct"))
|
||||
results.update(Scorer.score_deps(examples, "dep", **kwargs))
|
||||
del results["sents_per_type"]
|
||||
return results
|
||||
|
||||
def scored_parses(self, beams):
|
||||
"""Return two dictionaries with scores for each beam/doc that was processed:
|
||||
one containing (i, head) keys, and another containing (i, label) keys.
|
||||
|
|
|
@ -17,10 +17,12 @@ from ..language import Language
|
|||
from ..vocab import Vocab
|
||||
from ..training import Example, validate_examples, validate_get_examples
|
||||
from ..errors import Errors, Warnings
|
||||
from ..util import SimpleFrozenList
|
||||
from ..util import SimpleFrozenList, registry
|
||||
from .. import util
|
||||
from ..scorer import Scorer
|
||||
|
||||
# See #9050
|
||||
BACKWARD_OVERWRITE = True
|
||||
|
||||
default_model_config = """
|
||||
[model]
|
||||
|
@ -51,6 +53,8 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"incl_context": True,
|
||||
"entity_vector_length": 64,
|
||||
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
||||
"overwrite": True,
|
||||
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
||||
},
|
||||
default_score_weights={
|
||||
"nel_micro_f": 1.0,
|
||||
|
@ -69,6 +73,8 @@ def make_entity_linker(
|
|||
incl_context: bool,
|
||||
entity_vector_length: int,
|
||||
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
"""Construct an EntityLinker component.
|
||||
|
||||
|
@ -82,6 +88,7 @@ def make_entity_linker(
|
|||
entity_vector_length (int): Size of encoding vectors in the KB.
|
||||
get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
|
||||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||
scorer (Optional[Callable]): The scoring method.
|
||||
"""
|
||||
return EntityLinker(
|
||||
nlp.vocab,
|
||||
|
@ -93,9 +100,20 @@ def make_entity_linker(
|
|||
incl_context=incl_context,
|
||||
entity_vector_length=entity_vector_length,
|
||||
get_candidates=get_candidates,
|
||||
overwrite=overwrite,
|
||||
scorer=scorer,
|
||||
)
|
||||
|
||||
|
||||
def entity_linker_score(examples, **kwargs):
|
||||
return Scorer.score_links(examples, negative_labels=[EntityLinker.NIL], **kwargs)
|
||||
|
||||
|
||||
@registry.scorers("spacy.entity_linker_scorer.v1")
|
||||
def make_entity_linker_scorer():
|
||||
return entity_linker_score
|
||||
|
||||
|
||||
class EntityLinker(TrainablePipe):
|
||||
"""Pipeline component for named entity linking.
|
||||
|
||||
|
@ -116,6 +134,8 @@ class EntityLinker(TrainablePipe):
|
|||
incl_context: bool,
|
||||
entity_vector_length: int,
|
||||
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||
overwrite: bool = BACKWARD_OVERWRITE,
|
||||
scorer: Optional[Callable] = entity_linker_score,
|
||||
) -> None:
|
||||
"""Initialize an entity linker.
|
||||
|
||||
|
@ -130,6 +150,8 @@ class EntityLinker(TrainablePipe):
|
|||
entity_vector_length (int): Size of encoding vectors in the KB.
|
||||
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
||||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_links.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#init
|
||||
"""
|
||||
|
@ -141,11 +163,12 @@ class EntityLinker(TrainablePipe):
|
|||
self.incl_prior = incl_prior
|
||||
self.incl_context = incl_context
|
||||
self.get_candidates = get_candidates
|
||||
self.cfg: Dict[str, Any] = {}
|
||||
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
|
||||
self.distance = CosineDistance(normalize=False)
|
||||
# how many neighbour sentences to take into account
|
||||
# create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
|
||||
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
||||
self.scorer = scorer
|
||||
|
||||
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
||||
"""Define the KB of this pipe by providing a function that will
|
||||
|
@ -384,23 +407,14 @@ class EntityLinker(TrainablePipe):
|
|||
if count_ents != len(kb_ids):
|
||||
raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
|
||||
i = 0
|
||||
overwrite = self.cfg["overwrite"]
|
||||
for doc in docs:
|
||||
for ent in doc.ents:
|
||||
kb_id = kb_ids[i]
|
||||
i += 1
|
||||
for token in ent:
|
||||
token.ent_kb_id_ = kb_id
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores.
|
||||
|
||||
DOCS TODO: https://spacy.io/api/entity_linker#score
|
||||
"""
|
||||
validate_examples(examples, "EntityLinker.score")
|
||||
return Scorer.score_links(examples, negative_labels=[self.NIL])
|
||||
if token.ent_kb_id == 0 or overwrite:
|
||||
token.ent_kb_id_ = kb_id
|
||||
|
||||
def to_bytes(self, *, exclude=tuple()):
|
||||
"""Serialize the pipe to a bytestring.
|
||||
|
|
|
@ -9,11 +9,10 @@ from .pipe import Pipe
|
|||
from ..training import Example
|
||||
from ..language import Language
|
||||
from ..errors import Errors, Warnings
|
||||
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
|
||||
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry
|
||||
from ..tokens import Doc, Span
|
||||
from ..matcher import Matcher, PhraseMatcher
|
||||
from ..scorer import get_ner_prf
|
||||
from ..training import validate_examples
|
||||
|
||||
|
||||
DEFAULT_ENT_ID_SEP = "||"
|
||||
|
@ -28,6 +27,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
|
|||
"validate": False,
|
||||
"overwrite_ents": False,
|
||||
"ent_id_sep": DEFAULT_ENT_ID_SEP,
|
||||
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
|
||||
},
|
||||
default_score_weights={
|
||||
"ents_f": 1.0,
|
||||
|
@ -43,6 +43,7 @@ def make_entity_ruler(
|
|||
validate: bool,
|
||||
overwrite_ents: bool,
|
||||
ent_id_sep: str,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return EntityRuler(
|
||||
nlp,
|
||||
|
@ -51,9 +52,19 @@ def make_entity_ruler(
|
|||
validate=validate,
|
||||
overwrite_ents=overwrite_ents,
|
||||
ent_id_sep=ent_id_sep,
|
||||
scorer=scorer,
|
||||
)
|
||||
|
||||
|
||||
def entity_ruler_score(examples, **kwargs):
|
||||
return get_ner_prf(examples)
|
||||
|
||||
|
||||
@registry.scorers("spacy.entity_ruler_scorer.v1")
|
||||
def make_entity_ruler_scorer():
|
||||
return entity_ruler_score
|
||||
|
||||
|
||||
class EntityRuler(Pipe):
|
||||
"""The EntityRuler lets you add spans to the `Doc.ents` using token-based
|
||||
rules or exact phrase matches. It can be combined with the statistical
|
||||
|
@ -75,6 +86,7 @@ class EntityRuler(Pipe):
|
|||
overwrite_ents: bool = False,
|
||||
ent_id_sep: str = DEFAULT_ENT_ID_SEP,
|
||||
patterns: Optional[List[PatternType]] = None,
|
||||
scorer: Optional[Callable] = entity_ruler_score,
|
||||
) -> None:
|
||||
"""Initialize the entity ruler. If patterns are supplied here, they
|
||||
need to be a list of dictionaries with a `"label"` and `"pattern"`
|
||||
|
@ -95,6 +107,8 @@ class EntityRuler(Pipe):
|
|||
overwrite_ents (bool): If existing entities are present, e.g. entities
|
||||
added by the model, overwrite them by matches if necessary.
|
||||
ent_id_sep (str): Separator used internally for entity IDs.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
spacy.scorer.get_ner_prf.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#init
|
||||
"""
|
||||
|
@ -113,6 +127,7 @@ class EntityRuler(Pipe):
|
|||
self._ent_ids = defaultdict(tuple) # type: ignore
|
||||
if patterns is not None:
|
||||
self.add_patterns(patterns)
|
||||
self.scorer = scorer
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""The number of all patterns added to the entity ruler."""
|
||||
|
@ -363,10 +378,6 @@ class EntityRuler(Pipe):
|
|||
label = f"{label}{self.ent_id_sep}{ent_id}"
|
||||
return label
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
validate_examples(examples, "EntityRuler.score")
|
||||
return get_ner_prf(examples)
|
||||
|
||||
def from_bytes(
|
||||
self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> "EntityRuler":
|
||||
|
|
|
@ -12,21 +12,41 @@ from ..lookups import Lookups, load_lookups
|
|||
from ..scorer import Scorer
|
||||
from ..tokens import Doc, Token
|
||||
from ..vocab import Vocab
|
||||
from ..training import validate_examples
|
||||
from ..util import logger, SimpleFrozenList
|
||||
from ..util import logger, SimpleFrozenList, registry
|
||||
from .. import util
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "lookup", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "lookup",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return Lemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
def lemmatizer_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
return Scorer.score_token_attr(examples, "lemma", **kwargs)
|
||||
|
||||
|
||||
@registry.scorers("spacy.lemmatizer_scorer.v1")
|
||||
def make_lemmatizer_scorer():
|
||||
return lemmatizer_score
|
||||
|
||||
|
||||
class Lemmatizer(Pipe):
|
||||
|
@ -60,6 +80,7 @@ class Lemmatizer(Pipe):
|
|||
*,
|
||||
mode: str = "lookup",
|
||||
overwrite: bool = False,
|
||||
scorer: Optional[Callable] = lemmatizer_score,
|
||||
) -> None:
|
||||
"""Initialize a Lemmatizer.
|
||||
|
||||
|
@ -69,6 +90,8 @@ class Lemmatizer(Pipe):
|
|||
mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
|
||||
overwrite (bool): Whether to overwrite existing lemmas. Defaults to
|
||||
`False`.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_token_attr for the attribute "lemma".
|
||||
|
||||
DOCS: https://spacy.io/api/lemmatizer#init
|
||||
"""
|
||||
|
@ -89,6 +112,7 @@ class Lemmatizer(Pipe):
|
|||
raise ValueError(Errors.E1003.format(mode=mode))
|
||||
self.lemmatize = getattr(self, mode_attr)
|
||||
self.cache = {} # type: ignore[var-annotated]
|
||||
self.scorer = scorer
|
||||
|
||||
@property
|
||||
def mode(self):
|
||||
|
@ -247,17 +271,6 @@ class Lemmatizer(Pipe):
|
|||
"""
|
||||
return False
|
||||
|
||||
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores.
|
||||
|
||||
DOCS: https://spacy.io/api/lemmatizer#score
|
||||
"""
|
||||
validate_examples(examples, "Lemmatizer.score")
|
||||
return Scorer.score_token_attr(examples, "lemma", **kwargs)
|
||||
|
||||
def to_disk(
|
||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||
):
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
from typing import Optional, Union, Dict
|
||||
from typing import Optional, Union, Dict, Callable
|
||||
import srsly
|
||||
from thinc.api import SequenceCategoricalCrossentropy, Model, Config
|
||||
from itertools import islice
|
||||
|
@ -17,7 +17,11 @@ from .tagger import Tagger
|
|||
from .. import util
|
||||
from ..scorer import Scorer
|
||||
from ..training import validate_examples, validate_get_examples
|
||||
from ..util import registry
|
||||
|
||||
# See #9050
|
||||
BACKWARD_OVERWRITE = True
|
||||
BACKWARD_EXTEND = False
|
||||
|
||||
default_model_config = """
|
||||
[model]
|
||||
|
@ -48,15 +52,35 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
|
|||
@Language.factory(
|
||||
"morphologizer",
|
||||
assigns=["token.morph", "token.pos"],
|
||||
default_config={"model": DEFAULT_MORPH_MODEL},
|
||||
default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}},
|
||||
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
|
||||
)
|
||||
def make_morphologizer(
|
||||
nlp: Language,
|
||||
model: Model,
|
||||
name: str,
|
||||
overwrite: bool,
|
||||
extend: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return Morphologizer(nlp.vocab, model, name)
|
||||
return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer)
|
||||
|
||||
|
||||
def morphologizer_score(examples, **kwargs):
|
||||
def morph_key_getter(token, attr):
|
||||
return getattr(token, attr).key
|
||||
|
||||
results = {}
|
||||
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
||||
results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
|
||||
results.update(Scorer.score_token_attr_per_feat(examples,
|
||||
"morph", getter=morph_key_getter, **kwargs))
|
||||
return results
|
||||
|
||||
|
||||
@registry.scorers("spacy.morphologizer_scorer.v1")
|
||||
def make_morphologizer_scorer():
|
||||
return morphologizer_score
|
||||
|
||||
|
||||
class Morphologizer(Tagger):
|
||||
|
@ -67,6 +91,10 @@ class Morphologizer(Tagger):
|
|||
vocab: Vocab,
|
||||
model: Model,
|
||||
name: str = "morphologizer",
|
||||
*,
|
||||
overwrite: bool = BACKWARD_OVERWRITE,
|
||||
extend: bool = BACKWARD_EXTEND,
|
||||
scorer: Optional[Callable] = morphologizer_score,
|
||||
):
|
||||
"""Initialize a morphologizer.
|
||||
|
||||
|
@ -74,6 +102,9 @@ class Morphologizer(Tagger):
|
|||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_token_attr for the attributes "pos" and "morph" and
|
||||
Scorer.score_token_attr_per_feat for the attribute "morph".
|
||||
|
||||
DOCS: https://spacy.io/api/morphologizer#init
|
||||
"""
|
||||
|
@ -85,8 +116,14 @@ class Morphologizer(Tagger):
|
|||
# store mappings from morph+POS labels to token-level annotations:
|
||||
# 1) labels_morph stores a mapping from morph+POS->morph
|
||||
# 2) labels_pos stores a mapping from morph+POS->POS
|
||||
cfg = {"labels_morph": {}, "labels_pos": {}}
|
||||
cfg = {
|
||||
"labels_morph": {},
|
||||
"labels_pos": {},
|
||||
"overwrite": overwrite,
|
||||
"extend": extend,
|
||||
}
|
||||
self.cfg = dict(sorted(cfg.items()))
|
||||
self.scorer = scorer
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
|
@ -192,14 +229,34 @@ class Morphologizer(Tagger):
|
|||
docs = [docs]
|
||||
cdef Doc doc
|
||||
cdef Vocab vocab = self.vocab
|
||||
cdef bint overwrite = self.cfg["overwrite"]
|
||||
cdef bint extend = self.cfg["extend"]
|
||||
for i, doc in enumerate(docs):
|
||||
doc_tag_ids = batch_tag_ids[i]
|
||||
if hasattr(doc_tag_ids, "get"):
|
||||
doc_tag_ids = doc_tag_ids.get()
|
||||
for j, tag_id in enumerate(doc_tag_ids):
|
||||
morph = self.labels[tag_id]
|
||||
doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0))
|
||||
doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0)
|
||||
# set morph
|
||||
if doc.c[j].morph == 0 or overwrite or extend:
|
||||
if overwrite and extend:
|
||||
# morphologizer morph overwrites any existing features
|
||||
# while extending
|
||||
extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph])
|
||||
extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0)))
|
||||
doc.c[j].morph = self.vocab.morphology.add(extended_morph)
|
||||
elif extend:
|
||||
# existing features are preserved and any new features
|
||||
# are added
|
||||
extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0))
|
||||
extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph]))
|
||||
doc.c[j].morph = self.vocab.morphology.add(extended_morph)
|
||||
else:
|
||||
# clobber
|
||||
doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0))
|
||||
# set POS
|
||||
if doc.c[j].pos == 0 or overwrite:
|
||||
doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0)
|
||||
|
||||
def get_loss(self, examples, scores):
|
||||
"""Find the loss and gradient of loss for the batch of documents and
|
||||
|
@ -246,24 +303,3 @@ class Morphologizer(Tagger):
|
|||
if self.model.ops.xp.isnan(loss):
|
||||
raise ValueError(Errors.E910.format(name=self.name))
|
||||
return float(loss), d_scores
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores, produced by
|
||||
Scorer.score_token_attr for the attributes "pos" and "morph" and
|
||||
Scorer.score_token_attr_per_feat for the attribute "morph".
|
||||
|
||||
DOCS: https://spacy.io/api/morphologizer#score
|
||||
"""
|
||||
def morph_key_getter(token, attr):
|
||||
return getattr(token, attr).key
|
||||
|
||||
validate_examples(examples, "Morphologizer.score")
|
||||
results = {}
|
||||
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
||||
results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
|
||||
results.update(Scorer.score_token_attr_per_feat(examples,
|
||||
"morph", getter=morph_key_getter, **kwargs))
|
||||
return results
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
from collections import defaultdict
|
||||
from typing import Optional, Iterable
|
||||
from typing import Optional, Iterable, Callable
|
||||
from thinc.api import Model, Config
|
||||
|
||||
from ._parser_internals.transition_system import TransitionSystem
|
||||
|
@ -9,7 +9,7 @@ from ._parser_internals.ner cimport BiluoPushDown
|
|||
|
||||
from ..language import Language
|
||||
from ..scorer import get_ner_prf, PRFScore
|
||||
from ..training import validate_examples
|
||||
from ..util import registry
|
||||
|
||||
|
||||
default_model_config = """
|
||||
|
@ -41,7 +41,8 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"moves": None,
|
||||
"update_with_oracle_cut_size": 100,
|
||||
"model": DEFAULT_NER_MODEL,
|
||||
"incorrect_spans_key": None
|
||||
"incorrect_spans_key": None,
|
||||
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
||||
|
||||
|
@ -52,7 +53,8 @@ def make_ner(
|
|||
model: Model,
|
||||
moves: Optional[TransitionSystem],
|
||||
update_with_oracle_cut_size: int,
|
||||
incorrect_spans_key: Optional[str]=None
|
||||
incorrect_spans_key: Optional[str],
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
"""Create a transition-based EntityRecognizer component. The entity recognizer
|
||||
identifies non-overlapping labelled spans of tokens.
|
||||
|
@ -80,6 +82,7 @@ def make_ner(
|
|||
incorrect_spans_key (Optional[str]): Identifies spans that are known
|
||||
to be incorrect entity annotations. The incorrect entity annotations
|
||||
can be stored in the span group, under this key.
|
||||
scorer (Optional[Callable]): The scoring method.
|
||||
"""
|
||||
return EntityRecognizer(
|
||||
nlp.vocab,
|
||||
|
@ -92,6 +95,7 @@ def make_ner(
|
|||
beam_width=1,
|
||||
beam_density=0.0,
|
||||
beam_update_prob=0.0,
|
||||
scorer=scorer,
|
||||
)
|
||||
|
||||
@Language.factory(
|
||||
|
@ -104,7 +108,8 @@ def make_ner(
|
|||
"beam_density": 0.01,
|
||||
"beam_update_prob": 0.5,
|
||||
"beam_width": 32,
|
||||
"incorrect_spans_key": None
|
||||
"incorrect_spans_key": None,
|
||||
"scorer": None,
|
||||
},
|
||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
||||
)
|
||||
|
@ -117,7 +122,8 @@ def make_beam_ner(
|
|||
beam_width: int,
|
||||
beam_density: float,
|
||||
beam_update_prob: float,
|
||||
incorrect_spans_key: Optional[str]=None
|
||||
incorrect_spans_key: Optional[str],
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
"""Create a transition-based EntityRecognizer component that uses beam-search.
|
||||
The entity recognizer identifies non-overlapping labelled spans of tokens.
|
||||
|
@ -153,6 +159,7 @@ def make_beam_ner(
|
|||
and are faster to compute.
|
||||
incorrect_spans_key (Optional[str]): Optional key into span groups of
|
||||
entities known to be non-entities.
|
||||
scorer (Optional[Callable]): The scoring method.
|
||||
"""
|
||||
return EntityRecognizer(
|
||||
nlp.vocab,
|
||||
|
@ -164,10 +171,20 @@ def make_beam_ner(
|
|||
beam_width=beam_width,
|
||||
beam_density=beam_density,
|
||||
beam_update_prob=beam_update_prob,
|
||||
incorrect_spans_key=incorrect_spans_key
|
||||
incorrect_spans_key=incorrect_spans_key,
|
||||
scorer=scorer,
|
||||
)
|
||||
|
||||
|
||||
def ner_score(examples, **kwargs):
|
||||
return get_ner_prf(examples, **kwargs)
|
||||
|
||||
|
||||
@registry.scorers("spacy.ner_scorer.v1")
|
||||
def make_ner_scorer():
|
||||
return ner_score
|
||||
|
||||
|
||||
cdef class EntityRecognizer(Parser):
|
||||
"""Pipeline component for named entity recognition.
|
||||
|
||||
|
@ -188,6 +205,7 @@ cdef class EntityRecognizer(Parser):
|
|||
beam_update_prob=0.0,
|
||||
multitasks=tuple(),
|
||||
incorrect_spans_key=None,
|
||||
scorer=ner_score,
|
||||
):
|
||||
"""Create an EntityRecognizer.
|
||||
"""
|
||||
|
@ -204,6 +222,7 @@ cdef class EntityRecognizer(Parser):
|
|||
beam_update_prob=beam_update_prob,
|
||||
multitasks=multitasks,
|
||||
incorrect_spans_key=incorrect_spans_key,
|
||||
scorer=scorer,
|
||||
)
|
||||
|
||||
def add_multitask_objective(self, mt_component):
|
||||
|
@ -227,17 +246,6 @@ cdef class EntityRecognizer(Parser):
|
|||
if move[0] in ("B", "I", "L", "U"))
|
||||
return tuple(sorted(labels))
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The NER precision, recall and f-scores.
|
||||
|
||||
DOCS: https://spacy.io/api/entityrecognizer#score
|
||||
"""
|
||||
validate_examples(examples, "EntityRecognizer.score")
|
||||
return get_ner_prf(examples)
|
||||
|
||||
def scored_ents(self, beams):
|
||||
"""Return a dictionary of (start, end, label) tuples with corresponding scores
|
||||
for each beam/doc that was processed.
|
||||
|
|
|
@ -81,6 +81,17 @@ cdef class Pipe:
|
|||
|
||||
DOCS: https://spacy.io/api/pipe#score
|
||||
"""
|
||||
if hasattr(self, "scorer") and self.scorer is not None:
|
||||
scorer_kwargs = {}
|
||||
# use default settings from cfg (e.g., threshold)
|
||||
if hasattr(self, "cfg") and isinstance(self.cfg, dict):
|
||||
scorer_kwargs.update(self.cfg)
|
||||
# override self.cfg["labels"] with self.labels
|
||||
if hasattr(self, "labels"):
|
||||
scorer_kwargs["labels"] = self.labels
|
||||
# override with kwargs settings
|
||||
scorer_kwargs.update(kwargs)
|
||||
return self.scorer(examples, **scorer_kwargs)
|
||||
return {}
|
||||
|
||||
@property
|
||||
|
|
|
@ -1,26 +1,32 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
from typing import Optional, List
|
||||
from typing import Optional, List, Callable
|
||||
import srsly
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
|
||||
from .pipe import Pipe
|
||||
from .senter import senter_score
|
||||
from ..language import Language
|
||||
from ..scorer import Scorer
|
||||
from ..training import validate_examples
|
||||
from .. import util
|
||||
|
||||
# see #9050
|
||||
BACKWARD_OVERWRITE = False
|
||||
|
||||
@Language.factory(
|
||||
"sentencizer",
|
||||
assigns=["token.is_sent_start", "doc.sents"],
|
||||
default_config={"punct_chars": None},
|
||||
default_config={"punct_chars": None, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
|
||||
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
||||
)
|
||||
def make_sentencizer(
|
||||
nlp: Language,
|
||||
name: str,
|
||||
punct_chars: Optional[List[str]]
|
||||
punct_chars: Optional[List[str]],
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return Sentencizer(name, punct_chars=punct_chars)
|
||||
return Sentencizer(name, punct_chars=punct_chars, overwrite=overwrite, scorer=scorer)
|
||||
|
||||
|
||||
class Sentencizer(Pipe):
|
||||
|
@ -41,12 +47,20 @@ class Sentencizer(Pipe):
|
|||
'𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
|
||||
'。', '。']
|
||||
|
||||
def __init__(self, name="sentencizer", *, punct_chars=None):
|
||||
def __init__(
|
||||
self,
|
||||
name="sentencizer",
|
||||
*,
|
||||
punct_chars=None,
|
||||
overwrite=BACKWARD_OVERWRITE,
|
||||
scorer=senter_score,
|
||||
):
|
||||
"""Initialize the sentencizer.
|
||||
|
||||
punct_chars (list): Punctuation characters to split on. Will be
|
||||
serialized with the nlp object.
|
||||
RETURNS (Sentencizer): The sentencizer component.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_spans for the attribute "sents".
|
||||
|
||||
DOCS: https://spacy.io/api/sentencizer#init
|
||||
"""
|
||||
|
@ -55,6 +69,8 @@ class Sentencizer(Pipe):
|
|||
self.punct_chars = set(punct_chars)
|
||||
else:
|
||||
self.punct_chars = set(self.default_punct_chars)
|
||||
self.overwrite = overwrite
|
||||
self.scorer = scorer
|
||||
|
||||
def __call__(self, doc):
|
||||
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
|
||||
|
@ -115,29 +131,12 @@ class Sentencizer(Pipe):
|
|||
for i, doc in enumerate(docs):
|
||||
doc_tag_ids = batch_tag_ids[i]
|
||||
for j, tag_id in enumerate(doc_tag_ids):
|
||||
# Don't clobber existing sentence boundaries
|
||||
if doc.c[j].sent_start == 0:
|
||||
if doc.c[j].sent_start == 0 or self.overwrite:
|
||||
if tag_id:
|
||||
doc.c[j].sent_start = 1
|
||||
else:
|
||||
doc.c[j].sent_start = -1
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencizer#score
|
||||
"""
|
||||
def has_sents(doc):
|
||||
return doc.has_annotation("SENT_START")
|
||||
|
||||
validate_examples(examples, "Sentencizer.score")
|
||||
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
|
||||
del results["sents_per_type"]
|
||||
return results
|
||||
|
||||
def to_bytes(self, *, exclude=tuple()):
|
||||
"""Serialize the sentencizer to a bytestring.
|
||||
|
||||
|
@ -145,7 +144,7 @@ class Sentencizer(Pipe):
|
|||
|
||||
DOCS: https://spacy.io/api/sentencizer#to_bytes
|
||||
"""
|
||||
return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)})
|
||||
return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars), "overwrite": self.overwrite})
|
||||
|
||||
def from_bytes(self, bytes_data, *, exclude=tuple()):
|
||||
"""Load the sentencizer from a bytestring.
|
||||
|
@ -157,6 +156,7 @@ class Sentencizer(Pipe):
|
|||
"""
|
||||
cfg = srsly.msgpack_loads(bytes_data)
|
||||
self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
|
||||
self.overwrite = cfg.get("overwrite", self.overwrite)
|
||||
return self
|
||||
|
||||
def to_disk(self, path, *, exclude=tuple()):
|
||||
|
@ -166,7 +166,7 @@ class Sentencizer(Pipe):
|
|||
"""
|
||||
path = util.ensure_path(path)
|
||||
path = path.with_suffix(".json")
|
||||
srsly.write_json(path, {"punct_chars": list(self.punct_chars)})
|
||||
srsly.write_json(path, {"punct_chars": list(self.punct_chars), "overwrite": self.overwrite})
|
||||
|
||||
|
||||
def from_disk(self, path, *, exclude=tuple()):
|
||||
|
@ -178,4 +178,5 @@ class Sentencizer(Pipe):
|
|||
path = path.with_suffix(".json")
|
||||
cfg = srsly.read_json(path)
|
||||
self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
|
||||
self.overwrite = cfg.get("overwrite", self.overwrite)
|
||||
return self
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
from itertools import islice
|
||||
from typing import Optional, Callable
|
||||
|
||||
import srsly
|
||||
from thinc.api import Model, SequenceCategoricalCrossentropy, Config
|
||||
|
@ -11,8 +12,11 @@ from ..language import Language
|
|||
from ..errors import Errors
|
||||
from ..scorer import Scorer
|
||||
from ..training import validate_examples, validate_get_examples
|
||||
from ..util import registry
|
||||
from .. import util
|
||||
|
||||
# See #9050
|
||||
BACKWARD_OVERWRITE = False
|
||||
|
||||
default_model_config = """
|
||||
[model]
|
||||
|
@ -34,11 +38,25 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
@Language.factory(
|
||||
"senter",
|
||||
assigns=["token.is_sent_start"],
|
||||
default_config={"model": DEFAULT_SENTER_MODEL},
|
||||
default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
|
||||
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
||||
)
|
||||
def make_senter(nlp: Language, name: str, model: Model):
|
||||
return SentenceRecognizer(nlp.vocab, model, name)
|
||||
def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]):
|
||||
return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer)
|
||||
|
||||
|
||||
def senter_score(examples, **kwargs):
|
||||
def has_sents(doc):
|
||||
return doc.has_annotation("SENT_START")
|
||||
|
||||
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
|
||||
del results["sents_per_type"]
|
||||
return results
|
||||
|
||||
|
||||
@registry.scorers("spacy.senter_scorer.v1")
|
||||
def make_senter_scorer():
|
||||
return senter_score
|
||||
|
||||
|
||||
class SentenceRecognizer(Tagger):
|
||||
|
@ -46,13 +64,23 @@ class SentenceRecognizer(Tagger):
|
|||
|
||||
DOCS: https://spacy.io/api/sentencerecognizer
|
||||
"""
|
||||
def __init__(self, vocab, model, name="senter"):
|
||||
def __init__(
|
||||
self,
|
||||
vocab,
|
||||
model,
|
||||
name="senter",
|
||||
*,
|
||||
overwrite=BACKWARD_OVERWRITE,
|
||||
scorer=senter_score,
|
||||
):
|
||||
"""Initialize a sentence recognizer.
|
||||
|
||||
vocab (Vocab): The shared vocabulary.
|
||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_spans for the attribute "sents".
|
||||
|
||||
DOCS: https://spacy.io/api/sentencerecognizer#init
|
||||
"""
|
||||
|
@ -60,7 +88,8 @@ class SentenceRecognizer(Tagger):
|
|||
self.model = model
|
||||
self.name = name
|
||||
self._rehearsal_model = None
|
||||
self.cfg = {}
|
||||
self.cfg = {"overwrite": overwrite}
|
||||
self.scorer = scorer
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
|
@ -85,13 +114,13 @@ class SentenceRecognizer(Tagger):
|
|||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
cdef Doc doc
|
||||
cdef bint overwrite = self.cfg["overwrite"]
|
||||
for i, doc in enumerate(docs):
|
||||
doc_tag_ids = batch_tag_ids[i]
|
||||
if hasattr(doc_tag_ids, "get"):
|
||||
doc_tag_ids = doc_tag_ids.get()
|
||||
for j, tag_id in enumerate(doc_tag_ids):
|
||||
# Don't clobber existing sentence boundaries
|
||||
if doc.c[j].sent_start == 0:
|
||||
if doc.c[j].sent_start == 0 or overwrite:
|
||||
if tag_id == 1:
|
||||
doc.c[j].sent_start = 1
|
||||
else:
|
||||
|
@ -153,18 +182,3 @@ class SentenceRecognizer(Tagger):
|
|||
|
||||
def add_label(self, label, values=None):
|
||||
raise NotImplementedError
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
||||
DOCS: https://spacy.io/api/sentencerecognizer#score
|
||||
"""
|
||||
def has_sents(doc):
|
||||
return doc.has_annotation("SENT_START")
|
||||
|
||||
validate_examples(examples, "SentenceRecognizer.score")
|
||||
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
|
||||
del results["sents_per_type"]
|
||||
return results
|
||||
|
|
|
@ -104,6 +104,7 @@ def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester:
|
|||
"max_positive": None,
|
||||
"model": DEFAULT_SPANCAT_MODEL,
|
||||
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
||||
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
||||
)
|
||||
|
@ -113,8 +114,9 @@ def make_spancat(
|
|||
suggester: Suggester,
|
||||
model: Model[Tuple[List[Doc], Ragged], Floats2d],
|
||||
spans_key: str,
|
||||
threshold: float = 0.5,
|
||||
max_positive: Optional[int] = None,
|
||||
scorer: Optional[Callable],
|
||||
threshold: float,
|
||||
max_positive: Optional[int],
|
||||
) -> "SpanCategorizer":
|
||||
"""Create a SpanCategorizer component. The span categorizer consists of two
|
||||
parts: a suggester function that proposes candidate spans, and a labeller
|
||||
|
@ -144,9 +146,28 @@ def make_spancat(
|
|||
threshold=threshold,
|
||||
max_positive=max_positive,
|
||||
name=name,
|
||||
scorer=scorer,
|
||||
)
|
||||
|
||||
|
||||
def spancat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
kwargs = dict(kwargs)
|
||||
attr_prefix = "spans_"
|
||||
key = kwargs["spans_key"]
|
||||
kwargs.setdefault("attr", f"{attr_prefix}{key}")
|
||||
kwargs.setdefault("allow_overlap", True)
|
||||
kwargs.setdefault(
|
||||
"getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], [])
|
||||
)
|
||||
kwargs.setdefault("has_annotation", lambda doc: key in doc.spans)
|
||||
return Scorer.score_spans(examples, **kwargs)
|
||||
|
||||
|
||||
@registry.scorers("spacy.spancat_scorer.v1")
|
||||
def make_spancat_scorer():
|
||||
return spancat_score
|
||||
|
||||
|
||||
class SpanCategorizer(TrainablePipe):
|
||||
"""Pipeline component to label spans of text.
|
||||
|
||||
|
@ -163,8 +184,25 @@ class SpanCategorizer(TrainablePipe):
|
|||
spans_key: str = "spans",
|
||||
threshold: float = 0.5,
|
||||
max_positive: Optional[int] = None,
|
||||
scorer: Optional[Callable] = spancat_score,
|
||||
) -> None:
|
||||
"""Initialize the span categorizer.
|
||||
vocab (Vocab): The shared vocabulary.
|
||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
spans_key (str): Key of the Doc.spans dict to save the spans under.
|
||||
During initialization and training, the component will look for
|
||||
spans on the reference document under the same key. Defaults to
|
||||
`"spans"`.
|
||||
threshold (float): Minimum probability to consider a prediction
|
||||
positive. Spans with a positive prediction will be saved on the Doc.
|
||||
Defaults to 0.5.
|
||||
max_positive (Optional[int]): Maximum number of labels to consider
|
||||
positive per span. Defaults to None, indicating no limit.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
|
||||
spans allowed.
|
||||
|
||||
DOCS: https://spacy.io/api/spancategorizer#init
|
||||
"""
|
||||
|
@ -178,6 +216,7 @@ class SpanCategorizer(TrainablePipe):
|
|||
self.suggester = suggester
|
||||
self.model = model
|
||||
self.name = name
|
||||
self.scorer = scorer
|
||||
|
||||
@property
|
||||
def key(self) -> str:
|
||||
|
@ -379,26 +418,6 @@ class SpanCategorizer(TrainablePipe):
|
|||
else:
|
||||
self.model.initialize()
|
||||
|
||||
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
|
||||
|
||||
DOCS: https://spacy.io/api/spancategorizer#score
|
||||
"""
|
||||
validate_examples(examples, "SpanCategorizer.score")
|
||||
self._validate_categories(examples)
|
||||
kwargs = dict(kwargs)
|
||||
attr_prefix = "spans_"
|
||||
kwargs.setdefault("attr", f"{attr_prefix}{self.key}")
|
||||
kwargs.setdefault("allow_overlap", True)
|
||||
kwargs.setdefault(
|
||||
"getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], [])
|
||||
)
|
||||
kwargs.setdefault("has_annotation", lambda doc: self.key in doc.spans)
|
||||
return Scorer.score_spans(examples, **kwargs)
|
||||
|
||||
def _validate_categories(self, examples: Iterable[Example]):
|
||||
# TODO
|
||||
pass
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
from typing import Callable, Optional
|
||||
import numpy
|
||||
import srsly
|
||||
from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
|
||||
|
@ -18,8 +19,11 @@ from ..parts_of_speech import X
|
|||
from ..errors import Errors, Warnings
|
||||
from ..scorer import Scorer
|
||||
from ..training import validate_examples, validate_get_examples
|
||||
from ..util import registry
|
||||
from .. import util
|
||||
|
||||
# See #9050
|
||||
BACKWARD_OVERWRITE = False
|
||||
|
||||
default_model_config = """
|
||||
[model]
|
||||
|
@ -41,10 +45,16 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
@Language.factory(
|
||||
"tagger",
|
||||
assigns=["token.tag"],
|
||||
default_config={"model": DEFAULT_TAGGER_MODEL},
|
||||
default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}},
|
||||
default_score_weights={"tag_acc": 1.0},
|
||||
)
|
||||
def make_tagger(nlp: Language, name: str, model: Model):
|
||||
def make_tagger(
|
||||
nlp: Language,
|
||||
name: str,
|
||||
model: Model,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
"""Construct a part-of-speech tagger component.
|
||||
|
||||
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
|
||||
|
@ -52,7 +62,16 @@ def make_tagger(nlp: Language, name: str, model: Model):
|
|||
in size, and be normalized as probabilities (all scores between 0 and 1,
|
||||
with the rows summing to 1).
|
||||
"""
|
||||
return Tagger(nlp.vocab, model, name)
|
||||
return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer)
|
||||
|
||||
|
||||
def tagger_score(examples, **kwargs):
|
||||
return Scorer.score_token_attr(examples, "tag", **kwargs)
|
||||
|
||||
|
||||
@registry.scorers("spacy.tagger_scorer.v1")
|
||||
def make_tagger_scorer():
|
||||
return tagger_score
|
||||
|
||||
|
||||
class Tagger(TrainablePipe):
|
||||
|
@ -60,13 +79,23 @@ class Tagger(TrainablePipe):
|
|||
|
||||
DOCS: https://spacy.io/api/tagger
|
||||
"""
|
||||
def __init__(self, vocab, model, name="tagger"):
|
||||
def __init__(
|
||||
self,
|
||||
vocab,
|
||||
model,
|
||||
name="tagger",
|
||||
*,
|
||||
overwrite=BACKWARD_OVERWRITE,
|
||||
scorer=tagger_score,
|
||||
):
|
||||
"""Initialize a part-of-speech tagger.
|
||||
|
||||
vocab (Vocab): The shared vocabulary.
|
||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_token_attr for the attribute "tag".
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#init
|
||||
"""
|
||||
|
@ -74,8 +103,9 @@ class Tagger(TrainablePipe):
|
|||
self.model = model
|
||||
self.name = name
|
||||
self._rehearsal_model = None
|
||||
cfg = {"labels": []}
|
||||
cfg = {"labels": [], "overwrite": overwrite}
|
||||
self.cfg = dict(sorted(cfg.items()))
|
||||
self.scorer = scorer
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
|
@ -135,13 +165,13 @@ class Tagger(TrainablePipe):
|
|||
docs = [docs]
|
||||
cdef Doc doc
|
||||
cdef Vocab vocab = self.vocab
|
||||
cdef bint overwrite = self.cfg["overwrite"]
|
||||
for i, doc in enumerate(docs):
|
||||
doc_tag_ids = batch_tag_ids[i]
|
||||
if hasattr(doc_tag_ids, "get"):
|
||||
doc_tag_ids = doc_tag_ids.get()
|
||||
for j, tag_id in enumerate(doc_tag_ids):
|
||||
# Don't clobber preset POS tags
|
||||
if doc.c[j].tag == 0:
|
||||
if doc.c[j].tag == 0 or overwrite:
|
||||
doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
|
||||
|
||||
def update(self, examples, *, drop=0., sgd=None, losses=None):
|
||||
|
@ -289,15 +319,3 @@ class Tagger(TrainablePipe):
|
|||
self.cfg["labels"].append(label)
|
||||
self.vocab.strings.add(label)
|
||||
return 1
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores, produced by
|
||||
Scorer.score_token_attr for the attributes "tag".
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#score
|
||||
"""
|
||||
validate_examples(examples, "Tagger.score")
|
||||
return Scorer.score_token_attr(examples, "tag", **kwargs)
|
||||
|
|
|
@ -10,6 +10,7 @@ from ..training import Example, validate_examples, validate_get_examples
|
|||
from ..errors import Errors
|
||||
from ..scorer import Scorer
|
||||
from ..tokens import Doc
|
||||
from ..util import registry
|
||||
from ..vocab import Vocab
|
||||
|
||||
|
||||
|
@ -70,7 +71,11 @@ subword_features = true
|
|||
@Language.factory(
|
||||
"textcat",
|
||||
assigns=["doc.cats"],
|
||||
default_config={"threshold": 0.5, "model": DEFAULT_SINGLE_TEXTCAT_MODEL},
|
||||
default_config={
|
||||
"threshold": 0.5,
|
||||
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
|
||||
"scorer": {"@scorers": "spacy.textcat_scorer.v1"},
|
||||
},
|
||||
default_score_weights={
|
||||
"cats_score": 1.0,
|
||||
"cats_score_desc": None,
|
||||
|
@ -86,7 +91,11 @@ subword_features = true
|
|||
},
|
||||
)
|
||||
def make_textcat(
|
||||
nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
|
||||
nlp: Language,
|
||||
name: str,
|
||||
model: Model[List[Doc], List[Floats2d]],
|
||||
threshold: float,
|
||||
scorer: Optional[Callable],
|
||||
) -> "TextCategorizer":
|
||||
"""Create a TextCategorizer component. The text categorizer predicts categories
|
||||
over a whole document. It can learn one or more labels, and the labels are considered
|
||||
|
@ -95,8 +104,23 @@ def make_textcat(
|
|||
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
|
||||
scores for each category.
|
||||
threshold (float): Cutoff to consider a prediction "positive".
|
||||
scorer (Optional[Callable]): The scoring method.
|
||||
"""
|
||||
return TextCategorizer(nlp.vocab, model, name, threshold=threshold)
|
||||
return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer)
|
||||
|
||||
|
||||
def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
return Scorer.score_cats(
|
||||
examples,
|
||||
"cats",
|
||||
multi_label=False,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
@registry.scorers("spacy.textcat_scorer.v1")
|
||||
def make_textcat_scorer():
|
||||
return textcat_score
|
||||
|
||||
|
||||
class TextCategorizer(TrainablePipe):
|
||||
|
@ -106,7 +130,13 @@ class TextCategorizer(TrainablePipe):
|
|||
"""
|
||||
|
||||
def __init__(
|
||||
self, vocab: Vocab, model: Model, name: str = "textcat", *, threshold: float
|
||||
self,
|
||||
vocab: Vocab,
|
||||
model: Model,
|
||||
name: str = "textcat",
|
||||
*,
|
||||
threshold: float,
|
||||
scorer: Optional[Callable] = textcat_score,
|
||||
) -> None:
|
||||
"""Initialize a text categorizer for single-label classification.
|
||||
|
||||
|
@ -115,6 +145,8 @@ class TextCategorizer(TrainablePipe):
|
|||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
threshold (float): Cutoff to consider a prediction "positive".
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_cats for the attribute "cats".
|
||||
|
||||
DOCS: https://spacy.io/api/textcategorizer#init
|
||||
"""
|
||||
|
@ -124,6 +156,7 @@ class TextCategorizer(TrainablePipe):
|
|||
self._rehearsal_model = None
|
||||
cfg = {"labels": [], "threshold": threshold, "positive_label": None}
|
||||
self.cfg = dict(cfg)
|
||||
self.scorer = scorer
|
||||
|
||||
@property
|
||||
def labels(self) -> Tuple[str]:
|
||||
|
@ -353,26 +386,6 @@ class TextCategorizer(TrainablePipe):
|
|||
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||
|
||||
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
|
||||
|
||||
DOCS: https://spacy.io/api/textcategorizer#score
|
||||
"""
|
||||
validate_examples(examples, "TextCategorizer.score")
|
||||
self._validate_categories(examples)
|
||||
kwargs.setdefault("threshold", self.cfg["threshold"])
|
||||
kwargs.setdefault("positive_label", self.cfg["positive_label"])
|
||||
return Scorer.score_cats(
|
||||
examples,
|
||||
"cats",
|
||||
labels=self.labels,
|
||||
multi_label=False,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _validate_categories(self, examples: Iterable[Example]):
|
||||
"""Check whether the provided examples all have single-label cats annotations."""
|
||||
for ex in examples:
|
||||
|
|
|
@ -5,10 +5,11 @@ from thinc.api import Model, Config
|
|||
from thinc.types import Floats2d
|
||||
|
||||
from ..language import Language
|
||||
from ..training import Example, validate_examples, validate_get_examples
|
||||
from ..training import Example, validate_get_examples
|
||||
from ..errors import Errors
|
||||
from ..scorer import Scorer
|
||||
from ..tokens import Doc
|
||||
from ..util import registry
|
||||
from ..vocab import Vocab
|
||||
from .textcat import TextCategorizer
|
||||
|
||||
|
@ -70,7 +71,11 @@ subword_features = true
|
|||
@Language.factory(
|
||||
"textcat_multilabel",
|
||||
assigns=["doc.cats"],
|
||||
default_config={"threshold": 0.5, "model": DEFAULT_MULTI_TEXTCAT_MODEL},
|
||||
default_config={
|
||||
"threshold": 0.5,
|
||||
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
|
||||
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
|
||||
},
|
||||
default_score_weights={
|
||||
"cats_score": 1.0,
|
||||
"cats_score_desc": None,
|
||||
|
@ -86,7 +91,11 @@ subword_features = true
|
|||
},
|
||||
)
|
||||
def make_multilabel_textcat(
|
||||
nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
|
||||
nlp: Language,
|
||||
name: str,
|
||||
model: Model[List[Doc], List[Floats2d]],
|
||||
threshold: float,
|
||||
scorer: Optional[Callable],
|
||||
) -> "TextCategorizer":
|
||||
"""Create a TextCategorizer component. The text categorizer predicts categories
|
||||
over a whole document. It can learn one or more labels, and the labels are considered
|
||||
|
@ -97,7 +106,23 @@ def make_multilabel_textcat(
|
|||
scores for each category.
|
||||
threshold (float): Cutoff to consider a prediction "positive".
|
||||
"""
|
||||
return MultiLabel_TextCategorizer(nlp.vocab, model, name, threshold=threshold)
|
||||
return MultiLabel_TextCategorizer(
|
||||
nlp.vocab, model, name, threshold=threshold, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
return Scorer.score_cats(
|
||||
examples,
|
||||
"cats",
|
||||
multi_label=True,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
@registry.scorers("spacy.textcat_multilabel_scorer.v1")
|
||||
def make_textcat_multilabel_scorer():
|
||||
return textcat_multilabel_score
|
||||
|
||||
|
||||
class MultiLabel_TextCategorizer(TextCategorizer):
|
||||
|
@ -113,6 +138,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
|
|||
name: str = "textcat_multilabel",
|
||||
*,
|
||||
threshold: float,
|
||||
scorer: Optional[Callable] = textcat_multilabel_score,
|
||||
) -> None:
|
||||
"""Initialize a text categorizer for multi-label classification.
|
||||
|
||||
|
@ -130,6 +156,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
|
|||
self._rehearsal_model = None
|
||||
cfg = {"labels": [], "threshold": threshold}
|
||||
self.cfg = dict(cfg)
|
||||
self.scorer = scorer
|
||||
|
||||
def initialize( # type: ignore[override]
|
||||
self,
|
||||
|
@ -166,24 +193,6 @@ class MultiLabel_TextCategorizer(TextCategorizer):
|
|||
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||
|
||||
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
|
||||
|
||||
DOCS: https://spacy.io/api/textcategorizer#score
|
||||
"""
|
||||
validate_examples(examples, "MultiLabel_TextCategorizer.score")
|
||||
kwargs.setdefault("threshold", self.cfg["threshold"])
|
||||
return Scorer.score_cats(
|
||||
examples,
|
||||
"cats",
|
||||
labels=self.labels,
|
||||
multi_label=True,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _validate_categories(self, examples: Iterable[Example]):
|
||||
"""This component allows any type of single- or multi-label annotations.
|
||||
This method overwrites the more strict one from 'textcat'."""
|
||||
|
|
|
@ -5,3 +5,4 @@ cdef class TrainablePipe(Pipe):
|
|||
cdef public Vocab vocab
|
||||
cdef public object model
|
||||
cdef public object cfg
|
||||
cdef public object scorer
|
||||
|
|
|
@ -49,7 +49,8 @@ cdef class Parser(TrainablePipe):
|
|||
beam_density=0.0,
|
||||
beam_update_prob=0.0,
|
||||
multitasks=tuple(),
|
||||
incorrect_spans_key=None
|
||||
incorrect_spans_key=None,
|
||||
scorer=None,
|
||||
):
|
||||
"""Create a Parser.
|
||||
|
||||
|
@ -86,6 +87,7 @@ cdef class Parser(TrainablePipe):
|
|||
incorrect_spans_key (Optional[str]): Identifies spans that are known
|
||||
to be incorrect entity annotations. The incorrect entity annotations
|
||||
can be stored in the span group, under this key.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to None.
|
||||
"""
|
||||
self.vocab = vocab
|
||||
self.name = name
|
||||
|
@ -117,6 +119,7 @@ cdef class Parser(TrainablePipe):
|
|||
self.add_multitask_objective(multitask)
|
||||
|
||||
self._rehearsal_model = None
|
||||
self.scorer = scorer
|
||||
|
||||
def __getnewargs_ex__(self):
|
||||
"""This allows pickling the Parser and its keyword-only init arguments"""
|
||||
|
|
|
@ -351,7 +351,8 @@ class ConfigSchemaPretrain(BaseModel):
|
|||
# fmt: off
|
||||
max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for")
|
||||
dropout: StrictFloat = Field(..., title="Dropout rate")
|
||||
n_save_every: Optional[StrictInt] = Field(..., title="Saving frequency")
|
||||
n_save_every: Optional[StrictInt] = Field(..., title="Saving additional temporary model after n batches within an epoch")
|
||||
n_save_epoch: Optional[StrictInt] = Field(..., title="Saving model after every n epoch")
|
||||
optimizer: Optimizer = Field(..., title="The optimizer to use")
|
||||
corpus: StrictStr = Field(..., title="Path in the config to the training data")
|
||||
batcher: Batcher = Field(..., title="Batcher for the training data")
|
||||
|
|
|
@ -247,18 +247,21 @@ class Scorer:
|
|||
missing_values: Set[Any] = MISSING_VALUES, # type: ignore[assignment]
|
||||
**cfg,
|
||||
) -> Dict[str, Any]:
|
||||
"""Return PRF scores per feat for a token attribute in UFEATS format.
|
||||
"""Return micro PRF and PRF scores per feat for a token attribute in
|
||||
UFEATS format.
|
||||
|
||||
examples (Iterable[Example]): Examples to score
|
||||
attr (str): The attribute to score.
|
||||
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
|
||||
getter(token, attr) should return the value of the attribute for an
|
||||
individual token.
|
||||
missing_values (Set[Any]): Attribute values to treat as missing annotation
|
||||
in the reference annotation.
|
||||
RETURNS (dict): A dictionary containing the per-feat PRF scores under
|
||||
the key attr_per_feat.
|
||||
missing_values (Set[Any]): Attribute values to treat as missing
|
||||
annotation in the reference annotation.
|
||||
RETURNS (dict): A dictionary containing the micro PRF scores under the
|
||||
key attr_micro_p/r/f and the per-feat PRF scores under
|
||||
attr_per_feat.
|
||||
"""
|
||||
micro_score = PRFScore()
|
||||
per_feat = {}
|
||||
for example in examples:
|
||||
pred_doc = example.predicted
|
||||
|
@ -300,15 +303,22 @@ class Scorer:
|
|||
pred_per_feat[field] = set()
|
||||
pred_per_feat[field].add((gold_i, feat))
|
||||
for field in per_feat:
|
||||
micro_score.score_set(pred_per_feat.get(field, set()), gold_per_feat.get(field, set()))
|
||||
per_feat[field].score_set(
|
||||
pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
|
||||
)
|
||||
score_key = f"{attr}_per_feat"
|
||||
if any([len(v) for v in per_feat.values()]):
|
||||
result = {k: v.to_dict() for k, v in per_feat.items()}
|
||||
return {score_key: result}
|
||||
result: Dict[str, Any] = {}
|
||||
if len(micro_score) > 0:
|
||||
result[f"{attr}_micro_p"] = micro_score.precision
|
||||
result[f"{attr}_micro_r"] = micro_score.recall
|
||||
result[f"{attr}_micro_f"] = micro_score.fscore
|
||||
result[f"{attr}_per_feat"] = {k: v.to_dict() for k, v in per_feat.items()}
|
||||
else:
|
||||
return {score_key: None}
|
||||
result[f"{attr}_micro_p"] = None
|
||||
result[f"{attr}_micro_r"] = None
|
||||
result[f"{attr}_micro_f"] = None
|
||||
result[f"{attr}_per_feat"] = None
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def score_spans(
|
||||
|
@ -545,7 +555,7 @@ class Scorer:
|
|||
|
||||
@staticmethod
|
||||
def score_links(
|
||||
examples: Iterable[Example], *, negative_labels: Iterable[str]
|
||||
examples: Iterable[Example], *, negative_labels: Iterable[str], **cfg
|
||||
) -> Dict[str, Any]:
|
||||
"""Returns PRF for predicted links on the entity level.
|
||||
To disentangle the performance of the NEL from the NER,
|
||||
|
@ -721,7 +731,7 @@ class Scorer:
|
|||
}
|
||||
|
||||
|
||||
def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]:
|
||||
def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
"""Compute micro-PRF and per-entity PRF scores for a sequence of examples."""
|
||||
score_per_type = defaultdict(PRFScore)
|
||||
for eg in examples:
|
||||
|
|
|
@ -8,10 +8,10 @@ from murmurhash.mrmr cimport hash64
|
|||
from .typedefs cimport attr_t, hash_t
|
||||
|
||||
|
||||
cpdef hash_t hash_string(unicode string) except 0
|
||||
cpdef hash_t hash_string(str string) except 0
|
||||
cdef hash_t hash_utf8(char* utf8_string, int length) nogil
|
||||
|
||||
cdef unicode decode_Utf8Str(const Utf8Str* string)
|
||||
cdef str decode_Utf8Str(const Utf8Str* string)
|
||||
|
||||
|
||||
ctypedef union Utf8Str:
|
||||
|
@ -25,5 +25,5 @@ cdef class StringStore:
|
|||
cdef vector[hash_t] keys
|
||||
cdef public PreshMap _map
|
||||
|
||||
cdef const Utf8Str* intern_unicode(self, unicode py_string)
|
||||
cdef const Utf8Str* intern_unicode(self, str py_string)
|
||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)
|
||||
|
|
|
@ -33,7 +33,7 @@ def get_string_id(key):
|
|||
return hash_utf8(chars, len(chars))
|
||||
|
||||
|
||||
cpdef hash_t hash_string(unicode string) except 0:
|
||||
cpdef hash_t hash_string(str string) except 0:
|
||||
chars = string.encode("utf8")
|
||||
return hash_utf8(chars, len(chars))
|
||||
|
||||
|
@ -46,7 +46,7 @@ cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
|
|||
return hash32(utf8_string, length, 1)
|
||||
|
||||
|
||||
cdef unicode decode_Utf8Str(const Utf8Str* string):
|
||||
cdef str decode_Utf8Str(const Utf8Str* string):
|
||||
cdef int i, length
|
||||
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
|
||||
return string.s[1:string.s[0]+1].decode("utf8")
|
||||
|
@ -107,17 +107,17 @@ cdef class StringStore:
|
|||
def __getitem__(self, object string_or_id):
|
||||
"""Retrieve a string from a given hash, or vice versa.
|
||||
|
||||
string_or_id (bytes, unicode or uint64): The value to encode.
|
||||
string_or_id (bytes, str or uint64): The value to encode.
|
||||
Returns (str / uint64): The value to be retrieved.
|
||||
"""
|
||||
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
|
||||
if isinstance(string_or_id, str) and len(string_or_id) == 0:
|
||||
return 0
|
||||
elif string_or_id == 0:
|
||||
return ""
|
||||
elif string_or_id in SYMBOLS_BY_STR:
|
||||
return SYMBOLS_BY_STR[string_or_id]
|
||||
cdef hash_t key
|
||||
if isinstance(string_or_id, unicode):
|
||||
if isinstance(string_or_id, str):
|
||||
key = hash_string(string_or_id)
|
||||
return key
|
||||
elif isinstance(string_or_id, bytes):
|
||||
|
@ -135,14 +135,14 @@ cdef class StringStore:
|
|||
|
||||
def as_int(self, key):
|
||||
"""If key is an int, return it; otherwise, get the int value."""
|
||||
if not isinstance(key, basestring):
|
||||
if not isinstance(key, str):
|
||||
return key
|
||||
else:
|
||||
return self[key]
|
||||
|
||||
def as_string(self, key):
|
||||
"""If key is a string, return it; otherwise, get the string value."""
|
||||
if isinstance(key, basestring):
|
||||
if isinstance(key, str):
|
||||
return key
|
||||
else:
|
||||
return self[key]
|
||||
|
@ -153,7 +153,7 @@ cdef class StringStore:
|
|||
string (str): The string to add.
|
||||
RETURNS (uint64): The string's hash value.
|
||||
"""
|
||||
if isinstance(string, unicode):
|
||||
if isinstance(string, str):
|
||||
if string in SYMBOLS_BY_STR:
|
||||
return SYMBOLS_BY_STR[string]
|
||||
key = hash_string(string)
|
||||
|
@ -189,7 +189,7 @@ cdef class StringStore:
|
|||
return True
|
||||
elif string in SYMBOLS_BY_STR:
|
||||
return True
|
||||
elif isinstance(string, unicode):
|
||||
elif isinstance(string, str):
|
||||
key = hash_string(string)
|
||||
else:
|
||||
string = string.encode("utf8")
|
||||
|
@ -269,7 +269,7 @@ cdef class StringStore:
|
|||
for string in strings:
|
||||
self.add(string)
|
||||
|
||||
cdef const Utf8Str* intern_unicode(self, unicode py_string):
|
||||
cdef const Utf8Str* intern_unicode(self, str py_string):
|
||||
# 0 means missing, but we don't bother offsetting the index.
|
||||
cdef bytes byte_string = py_string.encode("utf8")
|
||||
return self._intern_utf8(byte_string, len(byte_string))
|
||||
|
|
|
@ -5,9 +5,11 @@ from spacy.compat import pickle
|
|||
def test_pickle_single_doc():
|
||||
nlp = Language()
|
||||
doc = nlp("pickle roundtrip")
|
||||
doc._context = 3
|
||||
data = pickle.dumps(doc, 1)
|
||||
doc2 = pickle.loads(data)
|
||||
assert doc2.text == "pickle roundtrip"
|
||||
assert doc2._context == 3
|
||||
|
||||
|
||||
def test_list_of_docs_pickles_efficiently():
|
||||
|
|
|
@ -11,7 +11,18 @@ def test_ca_tokenizer_handles_abbr(ca_tokenizer, text, lemma):
|
|||
|
||||
|
||||
def test_ca_tokenizer_handles_exc_in_text(ca_tokenizer):
|
||||
text = "La Núria i el Pere han vingut aprox. a les 7 de la tarda."
|
||||
tokens = ca_tokenizer(text)
|
||||
assert len(tokens) == 15
|
||||
assert tokens[7].text == "aprox."
|
||||
text = "La Dra. Puig viu a la pl. dels Til·lers."
|
||||
doc = ca_tokenizer(text)
|
||||
assert [t.text for t in doc] == [
|
||||
"La",
|
||||
"Dra.",
|
||||
"Puig",
|
||||
"viu",
|
||||
"a",
|
||||
"la",
|
||||
"pl.",
|
||||
"d",
|
||||
"els",
|
||||
"Til·lers",
|
||||
".",
|
||||
]
|
||||
|
|
|
@ -2,7 +2,14 @@ import pytest
|
|||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,expected_tokens", [("d'un", ["d'", "un"]), ("s'ha", ["s'", "ha"])]
|
||||
"text,expected_tokens",
|
||||
[
|
||||
("d'un", ["d'", "un"]),
|
||||
("s'ha", ["s'", "ha"]),
|
||||
("del", ["d", "el"]),
|
||||
("cantar-te", ["cantar", "-te"]),
|
||||
("-hola", ["-", "hola"]),
|
||||
],
|
||||
)
|
||||
def test_contractions(ca_tokenizer, text, expected_tokens):
|
||||
"""Test that the contractions are split into two tokens"""
|
||||
|
|
|
@ -12,17 +12,20 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer):
|
|||
una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida."""
|
||||
|
||||
tokens = ca_tokenizer(text)
|
||||
assert len(tokens) == 140
|
||||
assert len(tokens) == 146
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,length",
|
||||
[
|
||||
("Perquè va anar-hi?", 4),
|
||||
("Perquè va anar-hi?", 5),
|
||||
("El cotxe dels veins.", 6),
|
||||
("“Ah no?”", 5),
|
||||
("""Sí! "Anem", va contestar el Joan Carles""", 11),
|
||||
("Van córrer aprox. 10km", 5),
|
||||
("Llavors perqué...", 3),
|
||||
("Vull parlar-te'n demà al matí", 8),
|
||||
("Vull explicar-t'ho demà al matí", 8),
|
||||
],
|
||||
)
|
||||
def test_ca_tokenizer_handles_cnts(ca_tokenizer, text, length):
|
||||
|
|
|
@ -8,3 +8,17 @@ import pytest
|
|||
def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma):
|
||||
test_lemma = ja_tokenizer(word)[0].lemma_
|
||||
assert test_lemma == lemma
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"word,norm",
|
||||
[
|
||||
("SUMMER", "サマー"),
|
||||
("食べ物", "食べ物"),
|
||||
("綜合", "総合"),
|
||||
("コンピュータ", "コンピューター"),
|
||||
],
|
||||
)
|
||||
def test_ja_lemmatizer_norm(ja_tokenizer, word, norm):
|
||||
test_norm = ja_tokenizer(word)[0].norm_
|
||||
assert test_norm == norm
|
||||
|
|
9
spacy/tests/lang/ja/test_morphologizer_factory.py
Normal file
9
spacy/tests/lang/ja/test_morphologizer_factory.py
Normal file
|
@ -0,0 +1,9 @@
|
|||
import pytest
|
||||
from spacy.lang.ja import Japanese
|
||||
|
||||
|
||||
def test_ja_morphologizer_factory():
|
||||
pytest.importorskip("sudachipy")
|
||||
nlp = Japanese()
|
||||
morphologizer = nlp.add_pipe("morphologizer")
|
||||
assert morphologizer.cfg["extend"] is True
|
|
@ -1,3 +1,5 @@
|
|||
import pickle
|
||||
|
||||
from spacy.lang.ja import Japanese
|
||||
from ...util import make_tempdir
|
||||
|
||||
|
@ -31,3 +33,9 @@ def test_ja_tokenizer_serialize(ja_tokenizer):
|
|||
nlp_r.from_disk(d)
|
||||
assert nlp_bytes == nlp_r.to_bytes()
|
||||
assert nlp_r.tokenizer.split_mode == "B"
|
||||
|
||||
|
||||
def test_ja_tokenizer_pickle(ja_tokenizer):
|
||||
b = pickle.dumps(ja_tokenizer)
|
||||
ja_tokenizer_re = pickle.loads(b)
|
||||
assert ja_tokenizer.to_bytes() == ja_tokenizer_re.to_bytes()
|
||||
|
|
|
@ -34,22 +34,22 @@ SENTENCE_TESTS = [
|
|||
]
|
||||
|
||||
tokens1 = [
|
||||
DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", reading="イイン", sub_tokens=None),
|
||||
DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", reading="カイ", sub_tokens=None),
|
||||
DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", norm="委員", reading="イイン", sub_tokens=None),
|
||||
DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", norm="会", reading="カイ", sub_tokens=None),
|
||||
]
|
||||
tokens2 = [
|
||||
DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", reading="センキョ", sub_tokens=None),
|
||||
DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", reading="カンリ", sub_tokens=None),
|
||||
DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", reading="イイン", sub_tokens=None),
|
||||
DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", reading="カイ", sub_tokens=None),
|
||||
DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", norm="選挙", reading="センキョ", sub_tokens=None),
|
||||
DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", norm="管理", reading="カンリ", sub_tokens=None),
|
||||
DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", norm="委員", reading="イイン", sub_tokens=None),
|
||||
DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", norm="会", reading="カイ", sub_tokens=None),
|
||||
]
|
||||
tokens3 = [
|
||||
DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", reading="センキョ", sub_tokens=None),
|
||||
DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", reading="カンリ", sub_tokens=None),
|
||||
DetailedToken(surface="委員会", tag="名詞-普通名詞-一般", inf="", lemma="委員会", reading="イインカイ", sub_tokens=None),
|
||||
DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", norm="選挙", reading="センキョ", sub_tokens=None),
|
||||
DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", norm="管理", reading="カンリ", sub_tokens=None),
|
||||
DetailedToken(surface="委員会", tag="名詞-普通名詞-一般", inf="", lemma="委員会", norm="委員会", reading="イインカイ", sub_tokens=None),
|
||||
]
|
||||
SUB_TOKEN_TESTS = [
|
||||
("選挙管理委員会", [None, None, None, None], [None, None, [tokens1]], [[tokens2, tokens3]])
|
||||
("選挙管理委員会", [None, None, [tokens1]], [[tokens2, tokens3]])
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
|
@ -111,18 +111,16 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
|
|||
assert len(nlp_c(text)) == len_c
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS
|
||||
)
|
||||
@pytest.mark.parametrize("text,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS)
|
||||
def test_ja_tokenizer_sub_tokens(
|
||||
ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c
|
||||
ja_tokenizer, text, sub_tokens_list_b, sub_tokens_list_c
|
||||
):
|
||||
nlp_a = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "A"}}})
|
||||
nlp_b = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "B"}}})
|
||||
nlp_c = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "C"}}})
|
||||
|
||||
assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a
|
||||
assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a
|
||||
assert ja_tokenizer(text).user_data.get("sub_tokens") is None
|
||||
assert nlp_a(text).user_data.get("sub_tokens") is None
|
||||
assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b
|
||||
assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c
|
||||
|
||||
|
@ -132,16 +130,24 @@ def test_ja_tokenizer_sub_tokens(
|
|||
[
|
||||
(
|
||||
"取ってつけた",
|
||||
("五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"),
|
||||
("トッ", "テ", "ツケ", "タ"),
|
||||
(["五段-ラ行;連用形-促音便"], [], ["下一段-カ行;連用形-一般"], ["助動詞-タ;終止形-一般"]),
|
||||
(["トッ"], ["テ"], ["ツケ"], ["タ"]),
|
||||
),
|
||||
(
|
||||
"2=3",
|
||||
([], [], []),
|
||||
(["ニ"], ["_"], ["サン"])
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_ja_tokenizer_inflections_reading_forms(
|
||||
ja_tokenizer, text, inflections, reading_forms
|
||||
):
|
||||
assert ja_tokenizer(text).user_data["inflections"] == inflections
|
||||
assert ja_tokenizer(text).user_data["reading_forms"] == reading_forms
|
||||
tokens = ja_tokenizer(text)
|
||||
test_inflections = [tt.morph.get("Inflection") for tt in tokens]
|
||||
assert test_inflections == list(inflections)
|
||||
test_readings = [tt.morph.get("Reading") for tt in tokens]
|
||||
assert test_readings == list(reading_forms)
|
||||
|
||||
|
||||
def test_ja_tokenizer_emptyish_texts(ja_tokenizer):
|
||||
|
|
24
spacy/tests/lang/ko/test_serialize.py
Normal file
24
spacy/tests/lang/ko/test_serialize.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
import pickle
|
||||
|
||||
from spacy.lang.ko import Korean
|
||||
from ...util import make_tempdir
|
||||
|
||||
|
||||
def test_ko_tokenizer_serialize(ko_tokenizer):
|
||||
tokenizer_bytes = ko_tokenizer.to_bytes()
|
||||
nlp = Korean()
|
||||
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
||||
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||
|
||||
with make_tempdir() as d:
|
||||
file_path = d / "tokenizer"
|
||||
ko_tokenizer.to_disk(file_path)
|
||||
nlp = Korean()
|
||||
nlp.tokenizer.from_disk(file_path)
|
||||
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||
|
||||
|
||||
def test_ko_tokenizer_pickle(ko_tokenizer):
|
||||
b = pickle.dumps(ko_tokenizer)
|
||||
ko_tokenizer_re = pickle.loads(b)
|
||||
assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
|
|
|
@ -58,9 +58,10 @@ def test_lex_attrs_is_currency(text, match):
|
|||
("www.google.com", True),
|
||||
("google.com", True),
|
||||
("sydney.com", True),
|
||||
("2girls1cup.org", True),
|
||||
("1abc2def.org", True),
|
||||
("http://stupid", True),
|
||||
("www.hi", True),
|
||||
("example.com/example", True),
|
||||
("dog", False),
|
||||
("1.2", False),
|
||||
("1.a", False),
|
||||
|
|
24
spacy/tests/lang/th/test_serialize.py
Normal file
24
spacy/tests/lang/th/test_serialize.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
import pickle
|
||||
|
||||
from spacy.lang.th import Thai
|
||||
from ...util import make_tempdir
|
||||
|
||||
|
||||
def test_th_tokenizer_serialize(th_tokenizer):
|
||||
tokenizer_bytes = th_tokenizer.to_bytes()
|
||||
nlp = Thai()
|
||||
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
||||
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||
|
||||
with make_tempdir() as d:
|
||||
file_path = d / "tokenizer"
|
||||
th_tokenizer.to_disk(file_path)
|
||||
nlp = Thai()
|
||||
nlp.tokenizer.from_disk(file_path)
|
||||
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||
|
||||
|
||||
def test_th_tokenizer_pickle(th_tokenizer):
|
||||
b = pickle.dumps(th_tokenizer)
|
||||
th_tokenizer_re = pickle.loads(b)
|
||||
assert th_tokenizer.to_bytes() == th_tokenizer_re.to_bytes()
|
|
@ -37,7 +37,7 @@ def test_ti_tokenizer_handles_cnts(ti_tokenizer, text, length):
|
|||
("10.000", True),
|
||||
("1000", True),
|
||||
("999,0", True),
|
||||
("ሐደ", True),
|
||||
("ሓደ", True),
|
||||
("ክልተ", True),
|
||||
("ትሪልዮን", True),
|
||||
("ከልቢ", False),
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
import pickle
|
||||
|
||||
from spacy.lang.vi import Vietnamese
|
||||
from ...util import make_tempdir
|
||||
|
||||
|
@ -31,3 +33,9 @@ def test_vi_tokenizer_serialize(vi_tokenizer):
|
|||
nlp_r.from_disk(d)
|
||||
assert nlp_bytes == nlp_r.to_bytes()
|
||||
assert nlp_r.tokenizer.use_pyvi is False
|
||||
|
||||
|
||||
def test_vi_tokenizer_pickle(vi_tokenizer):
|
||||
b = pickle.dumps(vi_tokenizer)
|
||||
vi_tokenizer_re = pickle.loads(b)
|
||||
assert vi_tokenizer.to_bytes() == vi_tokenizer_re.to_bytes()
|
||||
|
|
|
@ -32,24 +32,6 @@ def pattern_dicts():
|
|||
]
|
||||
|
||||
|
||||
@registry.misc("attribute_ruler_patterns")
|
||||
def attribute_ruler_patterns():
|
||||
return [
|
||||
{
|
||||
"patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
|
||||
"attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
|
||||
},
|
||||
# one pattern sets the lemma
|
||||
{"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
|
||||
# another pattern sets the morphology
|
||||
{
|
||||
"patterns": [[{"ORTH": "test"}]],
|
||||
"attrs": {"MORPH": "Case=Nom|Number=Sing"},
|
||||
"index": 0,
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tag_map():
|
||||
return {
|
||||
|
@ -121,7 +103,25 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
|||
assert doc.has_annotation("LEMMA")
|
||||
assert doc.has_annotation("MORPH")
|
||||
nlp.remove_pipe("attribute_ruler")
|
||||
|
||||
# initialize with patterns from misc registry
|
||||
@registry.misc("attribute_ruler_patterns")
|
||||
def attribute_ruler_patterns():
|
||||
return [
|
||||
{
|
||||
"patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
|
||||
"attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
|
||||
},
|
||||
# one pattern sets the lemma
|
||||
{"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
|
||||
# another pattern sets the morphology
|
||||
{
|
||||
"patterns": [[{"ORTH": "test"}]],
|
||||
"attrs": {"MORPH": "Case=Nom|Number=Sing"},
|
||||
"index": 0,
|
||||
},
|
||||
]
|
||||
|
||||
nlp.config["initialize"]["components"]["attribute_ruler"] = {
|
||||
"patterns": {"@misc": "attribute_ruler_patterns"}
|
||||
}
|
||||
|
@ -162,6 +162,26 @@ def test_attributeruler_score(nlp, pattern_dicts):
|
|||
assert scores["lemma_acc"] == pytest.approx(0.2)
|
||||
# no morphs are set
|
||||
assert scores["morph_acc"] is None
|
||||
nlp.remove_pipe("attribute_ruler")
|
||||
|
||||
# test with custom scorer
|
||||
@registry.misc("weird_scorer.v1")
|
||||
def make_weird_scorer():
|
||||
def weird_scorer(examples, weird_score, **kwargs):
|
||||
return {"weird_score": weird_score}
|
||||
|
||||
return weird_scorer
|
||||
|
||||
ruler = nlp.add_pipe(
|
||||
"attribute_ruler", config={"scorer": {"@misc": "weird_scorer.v1"}}
|
||||
)
|
||||
ruler.initialize(lambda: [], patterns=pattern_dicts)
|
||||
scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.12345})
|
||||
assert scores["weird_score"] == 0.12345
|
||||
assert "token_acc" in scores
|
||||
assert "lemma_acc" not in scores
|
||||
scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.23456})
|
||||
assert scores["weird_score"] == 0.23456
|
||||
|
||||
|
||||
def test_attributeruler_rule_order(nlp):
|
||||
|
|
|
@ -8,6 +8,7 @@ from spacy.language import Language
|
|||
from spacy.tests.util import make_tempdir
|
||||
from spacy.morphology import Morphology
|
||||
from spacy.attrs import MORPH
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_label_types():
|
||||
|
@ -137,6 +138,41 @@ def test_overfitting_IO():
|
|||
assert [str(t.morph) for t in doc] == gold_morphs
|
||||
assert [t.pos_ for t in doc] == gold_pos_tags
|
||||
|
||||
# Test overwrite+extend settings
|
||||
# (note that "" is unset, "_" is set and empty)
|
||||
morphs = ["Feat=V", "Feat=N", "_"]
|
||||
doc = Doc(nlp.vocab, words=["blue", "ham", "like"], morphs=morphs)
|
||||
orig_morphs = [str(t.morph) for t in doc]
|
||||
orig_pos_tags = [t.pos_ for t in doc]
|
||||
morphologizer = nlp.get_pipe("morphologizer")
|
||||
|
||||
# don't overwrite or extend
|
||||
morphologizer.cfg["overwrite"] = False
|
||||
doc = morphologizer(doc)
|
||||
assert [str(t.morph) for t in doc] == orig_morphs
|
||||
assert [t.pos_ for t in doc] == orig_pos_tags
|
||||
|
||||
# overwrite and extend
|
||||
morphologizer.cfg["overwrite"] = True
|
||||
morphologizer.cfg["extend"] = True
|
||||
doc = Doc(nlp.vocab, words=["I", "like"], morphs=["Feat=A|That=A|This=A", ""])
|
||||
doc = morphologizer(doc)
|
||||
assert [str(t.morph) for t in doc] == ["Feat=N|That=A|This=A", "Feat=V"]
|
||||
|
||||
# extend without overwriting
|
||||
morphologizer.cfg["overwrite"] = False
|
||||
morphologizer.cfg["extend"] = True
|
||||
doc = Doc(nlp.vocab, words=["I", "like"], morphs=["Feat=A|That=A|This=A", "That=B"])
|
||||
doc = morphologizer(doc)
|
||||
assert [str(t.morph) for t in doc] == ["Feat=A|That=A|This=A", "Feat=V|That=B"]
|
||||
|
||||
# overwrite without extending
|
||||
morphologizer.cfg["overwrite"] = True
|
||||
morphologizer.cfg["extend"] = False
|
||||
doc = Doc(nlp.vocab, words=["I", "like"], morphs=["Feat=A|That=A|This=A", ""])
|
||||
doc = morphologizer(doc)
|
||||
assert [str(t.morph) for t in doc] == ["Feat=N", "Feat=V"]
|
||||
|
||||
# Test with unset morph and partial POS
|
||||
nlp.remove_pipe("morphologizer")
|
||||
nlp.add_pipe("morphologizer")
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
import pytest
|
||||
import pickle
|
||||
from thinc.api import get_current_ops
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.strings import StringStore
|
||||
from spacy.vectors import Vectors
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
@ -129,7 +131,11 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
|
|||
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
||||
def test_pickle_vocab(strings, lex_attr):
|
||||
vocab = Vocab(strings=strings)
|
||||
ops = get_current_ops()
|
||||
vectors = Vectors(data=ops.xp.zeros((10, 10)), mode="floret", hash_count=1)
|
||||
vocab.vectors = vectors
|
||||
vocab[strings[0]].norm_ = lex_attr
|
||||
vocab_pickled = pickle.dumps(vocab)
|
||||
vocab_unpickled = pickle.loads(vocab_pickled)
|
||||
assert vocab.to_bytes() == vocab_unpickled.to_bytes()
|
||||
assert vocab_unpickled.vectors.mode == "floret"
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import pytest
|
||||
from click import NoSuchOption
|
||||
from packaging.specifiers import SpecifierSet
|
||||
from spacy.training import docs_to_json, offsets_to_biluo_tags
|
||||
from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs
|
||||
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
||||
|
@ -491,19 +492,27 @@ def test_string_to_list_intify(value):
|
|||
assert string_to_list(value, intify=True) == [1, 2, 3]
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Temporarily skip until v3.2.0 release")
|
||||
def test_download_compatibility():
|
||||
model_name = "en_core_web_sm"
|
||||
compatibility = get_compatibility()
|
||||
version = get_version(model_name, compatibility)
|
||||
assert get_minor_version(about.__version__) == get_minor_version(version)
|
||||
spec = SpecifierSet("==" + about.__version__)
|
||||
spec.prereleases = False
|
||||
if about.__version__ in spec:
|
||||
model_name = "en_core_web_sm"
|
||||
compatibility = get_compatibility()
|
||||
version = get_version(model_name, compatibility)
|
||||
assert get_minor_version(about.__version__) == get_minor_version(version)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Temporarily skip until v3.2.0 release")
|
||||
def test_validate_compatibility_table():
|
||||
model_pkgs, compat = get_model_pkgs()
|
||||
spacy_version = get_minor_version(about.__version__)
|
||||
current_compat = compat.get(spacy_version, {})
|
||||
assert len(current_compat) > 0
|
||||
assert "en_core_web_sm" in current_compat
|
||||
spec = SpecifierSet("==" + about.__version__)
|
||||
spec.prereleases = False
|
||||
if about.__version__ in spec:
|
||||
model_pkgs, compat = get_model_pkgs()
|
||||
spacy_version = get_minor_version(about.__version__)
|
||||
current_compat = compat.get(spacy_version, {})
|
||||
assert len(current_compat) > 0
|
||||
assert "en_core_web_sm" in current_compat
|
||||
|
||||
|
||||
@pytest.mark.parametrize("component_name", ["ner", "textcat", "spancat", "tagger"])
|
||||
|
|
|
@ -8,7 +8,7 @@ from spacy.vocab import Vocab
|
|||
from spacy.training import Example
|
||||
from spacy.lang.en import English
|
||||
from spacy.lang.de import German
|
||||
from spacy.util import registry, ignore_error, raise_error
|
||||
from spacy.util import registry, ignore_error, raise_error, find_matching_language
|
||||
import spacy
|
||||
from thinc.api import CupyOps, NumpyOps, get_current_ops
|
||||
|
||||
|
@ -255,6 +255,38 @@ def test_language_pipe_error_handler_custom(en_vocab, n_process):
|
|||
assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_process", [1, 2])
|
||||
def test_language_pipe_error_handler_input_as_tuples(en_vocab, n_process):
|
||||
"""Test the error handling of nlp.pipe with input as tuples"""
|
||||
Language.component("my_evil_component", func=evil_component)
|
||||
ops = get_current_ops()
|
||||
if isinstance(ops, NumpyOps) or n_process < 2:
|
||||
nlp = English()
|
||||
nlp.add_pipe("my_evil_component")
|
||||
texts = [
|
||||
("TEXT 111", 111),
|
||||
("TEXT 222", 222),
|
||||
("TEXT 333", 333),
|
||||
("TEXT 342", 342),
|
||||
("TEXT 666", 666),
|
||||
]
|
||||
with pytest.raises(ValueError):
|
||||
list(nlp.pipe(texts, as_tuples=True))
|
||||
nlp.set_error_handler(warn_error)
|
||||
logger = logging.getLogger("spacy")
|
||||
with mock.patch.object(logger, "warning") as mock_warning:
|
||||
tuples = list(nlp.pipe(texts, as_tuples=True, n_process=n_process))
|
||||
# HACK/TODO? the warnings in child processes don't seem to be
|
||||
# detected by the mock logger
|
||||
if n_process == 1:
|
||||
mock_warning.assert_called()
|
||||
assert mock_warning.call_count == 2
|
||||
assert len(tuples) + mock_warning.call_count == len(texts)
|
||||
assert (tuples[0][0].text, tuples[0][1]) == ("TEXT 111", 111)
|
||||
assert (tuples[1][0].text, tuples[1][1]) == ("TEXT 333", 333)
|
||||
assert (tuples[2][0].text, tuples[2][1]) == ("TEXT 666", 666)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_process", [1, 2])
|
||||
def test_language_pipe_error_handler_pipe(en_vocab, n_process):
|
||||
"""Test the error handling of a component's pipe method"""
|
||||
|
@ -512,6 +544,55 @@ def test_spacy_blank():
|
|||
assert nlp.meta["name"] == "my_custom_model"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"lang,target",
|
||||
[
|
||||
("en", "en"),
|
||||
("fra", "fr"),
|
||||
("fre", "fr"),
|
||||
("iw", "he"),
|
||||
("mo", "ro"),
|
||||
("mul", "xx"),
|
||||
("no", "nb"),
|
||||
("pt-BR", "pt"),
|
||||
("xx", "xx"),
|
||||
("zh-Hans", "zh"),
|
||||
("zh-Hant", None),
|
||||
("zxx", None),
|
||||
],
|
||||
)
|
||||
def test_language_matching(lang, target):
|
||||
"""
|
||||
Test that we can look up languages by equivalent or nearly-equivalent
|
||||
language codes.
|
||||
"""
|
||||
assert find_matching_language(lang) == target
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"lang,target",
|
||||
[
|
||||
("en", "en"),
|
||||
("fra", "fr"),
|
||||
("fre", "fr"),
|
||||
("iw", "he"),
|
||||
("mo", "ro"),
|
||||
("mul", "xx"),
|
||||
("no", "nb"),
|
||||
("pt-BR", "pt"),
|
||||
("xx", "xx"),
|
||||
("zh-Hans", "zh"),
|
||||
],
|
||||
)
|
||||
def test_blank_languages(lang, target):
|
||||
"""
|
||||
Test that we can get spacy.blank in various languages, including codes
|
||||
that are defined to be equivalent or that match by CLDR language matching.
|
||||
"""
|
||||
nlp = spacy.blank(lang)
|
||||
assert nlp.lang == target
|
||||
|
||||
|
||||
@pytest.mark.parametrize("value", [False, None, ["x", "y"], Language, Vocab])
|
||||
def test_language_init_invalid_vocab(value):
|
||||
err_fragment = "invalid value"
|
||||
|
@ -540,6 +621,32 @@ def test_language_source_and_vectors(nlp2):
|
|||
assert nlp.vocab.vectors.to_bytes() == vectors_bytes
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_process", [1, 2])
|
||||
def test_pass_doc_to_pipeline(nlp, n_process):
|
||||
texts = ["cats", "dogs", "guinea pigs"]
|
||||
docs = [nlp.make_doc(text) for text in texts]
|
||||
assert not any(len(doc.cats) for doc in docs)
|
||||
doc = nlp(docs[0])
|
||||
assert doc.text == texts[0]
|
||||
assert len(doc.cats) > 0
|
||||
if isinstance(get_current_ops(), NumpyOps) or n_process < 2:
|
||||
docs = nlp.pipe(docs, n_process=n_process)
|
||||
assert [doc.text for doc in docs] == texts
|
||||
assert all(len(doc.cats) for doc in docs)
|
||||
|
||||
|
||||
def test_invalid_arg_to_pipeline(nlp):
|
||||
str_list = ["This is a text.", "This is another."]
|
||||
with pytest.raises(ValueError):
|
||||
nlp(str_list) # type: ignore
|
||||
assert len(list(nlp.pipe(str_list))) == 2
|
||||
int_list = [1, 2, 3]
|
||||
with pytest.raises(ValueError):
|
||||
list(nlp.pipe(int_list)) # type: ignore
|
||||
with pytest.raises(ValueError):
|
||||
nlp(int_list) # type: ignore
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not isinstance(get_current_ops(), CupyOps), reason="test requires GPU"
|
||||
)
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user